screenhand 0.3.8 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4219,6 +4219,27 @@ server.tool("click_with_fallback", "Click a target by text using the canonical f
4219
4219
  }
4220
4220
  throw new Error("Target not found via OCR");
4221
4221
  }
4222
+ case "window_buffer": {
4223
+ // Last resort: capture GPU window buffer (works even when window is hidden),
4224
+ // OCR it, find target text, translate window-relative to screen-absolute coords
4225
+ const wbWindowId = await resolveWindowId(targetPid);
4226
+ if (!wbWindowId)
4227
+ throw new Error("No window found for window_buffer capture");
4228
+ const wbShot = await bridge.call("cg.captureWindow", { windowId: wbWindowId });
4229
+ const wbMatches = await bridge.call("vision.findText", { imagePath: wbShot.path, searchText: target });
4230
+ const wbMatch = Array.isArray(wbMatches) ? wbMatches[0] : null;
4231
+ if (!wbMatch?.bounds)
4232
+ throw new Error("Target not found via window buffer OCR");
4233
+ // Translate window-relative coords to screen-absolute
4234
+ const allWins = await bridge.call("app.windows");
4235
+ const winInfo = allWins.find((w) => w.windowId === wbWindowId);
4236
+ const winX = winInfo?.bounds?.x ?? 0;
4237
+ const winY = winInfo?.bounds?.y ?? 0;
4238
+ const absX = winX + wbMatch.bounds.x + wbMatch.bounds.width / 2;
4239
+ const absY = winY + wbMatch.bounds.y + wbMatch.bounds.height / 2;
4240
+ await bridge.call("cg.mouseClick", { x: absX, y: absY });
4241
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${Math.round(absX)},${Math.round(absY)}) [window_buffer]` };
4242
+ }
4222
4243
  }
4223
4244
  throw new Error(`Unknown method: ${method}`);
4224
4245
  }
@@ -4553,6 +4574,22 @@ server.tool("read_with_fallback", "Read text content from the screen or a specif
4553
4574
  const ocr = await bridge.call("vision.ocr", { imagePath: shot.path });
4554
4575
  return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: ocr.text?.slice(0, 4000) ?? "" };
4555
4576
  }
4577
+ case "window_buffer": {
4578
+ // GPU window buffer capture — reads content even when window is behind other apps
4579
+ const rbWindowId = await resolveWindowId(targetPid);
4580
+ if (!rbWindowId)
4581
+ throw new Error("No window found for window_buffer read");
4582
+ const rbShot = await bridge.call("cg.captureWindow", { windowId: rbWindowId });
4583
+ if (target) {
4584
+ const rbMatches = await bridge.call("vision.findText", { imagePath: rbShot.path, searchText: target });
4585
+ const rbMatch = Array.isArray(rbMatches) ? rbMatches[0] : null;
4586
+ if (!rbMatch)
4587
+ throw new Error("Text not found via window buffer OCR");
4588
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: rbMatch.text };
4589
+ }
4590
+ const rbOcr = await bridge.call("vision.ocr", { imagePath: rbShot.path });
4591
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: rbOcr.text?.slice(0, 4000) ?? "" };
4592
+ }
4556
4593
  }
4557
4594
  throw new Error(`Method ${method} does not support read`);
4558
4595
  }
@@ -4668,6 +4705,29 @@ server.tool("locate_with_fallback", "Find an element's position on screen using
4668
4705
  const b = match.bounds;
4669
4706
  return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${b.x},${b.y} ${b.width}x${b.height})` };
4670
4707
  }
4708
+ case "window_buffer": {
4709
+ // GPU window buffer capture + OCR — works even when window is hidden
4710
+ const lbWindowId = await resolveWindowId(targetPid);
4711
+ if (!lbWindowId)
4712
+ throw new Error("No window found for window_buffer locate");
4713
+ const lbShot = await bridge.call("cg.captureWindow", { windowId: lbWindowId });
4714
+ const lbMatches = await bridge.call("vision.findText", { imagePath: lbShot.path, searchText: target });
4715
+ const lbMatch = Array.isArray(lbMatches) ? lbMatches[0] : null;
4716
+ if (!lbMatch?.bounds)
4717
+ throw new Error("Target not found via window buffer OCR");
4718
+ // Translate window-relative to screen-absolute bounds
4719
+ const lbWins = await bridge.call("app.windows");
4720
+ const lbWinInfo = lbWins.find((w) => w.windowId === lbWindowId);
4721
+ const lbOffX = lbWinInfo?.bounds?.x ?? 0;
4722
+ const lbOffY = lbWinInfo?.bounds?.y ?? 0;
4723
+ const lbBounds = {
4724
+ x: lbOffX + lbMatch.bounds.x,
4725
+ y: lbOffY + lbMatch.bounds.y,
4726
+ width: lbMatch.bounds.width,
4727
+ height: lbMatch.bounds.height,
4728
+ };
4729
+ return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${lbBounds.x},${lbBounds.y} ${lbBounds.width}x${lbBounds.height}) [window_buffer]` };
4730
+ }
4671
4731
  }
4672
4732
  throw new Error(`Method ${method} does not support locate`);
4673
4733
  }
@@ -126,6 +126,26 @@ export class LearningEngine {
126
126
  rankSensors(bundleId) {
127
127
  return this.sensors.rank(bundleId);
128
128
  }
129
+ /**
130
+ * Detect whether an app is "vision-only" — AX can't see its content,
131
+ * so window buffer capture + OCR is the only viable perception source.
132
+ * Returns true when AX has failed enough times with a low score and
133
+ * at least one other source (vision/ocr) has succeeded.
134
+ */
135
+ isVisionOnlyApp(bundleId) {
136
+ const ranked = this.sensors.rank(bundleId);
137
+ if (ranked.length < 2)
138
+ return false;
139
+ const ax = ranked.find(r => r.sourceType === "ax");
140
+ const vision = ranked.find(r => r.sourceType === "vision" || r.sourceType === "ocr");
141
+ // AX score near zero + vision/ocr has some success
142
+ if (ax && ax.score < 0.15 && vision && vision.score > 0.3)
143
+ return true;
144
+ // No AX entry at all but vision works
145
+ if (!ax && vision && vision.score > 0.3)
146
+ return true;
147
+ return false;
148
+ }
129
149
  /**
130
150
  * Query verified UI patterns for a given app, optionally filtered by tool.
131
151
  */
@@ -768,13 +768,18 @@ export class PerceptionCoordinator extends EventEmitter {
768
768
  // Safe CLI mode is already enabled via setSafeCLI() in start().
769
769
  // This allows vision/OCR for canvas-heavy apps like Canva in Chrome.
770
770
  // Skip vision if learning engine shows it consistently fails for this app,
771
- // but retry every 20th cycle to re-evaluate (apps may gain windows later)
771
+ // but retry every 20th cycle to re-evaluate (apps may gain windows later).
772
+ // Exception: vision-only apps (AX blind) — vision/OCR is their ONLY perception
773
+ // source, so never skip it. Window buffer capture works even when window is hidden.
772
774
  if (this.learningEngine && this.activeAppContext) {
773
- const ranked = this.learningEngine.rankSensors(this.activeAppContext.bundleId);
774
- const visionRank = ranked.find(r => r.sourceType === "vision");
775
- if (visionRank && visionRank.score < 0.1 && ranked.length >= 2 && this.stats.slowCycles % 20 !== 0) {
776
- this.stats.slowCycles++;
777
- return; // Vision consistently fails for this app skip (retry every 20th cycle)
775
+ const isVisionOnly = this.learningEngine.isVisionOnlyApp(this.activeAppContext.bundleId);
776
+ if (!isVisionOnly) {
777
+ const ranked = this.learningEngine.rankSensors(this.activeAppContext.bundleId);
778
+ const visionRank = ranked.find(r => r.sourceType === "vision");
779
+ if (visionRank && visionRank.score < 0.1 && ranked.length >= 2 && this.stats.slowCycles % 20 !== 0) {
780
+ this.stats.slowCycles++;
781
+ return; // Vision consistently fails for this app — skip (retry every 20th cycle)
782
+ }
778
783
  }
779
784
  }
780
785
  const timestamp = new Date().toISOString();
@@ -860,14 +865,24 @@ export class PerceptionCoordinator extends EventEmitter {
860
865
  },
861
866
  });
862
867
  }
863
- // Record vision sensor outcome
868
+ // Record vision sensor outcome — also record as window_buffer for vision-only apps
869
+ // so the fallback chain knows this source works for element location
864
870
  if (this.learningEngine && this.activeAppContext) {
871
+ const latencyMs = Date.now() - new Date(timestamp).getTime();
865
872
  this.learningEngine.recordSensorOutcome({
866
873
  bundleId: this.activeAppContext.bundleId,
867
874
  sourceType: "vision",
868
875
  success: !!diffEvent,
869
- latencyMs: Date.now() - new Date(timestamp).getTime(),
876
+ latencyMs,
870
877
  });
878
+ if (this.learningEngine.isVisionOnlyApp(this.activeAppContext.bundleId) && ocrEvent) {
879
+ this.learningEngine.recordSensorOutcome({
880
+ bundleId: this.activeAppContext.bundleId,
881
+ sourceType: "window_buffer",
882
+ success: true,
883
+ latencyMs,
884
+ });
885
+ }
871
886
  }
872
887
  }
873
888
  catch {
@@ -21,6 +21,7 @@ export const DEFAULT_PERCEPTION_CONFIG = {
21
21
  enableAX: true,
22
22
  enableCDP: true,
23
23
  enableVision: true,
24
+ enableWindowBuffer: true,
24
25
  maxROIsPerCycle: 3,
25
26
  skipCaptureLock: false,
26
27
  };
@@ -23,7 +23,7 @@
23
23
  */
24
24
  // ── 1. Fallback Chain ──────────────────────────────────────────────────
25
25
  /** Ordered list of execution methods, from fastest/most reliable to slowest/least reliable */
26
- const EXECUTION_METHODS = ["ax", "cdp", "ocr", "coordinates"];
26
+ const EXECUTION_METHODS = ["ax", "cdp", "ocr", "window_buffer", "coordinates"];
27
27
  const METHOD_CAPABILITIES = {
28
28
  ax: {
29
29
  method: "ax",
@@ -61,6 +61,18 @@ const METHOD_CAPABILITIES = {
61
61
  requiresBridge: true,
62
62
  requiresCDP: false,
63
63
  },
64
+ window_buffer: {
65
+ method: "window_buffer",
66
+ canClick: true,
67
+ canType: false,
68
+ canRead: true,
69
+ canLocate: true,
70
+ canSelect: false,
71
+ canScroll: false,
72
+ avgLatencyMs: 350,
73
+ requiresBridge: true,
74
+ requiresCDP: false,
75
+ },
64
76
  coordinates: {
65
77
  method: "coordinates",
66
78
  canClick: true,
@@ -90,6 +102,7 @@ const SENSOR_TO_METHOD = {
90
102
  chrome: "cdp",
91
103
  ocr: "ocr",
92
104
  vision: "ocr",
105
+ window_buffer: "window_buffer",
93
106
  coordinates: "coordinates",
94
107
  };
95
108
  /**
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screenhand",
3
- "version": "0.3.8",
3
+ "version": "0.3.9",
4
4
  "mcpName": "io.github.manushi4/screenhand",
5
5
  "description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
6
6
  "homepage": "https://screenhand.com",