screenhand 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ // Copyright (C) 2025 Clazro Technology Private Limited
2
+ // SPDX-License-Identifier: AGPL-3.0-only
3
+ /**
4
+ * VisualMapper — Phase 3: Visual App Mapping.
5
+ *
6
+ * Two-phase approach:
7
+ * Phase A (quickScan): Screenshot → fast OCR → spatial clustering → element coordinates.
8
+ * No LLM needed, ~500ms, works offline.
9
+ * Phase B (llmEnrich): Screenshot + AX tree → Claude Vision API → semantic zone labels.
10
+ * Needs ANTHROPIC_API_KEY, ~5-15s, runs in background.
11
+ *
12
+ * Results populate the existing AppMap (fills -1,-1 coordinates, adds zone labels).
13
+ * LLM labels are hypotheses (confidence 0.5) until validated by 3+ AX matches.
14
+ */
15
+ import * as crypto from "node:crypto";
16
+ // ── Sensitive App Blocklist ────────────────────────────────────────
17
+ const BLOCKED_BUNDLE_IDS = new Set([
18
+ "com.1password.1password",
19
+ "com.agilebits.onepassword7",
20
+ "com.lastpass.LastPass",
21
+ "com.bitwarden.desktop",
22
+ "com.dashlane.dashlanephonefinal",
23
+ "com.apple.keychainaccess",
24
+ "com.apple.systempreferences", // System Settings (may show accounts)
25
+ ]);
26
+ export function isSensitiveApp(bundleId) {
27
+ if (BLOCKED_BUNDLE_IDS.has(bundleId))
28
+ return true;
29
+ // Block known banking/health patterns
30
+ if (/bank|health|medical|wallet/i.test(bundleId))
31
+ return true;
32
+ return false;
33
+ }
34
+ // ── Quick Scan (Phase A) ──────────────────────────────────────────
35
+ /**
36
+ * Perform a quick visual scan using OCR only (no LLM).
37
+ * Takes a screenshot, runs fast OCR, clusters results into zones.
38
+ * Returns structured elements with positions.
39
+ */
40
+ export async function quickScan(bridge, pid, windowBounds) {
41
+ // Take screenshot using cg.captureScreen (the bridge's actual method)
42
+ let screenshotResult;
43
+ try {
44
+ screenshotResult = await bridge.call("cg.captureScreen", {});
45
+ }
46
+ catch {
47
+ return null;
48
+ }
49
+ if (!screenshotResult?.path)
50
+ return null;
51
+ const captureW = screenshotResult.width ?? windowBounds?.width ?? 1440;
52
+ const captureH = screenshotResult.height ?? windowBounds?.height ?? 900;
53
+ // Detect Retina scale factor: capture pixels / logical window size
54
+ const scaleFactor = windowBounds?.width
55
+ ? Math.round(captureW / windowBounds.width) || 2
56
+ : 2;
57
+ // Window bounds in capture-pixel space (for coordinate normalization)
58
+ const winPixelX = windowBounds ? windowBounds.x * scaleFactor : 0;
59
+ const winPixelY = windowBounds ? windowBounds.y * scaleFactor : 0;
60
+ const winPixelW = windowBounds ? windowBounds.width * scaleFactor : captureW;
61
+ const winPixelH = windowBounds ? windowBounds.height * scaleFactor : captureH;
62
+ // Compute screenshot hash for staleness detection
63
+ const hash = crypto.createHash("sha256").update(screenshotResult.path).digest("hex").slice(0, 16);
64
+ // Run OCR on the captured screenshot file
65
+ // Bridge returns: { text, confidence, bounds: { x, y, width, height } }
66
+ let ocrResult;
67
+ try {
68
+ ocrResult = await bridge.call("vision.ocr", {
69
+ imagePath: screenshotResult.path,
70
+ });
71
+ }
72
+ catch {
73
+ return { scan: { zones: [], elements: [], confidence: 0.2 }, hash, captureSize: { w: captureW, h: captureH } };
74
+ }
75
+ if (!ocrResult?.regions || ocrResult.regions.length === 0) {
76
+ return { scan: { zones: [], elements: [], confidence: 0.2 }, hash, captureSize: { w: captureW, h: captureH } };
77
+ }
78
+ // Convert OCR regions to elements with positions relative to WINDOW (0-1),
79
+ // not full screen. This matches the coordinator's AX normalization.
80
+ const elements = [];
81
+ for (const region of ocrResult.regions) {
82
+ const text = (region.text ?? "").trim();
83
+ if (!text || text.length < 2 || text.length > 100)
84
+ continue;
85
+ // OCR returns bounds nested: { bounds: { x, y, width, height } }
86
+ const pixelX = region.bounds?.x ?? region.x ?? 0;
87
+ const pixelY = region.bounds?.y ?? region.y ?? 0;
88
+ // Convert from full-screen pixel coords to window-relative (0-1)
89
+ const relX = winPixelW > 0 ? (pixelX - winPixelX) / winPixelW : 0;
90
+ const relY = winPixelH > 0 ? (pixelY - winPixelY) / winPixelH : 0;
91
+ // Skip elements outside the window
92
+ if (relX < -0.05 || relX > 1.05 || relY < -0.05 || relY > 1.05)
93
+ continue;
94
+ elements.push({
95
+ label: text,
96
+ role: "staticText",
97
+ x: Math.round(Math.max(0, Math.min(1, relX)) * 1000) / 1000,
98
+ y: Math.round(Math.max(0, Math.min(1, relY)) * 1000) / 1000,
99
+ zone: classifyZone(Math.max(0, Math.min(1, relY))),
100
+ confidence: 0.6,
101
+ });
102
+ }
103
+ // Cluster elements into zones based on Y position
104
+ const zones = clusterIntoZones(elements, captureW, captureH);
105
+ const confidence = elements.length > 10 ? 0.7 : elements.length > 3 ? 0.5 : 0.3;
106
+ return {
107
+ scan: { zones, elements, confidence },
108
+ hash,
109
+ captureSize: { w: captureW, h: captureH },
110
+ };
111
+ }
112
+ // ── LLM Enrichment (Phase B) ──────────────────────────────────────
113
+ /**
114
+ * Enrich a visual map using Claude Vision API.
115
+ * Sends screenshot + AX tree to LLM for semantic labeling.
116
+ * Returns structured zones and elements with purposes.
117
+ */
118
+ export async function llmEnrich(screenshotBase64, axTree, appName, bundleId, windowTitle, captureSize) {
119
+ const apiKey = process.env.ANTHROPIC_API_KEY;
120
+ if (!apiKey)
121
+ return null;
122
+ const prompt = buildLLMPrompt(appName, bundleId, windowTitle, captureSize, axTree);
123
+ try {
124
+ const resp = await fetch("https://api.anthropic.com/v1/messages", {
125
+ method: "POST",
126
+ headers: {
127
+ "Content-Type": "application/json",
128
+ "x-api-key": apiKey,
129
+ "anthropic-version": "2023-06-01",
130
+ },
131
+ body: JSON.stringify({
132
+ model: "claude-haiku-4-5-20251001",
133
+ max_tokens: 4096,
134
+ messages: [{
135
+ role: "user",
136
+ content: [
137
+ {
138
+ type: "image",
139
+ source: {
140
+ type: "base64",
141
+ media_type: "image/png",
142
+ data: screenshotBase64,
143
+ },
144
+ },
145
+ { type: "text", text: prompt },
146
+ ],
147
+ }],
148
+ }),
149
+ signal: AbortSignal.timeout(30_000),
150
+ });
151
+ if (!resp.ok) {
152
+ process.stderr.write(`[visual-mapper] LLM API error: ${resp.status}\n`);
153
+ return null;
154
+ }
155
+ const body = await resp.json();
156
+ const text = body?.content?.[0]?.text;
157
+ if (!text)
158
+ return null;
159
+ return parseLLMResponse(text);
160
+ }
161
+ catch (err) {
162
+ process.stderr.write(`[visual-mapper] LLM enrichment failed: ${err instanceof Error ? err.message : String(err)}\n`);
163
+ return null;
164
+ }
165
+ }
166
+ function buildLLMPrompt(appName, bundleId, windowTitle, captureSize, axTree) {
167
+ const truncatedAX = axTree.length > 3000 ? axTree.slice(0, 3000) + "\n...(truncated)" : axTree;
168
+ return `You are labeling a macOS application screenshot for UI automation.
169
+
170
+ App: ${appName} (${bundleId})
171
+ Window: ${windowTitle}
172
+ Size: ${captureSize.w}x${captureSize.h}
173
+
174
+ AX accessibility tree (for cross-reference):
175
+ ${truncatedAX}
176
+
177
+ Return a JSON object with this exact structure:
178
+ {
179
+ "screenDescription": "one-line description of this screen",
180
+ "confidence": 0.0-1.0,
181
+ "zones": [
182
+ {"label": "zone name", "type": "toolbar|sidebar|canvas|panel|dialog|menu|status-bar|tab-bar|other", "bounds": {"top": 0.0, "left": 0.0, "width": 1.0, "height": 0.1}, "purpose": "what this zone does"}
183
+ ],
184
+ "elements": [
185
+ {"label": "element name", "role": "button|textField|menu|checkbox|list|tab|other", "x": 0.0, "y": 0.0, "zone": "zone name", "purpose": "what clicking this does", "confidence": 0.0-1.0}
186
+ ]
187
+ }
188
+
189
+ Rules:
190
+ 1. Match AX tree labels where possible. Use the EXACT AX label when available.
191
+ 2. All positions are fractions of window dimensions (0.0-1.0). x,y is the center point.
192
+ 3. Only label INTERACTIVE elements (buttons, fields, menus, tabs, checkboxes, links).
193
+ 4. Do NOT include specific user data (names, emails, file contents) in labels — only UI structure.
194
+ 5. Return ONLY the JSON object. No markdown fences, no commentary.`;
195
+ }
196
+ /** Parse LLM JSON response, handling markdown fences and malformed output. */
197
+ export function parseLLMResponse(text) {
198
+ // Strip markdown fences if present
199
+ let cleaned = text.trim();
200
+ if (cleaned.startsWith("```")) {
201
+ cleaned = cleaned.replace(/^```(?:json)?\s*/i, "").replace(/\s*```$/, "");
202
+ }
203
+ try {
204
+ const parsed = JSON.parse(cleaned);
205
+ if (!parsed || typeof parsed !== "object")
206
+ return null;
207
+ const result = {
208
+ screenDescription: typeof parsed.screenDescription === "string" ? parsed.screenDescription : "",
209
+ confidence: typeof parsed.confidence === "number" ? Math.min(1, Math.max(0, parsed.confidence)) : 0.5,
210
+ zones: [],
211
+ elements: [],
212
+ };
213
+ // Parse zones
214
+ if (Array.isArray(parsed.zones)) {
215
+ for (const z of parsed.zones) {
216
+ if (!z?.label || !z?.bounds)
217
+ continue;
218
+ const zoneType = validateZoneType(z.type);
219
+ result.zones.push({
220
+ label: String(z.label),
221
+ type: zoneType,
222
+ bounds: {
223
+ top: clamp01(z.bounds.top ?? 0),
224
+ left: clamp01(z.bounds.left ?? 0),
225
+ width: clamp01(z.bounds.width ?? 0),
226
+ height: clamp01(z.bounds.height ?? 0),
227
+ },
228
+ purpose: String(z.purpose ?? ""),
229
+ });
230
+ }
231
+ }
232
+ // Parse elements
233
+ if (Array.isArray(parsed.elements)) {
234
+ for (const el of parsed.elements) {
235
+ if (!el?.label)
236
+ continue;
237
+ result.elements.push({
238
+ label: String(el.label),
239
+ role: String(el.role ?? "other"),
240
+ x: clamp01(el.x ?? 0),
241
+ y: clamp01(el.y ?? 0),
242
+ zone: String(el.zone ?? "other"),
243
+ purpose: String(el.purpose ?? ""),
244
+ confidence: typeof el.confidence === "number" ? clamp01(el.confidence) : 0.5,
245
+ });
246
+ }
247
+ }
248
+ return result;
249
+ }
250
+ catch {
251
+ process.stderr.write("[visual-mapper] Failed to parse LLM JSON response\n");
252
+ return null;
253
+ }
254
+ }
255
+ // ── Helpers ────────────────────────────────────────────────────────
256
+ const VALID_ZONE_TYPES = new Set([
257
+ "toolbar", "sidebar", "canvas", "canvas-zoomable", "panel",
258
+ "dialog", "menu", "nested-menu", "status-bar", "palette", "tab-bar", "other",
259
+ ]);
260
+ function validateZoneType(t) {
261
+ if (typeof t === "string" && VALID_ZONE_TYPES.has(t))
262
+ return t;
263
+ // Common LLM output normalization
264
+ if (typeof t === "string") {
265
+ const normalized = t.toLowerCase().replace(/[\s_]+/g, "-");
266
+ if (VALID_ZONE_TYPES.has(normalized))
267
+ return normalized;
268
+ }
269
+ return "other";
270
+ }
271
+ function clamp01(n) {
272
+ if (typeof n !== "number" || isNaN(n))
273
+ return 0;
274
+ return Math.min(1, Math.max(0, n));
275
+ }
276
+ /** Classify an element into a zone based on its Y position. */
277
+ function classifyZone(relY) {
278
+ if (relY < 0.06)
279
+ return "toolbar";
280
+ if (relY > 0.95)
281
+ return "status-bar";
282
+ return "canvas";
283
+ }
284
+ /** Cluster elements into spatial zones by Y-band grouping. */
285
+ function clusterIntoZones(elements, _captureW, _captureH) {
286
+ const bands = {};
287
+ for (const el of elements) {
288
+ const zone = el.zone;
289
+ if (!bands[zone]) {
290
+ bands[zone] = { minY: el.y, maxY: el.y, count: 0 };
291
+ }
292
+ bands[zone].minY = Math.min(bands[zone].minY, el.y);
293
+ bands[zone].maxY = Math.max(bands[zone].maxY, el.y);
294
+ bands[zone].count++;
295
+ }
296
+ const zones = [];
297
+ for (const [label, band] of Object.entries(bands)) {
298
+ const height = Math.max(0.05, band.maxY - band.minY + 0.02);
299
+ zones.push({
300
+ label,
301
+ type: label,
302
+ bounds: {
303
+ top: Math.max(0, band.minY - 0.01),
304
+ left: 0,
305
+ width: 1,
306
+ height,
307
+ },
308
+ });
309
+ }
310
+ return zones;
311
+ }
312
+ /**
313
+ * Build VisualMeta from scan results.
314
+ */
315
+ export function buildVisualMeta(hash, captureSize, windowTitle, appVersion, confidence, scaleFactor = 2) {
316
+ return {
317
+ lastScannedAt: new Date().toISOString(),
318
+ appVersion,
319
+ scaleFactor,
320
+ captureSize,
321
+ screenshotHash: hash,
322
+ screensMapped: [windowTitle],
323
+ confidence,
324
+ };
325
+ }
@@ -131,6 +131,35 @@ const BUNDLE_FAMILY_MAP = [
131
131
  * Normalize AX role names: strip "AX" prefix and lowercase first char.
132
132
  * e.g. "AXRadioButton" → "radioButton", "AXWindow" → "window", "button" → "button"
133
133
  */
134
+ /**
135
+ * Extract window bounds from AX tree root node.
136
+ * Checks root and first window child for position+size.
137
+ */
138
+ function extractWindowBounds(tree, existing) {
139
+ // Try root node first
140
+ if (tree.position && tree.size && tree.size.width > 0 && tree.size.height > 0) {
141
+ const role = tree.role ? normalizeRoleForBounds(tree.role) : "";
142
+ if (role === "window" || role === "application") {
143
+ return tracked({ x: tree.position.x, y: tree.position.y, width: tree.size.width, height: tree.size.height });
144
+ }
145
+ }
146
+ // Check first window child (root may be "application")
147
+ if (tree.children) {
148
+ for (const child of tree.children) {
149
+ if (child.position && child.size && child.size.width > 0 && child.size.height > 0) {
150
+ const childRole = child.role ? normalizeRoleForBounds(child.role) : "";
151
+ if (childRole === "window") {
152
+ return tracked({ x: child.position.x, y: child.position.y, width: child.size.width, height: child.size.height });
153
+ }
154
+ }
155
+ }
156
+ }
157
+ return existing ?? tracked({ x: 0, y: 0, width: 0, height: 0 });
158
+ }
159
+ // Lightweight role normalization for bounds extraction (avoids full normalizeRole dependency order)
160
+ function normalizeRoleForBounds(raw) {
161
+ return raw.replace(/^AX/, "").replace(/^(.)/, (c) => c.toLowerCase());
162
+ }
134
163
  function normalizeRole(raw) {
135
164
  if (raw.startsWith("AX") && raw.length > 2) {
136
165
  return raw[2].toLowerCase() + raw.slice(3);
@@ -463,7 +492,7 @@ export class WorldModel {
463
492
  title: tracked(redactSensitiveLabel(sanitizeString(windowTitle || existing?.title.value || ""))),
464
493
  bundleId: appContext.bundleId,
465
494
  pid: appContext.pid,
466
- bounds: existing?.bounds ?? tracked({ x: 0, y: 0, width: 0, height: 0 }),
495
+ bounds: extractWindowBounds(tree, existing?.bounds),
467
496
  controls,
468
497
  isOnScreen: true,
469
498
  focusedElement,