ucu-mcp 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,51 @@
1
1
  import { execFile, execFileSync } from "node:child_process";
2
2
  import { randomUUID } from "node:crypto";
3
3
  import { promisify } from "node:util";
4
- import { captureFullScreen, captureRegion, captureWindow } from "../utils/screenshot.js";
4
+ import { captureFullScreen, captureRegion } from "../utils/screenshot.js";
5
5
  import { click as inputClick, doubleClick as inputDoubleClick, move as inputMove, drag as inputDrag, scroll as inputScroll, typeText, pressShortcut } from "../utils/input.js";
6
+ import { CaptureError, ElementNotFoundError, InputSynthesisError, PermissionError, PlatformError, UcuError, WindowNotFoundError } from "../util/errors.js";
6
7
  const execFileAsync = promisify(execFile);
8
+ function errorMessage(error) {
9
+ return error instanceof Error ? error.message : String(error);
10
+ }
11
+ function isAccessibilityPermissionError(error) {
12
+ return /not allowed|permission|assistive|accessibility/i.test(errorMessage(error));
13
+ }
14
+ function rethrowCaptureError(error, operation) {
15
+ if (error instanceof UcuError)
16
+ throw error;
17
+ throw new CaptureError(`${operation} failed: ${errorMessage(error)}`);
18
+ }
19
+ function rethrowAccessibilityError(error, operation) {
20
+ if (error instanceof UcuError)
21
+ throw error;
22
+ if (isAccessibilityPermissionError(error)) {
23
+ throw new PermissionError("accessibility", "darwin");
24
+ }
25
+ throw new PlatformError(`${operation} failed: ${errorMessage(error)}`);
26
+ }
27
+ function rethrowElementActionError(error, operation, elementId) {
28
+ if (error instanceof UcuError)
29
+ throw error;
30
+ if (isAccessibilityPermissionError(error)) {
31
+ throw new PermissionError("accessibility", "darwin");
32
+ }
33
+ if (/element not found/i.test(errorMessage(error))) {
34
+ throw new ElementNotFoundError(elementId);
35
+ }
36
+ throw new PlatformError(`${operation} failed: ${errorMessage(error)}`);
37
+ }
38
+ function rethrowInputError(error, operation) {
39
+ if (error instanceof UcuError)
40
+ throw error;
41
+ throw new InputSynthesisError(`${operation} failed: ${errorMessage(error)}`);
42
+ }
7
43
  export class MacOSPlatform {
8
44
  elementCache = new Map();
9
45
  elementCacheTtlMs = 30_000;
10
46
  elementCacheMaxSize = 100;
47
+ windowCacheTtlMs = 300;
48
+ windowCache;
11
49
  activeTarget;
12
50
  savedFocus;
13
51
  // ── Element Cache Management ────────────────────────────────────────────
@@ -79,14 +117,22 @@ export class MacOSPlatform {
79
117
  }
80
118
  // ── Screenshot ──────────────────────────────────────────────────────────
81
119
  async screenshot(_display, region, options) {
82
- const base64 = region
83
- ? await captureRegion(region.x, region.y, region.width, region.height, options)
84
- : await captureFullScreen(options);
85
- return Buffer.from(base64, "base64");
120
+ try {
121
+ const base64 = region
122
+ ? await captureRegion(region.x, region.y, region.width, region.height, options)
123
+ : await captureFullScreen(options);
124
+ return Buffer.from(base64, "base64");
125
+ }
126
+ catch (error) {
127
+ rethrowCaptureError(error, region ? "capture region" : "capture full screen");
128
+ }
86
129
  }
87
130
  async screenshotWindow(windowId, options) {
88
- const base64 = await captureWindow(windowId, options);
89
- return Buffer.from(base64, "base64");
131
+ const win = (await this.listWindows(true)).find((w) => w.id === windowId);
132
+ if (!win) {
133
+ throw new WindowNotFoundError(windowId);
134
+ }
135
+ return this.screenshot(undefined, win.bounds, options);
90
136
  }
91
137
  // ── Screen Info ─────────────────────────────────────────────────────────
92
138
  getScreenSize(display) {
@@ -154,10 +200,26 @@ export class MacOSPlatform {
154
200
  }
155
201
  async focusApp(app) {
156
202
  const appLower = app.toLowerCase();
157
- const windows = await this.listWindows(true);
158
- const target = windows.find((w) => w.processName.toLowerCase().includes(appLower));
203
+ const escapedApp = app.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
204
+ this.windowCache = undefined;
205
+ try {
206
+ execFileSync("osascript", ["-e", `tell application "${escapedApp}" to activate`], { timeout: 5000 });
207
+ }
208
+ catch {
209
+ // Some app names are process labels rather than AppleScript application names.
210
+ // Continue with the AX window lookup below so existing callers still work.
211
+ }
212
+ let target;
213
+ const deadline = Date.now() + 3000;
214
+ do {
215
+ const windows = await this.listWindows(true);
216
+ target = windows.find((w) => w.processName.toLowerCase().includes(appLower));
217
+ if (target)
218
+ break;
219
+ await new Promise((resolve) => setTimeout(resolve, 150));
220
+ } while (Date.now() < deadline);
159
221
  if (!target) {
160
- throw new Error(`No on-screen window found for app "${app}". Use list_apps to inspect localized macOS app names.`);
222
+ throw new WindowNotFoundError(app);
161
223
  }
162
224
  this.activeTarget = {
163
225
  appName: target.processName,
@@ -219,44 +281,49 @@ export class MacOSPlatform {
219
281
  }
220
282
  }
221
283
  async listWindows(_includeMinimized) {
284
+ const now = Date.now();
285
+ if (this.windowCache && now - this.windowCache.cachedAt <= this.windowCacheTtlMs) {
286
+ return this.windowCache.windows.map((window) => ({
287
+ ...window,
288
+ bounds: { ...window.bounds },
289
+ }));
290
+ }
222
291
  try {
292
+ // Use System Events instead of CGWindowListCopyWindowInfo.
293
+ // The CoreGraphics API returns CFArrayRef/CFDictionaryRef which JXA
294
+ // cannot iterate reliably — CFArrayGetCount works but objectAtIndex
295
+ // does not. System Events JXA is slower (~3-6s) but correct.
223
296
  const jxaScript = `
224
- ObjC.import('CoreGraphics');
225
- ObjC.import('Foundation');
226
- var winList = $.CGWindowListCopyWindowInfo(1, 0);
227
- var count = winList.count;
297
+ var se = Application('System Events');
228
298
  var result = [];
229
- for (var i = 0; i < count; i++) {
230
- var w = $(winList).objectAtIndex(i);
231
- var bounds = w.objectForKey('kCGWindowBounds');
232
- var numberVal = w.objectForKey('kCGWindowNumber');
233
- var nameVal = w.objectForKey('kCGWindowName');
234
- var ownerVal = w.objectForKey('kCGWindowOwnerName');
235
- var pidVal = w.objectForKey('kCGWindowOwnerPID');
236
- var onScreenVal = w.objectForKey('kCGWindowIsOnscreen');
237
- var layerVal = w.objectForKey('kCGWindowLayer');
238
-
239
- // Skip windows at layer > 0 (menus, overlays, etc.)
240
- if (layerVal && layerVal.intValue > 0) continue;
241
-
242
- var bx = 0, by = 0, bw = 0, bh = 0;
243
- try { bx = $(bounds).objectForKey('X').intValue; } catch(e) {}
244
- try { by = $(bounds).objectForKey('Y').intValue; } catch(e) {}
245
- try { bw = $(bounds).objectForKey('Width').intValue; } catch(e) {}
246
- try { bh = $(bounds).objectForKey('Height').intValue; } catch(e) {}
247
-
248
- // Skip zero-size windows
249
- if (bw === 0 && bh === 0) continue;
250
-
251
- result.push({
252
- id: String(numberVal ? numberVal.intValue : 0),
253
- title: nameVal ? String(nameVal) : '',
254
- processName: ownerVal ? String(ownerVal) : '',
255
- pid: pidVal ? pidVal.intValue : 0,
256
- bounds: { x: bx, y: by, width: bw, height: bh },
257
- isMinimized: false,
258
- isOnScreen: onScreenVal ? onScreenVal.boolValue : true
259
- });
299
+ var procs = se.processes();
300
+ for (var i = 0; i < procs.length; i++) {
301
+ var p = procs[i];
302
+ var pName = '';
303
+ var pPid = 0;
304
+ try { pName = p.name(); } catch(e) {}
305
+ try { pPid = p.unixId(); } catch(e) {}
306
+ try {
307
+ var wins = p.windows();
308
+ for (var j = 0; j < wins.length; j++) {
309
+ var w = wins[j];
310
+ var pos, sz;
311
+ try { pos = w.position(); } catch(e) { pos = [0, 0]; }
312
+ try { sz = w.size(); } catch(e) { sz = [0, 0]; }
313
+ if (sz[0] === 0 && sz[1] === 0) continue;
314
+ var title = '';
315
+ try { title = w.name() || ''; } catch(e) {}
316
+ result.push({
317
+ id: pName + '/win' + j,
318
+ title: title,
319
+ processName: pName,
320
+ pid: pPid,
321
+ bounds: { x: pos[0], y: pos[1], width: sz[0], height: sz[1] },
322
+ isMinimized: false,
323
+ isOnScreen: true
324
+ });
325
+ }
326
+ } catch(e) {}
260
327
  }
261
328
  JSON.stringify(result);
262
329
  `;
@@ -264,7 +331,15 @@ export class MacOSPlatform {
264
331
  "-l", "JavaScript",
265
332
  "-e", jxaScript
266
333
  ], { encoding: "utf-8", timeout: 15000 });
267
- return JSON.parse(jxaOut.trim());
334
+ const windows = JSON.parse(jxaOut.trim());
335
+ this.windowCache = {
336
+ cachedAt: Date.now(),
337
+ windows: windows.map((window) => ({
338
+ ...window,
339
+ bounds: { ...window.bounds },
340
+ })),
341
+ };
342
+ return windows;
268
343
  }
269
344
  catch {
270
345
  // Fallback: return empty list if JXA fails
@@ -274,7 +349,7 @@ export class MacOSPlatform {
274
349
  async getWindowState(windowId, depth, includeBounds = true) {
275
350
  const resolvedWindowId = windowId || this.activeTarget?.windowId;
276
351
  if (!resolvedWindowId) {
277
- throw new Error("getWindowState requires windowId or a prior focus_app target");
352
+ throw new WindowNotFoundError("active target");
278
353
  }
279
354
  const maxDepth = Math.min(depth || 3, 10);
280
355
  const maxElements = 50;
@@ -285,6 +360,11 @@ export class MacOSPlatform {
285
360
  const jxaScript = `
286
361
  ObjC.import('AppKit');
287
362
  var se = Application('System Events');
363
+ function childElements(elem) {
364
+ try { return elem.uiElements(); } catch(e1) {
365
+ try { return elem.elements(); } catch(e2) { return []; }
366
+ }
367
+ }
288
368
  var result = {window: null, focusedElement: null, tree: null, error: null};
289
369
  var target = ${targetJson};
290
370
  var includeBounds = ${includeBounds ? "true" : "false"};
@@ -319,25 +399,47 @@ export class MacOSPlatform {
319
399
  return false;
320
400
  }
321
401
 
402
+ var foundWin = null;
403
+ var foundProc = null;
404
+
405
+ // Fast path: resolve "ProcessName/winN" format directly
406
+ var idParts = "${escapedWindowId}".split('/');
407
+ if (idParts.length >= 2 && idParts[0]) {
408
+ var procName = idParts[0];
409
+ var winIdx = 0;
410
+ var winMatch = idParts[1].match(/^win(\d+)$/);
411
+ if (winMatch) winIdx = parseInt(winMatch[1]);
412
+ try {
413
+ var proc = se.processes[procName]();
414
+ var ws = proc.windows();
415
+ if (winIdx < ws.length) {
416
+ foundWin = ws[winIdx];
417
+ foundProc = proc;
418
+ }
419
+ } catch(e) {}
420
+ }
421
+
322
422
  try {
323
- var foundWin = null;
324
- var foundProc = null;
325
- var procs = se.processes();
326
- for (var p = 0; p < procs.length; p++) {
327
- var proc = procs[p];
328
- try {
329
- var wins = proc.windows();
330
- for (var w = 0; w < wins.length; w++) {
331
- if (windowMatches(wins[w], proc)) {
332
- foundWin = wins[w];
333
- foundProc = proc;
334
- break;
423
+ if (!foundWin) {
424
+ var procs = se.processes();
425
+ for (var p = 0; p < procs.length; p++) {
426
+ var proc = procs[p];
427
+ try {
428
+ var wins = proc.windows();
429
+ for (var w = 0; w < wins.length; w++) {
430
+ if (windowMatches(wins[w], proc)) {
431
+ foundWin = wins[w];
432
+ foundProc = proc;
433
+ break;
434
+ }
335
435
  }
336
- }
337
- } catch(e) {}
338
- if (foundWin) break;
436
+ } catch(e) {}
437
+ if (foundWin) break;
438
+ }
339
439
  }
340
- if (!foundWin) { result.error = 'Window not found'; JSON.stringify(result); return; }
440
+ if (!foundWin) {
441
+ result.error = 'Window not found';
442
+ } else {
341
443
 
342
444
  var winPos = foundWin.position();
343
445
  var winSize = foundWin.size();
@@ -423,7 +525,7 @@ export class MacOSPlatform {
423
525
 
424
526
  if (currentDepth < ${maxDepth}) {
425
527
  try {
426
- var kids = axElem.elements();
528
+ var kids = childElements(axElem);
427
529
  for (var k = 0; k < kids.length && elemCount[0] < ${maxElements}; k++) {
428
530
  var child = extractElement(kids[k], currentDepth + 1);
429
531
  if (child) info.children.push(child);
@@ -433,7 +535,8 @@ export class MacOSPlatform {
433
535
  return info;
434
536
  }
435
537
 
436
- result.tree = extractElement(foundWin, 0);
538
+ result.tree = extractElement(foundWin, 0);
539
+ }
437
540
  } catch(e) {
438
541
  result.error = String(e.message || e);
439
542
  }
@@ -445,7 +548,7 @@ export class MacOSPlatform {
445
548
  ], { encoding: "utf-8", timeout: 15000 }).trim();
446
549
  const parsed = JSON.parse(out);
447
550
  if (parsed.error && !parsed.window) {
448
- throw new Error(parsed.error);
551
+ throw new WindowNotFoundError(resolvedWindowId);
449
552
  }
450
553
  const windowInfo = parsed.window || {
451
554
  id: resolvedWindowId,
@@ -463,31 +566,48 @@ export class MacOSPlatform {
463
566
  };
464
567
  }
465
568
  catch (error) {
466
- if (String(error.message || error).includes("not allowed") ||
467
- String(error.message || error).includes("permission") ||
468
- String(error.message || error).includes("assistive")) {
469
- throw new Error(`Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility`);
470
- }
471
- throw new Error(`Window ${resolvedWindowId} not found or Accessibility permission missing`);
569
+ if (error instanceof WindowNotFoundError)
570
+ throw error;
571
+ rethrowAccessibilityError(error, "get_window_state");
472
572
  }
473
573
  }
474
574
  // ── Mouse ───────────────────────────────────────────────────────────────
475
575
  async click(x, y, button, doubleClick) {
476
- if (doubleClick) {
477
- await inputDoubleClick(x, y, button);
576
+ try {
577
+ if (doubleClick) {
578
+ await inputDoubleClick(x, y, button);
579
+ }
580
+ else {
581
+ await inputClick(x, y, button);
582
+ }
478
583
  }
479
- else {
480
- await inputClick(x, y, button);
584
+ catch (error) {
585
+ rethrowInputError(error, doubleClick ? "double_click" : "click");
481
586
  }
482
587
  }
483
588
  async move(x, y) {
484
- await inputMove(x, y);
589
+ try {
590
+ await inputMove(x, y);
591
+ }
592
+ catch (error) {
593
+ rethrowInputError(error, "move");
594
+ }
485
595
  }
486
596
  async drag(startX, startY, endX, endY, button, duration) {
487
- await inputDrag(startX, startY, endX, endY, button, duration);
597
+ try {
598
+ await inputDrag(startX, startY, endX, endY, button, duration);
599
+ }
600
+ catch (error) {
601
+ rethrowInputError(error, "drag");
602
+ }
488
603
  }
489
604
  async scroll(x, y, deltaX, deltaY) {
490
- await inputScroll(x, y, deltaX, deltaY);
605
+ try {
606
+ await inputScroll(x, y, deltaX, deltaY);
607
+ }
608
+ catch (error) {
609
+ rethrowInputError(error, "scroll");
610
+ }
491
611
  }
492
612
  // ── Cursor ──────────────────────────────────────────────────────────────
493
613
  getCursorPosition() {
@@ -502,14 +622,12 @@ export class MacOSPlatform {
502
622
  return JSON.parse(out);
503
623
  }
504
624
  catch (error) {
505
- throw new Error(`get_cursor_position failed: ${error.message || error}`);
625
+ throw new PlatformError(`get_cursor_position failed: ${errorMessage(error)}`);
506
626
  }
507
627
  }
508
628
  // ── OCR ──────────────────────────────────────────────────────────────────
509
629
  async ocr(display, region) {
510
- // Take a screenshot first (reuse existing logic)
511
630
  const buf = await this.screenshot(display, region);
512
- // Write screenshot to a temp file so Vision framework can read it
513
631
  const { writeFile, unlink } = await import("node:fs/promises");
514
632
  const { join } = await import("node:path");
515
633
  const { tmpdir } = await import("node:os");
@@ -518,93 +636,46 @@ export class MacOSPlatform {
518
636
  try {
519
637
  const screenSize = this.getScreenSize(display);
520
638
  const scaleFactor = screenSize.scaleFactor ?? 2;
521
- // Build JXA script that uses Vision framework for OCR
522
- // JXA does not allow return statements at global scope, so we wrap in a function
523
- const jxaScript = `
524
- function run() {
525
- ObjC.import('Vision');
526
- ObjC.import('AppKit');
527
- ObjC.import('Foundation');
528
-
529
- var app = Application.currentApplication();
530
- app.includeStandardAdditions = true;
531
-
532
- var path = "${tmpPath.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$')}";
533
- var url = $.NSURL.fileURLWithPath(path);
534
- var image = $.NSImage.alloc.initWithContentsOfURL(url);
535
-
536
- if (!image || image.isValid() === false) {
537
- return JSON.stringify({error: "Failed to load screenshot image", elements: [], fullText: ""});
538
- }
539
-
540
- var cgImage = image.CGImageForProposedRectContextHints(null, null, null);
541
- if (!cgImage) {
542
- return JSON.stringify({error: "Failed to get CGImage from screenshot", elements: [], fullText: ""});
543
- }
544
-
545
- var request = $.VNRecognizeTextRequest.alloc.init;
546
- request.recognitionLevel = $.VNRequestTextRecognitionLevelAccurate;
547
- request.usesLanguageCorrection = true;
548
-
549
- var handler = $.VNImageRequestHandler.alloc.initWithCGImageOptions(cgImage, null);
550
- var performError = $();
551
-
552
- var success = handler.performRequestsError([request], performError);
553
- if (!success) {
554
- return JSON.stringify({error: "OCR request failed", elements: [], fullText: ""});
555
- }
556
-
557
- var results = request.results;
558
- var elements = [];
559
- var fullTextParts = [];
560
-
561
- var imgWidth = cgImage.width;
562
- var imgHeight = cgImage.height;
563
-
564
- for (var i = 0; i < results.count; i++) {
565
- var obs = $(results).objectAtIndex(i);
566
- var candidates = obs.topCandidates(1);
567
- if (candidates && candidates.count > 0) {
568
- var candidate = $(candidates).objectAtIndex(0);
569
- var text = candidate.string.toString();
570
- var confidence = candidate.confidence;
571
- var bbox = obs.boundingBox;
572
-
573
- // Vision boundingBox is normalized (0-1) with origin at bottom-left
574
- // Convert to screen coordinates (origin at top-left)
575
- var bx = bbox.origin.x * imgWidth;
576
- var by = (1 - bbox.origin.y - bbox.size.height) * imgHeight;
577
- var bw = bbox.size.width * imgWidth;
578
- var bh = bbox.size.height * imgHeight;
579
-
580
- elements.push({
581
- text: text,
582
- x: Math.round(bx),
583
- y: Math.round(by),
584
- width: Math.round(bw),
585
- height: Math.round(bh),
586
- confidence: confidence
587
- });
588
- fullTextParts.push(text);
639
+ // Try native Swift OCR helper first (avoids JXA ObjC bridge bugs on macOS Sequoia+)
640
+ const nativeResult = await this.ocrNative(tmpPath, scaleFactor, region);
641
+ if (nativeResult)
642
+ return nativeResult;
643
+ // Fallback to JXA Vision framework
644
+ return await this.ocrJxa(tmpPath, screenSize, scaleFactor, region, buf);
645
+ }
646
+ finally {
647
+ await unlink(tmpPath).catch(() => { });
648
+ }
649
+ }
650
+ async ocrNative(tmpPath, scaleFactor, region) {
651
+ const { existsSync } = await import("node:fs");
652
+ const { join, dirname } = await import("node:path");
653
+ const { fileURLToPath } = await import("node:url");
654
+ // Resolve native binary path (same pattern as input.ts CGEvent helper)
655
+ const candidates = [
656
+ join(dirname(fileURLToPath(import.meta.url)), "..", "..", "native", "ocr", "ocr-helper"),
657
+ join(dirname(fileURLToPath(import.meta.url)), "..", "native", "ocr", "ocr-helper"),
658
+ join(process.cwd(), "native", "ocr", "ocr-helper"),
659
+ ];
660
+ let binaryPath;
661
+ for (const p of candidates) {
662
+ if (existsSync(p)) {
663
+ binaryPath = p;
664
+ break;
589
665
  }
590
- }
591
-
592
- return JSON.stringify({elements: elements, fullText: fullTextParts.join("\\n"), error: null});
593
666
  }
594
- run();
595
- `;
596
- const out = execFileSync("osascript", [
597
- "-l", "JavaScript",
598
- "-e", jxaScript,
599
- ], { encoding: "utf-8", timeout: 30000 }).trim();
667
+ if (!binaryPath)
668
+ return null;
669
+ try {
670
+ const input = JSON.stringify({ imagePath: tmpPath });
671
+ const out = execFileSync(binaryPath, [], {
672
+ input,
673
+ encoding: "utf-8",
674
+ timeout: 30000,
675
+ }).trim();
600
676
  const parsed = JSON.parse(out);
601
- if (parsed.error) {
602
- throw new Error(parsed.error);
603
- }
604
- // Scale coordinates from image space to screen space
605
- // The screenshot may be taken at a different resolution than screen coordinates
606
- const imgWidth = buf.readUInt32BE(16); // PNG width at offset 16
607
- const scaleFactorX = screenSize.width / (region ? region.width : (imgWidth / scaleFactor));
677
+ if (parsed.error)
678
+ return null;
608
679
  const elements = parsed.elements.map((el) => ({
609
680
  text: el.text,
610
681
  x: Math.round(el.x / scaleFactor) + (region ? region.x : 0),
@@ -613,15 +684,81 @@ export class MacOSPlatform {
613
684
  height: Math.round(el.height / scaleFactor),
614
685
  confidence: el.confidence,
615
686
  }));
616
- return {
617
- elements,
618
- fullText: parsed.fullText,
619
- };
687
+ return { elements, fullText: parsed.fullText };
620
688
  }
621
- finally {
622
- await unlink(tmpPath).catch(() => { });
689
+ catch {
690
+ return null;
623
691
  }
624
692
  }
693
+ async ocrJxa(tmpPath, screenSize, scaleFactor, region, buf) {
694
+ const escapedPath = tmpPath.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, "\\`").replace(/$/g, "\\$");
695
+ const jxaScript = `
696
+ function run() {
697
+ ObjC.import('Vision');
698
+ ObjC.import('AppKit');
699
+ ObjC.import('Foundation');
700
+ var app = Application.currentApplication();
701
+ app.includeStandardAdditions = true;
702
+ var path = "${escapedPath}";
703
+ var url = $.NSURL.fileURLWithPath(path);
704
+ var image = $.NSImage.alloc.initWithContentsOfURL(url);
705
+ if (!image || !image.isValid) {
706
+ return JSON.stringify({error: "Failed to load screenshot image", elements: [], fullText: ""});
707
+ }
708
+ var cgImage = image.CGImageForProposedRectContextHints(null, null, null);
709
+ if (!cgImage) {
710
+ return JSON.stringify({error: "Failed to get CGImage from screenshot", elements: [], fullText: ""});
711
+ }
712
+ var request = $.VNRecognizeTextRequest.alloc.init;
713
+ request.recognitionLevel = $.VNRequestTextRecognitionLevelAccurate;
714
+ request.usesLanguageCorrection = true;
715
+ var handler = $.VNImageRequestHandler.alloc.initWithCGImageOptions(cgImage, null);
716
+ var performError = $();
717
+ var success = handler.performRequestsError([request], performError);
718
+ if (!success) {
719
+ return JSON.stringify({error: "OCR request failed", elements: [], fullText: ""});
720
+ }
721
+ var results = request.results;
722
+ var elements = [];
723
+ var fullTextParts = [];
724
+ var imgWidth = cgImage.width;
725
+ var imgHeight = cgImage.height;
726
+ for (var i = 0; i < results.count; i++) {
727
+ var obs = $(results).objectAtIndex(i);
728
+ var candidates = obs.topCandidates(1);
729
+ if (candidates && candidates.count > 0) {
730
+ var candidate = $(candidates).objectAtIndex(0);
731
+ var text = candidate.string.toString();
732
+ var confidence = candidate.confidence;
733
+ var bbox = obs.boundingBox;
734
+ var bx = bbox.origin.x * imgWidth;
735
+ var by = (1 - bbox.origin.y - bbox.size.height) * imgHeight;
736
+ var bw = bbox.size.width * imgWidth;
737
+ var bh = bbox.size.height * imgHeight;
738
+ elements.push({text:text,x:Math.round(bx),y:Math.round(by),width:Math.round(bw),height:Math.round(bh),confidence:confidence});
739
+ fullTextParts.push(text);
740
+ }
741
+ }
742
+ return JSON.stringify({elements:elements,fullText:fullTextParts.join("\\n"),error:null});
743
+ }
744
+ run();
745
+ `;
746
+ const out = execFileSync("osascript", ["-l", "JavaScript", "-e", jxaScript], { encoding: "utf-8", timeout: 30000 }).trim();
747
+ const parsed = JSON.parse(out);
748
+ if (parsed.error)
749
+ throw new CaptureError(`ocr failed: ${parsed.error}`);
750
+ const imgWidth = buf.readUInt32BE(16);
751
+ const scaleFactorX = screenSize.width / (region ? region.width : (imgWidth / scaleFactor));
752
+ const elements = parsed.elements.map((el) => ({
753
+ text: el.text,
754
+ x: Math.round(el.x / scaleFactor) + (region ? region.x : 0),
755
+ y: Math.round(el.y / scaleFactor) + (region ? region.y : 0),
756
+ width: Math.round(el.width / scaleFactor),
757
+ height: Math.round(el.height / scaleFactor),
758
+ confidence: el.confidence,
759
+ }));
760
+ return { elements, fullText: parsed.fullText };
761
+ }
625
762
  // ── Keyboard ────────────────────────────────────────────────────────────
626
763
  async type(text, delay) {
627
764
  await typeText(text, delay);
@@ -640,8 +777,13 @@ export class MacOSPlatform {
640
777
  const escapedText = text ? text.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$') : "";
641
778
  const escapedRole = role ? role.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$') : "";
642
779
  const jxaScript = `
643
- var se = Application('System Events');
644
- var results = [];
780
+ var se = Application('System Events');
781
+ function childElements(elem) {
782
+ try { return elem.uiElements(); } catch(e1) {
783
+ try { return elem.elements(); } catch(e2) { return []; }
784
+ }
785
+ }
786
+ var results = [];
645
787
  var resultCount = [0];
646
788
  var maxResults = ${maxResults};
647
789
  var includeBounds = ${includeBounds ? "true" : "false"};
@@ -723,7 +865,7 @@ export class MacOSPlatform {
723
865
 
724
866
  if (currentDepth < ${maxDepth}) {
725
867
  try {
726
- var kids = elem.elements();
868
+ var kids = childElements(elem);
727
869
  for (var k = 0; k < kids.length && resultCount[0] < maxResults; k++) {
728
870
  traverse(kids[k], path + '/' + k, currentDepth + 1);
729
871
  }
@@ -736,7 +878,7 @@ export class MacOSPlatform {
736
878
  var proc = se.processes["${escapedApp}"]();
737
879
  var wins = proc.windows();
738
880
  for (var w = 0; w < wins.length && resultCount[0] < maxResults; w++) {
739
- traverse(wins[w], "win" + w, 0);
881
+ traverse(wins[w], "${escapedApp}/win" + w, 0);
740
882
  }
741
883
  } else {
742
884
  var procs = se.processes();
@@ -779,12 +921,7 @@ export class MacOSPlatform {
779
921
  return results;
780
922
  }
781
923
  catch (error) {
782
- if (String(error.message || error).includes("not allowed") ||
783
- String(error.message || error).includes("permission") ||
784
- String(error.message || error).includes("assistive")) {
785
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
786
- }
787
- throw new Error(`find_element failed: ${error.message || error}`);
924
+ rethrowAccessibilityError(error, "find_element");
788
925
  }
789
926
  }
790
927
  async clickElement(elementId, app) {
@@ -799,6 +936,11 @@ export class MacOSPlatform {
799
936
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
800
937
  const jxaScript = `
801
938
  var se = Application('System Events');
939
+ function childElements(elem) {
940
+ try { return elem.uiElements(); } catch(e1) {
941
+ try { return elem.elements(); } catch(e2) { return []; }
942
+ }
943
+ }
802
944
  var elemPath = "${escapedElementId}";
803
945
  var appName = "${escapedApp}";
804
946
  var cached = ${cachedJson};
@@ -825,7 +967,7 @@ export class MacOSPlatform {
825
967
  var idx = parseInt(parts[i]);
826
968
  if (isNaN(idx)) return null;
827
969
  try {
828
- var kids = current.elements();
970
+ var kids = childElements(current);
829
971
  if (idx >= kids.length) return null;
830
972
  current = kids[idx];
831
973
  } catch(e) { return null; }
@@ -910,7 +1052,7 @@ export class MacOSPlatform {
910
1052
  bestScore = score;
911
1053
  }
912
1054
  try {
913
- var kids = elem.elements();
1055
+ var kids = childElements(elem);
914
1056
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
915
1057
  } catch(e) {}
916
1058
  }
@@ -948,7 +1090,7 @@ export class MacOSPlatform {
948
1090
  var idx = parseInt(parts[i]);
949
1091
  if (isNaN(idx)) break;
950
1092
  try {
951
- var kids = current.elements();
1093
+ var kids = childElements(current);
952
1094
  if (idx >= kids.length) break;
953
1095
  current = kids[idx];
954
1096
  } catch(e) { break; }
@@ -1003,17 +1145,13 @@ export class MacOSPlatform {
1003
1145
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1004
1146
  const result = JSON.parse(out);
1005
1147
  if (!result.success) {
1006
- throw new Error(result.error || `click_element failed for element ${elementId}`);
1148
+ throw result.error
1149
+ ? new Error(result.error)
1150
+ : new ElementNotFoundError(elementId);
1007
1151
  }
1008
1152
  }
1009
1153
  catch (error) {
1010
- if (error.message && error.message.includes("click_element failed"))
1011
- throw error;
1012
- if (String(error.message || error).includes("not allowed") ||
1013
- String(error.message || error).includes("permission")) {
1014
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1015
- }
1016
- throw new Error(`click_element failed: ${error.message || error}`);
1154
+ rethrowElementActionError(error, "click_element", elementId);
1017
1155
  }
1018
1156
  }
1019
1157
  async typeInElement(elementId, text, app, clearFirst) {
@@ -1029,6 +1167,11 @@ export class MacOSPlatform {
1029
1167
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
1030
1168
  const jxaScript = `
1031
1169
  var se = Application('System Events');
1170
+ function childElements(elem) {
1171
+ try { return elem.uiElements(); } catch(e1) {
1172
+ try { return elem.elements(); } catch(e2) { return []; }
1173
+ }
1174
+ }
1032
1175
  var elemPath = "${escapedElementId}";
1033
1176
  var appName = "${escapedApp}";
1034
1177
  var textToType = "${escapedText}";
@@ -1057,7 +1200,7 @@ export class MacOSPlatform {
1057
1200
  var idx = parseInt(parts[i]);
1058
1201
  if (isNaN(idx)) return null;
1059
1202
  try {
1060
- var kids = current.elements();
1203
+ var kids = childElements(current);
1061
1204
  if (idx >= kids.length) return null;
1062
1205
  current = kids[idx];
1063
1206
  } catch(e) { return null; }
@@ -1142,7 +1285,7 @@ export class MacOSPlatform {
1142
1285
  bestScore = score;
1143
1286
  }
1144
1287
  try {
1145
- var kids = elem.elements();
1288
+ var kids = childElements(elem);
1146
1289
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
1147
1290
  } catch(e) {}
1148
1291
  }
@@ -1180,7 +1323,7 @@ export class MacOSPlatform {
1180
1323
  var idx = parseInt(parts[i]);
1181
1324
  if (isNaN(idx)) break;
1182
1325
  try {
1183
- var kids = current.elements();
1326
+ var kids = childElements(current);
1184
1327
  if (idx >= kids.length) break;
1185
1328
  current = kids[idx];
1186
1329
  } catch(e) { break; }
@@ -1245,17 +1388,13 @@ export class MacOSPlatform {
1245
1388
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1246
1389
  const result = JSON.parse(out);
1247
1390
  if (!result.success) {
1248
- throw new Error(result.error || `type_in_element failed for element ${elementId}`);
1391
+ throw result.error
1392
+ ? new Error(result.error)
1393
+ : new ElementNotFoundError(elementId);
1249
1394
  }
1250
1395
  }
1251
1396
  catch (error) {
1252
- if (error.message && error.message.includes("type_in_element failed"))
1253
- throw error;
1254
- if (String(error.message || error).includes("not allowed") ||
1255
- String(error.message || error).includes("permission")) {
1256
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1257
- }
1258
- throw new Error(`type_in_element failed: ${error.message || error}`);
1397
+ rethrowElementActionError(error, "type_in_element", elementId);
1259
1398
  }
1260
1399
  }
1261
1400
  async setElementValue(elementId, value, app) {
@@ -1271,6 +1410,11 @@ export class MacOSPlatform {
1271
1410
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
1272
1411
  const jxaScript = `
1273
1412
  var se = Application('System Events');
1413
+ function childElements(elem) {
1414
+ try { return elem.uiElements(); } catch(e1) {
1415
+ try { return elem.elements(); } catch(e2) { return []; }
1416
+ }
1417
+ }
1274
1418
  var elemPath = ${elementIdLiteral};
1275
1419
  var appName = ${appLiteral};
1276
1420
  var valueToSet = ${valueLiteral};
@@ -1296,7 +1440,7 @@ export class MacOSPlatform {
1296
1440
  var idx = parseInt(parts[i]);
1297
1441
  if (isNaN(idx)) return null;
1298
1442
  try {
1299
- var kids = current.elements();
1443
+ var kids = childElements(current);
1300
1444
  if (idx >= kids.length) return null;
1301
1445
  current = kids[idx];
1302
1446
  } catch(e) { return null; }
@@ -1323,7 +1467,7 @@ export class MacOSPlatform {
1323
1467
  var idx = parseInt(parts[i]);
1324
1468
  if (isNaN(idx)) return null;
1325
1469
  try {
1326
- var kids = current.elements();
1470
+ var kids = childElements(current);
1327
1471
  if (idx >= kids.length) return null;
1328
1472
  current = kids[idx];
1329
1473
  } catch(e) { return null; }
@@ -1408,7 +1552,7 @@ export class MacOSPlatform {
1408
1552
  bestScore = score;
1409
1553
  }
1410
1554
  try {
1411
- var kids = elem.elements();
1555
+ var kids = childElements(elem);
1412
1556
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
1413
1557
  } catch(e) {}
1414
1558
  }
@@ -1456,7 +1600,9 @@ export class MacOSPlatform {
1456
1600
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1457
1601
  const result = JSON.parse(out);
1458
1602
  if (!result.success) {
1459
- throw new Error(result.error || `set_value failed for element ${elementId}`);
1603
+ throw result.error
1604
+ ? new Error(result.error)
1605
+ : new ElementNotFoundError(elementId);
1460
1606
  }
1461
1607
  const currentCached = this.elementCache.get(elementId);
1462
1608
  if (currentCached) {
@@ -1464,13 +1610,7 @@ export class MacOSPlatform {
1464
1610
  }
1465
1611
  }
1466
1612
  catch (error) {
1467
- if (error.message && error.message.includes("set_value failed"))
1468
- throw error;
1469
- if (String(error.message || error).includes("not allowed") ||
1470
- String(error.message || error).includes("permission")) {
1471
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1472
- }
1473
- throw new Error(`set_value failed: ${error.message || error}`);
1613
+ rethrowElementActionError(error, "set_value", elementId);
1474
1614
  }
1475
1615
  }
1476
1616
  }