ucu-mcp 0.1.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,69 @@
1
1
  import { execFile, execFileSync } from "node:child_process";
2
2
  import { randomUUID } from "node:crypto";
3
3
  import { promisify } from "node:util";
4
- import { captureFullScreen, captureRegion, captureWindow } from "../utils/screenshot.js";
4
+ import { captureFullScreen, captureRegion } from "../utils/screenshot.js";
5
5
  import { click as inputClick, doubleClick as inputDoubleClick, move as inputMove, drag as inputDrag, scroll as inputScroll, typeText, pressShortcut } from "../utils/input.js";
6
+ import { CaptureError, ElementNotFoundError, InputSynthesisError, PermissionError, PlatformError, TargetStaleError, UcuError, WindowNotFoundError } from "../util/errors.js";
6
7
  const execFileAsync = promisify(execFile);
8
+ function errorMessage(error) {
9
+ return error instanceof Error ? error.message : String(error);
10
+ }
11
+ function isAccessibilityPermissionError(error) {
12
+ return /not allowed|permission|assistive|accessibility/i.test(errorMessage(error));
13
+ }
14
+ function rethrowCaptureError(error, operation) {
15
+ if (error instanceof UcuError)
16
+ throw error;
17
+ throw new CaptureError(`${operation} failed: ${errorMessage(error)}`);
18
+ }
19
+ function rethrowAccessibilityError(error, operation) {
20
+ if (error instanceof UcuError)
21
+ throw error;
22
+ if (isAccessibilityPermissionError(error)) {
23
+ throw new PermissionError("accessibility", "darwin");
24
+ }
25
+ throw new PlatformError(`${operation} failed: ${errorMessage(error)}`);
26
+ }
27
+ function rethrowElementActionError(error, operation, elementId) {
28
+ if (error instanceof UcuError)
29
+ throw error;
30
+ if (isAccessibilityPermissionError(error)) {
31
+ throw new PermissionError("accessibility", "darwin");
32
+ }
33
+ if (/element not found/i.test(errorMessage(error))) {
34
+ throw new ElementNotFoundError(elementId);
35
+ }
36
+ throw new PlatformError(`${operation} failed: ${errorMessage(error)}`);
37
+ }
38
+ function rethrowInputError(error, operation) {
39
+ if (error instanceof UcuError)
40
+ throw error;
41
+ throw new InputSynthesisError(`${operation} failed: ${errorMessage(error)}`);
42
+ }
43
+ function normalizeAppName(name) {
44
+ return name.trim().toLowerCase();
45
+ }
46
+ function appNameMatches(processName, requestedApp) {
47
+ const process = normalizeAppName(processName);
48
+ const requested = normalizeAppName(requestedApp);
49
+ if (!process || !requested)
50
+ return false;
51
+ return process === requested ||
52
+ process.startsWith(`${requested} `) ||
53
+ process.startsWith(`${requested}-`) ||
54
+ process.includes(` ${requested} `);
55
+ }
56
+ function selectWindowForApp(windows, requestedApp) {
57
+ const requested = normalizeAppName(requestedApp);
58
+ return windows.find((window) => normalizeAppName(window.processName) === requested) ??
59
+ windows.find((window) => appNameMatches(window.processName, requestedApp));
60
+ }
7
61
  export class MacOSPlatform {
8
62
  elementCache = new Map();
9
63
  elementCacheTtlMs = 30_000;
10
64
  elementCacheMaxSize = 100;
65
+ windowCacheTtlMs = 300;
66
+ windowCache;
11
67
  activeTarget;
12
68
  savedFocus;
13
69
  // ── Element Cache Management ────────────────────────────────────────────
@@ -43,6 +99,18 @@ export class MacOSPlatform {
43
99
  isCacheEntryExpired(descriptor) {
44
100
  return Date.now() - descriptor.cachedAt > this.elementCacheTtlMs;
45
101
  }
102
+ // ── Target Validation ────────────────────────────────────────────────────
103
+ /** Validate that the active target window still exists. */
104
+ async validateActiveTarget() {
105
+ if (!this.activeTarget?.windowId)
106
+ return;
107
+ this.windowCache = undefined; // Bypass cache — stale detection must use fresh data
108
+ const windows = await this.listWindows(true);
109
+ const stillExists = windows.some(w => w.id === this.activeTarget.windowId);
110
+ if (!stillExists) {
111
+ throw new TargetStaleError(this.activeTarget.windowId);
112
+ }
113
+ }
46
114
  // ── Focus Management ────────────────────────────────────────────────────
47
115
  /** Save the current frontmost app/window so we can restore after an action. */
48
116
  async saveFocus() {
@@ -79,14 +147,22 @@ export class MacOSPlatform {
79
147
  }
80
148
  // ── Screenshot ──────────────────────────────────────────────────────────
81
149
  async screenshot(_display, region, options) {
82
- const base64 = region
83
- ? await captureRegion(region.x, region.y, region.width, region.height, options)
84
- : await captureFullScreen(options);
85
- return Buffer.from(base64, "base64");
150
+ try {
151
+ const base64 = region
152
+ ? await captureRegion(region.x, region.y, region.width, region.height, options)
153
+ : await captureFullScreen(options);
154
+ return Buffer.from(base64, "base64");
155
+ }
156
+ catch (error) {
157
+ rethrowCaptureError(error, region ? "capture region" : "capture full screen");
158
+ }
86
159
  }
87
160
  async screenshotWindow(windowId, options) {
88
- const base64 = await captureWindow(windowId, options);
89
- return Buffer.from(base64, "base64");
161
+ const win = (await this.listWindows(true)).find((w) => w.id === windowId);
162
+ if (!win) {
163
+ throw new WindowNotFoundError(windowId);
164
+ }
165
+ return this.screenshot(undefined, win.bounds, options);
90
166
  }
91
167
  // ── Screen Info ─────────────────────────────────────────────────────────
92
168
  getScreenSize(display) {
@@ -153,17 +229,34 @@ export class MacOSPlatform {
153
229
  return JSON.parse(out);
154
230
  }
155
231
  async focusApp(app) {
156
- const appLower = app.toLowerCase();
157
- const windows = await this.listWindows(true);
158
- const target = windows.find((w) => w.processName.toLowerCase().includes(appLower));
232
+ const escapedApp = app.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
233
+ this.windowCache = undefined;
234
+ try {
235
+ execFileSync("osascript", ["-e", `tell application "${escapedApp}" to activate`], { timeout: 5000 });
236
+ }
237
+ catch {
238
+ // Some app names are process labels rather than AppleScript application names.
239
+ // Continue with the AX window lookup below so existing callers still work.
240
+ }
241
+ let target;
242
+ const deadline = Date.now() + 3000;
243
+ do {
244
+ const windows = await this.listWindows(true);
245
+ target = selectWindowForApp(windows, app);
246
+ if (target)
247
+ break;
248
+ await new Promise((resolve) => setTimeout(resolve, 150));
249
+ } while (Date.now() < deadline);
159
250
  if (!target) {
160
- throw new Error(`No on-screen window found for app "${app}". Use list_apps to inspect localized macOS app names.`);
251
+ throw new WindowNotFoundError(app);
161
252
  }
162
253
  this.activeTarget = {
254
+ targetId: randomUUID(),
163
255
  appName: target.processName,
164
256
  pid: target.pid,
165
257
  windowId: target.id,
166
258
  title: target.title,
259
+ capturedAt: new Date().toISOString(),
167
260
  };
168
261
  return this.activeTarget;
169
262
  }
@@ -219,44 +312,49 @@ export class MacOSPlatform {
219
312
  }
220
313
  }
221
314
  async listWindows(_includeMinimized) {
315
+ const now = Date.now();
316
+ if (this.windowCache && now - this.windowCache.cachedAt <= this.windowCacheTtlMs) {
317
+ return this.windowCache.windows.map((window) => ({
318
+ ...window,
319
+ bounds: { ...window.bounds },
320
+ }));
321
+ }
222
322
  try {
323
+ // Use System Events instead of CGWindowListCopyWindowInfo.
324
+ // The CoreGraphics API returns CFArrayRef/CFDictionaryRef which JXA
325
+ // cannot iterate reliably — CFArrayGetCount works but objectAtIndex
326
+ // does not. System Events JXA is slower (~3-6s) but correct.
223
327
  const jxaScript = `
224
- ObjC.import('CoreGraphics');
225
- ObjC.import('Foundation');
226
- var winList = $.CGWindowListCopyWindowInfo(1, 0);
227
- var count = winList.count;
328
+ var se = Application('System Events');
228
329
  var result = [];
229
- for (var i = 0; i < count; i++) {
230
- var w = $(winList).objectAtIndex(i);
231
- var bounds = w.objectForKey('kCGWindowBounds');
232
- var numberVal = w.objectForKey('kCGWindowNumber');
233
- var nameVal = w.objectForKey('kCGWindowName');
234
- var ownerVal = w.objectForKey('kCGWindowOwnerName');
235
- var pidVal = w.objectForKey('kCGWindowOwnerPID');
236
- var onScreenVal = w.objectForKey('kCGWindowIsOnscreen');
237
- var layerVal = w.objectForKey('kCGWindowLayer');
238
-
239
- // Skip windows at layer > 0 (menus, overlays, etc.)
240
- if (layerVal && layerVal.intValue > 0) continue;
241
-
242
- var bx = 0, by = 0, bw = 0, bh = 0;
243
- try { bx = $(bounds).objectForKey('X').intValue; } catch(e) {}
244
- try { by = $(bounds).objectForKey('Y').intValue; } catch(e) {}
245
- try { bw = $(bounds).objectForKey('Width').intValue; } catch(e) {}
246
- try { bh = $(bounds).objectForKey('Height').intValue; } catch(e) {}
247
-
248
- // Skip zero-size windows
249
- if (bw === 0 && bh === 0) continue;
250
-
251
- result.push({
252
- id: String(numberVal ? numberVal.intValue : 0),
253
- title: nameVal ? String(nameVal) : '',
254
- processName: ownerVal ? String(ownerVal) : '',
255
- pid: pidVal ? pidVal.intValue : 0,
256
- bounds: { x: bx, y: by, width: bw, height: bh },
257
- isMinimized: false,
258
- isOnScreen: onScreenVal ? onScreenVal.boolValue : true
259
- });
330
+ var procs = se.processes();
331
+ for (var i = 0; i < procs.length; i++) {
332
+ var p = procs[i];
333
+ var pName = '';
334
+ var pPid = 0;
335
+ try { pName = p.name(); } catch(e) {}
336
+ try { pPid = p.unixId(); } catch(e) {}
337
+ try {
338
+ var wins = p.windows();
339
+ for (var j = 0; j < wins.length; j++) {
340
+ var w = wins[j];
341
+ var pos, sz;
342
+ try { pos = w.position(); } catch(e) { pos = [0, 0]; }
343
+ try { sz = w.size(); } catch(e) { sz = [0, 0]; }
344
+ if (sz[0] === 0 && sz[1] === 0) continue;
345
+ var title = '';
346
+ try { title = w.name() || ''; } catch(e) {}
347
+ result.push({
348
+ id: pName + '/win' + j,
349
+ title: title,
350
+ processName: pName,
351
+ pid: pPid,
352
+ bounds: { x: pos[0], y: pos[1], width: sz[0], height: sz[1] },
353
+ isMinimized: false,
354
+ isOnScreen: true
355
+ });
356
+ }
357
+ } catch(e) {}
260
358
  }
261
359
  JSON.stringify(result);
262
360
  `;
@@ -264,7 +362,15 @@ export class MacOSPlatform {
264
362
  "-l", "JavaScript",
265
363
  "-e", jxaScript
266
364
  ], { encoding: "utf-8", timeout: 15000 });
267
- return JSON.parse(jxaOut.trim());
365
+ const windows = JSON.parse(jxaOut.trim());
366
+ this.windowCache = {
367
+ cachedAt: Date.now(),
368
+ windows: windows.map((window) => ({
369
+ ...window,
370
+ bounds: { ...window.bounds },
371
+ })),
372
+ };
373
+ return windows;
268
374
  }
269
375
  catch {
270
376
  // Fallback: return empty list if JXA fails
@@ -272,9 +378,12 @@ export class MacOSPlatform {
272
378
  }
273
379
  }
274
380
  async getWindowState(windowId, depth, includeBounds = true) {
381
+ if (!windowId || windowId === this.activeTarget?.windowId) {
382
+ await this.validateActiveTarget();
383
+ }
275
384
  const resolvedWindowId = windowId || this.activeTarget?.windowId;
276
385
  if (!resolvedWindowId) {
277
- throw new Error("getWindowState requires windowId or a prior focus_app target");
386
+ throw new WindowNotFoundError("active target");
278
387
  }
279
388
  const maxDepth = Math.min(depth || 3, 10);
280
389
  const maxElements = 50;
@@ -285,6 +394,11 @@ export class MacOSPlatform {
285
394
  const jxaScript = `
286
395
  ObjC.import('AppKit');
287
396
  var se = Application('System Events');
397
+ function childElements(elem) {
398
+ try { return elem.uiElements(); } catch(e1) {
399
+ try { return elem.elements(); } catch(e2) { return []; }
400
+ }
401
+ }
288
402
  var result = {window: null, focusedElement: null, tree: null, error: null};
289
403
  var target = ${targetJson};
290
404
  var includeBounds = ${includeBounds ? "true" : "false"};
@@ -319,25 +433,47 @@ export class MacOSPlatform {
319
433
  return false;
320
434
  }
321
435
 
436
+ var foundWin = null;
437
+ var foundProc = null;
438
+
439
+ // Fast path: resolve "ProcessName/winN" format directly
440
+ var idParts = "${escapedWindowId}".split('/');
441
+ if (idParts.length >= 2 && idParts[0]) {
442
+ var procName = idParts[0];
443
+ var winIdx = 0;
444
+ var winMatch = idParts[1].match(/^win(\d+)$/);
445
+ if (winMatch) winIdx = parseInt(winMatch[1]);
446
+ try {
447
+ var proc = se.processes[procName]();
448
+ var ws = proc.windows();
449
+ if (winIdx < ws.length) {
450
+ foundWin = ws[winIdx];
451
+ foundProc = proc;
452
+ }
453
+ } catch(e) {}
454
+ }
455
+
322
456
  try {
323
- var foundWin = null;
324
- var foundProc = null;
325
- var procs = se.processes();
326
- for (var p = 0; p < procs.length; p++) {
327
- var proc = procs[p];
328
- try {
329
- var wins = proc.windows();
330
- for (var w = 0; w < wins.length; w++) {
331
- if (windowMatches(wins[w], proc)) {
332
- foundWin = wins[w];
333
- foundProc = proc;
334
- break;
457
+ if (!foundWin) {
458
+ var procs = se.processes();
459
+ for (var p = 0; p < procs.length; p++) {
460
+ var proc = procs[p];
461
+ try {
462
+ var wins = proc.windows();
463
+ for (var w = 0; w < wins.length; w++) {
464
+ if (windowMatches(wins[w], proc)) {
465
+ foundWin = wins[w];
466
+ foundProc = proc;
467
+ break;
468
+ }
335
469
  }
336
- }
337
- } catch(e) {}
338
- if (foundWin) break;
470
+ } catch(e) {}
471
+ if (foundWin) break;
472
+ }
339
473
  }
340
- if (!foundWin) { result.error = 'Window not found'; JSON.stringify(result); return; }
474
+ if (!foundWin) {
475
+ result.error = 'Window not found';
476
+ } else {
341
477
 
342
478
  var winPos = foundWin.position();
343
479
  var winSize = foundWin.size();
@@ -423,7 +559,7 @@ export class MacOSPlatform {
423
559
 
424
560
  if (currentDepth < ${maxDepth}) {
425
561
  try {
426
- var kids = axElem.elements();
562
+ var kids = childElements(axElem);
427
563
  for (var k = 0; k < kids.length && elemCount[0] < ${maxElements}; k++) {
428
564
  var child = extractElement(kids[k], currentDepth + 1);
429
565
  if (child) info.children.push(child);
@@ -433,7 +569,8 @@ export class MacOSPlatform {
433
569
  return info;
434
570
  }
435
571
 
436
- result.tree = extractElement(foundWin, 0);
572
+ result.tree = extractElement(foundWin, 0);
573
+ }
437
574
  } catch(e) {
438
575
  result.error = String(e.message || e);
439
576
  }
@@ -445,7 +582,7 @@ export class MacOSPlatform {
445
582
  ], { encoding: "utf-8", timeout: 15000 }).trim();
446
583
  const parsed = JSON.parse(out);
447
584
  if (parsed.error && !parsed.window) {
448
- throw new Error(parsed.error);
585
+ throw new WindowNotFoundError(resolvedWindowId);
449
586
  }
450
587
  const windowInfo = parsed.window || {
451
588
  id: resolvedWindowId,
@@ -463,31 +600,48 @@ export class MacOSPlatform {
463
600
  };
464
601
  }
465
602
  catch (error) {
466
- if (String(error.message || error).includes("not allowed") ||
467
- String(error.message || error).includes("permission") ||
468
- String(error.message || error).includes("assistive")) {
469
- throw new Error(`Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility`);
470
- }
471
- throw new Error(`Window ${resolvedWindowId} not found or Accessibility permission missing`);
603
+ if (error instanceof WindowNotFoundError)
604
+ throw error;
605
+ rethrowAccessibilityError(error, "get_window_state");
472
606
  }
473
607
  }
474
608
  // ── Mouse ───────────────────────────────────────────────────────────────
475
609
  async click(x, y, button, doubleClick) {
476
- if (doubleClick) {
477
- await inputDoubleClick(x, y, button);
610
+ try {
611
+ if (doubleClick) {
612
+ await inputDoubleClick(x, y, button);
613
+ }
614
+ else {
615
+ await inputClick(x, y, button);
616
+ }
478
617
  }
479
- else {
480
- await inputClick(x, y, button);
618
+ catch (error) {
619
+ rethrowInputError(error, doubleClick ? "double_click" : "click");
481
620
  }
482
621
  }
483
622
  async move(x, y) {
484
- await inputMove(x, y);
623
+ try {
624
+ await inputMove(x, y);
625
+ }
626
+ catch (error) {
627
+ rethrowInputError(error, "move");
628
+ }
485
629
  }
486
630
  async drag(startX, startY, endX, endY, button, duration) {
487
- await inputDrag(startX, startY, endX, endY, button, duration);
631
+ try {
632
+ await inputDrag(startX, startY, endX, endY, button, duration);
633
+ }
634
+ catch (error) {
635
+ rethrowInputError(error, "drag");
636
+ }
488
637
  }
489
638
  async scroll(x, y, deltaX, deltaY) {
490
- await inputScroll(x, y, deltaX, deltaY);
639
+ try {
640
+ await inputScroll(x, y, deltaX, deltaY);
641
+ }
642
+ catch (error) {
643
+ rethrowInputError(error, "scroll");
644
+ }
491
645
  }
492
646
  // ── Cursor ──────────────────────────────────────────────────────────────
493
647
  getCursorPosition() {
@@ -502,14 +656,12 @@ export class MacOSPlatform {
502
656
  return JSON.parse(out);
503
657
  }
504
658
  catch (error) {
505
- throw new Error(`get_cursor_position failed: ${error.message || error}`);
659
+ throw new PlatformError(`get_cursor_position failed: ${errorMessage(error)}`);
506
660
  }
507
661
  }
508
662
  // ── OCR ──────────────────────────────────────────────────────────────────
509
663
  async ocr(display, region) {
510
- // Take a screenshot first (reuse existing logic)
511
664
  const buf = await this.screenshot(display, region);
512
- // Write screenshot to a temp file so Vision framework can read it
513
665
  const { writeFile, unlink } = await import("node:fs/promises");
514
666
  const { join } = await import("node:path");
515
667
  const { tmpdir } = await import("node:os");
@@ -518,93 +670,46 @@ export class MacOSPlatform {
518
670
  try {
519
671
  const screenSize = this.getScreenSize(display);
520
672
  const scaleFactor = screenSize.scaleFactor ?? 2;
521
- // Build JXA script that uses Vision framework for OCR
522
- // JXA does not allow return statements at global scope, so we wrap in a function
523
- const jxaScript = `
524
- function run() {
525
- ObjC.import('Vision');
526
- ObjC.import('AppKit');
527
- ObjC.import('Foundation');
528
-
529
- var app = Application.currentApplication();
530
- app.includeStandardAdditions = true;
531
-
532
- var path = "${tmpPath.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$')}";
533
- var url = $.NSURL.fileURLWithPath(path);
534
- var image = $.NSImage.alloc.initWithContentsOfURL(url);
535
-
536
- if (!image || image.isValid() === false) {
537
- return JSON.stringify({error: "Failed to load screenshot image", elements: [], fullText: ""});
538
- }
539
-
540
- var cgImage = image.CGImageForProposedRectContextHints(null, null, null);
541
- if (!cgImage) {
542
- return JSON.stringify({error: "Failed to get CGImage from screenshot", elements: [], fullText: ""});
543
- }
544
-
545
- var request = $.VNRecognizeTextRequest.alloc.init;
546
- request.recognitionLevel = $.VNRequestTextRecognitionLevelAccurate;
547
- request.usesLanguageCorrection = true;
548
-
549
- var handler = $.VNImageRequestHandler.alloc.initWithCGImageOptions(cgImage, null);
550
- var performError = $();
551
-
552
- var success = handler.performRequestsError([request], performError);
553
- if (!success) {
554
- return JSON.stringify({error: "OCR request failed", elements: [], fullText: ""});
555
- }
556
-
557
- var results = request.results;
558
- var elements = [];
559
- var fullTextParts = [];
560
-
561
- var imgWidth = cgImage.width;
562
- var imgHeight = cgImage.height;
563
-
564
- for (var i = 0; i < results.count; i++) {
565
- var obs = $(results).objectAtIndex(i);
566
- var candidates = obs.topCandidates(1);
567
- if (candidates && candidates.count > 0) {
568
- var candidate = $(candidates).objectAtIndex(0);
569
- var text = candidate.string.toString();
570
- var confidence = candidate.confidence;
571
- var bbox = obs.boundingBox;
572
-
573
- // Vision boundingBox is normalized (0-1) with origin at bottom-left
574
- // Convert to screen coordinates (origin at top-left)
575
- var bx = bbox.origin.x * imgWidth;
576
- var by = (1 - bbox.origin.y - bbox.size.height) * imgHeight;
577
- var bw = bbox.size.width * imgWidth;
578
- var bh = bbox.size.height * imgHeight;
579
-
580
- elements.push({
581
- text: text,
582
- x: Math.round(bx),
583
- y: Math.round(by),
584
- width: Math.round(bw),
585
- height: Math.round(bh),
586
- confidence: confidence
587
- });
588
- fullTextParts.push(text);
673
+ // Try native Swift OCR helper first (avoids JXA ObjC bridge bugs on macOS Sequoia+)
674
+ const nativeResult = await this.ocrNative(tmpPath, scaleFactor, region);
675
+ if (nativeResult)
676
+ return nativeResult;
677
+ // Fallback to JXA Vision framework
678
+ return await this.ocrJxa(tmpPath, screenSize, scaleFactor, region, buf);
679
+ }
680
+ finally {
681
+ await unlink(tmpPath).catch(() => { });
682
+ }
683
+ }
684
+ async ocrNative(tmpPath, scaleFactor, region) {
685
+ const { existsSync } = await import("node:fs");
686
+ const { join, dirname } = await import("node:path");
687
+ const { fileURLToPath } = await import("node:url");
688
+ // Resolve native binary path (same pattern as input.ts CGEvent helper)
689
+ const candidates = [
690
+ join(dirname(fileURLToPath(import.meta.url)), "..", "..", "native", "ocr", "ocr-helper"),
691
+ join(dirname(fileURLToPath(import.meta.url)), "..", "native", "ocr", "ocr-helper"),
692
+ join(process.cwd(), "native", "ocr", "ocr-helper"),
693
+ ];
694
+ let binaryPath;
695
+ for (const p of candidates) {
696
+ if (existsSync(p)) {
697
+ binaryPath = p;
698
+ break;
589
699
  }
590
- }
591
-
592
- return JSON.stringify({elements: elements, fullText: fullTextParts.join("\\n"), error: null});
593
700
  }
594
- run();
595
- `;
596
- const out = execFileSync("osascript", [
597
- "-l", "JavaScript",
598
- "-e", jxaScript,
599
- ], { encoding: "utf-8", timeout: 30000 }).trim();
701
+ if (!binaryPath)
702
+ return null;
703
+ try {
704
+ const input = JSON.stringify({ imagePath: tmpPath });
705
+ const out = execFileSync(binaryPath, [], {
706
+ input,
707
+ encoding: "utf-8",
708
+ timeout: 30000,
709
+ }).trim();
600
710
  const parsed = JSON.parse(out);
601
- if (parsed.error) {
602
- throw new Error(parsed.error);
603
- }
604
- // Scale coordinates from image space to screen space
605
- // The screenshot may be taken at a different resolution than screen coordinates
606
- const imgWidth = buf.readUInt32BE(16); // PNG width at offset 16
607
- const scaleFactorX = screenSize.width / (region ? region.width : (imgWidth / scaleFactor));
711
+ if (parsed.error)
712
+ return null;
608
713
  const elements = parsed.elements.map((el) => ({
609
714
  text: el.text,
610
715
  x: Math.round(el.x / scaleFactor) + (region ? region.x : 0),
@@ -613,15 +718,81 @@ export class MacOSPlatform {
613
718
  height: Math.round(el.height / scaleFactor),
614
719
  confidence: el.confidence,
615
720
  }));
616
- return {
617
- elements,
618
- fullText: parsed.fullText,
619
- };
721
+ return { elements, fullText: parsed.fullText };
620
722
  }
621
- finally {
622
- await unlink(tmpPath).catch(() => { });
723
+ catch {
724
+ return null;
623
725
  }
624
726
  }
727
+ async ocrJxa(tmpPath, screenSize, scaleFactor, region, buf) {
728
+ const escapedPath = tmpPath.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, "\\`").replace(/$/g, "\\$");
729
+ const jxaScript = `
730
+ function run() {
731
+ ObjC.import('Vision');
732
+ ObjC.import('AppKit');
733
+ ObjC.import('Foundation');
734
+ var app = Application.currentApplication();
735
+ app.includeStandardAdditions = true;
736
+ var path = "${escapedPath}";
737
+ var url = $.NSURL.fileURLWithPath(path);
738
+ var image = $.NSImage.alloc.initWithContentsOfURL(url);
739
+ if (!image || !image.isValid) {
740
+ return JSON.stringify({error: "Failed to load screenshot image", elements: [], fullText: ""});
741
+ }
742
+ var cgImage = image.CGImageForProposedRectContextHints(null, null, null);
743
+ if (!cgImage) {
744
+ return JSON.stringify({error: "Failed to get CGImage from screenshot", elements: [], fullText: ""});
745
+ }
746
+ var request = $.VNRecognizeTextRequest.alloc.init;
747
+ request.recognitionLevel = $.VNRequestTextRecognitionLevelAccurate;
748
+ request.usesLanguageCorrection = true;
749
+ var handler = $.VNImageRequestHandler.alloc.initWithCGImageOptions(cgImage, null);
750
+ var performError = $();
751
+ var success = handler.performRequestsError([request], performError);
752
+ if (!success) {
753
+ return JSON.stringify({error: "OCR request failed", elements: [], fullText: ""});
754
+ }
755
+ var results = request.results;
756
+ var elements = [];
757
+ var fullTextParts = [];
758
+ var imgWidth = cgImage.width;
759
+ var imgHeight = cgImage.height;
760
+ for (var i = 0; i < results.count; i++) {
761
+ var obs = $(results).objectAtIndex(i);
762
+ var candidates = obs.topCandidates(1);
763
+ if (candidates && candidates.count > 0) {
764
+ var candidate = $(candidates).objectAtIndex(0);
765
+ var text = candidate.string.toString();
766
+ var confidence = candidate.confidence;
767
+ var bbox = obs.boundingBox;
768
+ var bx = bbox.origin.x * imgWidth;
769
+ var by = (1 - bbox.origin.y - bbox.size.height) * imgHeight;
770
+ var bw = bbox.size.width * imgWidth;
771
+ var bh = bbox.size.height * imgHeight;
772
+ elements.push({text:text,x:Math.round(bx),y:Math.round(by),width:Math.round(bw),height:Math.round(bh),confidence:confidence});
773
+ fullTextParts.push(text);
774
+ }
775
+ }
776
+ return JSON.stringify({elements:elements,fullText:fullTextParts.join("\\n"),error:null});
777
+ }
778
+ run();
779
+ `;
780
+ const out = execFileSync("osascript", ["-l", "JavaScript", "-e", jxaScript], { encoding: "utf-8", timeout: 30000 }).trim();
781
+ const parsed = JSON.parse(out);
782
+ if (parsed.error)
783
+ throw new CaptureError(`ocr failed: ${parsed.error}`);
784
+ const imgWidth = buf.readUInt32BE(16);
785
+ const scaleFactorX = screenSize.width / (region ? region.width : (imgWidth / scaleFactor));
786
+ const elements = parsed.elements.map((el) => ({
787
+ text: el.text,
788
+ x: Math.round(el.x / scaleFactor) + (region ? region.x : 0),
789
+ y: Math.round(el.y / scaleFactor) + (region ? region.y : 0),
790
+ width: Math.round(el.width / scaleFactor),
791
+ height: Math.round(el.height / scaleFactor),
792
+ confidence: el.confidence,
793
+ }));
794
+ return { elements, fullText: parsed.fullText };
795
+ }
625
796
  // ── Keyboard ────────────────────────────────────────────────────────────
626
797
  async type(text, delay) {
627
798
  await typeText(text, delay);
@@ -632,24 +803,98 @@ export class MacOSPlatform {
632
803
  // ── Accessibility (AX) Element Actions ───────────────────────────────────
633
804
  async findElement(options) {
634
805
  this.evictExpiredCacheEntries();
635
- const { text, role, app, depth, includeBounds = true } = options;
806
+ const { text, role, app, depth, includeBounds = true, textMode = "contains", visibleOnly = false, value } = options;
636
807
  const effectiveApp = app || this.activeTarget?.appName;
637
808
  const maxDepth = Math.min(depth || 5, 10);
638
809
  const maxResults = Math.min(Math.max(options.maxResults ?? 50, 1), 200);
639
810
  const escapedApp = (effectiveApp || "").replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$');
640
811
  const escapedText = text ? text.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$') : "";
641
812
  const escapedRole = role ? role.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$') : "";
813
+ const escapedValue = value ? value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/`/g, '\\`').replace(/\$/g, '\\$') : "";
814
+ // Pre-compile regex on TS side to validate syntax before passing to JXA
815
+ if (text && textMode === "regex") {
816
+ try {
817
+ new RegExp(text);
818
+ }
819
+ catch {
820
+ throw new PlatformError(`Invalid regex pattern: ${text}`);
821
+ }
822
+ }
823
+ const startTime = Date.now();
642
824
  const jxaScript = `
643
- var se = Application('System Events');
644
- var results = [];
825
+ var se = Application('System Events');
826
+ function childElements(elem) {
827
+ try { return elem.uiElements(); } catch(e1) {
828
+ try { return elem.elements(); } catch(e2) { return []; }
829
+ }
830
+ }
831
+ var results = [];
832
+ var scannedCount = 0;
833
+ var matchedCount = 0;
645
834
  var resultCount = [0];
646
835
  var maxResults = ${maxResults};
647
836
  var includeBounds = ${includeBounds ? "true" : "false"};
837
+ var visibleOnly = ${visibleOnly ? "true" : "false"};
838
+ var textMode = "${textMode}";
648
839
 
649
840
  var textFilter = ${text ? `"${escapedText}"` : "null"};
650
841
  var roleFilter = ${role ? `"${escapedRole}"` : "null"};
842
+ var valueFilter = ${value ? `"${escapedValue}"` : "null"};
843
+
844
+ function isVisible(elem) {
845
+ try {
846
+ var pos = elem.position();
847
+ var sz = elem.size();
848
+ if (!pos || !sz) return false;
849
+ return sz[0] > 0 && sz[1] > 0 && pos[0] > -10000 && pos[1] > -10000;
850
+ } catch(e) {
851
+ return false;
852
+ }
853
+ }
854
+
855
+ function textMatches(elemName, elemValue, elemDesc) {
856
+ if (textFilter === null) return true;
857
+ var sources = [elemName, elemValue, elemDesc];
858
+ if (textMode === "exact") {
859
+ var t = textFilter.toLowerCase();
860
+ for (var i = 0; i < sources.length; i++) {
861
+ if (sources[i].toLowerCase() === t) return true;
862
+ }
863
+ return false;
864
+ } else if (textMode === "regex") {
865
+ try {
866
+ var re = new RegExp(textFilter, "i");
867
+ for (var i = 0; i < sources.length; i++) {
868
+ if (re.test(sources[i])) return true;
869
+ }
870
+ } catch(e) {}
871
+ return false;
872
+ } else {
873
+ // contains (default)
874
+ var t = textFilter.toLowerCase();
875
+ for (var i = 0; i < sources.length; i++) {
876
+ if (sources[i].toLowerCase().indexOf(t) !== -1) return true;
877
+ }
878
+ return false;
879
+ }
880
+ }
881
+
882
+ function valueMatches(elemValue) {
883
+ if (valueFilter === null) return true;
884
+ if (textMode === "exact") {
885
+ return elemValue.toLowerCase() === valueFilter.toLowerCase();
886
+ } else if (textMode === "regex") {
887
+ try {
888
+ return new RegExp(valueFilter, "i").test(elemValue);
889
+ } catch(e) { return false; }
890
+ } else {
891
+ // contains (default)
892
+ return elemValue.toLowerCase().indexOf(valueFilter.toLowerCase()) !== -1;
893
+ }
894
+ }
651
895
 
652
896
  function matches(elem) {
897
+ scannedCount++;
653
898
  var elemName = '';
654
899
  var elemRole = '';
655
900
  var elemDesc = '';
@@ -659,17 +904,14 @@ export class MacOSPlatform {
659
904
  try { elemDesc = elem.description() || ''; } catch(e) {}
660
905
  try { var v = elem.value(); elemValue = (v !== undefined && v !== null) ? String(v) : ''; } catch(e) {}
661
906
 
662
- if (textFilter !== null) {
663
- var t = textFilter.toLowerCase();
664
- if (elemName.toLowerCase().indexOf(t) === -1 &&
665
- elemValue.toLowerCase().indexOf(t) === -1 &&
666
- elemDesc.toLowerCase().indexOf(t) === -1) {
667
- return false;
668
- }
669
- }
907
+ if (visibleOnly && !isVisible(elem)) return false;
908
+
909
+ if (!textMatches(elemName, elemValue, elemDesc)) return false;
670
910
  if (roleFilter !== null) {
671
911
  if (elemRole !== roleFilter) return false;
672
912
  }
913
+ if (!valueMatches(elemValue)) return false;
914
+ matchedCount++;
673
915
  return true;
674
916
  }
675
917
 
@@ -723,7 +965,7 @@ export class MacOSPlatform {
723
965
 
724
966
  if (currentDepth < ${maxDepth}) {
725
967
  try {
726
- var kids = elem.elements();
968
+ var kids = childElements(elem);
727
969
  for (var k = 0; k < kids.length && resultCount[0] < maxResults; k++) {
728
970
  traverse(kids[k], path + '/' + k, currentDepth + 1);
729
971
  }
@@ -736,7 +978,7 @@ export class MacOSPlatform {
736
978
  var proc = se.processes["${escapedApp}"]();
737
979
  var wins = proc.windows();
738
980
  for (var w = 0; w < wins.length && resultCount[0] < maxResults; w++) {
739
- traverse(wins[w], "win" + w, 0);
981
+ traverse(wins[w], "${escapedApp}/win" + w, 0);
740
982
  }
741
983
  } else {
742
984
  var procs = se.processes();
@@ -752,15 +994,16 @@ export class MacOSPlatform {
752
994
  }
753
995
  } catch(e) {}
754
996
 
755
- JSON.stringify(results);
997
+ JSON.stringify({results: results, scannedCount: scannedCount, matchedCount: matchedCount});
756
998
  `;
757
999
  try {
758
1000
  const out = execFileSync("osascript", [
759
1001
  "-l", "JavaScript",
760
1002
  "-e", jxaScript,
761
1003
  ], { encoding: "utf-8", timeout: 30000 }).trim();
762
- const results = JSON.parse(out);
763
- for (const result of results) {
1004
+ const parsed = JSON.parse(out);
1005
+ const durationMs = Date.now() - startTime;
1006
+ for (const result of parsed.results) {
764
1007
  const appName = effectiveApp || result.id.split("/")[0] || "";
765
1008
  this.elementCache.set(result.id, {
766
1009
  elementId: result.id,
@@ -776,15 +1019,35 @@ export class MacOSPlatform {
776
1019
  });
777
1020
  }
778
1021
  this.evictOverflowCacheEntries();
779
- return results;
1022
+ let finalResults = parsed.results;
1023
+ if (options.near) {
1024
+ const nx = options.near.x;
1025
+ const ny = options.near.y;
1026
+ finalResults = [...finalResults].sort((a, b) => {
1027
+ const acx = (a.bounds?.x ?? 0) + (a.bounds?.width ?? 0) / 2;
1028
+ const acy = (a.bounds?.y ?? 0) + (a.bounds?.height ?? 0) / 2;
1029
+ const bcx = (b.bounds?.x ?? 0) + (b.bounds?.width ?? 0) / 2;
1030
+ const bcy = (b.bounds?.y ?? 0) + (b.bounds?.height ?? 0) / 2;
1031
+ return Math.hypot(acx - nx, acy - ny) - Math.hypot(bcx - nx, bcy - ny);
1032
+ });
1033
+ }
1034
+ if (typeof options.index === "number") {
1035
+ finalResults = options.index >= 0 && options.index < finalResults.length
1036
+ ? [finalResults[options.index]]
1037
+ : [];
1038
+ }
1039
+ return {
1040
+ results: finalResults,
1041
+ metrics: {
1042
+ scannedCount: parsed.scannedCount,
1043
+ matchedCount: parsed.matchedCount,
1044
+ durationMs,
1045
+ truncated: parsed.results.length >= maxResults,
1046
+ },
1047
+ };
780
1048
  }
781
1049
  catch (error) {
782
- if (String(error.message || error).includes("not allowed") ||
783
- String(error.message || error).includes("permission") ||
784
- String(error.message || error).includes("assistive")) {
785
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
786
- }
787
- throw new Error(`find_element failed: ${error.message || error}`);
1050
+ rethrowAccessibilityError(error, "find_element");
788
1051
  }
789
1052
  }
790
1053
  async clickElement(elementId, app) {
@@ -799,6 +1062,11 @@ export class MacOSPlatform {
799
1062
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
800
1063
  const jxaScript = `
801
1064
  var se = Application('System Events');
1065
+ function childElements(elem) {
1066
+ try { return elem.uiElements(); } catch(e1) {
1067
+ try { return elem.elements(); } catch(e2) { return []; }
1068
+ }
1069
+ }
802
1070
  var elemPath = "${escapedElementId}";
803
1071
  var appName = "${escapedApp}";
804
1072
  var cached = ${cachedJson};
@@ -825,7 +1093,7 @@ export class MacOSPlatform {
825
1093
  var idx = parseInt(parts[i]);
826
1094
  if (isNaN(idx)) return null;
827
1095
  try {
828
- var kids = current.elements();
1096
+ var kids = childElements(current);
829
1097
  if (idx >= kids.length) return null;
830
1098
  current = kids[idx];
831
1099
  } catch(e) { return null; }
@@ -910,7 +1178,7 @@ export class MacOSPlatform {
910
1178
  bestScore = score;
911
1179
  }
912
1180
  try {
913
- var kids = elem.elements();
1181
+ var kids = childElements(elem);
914
1182
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
915
1183
  } catch(e) {}
916
1184
  }
@@ -948,7 +1216,7 @@ export class MacOSPlatform {
948
1216
  var idx = parseInt(parts[i]);
949
1217
  if (isNaN(idx)) break;
950
1218
  try {
951
- var kids = current.elements();
1219
+ var kids = childElements(current);
952
1220
  if (idx >= kids.length) break;
953
1221
  current = kids[idx];
954
1222
  } catch(e) { break; }
@@ -1003,17 +1271,13 @@ export class MacOSPlatform {
1003
1271
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1004
1272
  const result = JSON.parse(out);
1005
1273
  if (!result.success) {
1006
- throw new Error(result.error || `click_element failed for element ${elementId}`);
1274
+ throw result.error
1275
+ ? new Error(result.error)
1276
+ : new ElementNotFoundError(elementId);
1007
1277
  }
1008
1278
  }
1009
1279
  catch (error) {
1010
- if (error.message && error.message.includes("click_element failed"))
1011
- throw error;
1012
- if (String(error.message || error).includes("not allowed") ||
1013
- String(error.message || error).includes("permission")) {
1014
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1015
- }
1016
- throw new Error(`click_element failed: ${error.message || error}`);
1280
+ rethrowElementActionError(error, "click_element", elementId);
1017
1281
  }
1018
1282
  }
1019
1283
  async typeInElement(elementId, text, app, clearFirst) {
@@ -1029,6 +1293,11 @@ export class MacOSPlatform {
1029
1293
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
1030
1294
  const jxaScript = `
1031
1295
  var se = Application('System Events');
1296
+ function childElements(elem) {
1297
+ try { return elem.uiElements(); } catch(e1) {
1298
+ try { return elem.elements(); } catch(e2) { return []; }
1299
+ }
1300
+ }
1032
1301
  var elemPath = "${escapedElementId}";
1033
1302
  var appName = "${escapedApp}";
1034
1303
  var textToType = "${escapedText}";
@@ -1057,7 +1326,7 @@ export class MacOSPlatform {
1057
1326
  var idx = parseInt(parts[i]);
1058
1327
  if (isNaN(idx)) return null;
1059
1328
  try {
1060
- var kids = current.elements();
1329
+ var kids = childElements(current);
1061
1330
  if (idx >= kids.length) return null;
1062
1331
  current = kids[idx];
1063
1332
  } catch(e) { return null; }
@@ -1142,7 +1411,7 @@ export class MacOSPlatform {
1142
1411
  bestScore = score;
1143
1412
  }
1144
1413
  try {
1145
- var kids = elem.elements();
1414
+ var kids = childElements(elem);
1146
1415
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
1147
1416
  } catch(e) {}
1148
1417
  }
@@ -1180,7 +1449,7 @@ export class MacOSPlatform {
1180
1449
  var idx = parseInt(parts[i]);
1181
1450
  if (isNaN(idx)) break;
1182
1451
  try {
1183
- var kids = current.elements();
1452
+ var kids = childElements(current);
1184
1453
  if (idx >= kids.length) break;
1185
1454
  current = kids[idx];
1186
1455
  } catch(e) { break; }
@@ -1245,17 +1514,31 @@ export class MacOSPlatform {
1245
1514
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1246
1515
  const result = JSON.parse(out);
1247
1516
  if (!result.success) {
1248
- throw new Error(result.error || `type_in_element failed for element ${elementId}`);
1517
+ throw result.error
1518
+ ? new Error(result.error)
1519
+ : new ElementNotFoundError(elementId);
1249
1520
  }
1250
1521
  }
1251
1522
  catch (error) {
1252
- if (error.message && error.message.includes("type_in_element failed"))
1253
- throw error;
1254
- if (String(error.message || error).includes("not allowed") ||
1255
- String(error.message || error).includes("permission")) {
1256
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1257
- }
1258
- throw new Error(`type_in_element failed: ${error.message || error}`);
1523
+ rethrowElementActionError(error, "type_in_element", elementId);
1524
+ }
1525
+ }
1526
+ // ── Clipboard ───────────────────────────────────────────────────────────
1527
+ async readClipboard() {
1528
+ try {
1529
+ const out = execFileSync("pbpaste", [], { encoding: "utf-8", timeout: 5000 });
1530
+ return out;
1531
+ }
1532
+ catch (error) {
1533
+ throw new PlatformError(`read_clipboard failed: ${errorMessage(error)}`);
1534
+ }
1535
+ }
1536
+ async writeClipboard(text) {
1537
+ try {
1538
+ execFileSync("pbcopy", [], { input: text, encoding: "utf-8", timeout: 5000 });
1539
+ }
1540
+ catch (error) {
1541
+ throw new PlatformError(`write_clipboard failed: ${errorMessage(error)}`);
1259
1542
  }
1260
1543
  }
1261
1544
  async setElementValue(elementId, value, app) {
@@ -1271,6 +1554,11 @@ export class MacOSPlatform {
1271
1554
  const cachedJson = JSON.stringify(this.elementCache.get(elementId) ?? null);
1272
1555
  const jxaScript = `
1273
1556
  var se = Application('System Events');
1557
+ function childElements(elem) {
1558
+ try { return elem.uiElements(); } catch(e1) {
1559
+ try { return elem.elements(); } catch(e2) { return []; }
1560
+ }
1561
+ }
1274
1562
  var elemPath = ${elementIdLiteral};
1275
1563
  var appName = ${appLiteral};
1276
1564
  var valueToSet = ${valueLiteral};
@@ -1296,7 +1584,7 @@ export class MacOSPlatform {
1296
1584
  var idx = parseInt(parts[i]);
1297
1585
  if (isNaN(idx)) return null;
1298
1586
  try {
1299
- var kids = current.elements();
1587
+ var kids = childElements(current);
1300
1588
  if (idx >= kids.length) return null;
1301
1589
  current = kids[idx];
1302
1590
  } catch(e) { return null; }
@@ -1323,7 +1611,7 @@ export class MacOSPlatform {
1323
1611
  var idx = parseInt(parts[i]);
1324
1612
  if (isNaN(idx)) return null;
1325
1613
  try {
1326
- var kids = current.elements();
1614
+ var kids = childElements(current);
1327
1615
  if (idx >= kids.length) return null;
1328
1616
  current = kids[idx];
1329
1617
  } catch(e) { return null; }
@@ -1408,7 +1696,7 @@ export class MacOSPlatform {
1408
1696
  bestScore = score;
1409
1697
  }
1410
1698
  try {
1411
- var kids = elem.elements();
1699
+ var kids = childElements(elem);
1412
1700
  for (var i = 0; i < kids.length; i++) visit(kids[i], depth + 1);
1413
1701
  } catch(e) {}
1414
1702
  }
@@ -1456,7 +1744,9 @@ export class MacOSPlatform {
1456
1744
  ], { encoding: "utf-8", timeout: 15000 }).trim();
1457
1745
  const result = JSON.parse(out);
1458
1746
  if (!result.success) {
1459
- throw new Error(result.error || `set_value failed for element ${elementId}`);
1747
+ throw result.error
1748
+ ? new Error(result.error)
1749
+ : new ElementNotFoundError(elementId);
1460
1750
  }
1461
1751
  const currentCached = this.elementCache.get(elementId);
1462
1752
  if (currentCached) {
@@ -1464,13 +1754,7 @@ export class MacOSPlatform {
1464
1754
  }
1465
1755
  }
1466
1756
  catch (error) {
1467
- if (error.message && error.message.includes("set_value failed"))
1468
- throw error;
1469
- if (String(error.message || error).includes("not allowed") ||
1470
- String(error.message || error).includes("permission")) {
1471
- throw new Error("Accessibility permission required: grant System Events access in System Preferences > Privacy & Accessibility");
1472
- }
1473
- throw new Error(`set_value failed: ${error.message || error}`);
1757
+ rethrowElementActionError(error, "set_value", elementId);
1474
1758
  }
1475
1759
  }
1476
1760
  }