ucu-mcp 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,16 @@
1
1
  /**
2
2
  * Tool registry for UCU-MCP.
3
3
  *
4
- * Registers 22 MCP tools on the server and dispatches each call through
4
+ * Registers 24 MCP tools on the server and dispatches each call through
5
5
  * a shared safety/permission/retry pipeline (`withSafety`).
6
6
  */
7
7
  import { z } from "zod";
8
8
  import { MacOSPlatform } from "../platform/macos.js";
9
- import { SafetyGuard } from "../safety/guard.js";
9
+ import { SafetyGuard, classifyAction } from "../safety/guard.js";
10
10
  import { checkPermission } from "../safety/permissions.js";
11
11
  import { retry } from "../util/retry.js";
12
12
  import { createLogger } from "../util/logger.js";
13
+ import { metrics } from "../util/metrics.js";
13
14
  import { SafetyError, PermissionError, UnsupportedParameterError, UcuError, WindowNotFoundError } from "../util/errors.js";
14
15
  const log = createLogger("tools");
15
16
  let _platform;
@@ -20,6 +21,14 @@ function getPlatform() {
20
21
  return _platform;
21
22
  }
22
23
  const safety = new SafetyGuard();
24
+ // Active target context — set by focus_app, used by AX element tools
25
+ let activeTargetContext;
26
+ /**
27
+ * Get the currently active target context (set by focus_app).
28
+ */
29
+ export function getActiveTarget() {
30
+ return activeTargetContext;
31
+ }
23
32
  // User activity monitor — pauses automation when user moves the cursor
24
33
  let lastCursorPos = { x: 0, y: 0 };
25
34
  let userActivityInterval;
@@ -43,6 +52,8 @@ function recoveryHint(code) {
43
52
  switch (code) {
44
53
  case "WINDOW_NOT_FOUND":
45
54
  return "Run list_windows again, then retry with a fresh windowId or omit windowId for screen coordinates.";
55
+ case "TARGET_STALE":
56
+ return "Run focus_app again for the target app, or run list_windows and retry with a fresh windowId.";
46
57
  case "ELEMENT_NOT_FOUND":
47
58
  return "Run find_element again, then retry with a fresh elementId.";
48
59
  case "PERMISSION_DENIED":
@@ -61,28 +72,63 @@ function recoveryHint(code) {
61
72
  return "Inspect the error message, observe the current UI state, and retry only if the operation is safe.";
62
73
  }
63
74
  }
64
- function mcpErrorResponse(error) {
75
+ function errorDetails(error) {
65
76
  const err = error instanceof Error ? error : new Error(String(error));
66
77
  const code = error instanceof UcuError ? error.code : "UNKNOWN_ERROR";
67
78
  const retryable = error instanceof UcuError ? error.retryable : false;
79
+ return {
80
+ name: err.name,
81
+ code,
82
+ retryable,
83
+ message: err.message,
84
+ recovery: recoveryHint(code),
85
+ };
86
+ }
87
+ let _actionCounter = 0;
88
+ function nextActionId() {
89
+ _actionCounter = (_actionCounter + 1) % 1_000_000;
90
+ return `a${Date.now().toString(36)}-${_actionCounter.toString(36)}`;
91
+ }
92
+ function buildActionReceipt(action, status, target, result, captureRequested, captureFormat, captureMaxWidth, captureError, warnings = []) {
93
+ const captureStatus = captureRequested
94
+ ? captureError ? "error" : "ok"
95
+ : "skipped";
96
+ return {
97
+ actionId: nextActionId(),
98
+ action,
99
+ status,
100
+ target,
101
+ result,
102
+ capture: {
103
+ requested: captureRequested,
104
+ status: captureStatus,
105
+ ...(captureFormat && { format: captureFormat }),
106
+ ...(captureMaxWidth && { maxWidth: captureMaxWidth }),
107
+ ...(captureError && { error: captureError }),
108
+ },
109
+ warnings,
110
+ next: captureError
111
+ ? "screenshot"
112
+ : status === "partial"
113
+ ? "get_window_state"
114
+ : "find_element or get_window_state",
115
+ };
116
+ }
117
+ function mcpErrorResponse(error) {
68
118
  return {
69
119
  isError: true,
70
120
  content: [
71
121
  jsonText({
72
- error: {
73
- name: err.name,
74
- code,
75
- retryable,
76
- message: err.message,
77
- recovery: recoveryHint(code),
78
- },
122
+ error: errorDetails(error),
79
123
  }),
80
124
  ],
81
125
  };
82
126
  }
83
- async function actionResponse(result, captureAfter, captureFormat = "jpeg", captureMaxWidth = 1280) {
84
- if (!captureAfter)
85
- return { content: [jsonText(result)] };
127
+ async function actionResponse(action, result, target, captureAfter, captureFormat = "jpeg", captureMaxWidth = 1280, warnings = []) {
128
+ const receipt = buildActionReceipt(action, "ok", target, result, captureAfter ?? false, captureFormat, captureMaxWidth, undefined, warnings);
129
+ if (!captureAfter) {
130
+ return { content: [jsonText(receipt)] };
131
+ }
86
132
  try {
87
133
  const buf = await getPlatform().screenshot(undefined, undefined, {
88
134
  format: captureFormat,
@@ -90,7 +136,7 @@ async function actionResponse(result, captureAfter, captureFormat = "jpeg", capt
90
136
  });
91
137
  return {
92
138
  content: [
93
- jsonText({ actionResult: result }),
139
+ jsonText(receipt),
94
140
  {
95
141
  type: "image",
96
142
  data: buf.toString("base64"),
@@ -99,8 +145,9 @@ async function actionResponse(result, captureAfter, captureFormat = "jpeg", capt
99
145
  ],
100
146
  };
101
147
  }
102
- catch {
103
- return { content: [jsonText(result)] };
148
+ catch (error) {
149
+ const partialReceipt = buildActionReceipt(action, "partial", target, result, true, captureFormat, captureMaxWidth, errorDetails(error), [...warnings, "Post-action screenshot capture failed"]);
150
+ return { content: [jsonText(partialReceipt)] };
104
151
  }
105
152
  }
106
153
  const retryableActions = new Set([
@@ -118,7 +165,9 @@ async function withSafety(sa) {
118
165
  const platform = getPlatform();
119
166
  if (platform.isScreenLocked?.())
120
167
  throw new SafetyError("Screen is locked");
121
- const check = safety.checkAction(sa.action, sa.params);
168
+ const check = safety.checkAction(sa.action, sa.params, {
169
+ skipUserActivityPause: sa.skipUserActivityPause ?? classifyAction(sa.action) === "observe",
170
+ });
122
171
  if (!check.allowed)
123
172
  throw new SafetyError(check.reason ?? "Action blocked by safety guard");
124
173
  if (sa.requiresAccessibility) {
@@ -136,12 +185,14 @@ async function withSafety(sa) {
136
185
  const shouldManageFocus = sa.requiresAccessibility && !["screenshot", "list_windows", "list_apps", "get_window_state", "get_cursor_position", "get_screen_size", "ocr", "doctor", "wait", "wait_for_element", "find_element", "focus_app"].includes(sa.action);
137
186
  if (shouldManageFocus)
138
187
  await platform.saveFocus?.();
188
+ const start = Date.now();
139
189
  try {
140
190
  return retryableActions.has(sa.action)
141
191
  ? await retry(() => sa.execute())
142
192
  : await sa.execute();
143
193
  }
144
194
  finally {
195
+ metrics.record(sa.action, Date.now() - start);
145
196
  if (shouldManageFocus)
146
197
  await platform.restoreFocus?.();
147
198
  }
@@ -224,13 +275,15 @@ export function registerTools(server) {
224
275
  app: z.string().describe("Application name to focus"),
225
276
  }, async (params) => {
226
277
  const target = await withSafety({ action: "focus_app", params: {}, requiresAccessibility: true, execute: () => getPlatform().focusApp(params.app) });
278
+ activeTargetContext = target;
227
279
  return { content: [{ type: "text", text: JSON.stringify(target, null, 2) }] };
228
280
  });
229
281
  registry.register("focus_app");
230
282
  registerTool("get_window_state", "Get detailed state of a window including accessibility tree", {
231
283
  windowId: z.string().optional().describe("Window ID"), depth: z.number().optional().describe("AX tree depth"), includeBounds: z.boolean().optional().describe("Include element bounds"),
232
284
  }, async (params) => {
233
- const state = await withSafety({ action: "get_window_state", params: {}, requiresAccessibility: true, execute: () => getPlatform().getWindowState(params.windowId, params.depth, params.includeBounds) });
285
+ const effectiveWindowId = params.windowId || getActiveTarget()?.windowId;
286
+ const state = await withSafety({ action: "get_window_state", params: {}, requiresAccessibility: true, execute: () => getPlatform().getWindowState(effectiveWindowId, params.depth, params.includeBounds) });
234
287
  return { content: [{ type: "text", text: JSON.stringify(state, null, 2) }] };
235
288
  });
236
289
  registry.register("get_window_state");
@@ -242,7 +295,7 @@ export function registerTools(server) {
242
295
  }, async (params) => {
243
296
  const pt = await resolvePoint(params.x, params.y, params.windowId);
244
297
  await withSafety({ action: "click", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().click(pt.x, pt.y, params.button) });
245
- return actionResponse({ clicked: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
298
+ return actionResponse("click", { clicked: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
246
299
  });
247
300
  registry.register("click");
248
301
  registerTool("double_click", "Double-click at screen coordinates", {
@@ -253,7 +306,7 @@ export function registerTools(server) {
253
306
  }, async (params) => {
254
307
  const pt = await resolvePoint(params.x, params.y, params.windowId);
255
308
  await withSafety({ action: "click", params: { x: pt.x, y: pt.y, doubleClick: true }, requiresAccessibility: true, execute: () => getPlatform().click(pt.x, pt.y, params.button, true) });
256
- return actionResponse({ doubleClicked: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
309
+ return actionResponse("double_click", { doubleClicked: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
257
310
  });
258
311
  registry.register("double_click");
259
312
  registerTool("type_text", "Type text at the current cursor position", {
@@ -264,7 +317,7 @@ export function registerTools(server) {
264
317
  if (params.windowId)
265
318
  throw new UnsupportedParameterError("windowId-targeted keyboard typing is not implemented");
266
319
  await withSafety({ action: "type_text", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().type(params.text, params.delay) });
267
- return actionResponse({ typed: true, charCount: params.text.length }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
320
+ return actionResponse("type_text", { typed: true, charCount: params.text.length }, {}, params.captureAfter, params.captureFormat, params.captureMaxWidth);
268
321
  });
269
322
  registry.register("type_text");
270
323
  registerTool("press_key", "Press a keyboard shortcut", {
@@ -283,7 +336,7 @@ export function registerTools(server) {
283
336
  if (keys.length === 0)
284
337
  throw new UnsupportedParameterError("press_key requires at least one key");
285
338
  await withSafety({ action: "press_key", params: { keys }, requiresAccessibility: true, execute: () => getPlatform().key(keys) });
286
- return actionResponse({ pressed: true, keys }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
339
+ return actionResponse("press_key", { pressed: true, keys }, {}, params.captureAfter, params.captureFormat, params.captureMaxWidth);
287
340
  });
288
341
  registry.register("press_key");
289
342
  registerTool("scroll", "Scroll at coordinates", {
@@ -295,7 +348,7 @@ export function registerTools(server) {
295
348
  const pt = await resolvePoint(params.x, params.y, params.windowId);
296
349
  const deltaX = params.deltaX ?? 0;
297
350
  await withSafety({ action: "scroll", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().scroll(pt.x, pt.y, deltaX, params.deltaY) });
298
- return actionResponse({ scrolled: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
351
+ return actionResponse("scroll", { scrolled: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
299
352
  });
300
353
  registry.register("scroll");
301
354
  registerTool("drag", "Drag from one point to another", {
@@ -309,30 +362,100 @@ export function registerTools(server) {
309
362
  const start = await resolvePoint(params.startX, params.startY, params.windowId);
310
363
  const end = await resolvePoint(params.endX, params.endY, params.windowId);
311
364
  await withSafety({ action: "drag", params: { startX: start.x, startY: start.y, endX: end.x, endY: end.y }, requiresAccessibility: true, execute: () => getPlatform().drag(start.x, start.y, end.x, end.y, params.button, params.duration) });
312
- return actionResponse({ dragged: true, startX: start.x, startY: start.y, endX: end.x, endY: end.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
365
+ return actionResponse("drag", { dragged: true, startX: start.x, startY: start.y, endX: end.x, endY: end.y }, { startX: start.x, startY: start.y, endX: end.x, endY: end.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
313
366
  });
314
367
  registry.register("drag");
315
- registerTool("doctor", "Check system permissions and diagnose common issues", {}, async () => {
368
+ registerTool("doctor", "Check system permissions, native helpers, and client readiness", {}, async () => {
316
369
  const { checkPermissions } = await import("../safety/permissions.js");
317
370
  const { MacOSPlatform: MacPlat } = await import("../platform/macos.js");
371
+ const { existsSync } = await import("node:fs");
372
+ const { join, dirname } = await import("node:path");
373
+ const { fileURLToPath } = await import("node:url");
374
+ const { execFileSync } = await import("node:child_process");
318
375
  const permissions = await checkPermissions();
319
376
  const screenLocked = process.platform === "darwin" ? new MacPlat().isScreenLocked?.() ?? false : false;
377
+ let nativeHelpers;
378
+ if (process.platform === "darwin") {
379
+ const moduleDir = dirname(fileURLToPath(import.meta.url));
380
+ const checkPaths = (subdirs) => {
381
+ const paths = [
382
+ join(process.cwd(), ...subdirs),
383
+ join(moduleDir, "..", ...subdirs),
384
+ join(moduleDir, "..", "..", ...subdirs),
385
+ ];
386
+ return paths.some(p => { try {
387
+ return existsSync(p);
388
+ }
389
+ catch {
390
+ return false;
391
+ } });
392
+ };
393
+ nativeHelpers = {
394
+ cgevent: checkPaths(["native", "cgevent", "cgevent-helper"]),
395
+ ocr: checkPaths(["native", "ocr", "ocr-helper"]),
396
+ };
397
+ }
398
+ let readiness = "ready";
399
+ const issues = [];
400
+ if (!permissions.granted) {
401
+ readiness = "blocked";
402
+ issues.push("Missing macOS permissions: " + permissions.missing.join(", "));
403
+ }
404
+ if (screenLocked) {
405
+ readiness = "blocked";
406
+ issues.push("Screen is locked");
407
+ }
408
+ if (process.platform === "darwin" && nativeHelpers) {
409
+ if (!nativeHelpers.cgevent) {
410
+ readiness = readiness === "ready" ? "degraded" : readiness;
411
+ issues.push("Native CGEvent helper not found (input synthesis may crash on macOS Sequoia+)");
412
+ }
413
+ if (!nativeHelpers.ocr) {
414
+ readiness = readiness === "ready" ? "degraded" : readiness;
415
+ issues.push("Native OCR helper not found (OCR may fail on macOS Sequoia+)");
416
+ }
417
+ }
418
+ const clients = {};
419
+ for (const bin of ["claude", "codex", "opencode", "npx"]) {
420
+ try {
421
+ const path = execFileSync("which", [bin], { encoding: "utf-8", timeout: 2000 }).trim();
422
+ clients[bin] = path || "not found";
423
+ }
424
+ catch {
425
+ clients[bin] = "not found";
426
+ }
427
+ }
428
+ const recommendations = [];
429
+ if (readiness === "blocked") {
430
+ recommendations.push("Grant missing permissions in System Settings > Privacy & Security, then restart the MCP client.");
431
+ }
432
+ else if (readiness === "degraded") {
433
+ if (nativeHelpers && (!nativeHelpers.cgevent || !nativeHelpers.ocr)) {
434
+ recommendations.push("Run 'npm run build' to compile native Swift helpers.");
435
+ }
436
+ }
437
+ else {
438
+ recommendations.push("All checks passed. MCP client can proceed with automation.");
439
+ }
320
440
  const report = {
321
- ok: permissions.granted && !screenLocked,
441
+ readiness,
442
+ issues: issues.length > 0 ? issues : undefined,
443
+ recommendations,
322
444
  platform: process.platform,
323
445
  node: process.version,
324
446
  permissions,
325
447
  screenLocked,
448
+ nativeHelpers,
449
+ clients,
326
450
  safety: {
327
451
  urlBlocklist: true,
328
452
  lockScreenGuard: process.platform === "darwin",
329
453
  typedTextInjectionScan: true,
330
454
  },
331
455
  stdioCommand: "ucu-mcp",
332
- clients: {
333
- claudeCodeCli: "Run ucu-mcp as an MCP stdio server.",
334
- claudeCodeDesktop: "Configure ucu-mcp as a local MCP stdio server.",
335
- openCode: "Configure ucu-mcp as a local MCP stdio server.",
456
+ metrics: {
457
+ global: metrics.stats(),
458
+ byTool: metrics.byTool(),
336
459
  },
337
460
  };
338
461
  return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
@@ -343,27 +466,56 @@ export function registerTools(server) {
343
466
  return { content: [{ type: "text", text: JSON.stringify({ waited: params.ms }) }] };
344
467
  });
345
468
  registry.register("wait");
346
- registerTool("wait_for_element", "Poll until an accessibility element matching the criteria appears", {
469
+ registerTool("wait_for_element", "Poll until an accessibility element matching the criteria reaches the desired state", {
347
470
  text: z.string().optional().describe("Element text"), role: z.string().optional().describe("Element role"),
348
471
  app: z.string().optional().describe("Target app"),
349
472
  timeout: z.number().optional().describe("Timeout ms (default 5000)"),
350
473
  timeoutMs: z.number().optional().describe("Alias for timeout"),
351
474
  interval: z.number().optional().describe("Poll interval ms (default 500)"),
352
475
  intervalMs: z.number().optional().describe("Alias for interval"),
476
+ until: z.enum(["appear", "disappear", "value_change"]).default("appear").describe("Wait condition: 'appear' (default) waits for a match, 'disappear' waits until no match, 'value_change' waits until first match's value changes"),
353
477
  }, async (params) => {
354
478
  const deadline = Date.now() + (params.timeout ?? params.timeoutMs ?? 5000);
355
479
  const interval = params.interval ?? params.intervalMs ?? 500;
356
- const query = { text: params.text, role: params.role, app: params.app, maxResults: 1 };
480
+ const until = params.until ?? "appear";
481
+ const effectiveApp = params.app || getActiveTarget()?.appName;
482
+ const query = { text: params.text, role: params.role, app: effectiveApp, maxResults: 1 };
357
483
  const { granted } = await checkPermission("accessibility");
358
484
  if (!granted)
359
485
  throw new PermissionError("accessibility", process.platform);
486
+ let initialValue;
487
+ let hasInitial = false;
360
488
  while (Date.now() < deadline) {
361
- const results = await getPlatform().findElement(query);
362
- if (results.length > 0)
363
- return { content: [{ type: "text", text: JSON.stringify({ found: true, element: results[0] }, null, 2) }] };
489
+ const response = await getPlatform().findElement(query);
490
+ const matched = response.results[0];
491
+ if (until === "appear") {
492
+ if (matched)
493
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, element: matched }, null, 2) }] };
494
+ }
495
+ else if (until === "disappear") {
496
+ if (!matched)
497
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, reason: "disappeared" }, null, 2) }] };
498
+ }
499
+ else {
500
+ // value_change: capture the initial value of the first match, then wait for it to differ.
501
+ // A separate `hasInitial` flag is required because the first match's `value` may itself be
502
+ // undefined; using `initialValue === undefined` to mean "not yet captured" would loop
503
+ // forever in that case. On timeout, distinguish "element never appeared" from "value stayed
504
+ // the same" so the model can branch on the result.
505
+ if (matched) {
506
+ if (!hasInitial) {
507
+ initialValue = matched.value;
508
+ hasInitial = true;
509
+ }
510
+ else if (matched.value !== initialValue) {
511
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, oldValue: initialValue, newValue: matched.value }, null, 2) }] };
512
+ }
513
+ }
514
+ }
364
515
  await new Promise(r => setTimeout(r, interval));
365
516
  }
366
- return { content: [{ type: "text", text: JSON.stringify({ found: false, reason: "timeout" }) }] };
517
+ const reason = until === "value_change" ? (hasInitial ? "value_unchanged" : "never_appeared") : "timeout";
518
+ return { content: [{ type: "text", text: JSON.stringify({ found: false, reason }, null, 2) }] };
367
519
  });
368
520
  registry.register("wait_for_element");
369
521
  registerTool("get_cursor_position", "Get current cursor position", {}, async () => {
@@ -392,40 +544,61 @@ export function registerTools(server) {
392
544
  }, async (params) => {
393
545
  const pt = await resolvePoint(params.x, params.y, params.windowId);
394
546
  await withSafety({ action: "move", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().move(pt.x, pt.y) });
395
- return actionResponse({ moved: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
547
+ return actionResponse("move", { moved: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
396
548
  });
397
549
  registry.register("move");
398
550
  registerTool("find_element", "Find accessibility elements by text, role, or app", {
399
551
  text: z.string().optional().describe("Text to search"), role: z.string().optional().describe("AX role"), app: z.string().optional().describe("Target app"),
400
552
  depth: z.number().optional().describe("AX tree depth"), includeBounds: z.boolean().default(true).describe("Include bounds"), maxResults: z.number().min(1).max(200).default(50).describe("Max results"),
553
+ textMode: z.enum(["contains", "exact", "regex"]).default("contains").describe("Text matching mode: contains (default), exact, or regex"),
554
+ visibleOnly: z.boolean().default(false).describe("Only return elements with valid on-screen bounds"),
555
+ value: z.string().optional().describe("Filter by AX element value (respects textMode)"),
556
+ index: z.number().int().nonnegative().optional().describe("Return only the Nth match (0-based) after all other filtering and sorting"),
557
+ near: z.object({ x: z.number(), y: z.number() }).optional().describe("Sort results by ascending distance to this point and return closest first"),
401
558
  }, async (params) => {
402
- const results = await withSafety({ action: "find_element", params: {}, requiresAccessibility: true,
403
- execute: () => getPlatform().findElement({ text: params.text, role: params.role, app: params.app, depth: params.depth, includeBounds: params.includeBounds, maxResults: params.maxResults }) });
404
- return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
559
+ const effectiveApp = params.app || getActiveTarget()?.appName;
560
+ const response = await withSafety({ action: "find_element", params: {}, requiresAccessibility: true,
561
+ execute: () => getPlatform().findElement({ text: params.text, role: params.role, app: effectiveApp, depth: params.depth, includeBounds: params.includeBounds, maxResults: params.maxResults, textMode: params.textMode, visibleOnly: params.visibleOnly, value: params.value, index: params.index, near: params.near }) });
562
+ return { content: [{ type: "text", text: JSON.stringify({ results: response.results, metrics: response.metrics }, null, 2) }] };
405
563
  });
406
564
  registry.register("find_element");
407
565
  registerTool("click_element", "Click an accessibility element by its ID", {
408
566
  elementId: z.string().describe("AX element identifier"), app: z.string().optional().describe("Target app"), ...captureAfterFields,
409
567
  }, async (params) => {
410
- await withSafety({ action: "click_element", params: {}, requiresAccessibility: true, execute: () => getPlatform().clickElement(params.elementId, params.app) });
411
- return actionResponse({ clicked: true, elementId: params.elementId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
568
+ const effectiveApp = params.app || getActiveTarget()?.appName;
569
+ await withSafety({ action: "click_element", params: {}, requiresAccessibility: true, execute: () => getPlatform().clickElement(params.elementId, effectiveApp) });
570
+ return actionResponse("click_element", { clicked: true, elementId: params.elementId }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
412
571
  });
413
572
  registry.register("click_element");
414
573
  registerTool("set_value", "Set the value of an accessibility element", {
415
574
  elementId: z.string().describe("AX element identifier"), value: z.string().describe("Value to set"), app: z.string().optional().describe("Target app"), ...captureAfterFields,
416
575
  }, async (params) => {
417
- await withSafety({ action: "set_value", params: { value: params.value }, requiresAccessibility: true, execute: () => getPlatform().setElementValue(params.elementId, params.value, params.app) });
418
- return actionResponse({ setValue: true, elementId: params.elementId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
576
+ const effectiveApp = params.app || getActiveTarget()?.appName;
577
+ await withSafety({ action: "set_value", params: { value: params.value }, requiresAccessibility: true, execute: () => getPlatform().setElementValue(params.elementId, params.value, effectiveApp) });
578
+ return actionResponse("set_value", { setValue: true, elementId: params.elementId }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
419
579
  });
420
580
  registry.register("set_value");
421
581
  registerTool("type_in_element", "Type text into an accessibility element, optionally clearing first", {
422
582
  elementId: z.string().describe("AX element identifier"), text: z.string().describe("Text to type"),
423
583
  app: z.string().optional().describe("Target app"), clearFirst: z.boolean().optional().describe("Clear existing text before typing"), ...captureAfterFields,
424
584
  }, async (params) => {
425
- await withSafety({ action: "type_in_element", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().typeInElement(params.elementId, params.text, params.app, params.clearFirst) });
426
- return actionResponse({ typed: true, elementId: params.elementId, charCount: params.text.length }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
585
+ const effectiveApp = params.app || getActiveTarget()?.appName;
586
+ await withSafety({ action: "type_in_element", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().typeInElement(params.elementId, params.text, effectiveApp, params.clearFirst) });
587
+ return actionResponse("type_in_element", { typed: true, elementId: params.elementId, charCount: params.text.length }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
427
588
  });
428
589
  registry.register("type_in_element");
590
+ registerTool("clipboard_read", "Read the current contents of the system clipboard", {}, async () => {
591
+ const text = await withSafety({ action: "clipboard_read", params: {}, execute: () => getPlatform().readClipboard() });
592
+ return { content: [{ type: "text", text: JSON.stringify({ text }, null, 2) }] };
593
+ });
594
+ registry.register("clipboard_read");
595
+ registerTool("clipboard_write", "Write text to the system clipboard (text injection patterns are blocked)", {
596
+ text: z.string().describe("Text to place on the clipboard"),
597
+ }, async (params) => {
598
+ await withSafety({ action: "clipboard_write", params: { text: params.text }, execute: () => getPlatform().writeClipboard(params.text) });
599
+ return { content: [{ type: "text", text: JSON.stringify({ written: true }, null, 2) }] };
600
+ });
601
+ registry.register("clipboard_write");
429
602
  log.info("Registered tools", { count: registry.tools.length, tools: registry.tools.join(", ") });
430
603
  }
431
604
  export class ToolRegistry {
@@ -38,10 +38,12 @@ export interface AppInfo {
38
38
  windowCount: number;
39
39
  }
40
40
  export interface AppTarget {
41
+ targetId: string;
41
42
  appName: string;
42
43
  pid: number;
43
44
  windowId?: string;
44
45
  title?: string;
46
+ capturedAt: string;
45
47
  }
46
48
  export interface BrowserContext {
47
49
  appName: string;
@@ -81,6 +83,17 @@ export interface FindElementOptions {
81
83
  depth?: number;
82
84
  includeBounds?: boolean;
83
85
  maxResults?: number;
86
+ textMode?: "contains" | "exact" | "regex";
87
+ visibleOnly?: boolean;
88
+ /** Match against the AX element's current value attribute (respects textMode). */
89
+ value?: string;
90
+ /** Return only the Nth match (0-based) after all other filtering and sorting. */
91
+ index?: number;
92
+ /** Sort results by ascending distance to this point and return closest first. */
93
+ near?: {
94
+ x: number;
95
+ y: number;
96
+ };
84
97
  }
85
98
  export interface FindElementResult {
86
99
  id: string;
@@ -95,6 +108,16 @@ export interface FindElementResult {
95
108
  };
96
109
  description?: string;
97
110
  }
111
+ export interface FindElementMetrics {
112
+ scannedCount: number;
113
+ matchedCount: number;
114
+ durationMs: number;
115
+ truncated: boolean;
116
+ }
117
+ export interface FindElementResponse {
118
+ results: FindElementResult[];
119
+ metrics: FindElementMetrics;
120
+ }
98
121
  export interface WindowState {
99
122
  window: WindowInfo;
100
123
  focusedElement?: ElementInfo;
@@ -117,11 +140,13 @@ export interface Platform {
117
140
  ocr(display?: number, region?: ScreenRegion): Promise<OcrResult>;
118
141
  type(text: string, delay?: number): Promise<void>;
119
142
  key(keys: string[]): Promise<void>;
120
- findElement(options: FindElementOptions): Promise<FindElementResult[]>;
143
+ findElement(options: FindElementOptions): Promise<FindElementResponse>;
121
144
  clickElement(elementId: string, app?: string): Promise<void>;
122
145
  typeInElement(elementId: string, text: string, app?: string, clearFirst?: boolean): Promise<void>;
123
146
  setElementValue?(elementId: string, value: string, app?: string): Promise<void>;
124
147
  isScreenLocked?(): boolean;
125
148
  saveFocus?(): Promise<void>;
126
149
  restoreFocus?(): Promise<void>;
150
+ readClipboard(): Promise<string>;
151
+ writeClipboard(text: string): Promise<void>;
127
152
  }
@@ -1,4 +1,4 @@
1
- import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResult } from "./base.js";
1
+ import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResponse } from "./base.js";
2
2
  /**
3
3
  * Linux platform adapter (AT-SPI2 + xdotool fallback)
4
4
  * TODO: Implement with D-Bus AT-SPI2 bindings
@@ -16,7 +16,9 @@ export declare class LinuxPlatform implements Platform {
16
16
  type(text: string, delay?: number): Promise<void>;
17
17
  key(keys: string[]): Promise<void>;
18
18
  ocr(_display?: number, _region?: ScreenRegion): Promise<OcrResult>;
19
- findElement(_options: FindElementOptions): Promise<FindElementResult[]>;
19
+ findElement(_options: FindElementOptions): Promise<FindElementResponse>;
20
20
  clickElement(_elementId: string, _app?: string): Promise<void>;
21
21
  typeInElement(_elementId: string, _text: string, _app?: string, _clearFirst?: boolean): Promise<void>;
22
+ readClipboard(): Promise<string>;
23
+ writeClipboard(text: string): Promise<void>;
22
24
  }
@@ -1,3 +1,27 @@
1
+ import { execFileSync } from "node:child_process";
2
+ import { existsSync } from "node:fs";
3
+ import { PlatformError } from "../util/errors.js";
4
+ /** Pick the first available clipboard utility, preferring xclip. */
5
+ function pickClipboardTool() {
6
+ for (const bin of ["/usr/bin/xclip", "/usr/local/bin/xclip", "xclip"]) {
7
+ if (bin.startsWith("/") ? existsSync(bin) : which(bin))
8
+ return "xclip";
9
+ }
10
+ for (const bin of ["/usr/bin/xsel", "/usr/local/bin/xsel", "xsel"]) {
11
+ if (bin.startsWith("/") ? existsSync(bin) : which(bin))
12
+ return "xsel";
13
+ }
14
+ return undefined;
15
+ }
16
+ function which(bin) {
17
+ try {
18
+ execFileSync("which", [bin], { encoding: "utf-8", timeout: 2000, stdio: "ignore" });
19
+ return true;
20
+ }
21
+ catch {
22
+ return false;
23
+ }
24
+ }
1
25
  /**
2
26
  * Linux platform adapter (AT-SPI2 + xdotool fallback)
3
27
  * TODO: Implement with D-Bus AT-SPI2 bindings
@@ -59,4 +83,31 @@ export class LinuxPlatform {
59
83
  async typeInElement(_elementId, _text, _app, _clearFirst) {
60
84
  throw new Error("Not implemented: Linux typeInElement");
61
85
  }
86
+ async readClipboard() {
87
+ const tool = pickClipboardTool();
88
+ if (!tool) {
89
+ throw new PlatformError("readClipboard requires xclip or xsel on PATH", false);
90
+ }
91
+ try {
92
+ const args = tool === "xclip" ? ["-selection", "clipboard", "-o"] : ["--clipboard", "--output"];
93
+ const out = execFileSync(tool, args, { encoding: "utf-8", timeout: 5000 });
94
+ return out;
95
+ }
96
+ catch (error) {
97
+ throw new PlatformError(`read_clipboard failed: ${error.message}`);
98
+ }
99
+ }
100
+ async writeClipboard(text) {
101
+ const tool = pickClipboardTool();
102
+ if (!tool) {
103
+ throw new PlatformError("writeClipboard requires xclip or xsel on PATH", false);
104
+ }
105
+ try {
106
+ const args = tool === "xclip" ? ["-selection", "clipboard"] : ["--clipboard", "--input"];
107
+ execFileSync(tool, args, { input: text, encoding: "utf-8", timeout: 5000 });
108
+ }
109
+ catch (error) {
110
+ throw new PlatformError(`write_clipboard failed: ${error.message}`);
111
+ }
112
+ }
62
113
  }
@@ -1,4 +1,4 @@
1
- import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResult, AppInfo, AppTarget, BrowserContext, ScreenshotOptions } from "./base.js";
1
+ import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResponse, AppInfo, AppTarget, BrowserContext, ScreenshotOptions } from "./base.js";
2
2
  export declare class MacOSPlatform implements Platform {
3
3
  private readonly elementCache;
4
4
  private readonly elementCacheTtlMs;
@@ -13,6 +13,8 @@ export declare class MacOSPlatform implements Platform {
13
13
  private evictOverflowCacheEntries;
14
14
  /** Check whether a cached element descriptor has expired. */
15
15
  private isCacheEntryExpired;
16
+ /** Validate that the active target window still exists. */
17
+ validateActiveTarget(): Promise<void>;
16
18
  /** Save the current frontmost app/window so we can restore after an action. */
17
19
  saveFocus(): Promise<void>;
18
20
  /** Restore the previously saved frontmost app/window. */
@@ -36,8 +38,10 @@ export declare class MacOSPlatform implements Platform {
36
38
  private ocrJxa;
37
39
  type(text: string, delay?: number): Promise<void>;
38
40
  key(keys: string[]): Promise<void>;
39
- findElement(options: FindElementOptions): Promise<FindElementResult[]>;
41
+ findElement(options: FindElementOptions): Promise<FindElementResponse>;
40
42
  clickElement(elementId: string, app?: string): Promise<void>;
41
43
  typeInElement(elementId: string, text: string, app?: string, clearFirst?: boolean): Promise<void>;
44
+ readClipboard(): Promise<string>;
45
+ writeClipboard(text: string): Promise<void>;
42
46
  setElementValue(elementId: string, value: string, app?: string): Promise<void>;
43
47
  }