ucu-mcp 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
1
  /**
2
2
  * Tool registry for UCU-MCP.
3
3
  *
4
- * Registers 22 MCP tools on the server and dispatches each call through
4
+ * Registers 24 MCP tools on the server and dispatches each call through
5
5
  * a shared safety/permission/retry pipeline (`withSafety`).
6
6
  */
7
7
  import { z } from "zod";
8
8
  import { MacOSPlatform } from "../platform/macos.js";
9
- import { SafetyGuard } from "../safety/guard.js";
9
+ import { SafetyGuard, classifyAction } from "../safety/guard.js";
10
10
  import { checkPermission } from "../safety/permissions.js";
11
11
  import { retry } from "../util/retry.js";
12
12
  import { createLogger } from "../util/logger.js";
@@ -20,6 +20,14 @@ function getPlatform() {
20
20
  return _platform;
21
21
  }
22
22
  const safety = new SafetyGuard();
23
+ // Active target context — set by focus_app, used by AX element tools
24
+ let activeTargetContext;
25
+ /**
26
+ * Get the currently active target context (set by focus_app).
27
+ */
28
+ export function getActiveTarget() {
29
+ return activeTargetContext;
30
+ }
23
31
  // User activity monitor — pauses automation when user moves the cursor
24
32
  let lastCursorPos = { x: 0, y: 0 };
25
33
  let userActivityInterval;
@@ -43,6 +51,8 @@ function recoveryHint(code) {
43
51
  switch (code) {
44
52
  case "WINDOW_NOT_FOUND":
45
53
  return "Run list_windows again, then retry with a fresh windowId or omit windowId for screen coordinates.";
54
+ case "TARGET_STALE":
55
+ return "Run focus_app again for the target app, or run list_windows and retry with a fresh windowId.";
46
56
  case "ELEMENT_NOT_FOUND":
47
57
  return "Run find_element again, then retry with a fresh elementId.";
48
58
  case "PERMISSION_DENIED":
@@ -61,28 +71,63 @@ function recoveryHint(code) {
61
71
  return "Inspect the error message, observe the current UI state, and retry only if the operation is safe.";
62
72
  }
63
73
  }
64
- function mcpErrorResponse(error) {
74
+ function errorDetails(error) {
65
75
  const err = error instanceof Error ? error : new Error(String(error));
66
76
  const code = error instanceof UcuError ? error.code : "UNKNOWN_ERROR";
67
77
  const retryable = error instanceof UcuError ? error.retryable : false;
78
+ return {
79
+ name: err.name,
80
+ code,
81
+ retryable,
82
+ message: err.message,
83
+ recovery: recoveryHint(code),
84
+ };
85
+ }
86
+ let _actionCounter = 0;
87
+ function nextActionId() {
88
+ _actionCounter = (_actionCounter + 1) % 1_000_000;
89
+ return `a${Date.now().toString(36)}-${_actionCounter.toString(36)}`;
90
+ }
91
+ function buildActionReceipt(action, status, target, result, captureRequested, captureFormat, captureMaxWidth, captureError, warnings = []) {
92
+ const captureStatus = captureRequested
93
+ ? captureError ? "error" : "ok"
94
+ : "skipped";
95
+ return {
96
+ actionId: nextActionId(),
97
+ action,
98
+ status,
99
+ target,
100
+ result,
101
+ capture: {
102
+ requested: captureRequested,
103
+ status: captureStatus,
104
+ ...(captureFormat && { format: captureFormat }),
105
+ ...(captureMaxWidth && { maxWidth: captureMaxWidth }),
106
+ ...(captureError && { error: captureError }),
107
+ },
108
+ warnings,
109
+ next: captureError
110
+ ? "screenshot"
111
+ : status === "partial"
112
+ ? "get_window_state"
113
+ : "find_element or get_window_state",
114
+ };
115
+ }
116
+ function mcpErrorResponse(error) {
68
117
  return {
69
118
  isError: true,
70
119
  content: [
71
120
  jsonText({
72
- error: {
73
- name: err.name,
74
- code,
75
- retryable,
76
- message: err.message,
77
- recovery: recoveryHint(code),
78
- },
121
+ error: errorDetails(error),
79
122
  }),
80
123
  ],
81
124
  };
82
125
  }
83
- async function actionResponse(result, captureAfter, captureFormat = "jpeg", captureMaxWidth = 1280) {
84
- if (!captureAfter)
85
- return { content: [jsonText(result)] };
126
+ async function actionResponse(action, result, target, captureAfter, captureFormat = "jpeg", captureMaxWidth = 1280, warnings = []) {
127
+ const receipt = buildActionReceipt(action, "ok", target, result, captureAfter ?? false, captureFormat, captureMaxWidth, undefined, warnings);
128
+ if (!captureAfter) {
129
+ return { content: [jsonText(receipt)] };
130
+ }
86
131
  try {
87
132
  const buf = await getPlatform().screenshot(undefined, undefined, {
88
133
  format: captureFormat,
@@ -90,7 +135,7 @@ async function actionResponse(result, captureAfter, captureFormat = "jpeg", capt
90
135
  });
91
136
  return {
92
137
  content: [
93
- jsonText({ actionResult: result }),
138
+ jsonText(receipt),
94
139
  {
95
140
  type: "image",
96
141
  data: buf.toString("base64"),
@@ -99,8 +144,9 @@ async function actionResponse(result, captureAfter, captureFormat = "jpeg", capt
99
144
  ],
100
145
  };
101
146
  }
102
- catch {
103
- return { content: [jsonText(result)] };
147
+ catch (error) {
148
+ const partialReceipt = buildActionReceipt(action, "partial", target, result, true, captureFormat, captureMaxWidth, errorDetails(error), [...warnings, "Post-action screenshot capture failed"]);
149
+ return { content: [jsonText(partialReceipt)] };
104
150
  }
105
151
  }
106
152
  const retryableActions = new Set([
@@ -118,7 +164,9 @@ async function withSafety(sa) {
118
164
  const platform = getPlatform();
119
165
  if (platform.isScreenLocked?.())
120
166
  throw new SafetyError("Screen is locked");
121
- const check = safety.checkAction(sa.action, sa.params);
167
+ const check = safety.checkAction(sa.action, sa.params, {
168
+ skipUserActivityPause: sa.skipUserActivityPause ?? classifyAction(sa.action) === "observe",
169
+ });
122
170
  if (!check.allowed)
123
171
  throw new SafetyError(check.reason ?? "Action blocked by safety guard");
124
172
  if (sa.requiresAccessibility) {
@@ -224,13 +272,15 @@ export function registerTools(server) {
224
272
  app: z.string().describe("Application name to focus"),
225
273
  }, async (params) => {
226
274
  const target = await withSafety({ action: "focus_app", params: {}, requiresAccessibility: true, execute: () => getPlatform().focusApp(params.app) });
275
+ activeTargetContext = target;
227
276
  return { content: [{ type: "text", text: JSON.stringify(target, null, 2) }] };
228
277
  });
229
278
  registry.register("focus_app");
230
279
  registerTool("get_window_state", "Get detailed state of a window including accessibility tree", {
231
280
  windowId: z.string().optional().describe("Window ID"), depth: z.number().optional().describe("AX tree depth"), includeBounds: z.boolean().optional().describe("Include element bounds"),
232
281
  }, async (params) => {
233
- const state = await withSafety({ action: "get_window_state", params: {}, requiresAccessibility: true, execute: () => getPlatform().getWindowState(params.windowId, params.depth, params.includeBounds) });
282
+ const effectiveWindowId = params.windowId || getActiveTarget()?.windowId;
283
+ const state = await withSafety({ action: "get_window_state", params: {}, requiresAccessibility: true, execute: () => getPlatform().getWindowState(effectiveWindowId, params.depth, params.includeBounds) });
234
284
  return { content: [{ type: "text", text: JSON.stringify(state, null, 2) }] };
235
285
  });
236
286
  registry.register("get_window_state");
@@ -242,7 +292,7 @@ export function registerTools(server) {
242
292
  }, async (params) => {
243
293
  const pt = await resolvePoint(params.x, params.y, params.windowId);
244
294
  await withSafety({ action: "click", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().click(pt.x, pt.y, params.button) });
245
- return actionResponse({ clicked: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
295
+ return actionResponse("click", { clicked: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
246
296
  });
247
297
  registry.register("click");
248
298
  registerTool("double_click", "Double-click at screen coordinates", {
@@ -253,7 +303,7 @@ export function registerTools(server) {
253
303
  }, async (params) => {
254
304
  const pt = await resolvePoint(params.x, params.y, params.windowId);
255
305
  await withSafety({ action: "click", params: { x: pt.x, y: pt.y, doubleClick: true }, requiresAccessibility: true, execute: () => getPlatform().click(pt.x, pt.y, params.button, true) });
256
- return actionResponse({ doubleClicked: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
306
+ return actionResponse("double_click", { doubleClicked: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
257
307
  });
258
308
  registry.register("double_click");
259
309
  registerTool("type_text", "Type text at the current cursor position", {
@@ -264,7 +314,7 @@ export function registerTools(server) {
264
314
  if (params.windowId)
265
315
  throw new UnsupportedParameterError("windowId-targeted keyboard typing is not implemented");
266
316
  await withSafety({ action: "type_text", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().type(params.text, params.delay) });
267
- return actionResponse({ typed: true, charCount: params.text.length }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
317
+ return actionResponse("type_text", { typed: true, charCount: params.text.length }, {}, params.captureAfter, params.captureFormat, params.captureMaxWidth);
268
318
  });
269
319
  registry.register("type_text");
270
320
  registerTool("press_key", "Press a keyboard shortcut", {
@@ -283,7 +333,7 @@ export function registerTools(server) {
283
333
  if (keys.length === 0)
284
334
  throw new UnsupportedParameterError("press_key requires at least one key");
285
335
  await withSafety({ action: "press_key", params: { keys }, requiresAccessibility: true, execute: () => getPlatform().key(keys) });
286
- return actionResponse({ pressed: true, keys }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
336
+ return actionResponse("press_key", { pressed: true, keys }, {}, params.captureAfter, params.captureFormat, params.captureMaxWidth);
287
337
  });
288
338
  registry.register("press_key");
289
339
  registerTool("scroll", "Scroll at coordinates", {
@@ -295,7 +345,7 @@ export function registerTools(server) {
295
345
  const pt = await resolvePoint(params.x, params.y, params.windowId);
296
346
  const deltaX = params.deltaX ?? 0;
297
347
  await withSafety({ action: "scroll", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().scroll(pt.x, pt.y, deltaX, params.deltaY) });
298
- return actionResponse({ scrolled: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
348
+ return actionResponse("scroll", { scrolled: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
299
349
  });
300
350
  registry.register("scroll");
301
351
  registerTool("drag", "Drag from one point to another", {
@@ -309,31 +359,97 @@ export function registerTools(server) {
309
359
  const start = await resolvePoint(params.startX, params.startY, params.windowId);
310
360
  const end = await resolvePoint(params.endX, params.endY, params.windowId);
311
361
  await withSafety({ action: "drag", params: { startX: start.x, startY: start.y, endX: end.x, endY: end.y }, requiresAccessibility: true, execute: () => getPlatform().drag(start.x, start.y, end.x, end.y, params.button, params.duration) });
312
- return actionResponse({ dragged: true, startX: start.x, startY: start.y, endX: end.x, endY: end.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
362
+ return actionResponse("drag", { dragged: true, startX: start.x, startY: start.y, endX: end.x, endY: end.y }, { startX: start.x, startY: start.y, endX: end.x, endY: end.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
313
363
  });
314
364
  registry.register("drag");
315
- registerTool("doctor", "Check system permissions and diagnose common issues", {}, async () => {
365
+ registerTool("doctor", "Check system permissions, native helpers, and client readiness", {}, async () => {
316
366
  const { checkPermissions } = await import("../safety/permissions.js");
317
367
  const { MacOSPlatform: MacPlat } = await import("../platform/macos.js");
368
+ const { existsSync } = await import("node:fs");
369
+ const { join, dirname } = await import("node:path");
370
+ const { fileURLToPath } = await import("node:url");
371
+ const { execFileSync } = await import("node:child_process");
318
372
  const permissions = await checkPermissions();
319
373
  const screenLocked = process.platform === "darwin" ? new MacPlat().isScreenLocked?.() ?? false : false;
374
+ let nativeHelpers;
375
+ if (process.platform === "darwin") {
376
+ const moduleDir = dirname(fileURLToPath(import.meta.url));
377
+ const checkPaths = (subdirs) => {
378
+ const paths = [
379
+ join(process.cwd(), ...subdirs),
380
+ join(moduleDir, "..", ...subdirs),
381
+ join(moduleDir, "..", "..", ...subdirs),
382
+ ];
383
+ return paths.some(p => { try {
384
+ return existsSync(p);
385
+ }
386
+ catch {
387
+ return false;
388
+ } });
389
+ };
390
+ nativeHelpers = {
391
+ cgevent: checkPaths(["native", "cgevent", "cgevent-helper"]),
392
+ ocr: checkPaths(["native", "ocr", "ocr-helper"]),
393
+ };
394
+ }
395
+ let readiness = "ready";
396
+ const issues = [];
397
+ if (!permissions.granted) {
398
+ readiness = "blocked";
399
+ issues.push("Missing macOS permissions: " + permissions.missing.join(", "));
400
+ }
401
+ if (screenLocked) {
402
+ readiness = "blocked";
403
+ issues.push("Screen is locked");
404
+ }
405
+ if (process.platform === "darwin" && nativeHelpers) {
406
+ if (!nativeHelpers.cgevent) {
407
+ readiness = readiness === "ready" ? "degraded" : readiness;
408
+ issues.push("Native CGEvent helper not found (input synthesis may crash on macOS Sequoia+)");
409
+ }
410
+ if (!nativeHelpers.ocr) {
411
+ readiness = readiness === "ready" ? "degraded" : readiness;
412
+ issues.push("Native OCR helper not found (OCR may fail on macOS Sequoia+)");
413
+ }
414
+ }
415
+ const clients = {};
416
+ for (const bin of ["claude", "codex", "opencode", "npx"]) {
417
+ try {
418
+ const path = execFileSync("which", [bin], { encoding: "utf-8", timeout: 2000 }).trim();
419
+ clients[bin] = path || "not found";
420
+ }
421
+ catch {
422
+ clients[bin] = "not found";
423
+ }
424
+ }
425
+ const recommendations = [];
426
+ if (readiness === "blocked") {
427
+ recommendations.push("Grant missing permissions in System Settings > Privacy & Security, then restart the MCP client.");
428
+ }
429
+ else if (readiness === "degraded") {
430
+ if (nativeHelpers && (!nativeHelpers.cgevent || !nativeHelpers.ocr)) {
431
+ recommendations.push("Run 'npm run build' to compile native Swift helpers.");
432
+ }
433
+ }
434
+ else {
435
+ recommendations.push("All checks passed. MCP client can proceed with automation.");
436
+ }
320
437
  const report = {
321
- ok: permissions.granted && !screenLocked,
438
+ readiness,
439
+ issues: issues.length > 0 ? issues : undefined,
440
+ recommendations,
322
441
  platform: process.platform,
323
442
  node: process.version,
324
443
  permissions,
325
444
  screenLocked,
445
+ nativeHelpers,
446
+ clients,
326
447
  safety: {
327
448
  urlBlocklist: true,
328
449
  lockScreenGuard: process.platform === "darwin",
329
450
  typedTextInjectionScan: true,
330
451
  },
331
452
  stdioCommand: "ucu-mcp",
332
- clients: {
333
- claudeCodeCli: "Run ucu-mcp as an MCP stdio server.",
334
- claudeCodeDesktop: "Configure ucu-mcp as a local MCP stdio server.",
335
- openCode: "Configure ucu-mcp as a local MCP stdio server.",
336
- },
337
453
  };
338
454
  return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
339
455
  });
@@ -343,27 +459,49 @@ export function registerTools(server) {
343
459
  return { content: [{ type: "text", text: JSON.stringify({ waited: params.ms }) }] };
344
460
  });
345
461
  registry.register("wait");
346
- registerTool("wait_for_element", "Poll until an accessibility element matching the criteria appears", {
462
+ registerTool("wait_for_element", "Poll until an accessibility element matching the criteria reaches the desired state", {
347
463
  text: z.string().optional().describe("Element text"), role: z.string().optional().describe("Element role"),
348
464
  app: z.string().optional().describe("Target app"),
349
465
  timeout: z.number().optional().describe("Timeout ms (default 5000)"),
350
466
  timeoutMs: z.number().optional().describe("Alias for timeout"),
351
467
  interval: z.number().optional().describe("Poll interval ms (default 500)"),
352
468
  intervalMs: z.number().optional().describe("Alias for interval"),
469
+ until: z.enum(["appear", "disappear", "value_change"]).default("appear").describe("Wait condition: 'appear' (default) waits for a match, 'disappear' waits until no match, 'value_change' waits until first match's value changes"),
353
470
  }, async (params) => {
354
471
  const deadline = Date.now() + (params.timeout ?? params.timeoutMs ?? 5000);
355
472
  const interval = params.interval ?? params.intervalMs ?? 500;
356
- const query = { text: params.text, role: params.role, app: params.app, maxResults: 1 };
473
+ const until = params.until ?? "appear";
474
+ const effectiveApp = params.app || getActiveTarget()?.appName;
475
+ const query = { text: params.text, role: params.role, app: effectiveApp, maxResults: 1 };
357
476
  const { granted } = await checkPermission("accessibility");
358
477
  if (!granted)
359
478
  throw new PermissionError("accessibility", process.platform);
479
+ let initialValue;
360
480
  while (Date.now() < deadline) {
361
- const results = await getPlatform().findElement(query);
362
- if (results.length > 0)
363
- return { content: [{ type: "text", text: JSON.stringify({ found: true, element: results[0] }, null, 2) }] };
481
+ const response = await getPlatform().findElement(query);
482
+ const matched = response.results[0];
483
+ if (until === "appear") {
484
+ if (matched)
485
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, element: matched }, null, 2) }] };
486
+ }
487
+ else if (until === "disappear") {
488
+ if (!matched)
489
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, reason: "disappeared" }, null, 2) }] };
490
+ }
491
+ else {
492
+ // value_change: capture the initial value of the first match, then wait for it to differ
493
+ if (matched) {
494
+ if (initialValue === undefined) {
495
+ initialValue = matched.value;
496
+ }
497
+ else if (matched.value !== initialValue) {
498
+ return { content: [{ type: "text", text: JSON.stringify({ found: true, oldValue: initialValue, newValue: matched.value }, null, 2) }] };
499
+ }
500
+ }
501
+ }
364
502
  await new Promise(r => setTimeout(r, interval));
365
503
  }
366
- return { content: [{ type: "text", text: JSON.stringify({ found: false, reason: "timeout" }) }] };
504
+ return { content: [{ type: "text", text: JSON.stringify({ found: false, reason: "timeout" }, null, 2) }] };
367
505
  });
368
506
  registry.register("wait_for_element");
369
507
  registerTool("get_cursor_position", "Get current cursor position", {}, async () => {
@@ -392,40 +530,61 @@ export function registerTools(server) {
392
530
  }, async (params) => {
393
531
  const pt = await resolvePoint(params.x, params.y, params.windowId);
394
532
  await withSafety({ action: "move", params: { x: pt.x, y: pt.y }, requiresAccessibility: true, execute: () => getPlatform().move(pt.x, pt.y) });
395
- return actionResponse({ moved: true, x: pt.x, y: pt.y }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
533
+ return actionResponse("move", { moved: true, x: pt.x, y: pt.y }, { x: pt.x, y: pt.y, windowId: params.windowId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
396
534
  });
397
535
  registry.register("move");
398
536
  registerTool("find_element", "Find accessibility elements by text, role, or app", {
399
537
  text: z.string().optional().describe("Text to search"), role: z.string().optional().describe("AX role"), app: z.string().optional().describe("Target app"),
400
538
  depth: z.number().optional().describe("AX tree depth"), includeBounds: z.boolean().default(true).describe("Include bounds"), maxResults: z.number().min(1).max(200).default(50).describe("Max results"),
539
+ textMode: z.enum(["contains", "exact", "regex"]).default("contains").describe("Text matching mode: contains (default), exact, or regex"),
540
+ visibleOnly: z.boolean().default(false).describe("Only return elements with valid on-screen bounds"),
541
+ value: z.string().optional().describe("Filter by AX element value (respects textMode)"),
542
+ index: z.number().int().nonnegative().optional().describe("Return only the Nth match (0-based) after all other filtering and sorting"),
543
+ near: z.object({ x: z.number(), y: z.number() }).optional().describe("Sort results by ascending distance to this point and return closest first"),
401
544
  }, async (params) => {
402
- const results = await withSafety({ action: "find_element", params: {}, requiresAccessibility: true,
403
- execute: () => getPlatform().findElement({ text: params.text, role: params.role, app: params.app, depth: params.depth, includeBounds: params.includeBounds, maxResults: params.maxResults }) });
404
- return { content: [{ type: "text", text: JSON.stringify(results, null, 2) }] };
545
+ const effectiveApp = params.app || getActiveTarget()?.appName;
546
+ const response = await withSafety({ action: "find_element", params: {}, requiresAccessibility: true,
547
+ execute: () => getPlatform().findElement({ text: params.text, role: params.role, app: effectiveApp, depth: params.depth, includeBounds: params.includeBounds, maxResults: params.maxResults, textMode: params.textMode, visibleOnly: params.visibleOnly, value: params.value, index: params.index, near: params.near }) });
548
+ return { content: [{ type: "text", text: JSON.stringify({ results: response.results, metrics: response.metrics }, null, 2) }] };
405
549
  });
406
550
  registry.register("find_element");
407
551
  registerTool("click_element", "Click an accessibility element by its ID", {
408
552
  elementId: z.string().describe("AX element identifier"), app: z.string().optional().describe("Target app"), ...captureAfterFields,
409
553
  }, async (params) => {
410
- await withSafety({ action: "click_element", params: {}, requiresAccessibility: true, execute: () => getPlatform().clickElement(params.elementId, params.app) });
411
- return actionResponse({ clicked: true, elementId: params.elementId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
554
+ const effectiveApp = params.app || getActiveTarget()?.appName;
555
+ await withSafety({ action: "click_element", params: {}, requiresAccessibility: true, execute: () => getPlatform().clickElement(params.elementId, effectiveApp) });
556
+ return actionResponse("click_element", { clicked: true, elementId: params.elementId }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
412
557
  });
413
558
  registry.register("click_element");
414
559
  registerTool("set_value", "Set the value of an accessibility element", {
415
560
  elementId: z.string().describe("AX element identifier"), value: z.string().describe("Value to set"), app: z.string().optional().describe("Target app"), ...captureAfterFields,
416
561
  }, async (params) => {
417
- await withSafety({ action: "set_value", params: { value: params.value }, requiresAccessibility: true, execute: () => getPlatform().setElementValue(params.elementId, params.value, params.app) });
418
- return actionResponse({ setValue: true, elementId: params.elementId }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
562
+ const effectiveApp = params.app || getActiveTarget()?.appName;
563
+ await withSafety({ action: "set_value", params: { value: params.value }, requiresAccessibility: true, execute: () => getPlatform().setElementValue(params.elementId, params.value, effectiveApp) });
564
+ return actionResponse("set_value", { setValue: true, elementId: params.elementId }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
419
565
  });
420
566
  registry.register("set_value");
421
567
  registerTool("type_in_element", "Type text into an accessibility element, optionally clearing first", {
422
568
  elementId: z.string().describe("AX element identifier"), text: z.string().describe("Text to type"),
423
569
  app: z.string().optional().describe("Target app"), clearFirst: z.boolean().optional().describe("Clear existing text before typing"), ...captureAfterFields,
424
570
  }, async (params) => {
425
- await withSafety({ action: "type_in_element", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().typeInElement(params.elementId, params.text, params.app, params.clearFirst) });
426
- return actionResponse({ typed: true, elementId: params.elementId, charCount: params.text.length }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
571
+ const effectiveApp = params.app || getActiveTarget()?.appName;
572
+ await withSafety({ action: "type_in_element", params: { text: params.text }, requiresAccessibility: true, execute: () => getPlatform().typeInElement(params.elementId, params.text, effectiveApp, params.clearFirst) });
573
+ return actionResponse("type_in_element", { typed: true, elementId: params.elementId, charCount: params.text.length }, { elementId: params.elementId, app: effectiveApp }, params.captureAfter, params.captureFormat, params.captureMaxWidth);
427
574
  });
428
575
  registry.register("type_in_element");
576
+ registerTool("clipboard_read", "Read the current contents of the system clipboard", {}, async () => {
577
+ const text = await withSafety({ action: "clipboard_read", params: {}, execute: () => getPlatform().readClipboard() });
578
+ return { content: [{ type: "text", text: JSON.stringify({ text }, null, 2) }] };
579
+ });
580
+ registry.register("clipboard_read");
581
+ registerTool("clipboard_write", "Write text to the system clipboard (text injection patterns are blocked)", {
582
+ text: z.string().describe("Text to place on the clipboard"),
583
+ }, async (params) => {
584
+ await withSafety({ action: "clipboard_write", params: { text: params.text }, execute: () => getPlatform().writeClipboard(params.text) });
585
+ return { content: [{ type: "text", text: JSON.stringify({ written: true }, null, 2) }] };
586
+ });
587
+ registry.register("clipboard_write");
429
588
  log.info("Registered tools", { count: registry.tools.length, tools: registry.tools.join(", ") });
430
589
  }
431
590
  export class ToolRegistry {
@@ -38,10 +38,12 @@ export interface AppInfo {
38
38
  windowCount: number;
39
39
  }
40
40
  export interface AppTarget {
41
+ targetId: string;
41
42
  appName: string;
42
43
  pid: number;
43
44
  windowId?: string;
44
45
  title?: string;
46
+ capturedAt: string;
45
47
  }
46
48
  export interface BrowserContext {
47
49
  appName: string;
@@ -81,6 +83,17 @@ export interface FindElementOptions {
81
83
  depth?: number;
82
84
  includeBounds?: boolean;
83
85
  maxResults?: number;
86
+ textMode?: "contains" | "exact" | "regex";
87
+ visibleOnly?: boolean;
88
+ /** Match against the AX element's current value attribute (respects textMode). */
89
+ value?: string;
90
+ /** Return only the Nth match (0-based) after all other filtering and sorting. */
91
+ index?: number;
92
+ /** Sort results by ascending distance to this point and return closest first. */
93
+ near?: {
94
+ x: number;
95
+ y: number;
96
+ };
84
97
  }
85
98
  export interface FindElementResult {
86
99
  id: string;
@@ -95,6 +108,16 @@ export interface FindElementResult {
95
108
  };
96
109
  description?: string;
97
110
  }
111
+ export interface FindElementMetrics {
112
+ scannedCount: number;
113
+ matchedCount: number;
114
+ durationMs: number;
115
+ truncated: boolean;
116
+ }
117
+ export interface FindElementResponse {
118
+ results: FindElementResult[];
119
+ metrics: FindElementMetrics;
120
+ }
98
121
  export interface WindowState {
99
122
  window: WindowInfo;
100
123
  focusedElement?: ElementInfo;
@@ -117,11 +140,13 @@ export interface Platform {
117
140
  ocr(display?: number, region?: ScreenRegion): Promise<OcrResult>;
118
141
  type(text: string, delay?: number): Promise<void>;
119
142
  key(keys: string[]): Promise<void>;
120
- findElement(options: FindElementOptions): Promise<FindElementResult[]>;
143
+ findElement(options: FindElementOptions): Promise<FindElementResponse>;
121
144
  clickElement(elementId: string, app?: string): Promise<void>;
122
145
  typeInElement(elementId: string, text: string, app?: string, clearFirst?: boolean): Promise<void>;
123
146
  setElementValue?(elementId: string, value: string, app?: string): Promise<void>;
124
147
  isScreenLocked?(): boolean;
125
148
  saveFocus?(): Promise<void>;
126
149
  restoreFocus?(): Promise<void>;
150
+ readClipboard(): Promise<string>;
151
+ writeClipboard(text: string): Promise<void>;
127
152
  }
@@ -1,4 +1,4 @@
1
- import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResult } from "./base.js";
1
+ import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResponse } from "./base.js";
2
2
  /**
3
3
  * Linux platform adapter (AT-SPI2 + xdotool fallback)
4
4
  * TODO: Implement with D-Bus AT-SPI2 bindings
@@ -16,7 +16,9 @@ export declare class LinuxPlatform implements Platform {
16
16
  type(text: string, delay?: number): Promise<void>;
17
17
  key(keys: string[]): Promise<void>;
18
18
  ocr(_display?: number, _region?: ScreenRegion): Promise<OcrResult>;
19
- findElement(_options: FindElementOptions): Promise<FindElementResult[]>;
19
+ findElement(_options: FindElementOptions): Promise<FindElementResponse>;
20
20
  clickElement(_elementId: string, _app?: string): Promise<void>;
21
21
  typeInElement(_elementId: string, _text: string, _app?: string, _clearFirst?: boolean): Promise<void>;
22
+ readClipboard(): Promise<string>;
23
+ writeClipboard(text: string): Promise<void>;
22
24
  }
@@ -1,3 +1,27 @@
1
+ import { execFileSync } from "node:child_process";
2
+ import { existsSync } from "node:fs";
3
+ import { PlatformError } from "../util/errors.js";
4
+ /** Pick the first available clipboard utility, preferring xclip. */
5
+ function pickClipboardTool() {
6
+ for (const bin of ["/usr/bin/xclip", "/usr/local/bin/xclip", "xclip"]) {
7
+ if (bin.startsWith("/") ? existsSync(bin) : which(bin))
8
+ return "xclip";
9
+ }
10
+ for (const bin of ["/usr/bin/xsel", "/usr/local/bin/xsel", "xsel"]) {
11
+ if (bin.startsWith("/") ? existsSync(bin) : which(bin))
12
+ return "xsel";
13
+ }
14
+ return undefined;
15
+ }
16
+ function which(bin) {
17
+ try {
18
+ execFileSync("which", [bin], { encoding: "utf-8", timeout: 2000, stdio: "ignore" });
19
+ return true;
20
+ }
21
+ catch {
22
+ return false;
23
+ }
24
+ }
1
25
  /**
2
26
  * Linux platform adapter (AT-SPI2 + xdotool fallback)
3
27
  * TODO: Implement with D-Bus AT-SPI2 bindings
@@ -59,4 +83,31 @@ export class LinuxPlatform {
59
83
  async typeInElement(_elementId, _text, _app, _clearFirst) {
60
84
  throw new Error("Not implemented: Linux typeInElement");
61
85
  }
86
+ async readClipboard() {
87
+ const tool = pickClipboardTool();
88
+ if (!tool) {
89
+ throw new PlatformError("readClipboard requires xclip or xsel on PATH", false);
90
+ }
91
+ try {
92
+ const args = tool === "xclip" ? ["-selection", "clipboard", "-o"] : ["--clipboard", "--output"];
93
+ const out = execFileSync(tool, args, { encoding: "utf-8", timeout: 5000 });
94
+ return out;
95
+ }
96
+ catch (error) {
97
+ throw new PlatformError(`read_clipboard failed: ${error.message}`);
98
+ }
99
+ }
100
+ async writeClipboard(text) {
101
+ const tool = pickClipboardTool();
102
+ if (!tool) {
103
+ throw new PlatformError("writeClipboard requires xclip or xsel on PATH", false);
104
+ }
105
+ try {
106
+ const args = tool === "xclip" ? ["-selection", "clipboard"] : ["--clipboard", "--input"];
107
+ execFileSync(tool, args, { input: text, encoding: "utf-8", timeout: 5000 });
108
+ }
109
+ catch (error) {
110
+ throw new PlatformError(`write_clipboard failed: ${error.message}`);
111
+ }
112
+ }
62
113
  }
@@ -1,4 +1,4 @@
1
- import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResult, AppInfo, AppTarget, BrowserContext, ScreenshotOptions } from "./base.js";
1
+ import type { Platform, ScreenRegion, ScreenSize, CursorPosition, WindowInfo, WindowState, OcrResult, FindElementOptions, FindElementResponse, AppInfo, AppTarget, BrowserContext, ScreenshotOptions } from "./base.js";
2
2
  export declare class MacOSPlatform implements Platform {
3
3
  private readonly elementCache;
4
4
  private readonly elementCacheTtlMs;
@@ -13,6 +13,8 @@ export declare class MacOSPlatform implements Platform {
13
13
  private evictOverflowCacheEntries;
14
14
  /** Check whether a cached element descriptor has expired. */
15
15
  private isCacheEntryExpired;
16
+ /** Validate that the active target window still exists. */
17
+ validateActiveTarget(): Promise<void>;
16
18
  /** Save the current frontmost app/window so we can restore after an action. */
17
19
  saveFocus(): Promise<void>;
18
20
  /** Restore the previously saved frontmost app/window. */
@@ -36,8 +38,10 @@ export declare class MacOSPlatform implements Platform {
36
38
  private ocrJxa;
37
39
  type(text: string, delay?: number): Promise<void>;
38
40
  key(keys: string[]): Promise<void>;
39
- findElement(options: FindElementOptions): Promise<FindElementResult[]>;
41
+ findElement(options: FindElementOptions): Promise<FindElementResponse>;
40
42
  clickElement(elementId: string, app?: string): Promise<void>;
41
43
  typeInElement(elementId: string, text: string, app?: string, clearFirst?: boolean): Promise<void>;
44
+ readClipboard(): Promise<string>;
45
+ writeClipboard(text: string): Promise<void>;
42
46
  setElementValue(elementId: string, value: string, app?: string): Promise<void>;
43
47
  }