screenhand 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/.claude/commands/automate.md +28 -0
  2. package/.claude/commands/debug-ui.md +19 -0
  3. package/.claude/commands/screenshot.md +15 -0
  4. package/.github/FUNDING.yml +1 -0
  5. package/.github/ISSUE_TEMPLATE/bug_report.md +27 -0
  6. package/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  7. package/.mcp.json +8 -0
  8. package/DESKTOP_MCP_GUIDE.md +92 -0
  9. package/LICENSE +661 -21
  10. package/README.md +97 -292
  11. package/SECURITY.md +44 -0
  12. package/docs/architecture.md +47 -0
  13. package/install-skills.sh +19 -0
  14. package/mcp-bridge.ts +271 -0
  15. package/mcp-desktop.ts +1221 -0
  16. package/native/macos-bridge/Package.swift +21 -0
  17. package/native/macos-bridge/Sources/AccessibilityBridge.swift +261 -0
  18. package/native/macos-bridge/Sources/AppManagement.swift +129 -0
  19. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +242 -0
  20. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  21. package/native/macos-bridge/Sources/VisionBridge.swift +80 -0
  22. package/native/macos-bridge/Sources/main.swift +345 -0
  23. package/native/windows-bridge/AppManagement.cs +234 -0
  24. package/native/windows-bridge/InputBridge.cs +436 -0
  25. package/native/windows-bridge/Program.cs +265 -0
  26. package/native/windows-bridge/ScreenCapture.cs +329 -0
  27. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  28. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  29. package/package.json +3 -14
  30. package/playbooks/devpost.json +186 -0
  31. package/playbooks/instagram.json +41 -0
  32. package/playbooks/instagram_v2.json +201 -0
  33. package/playbooks/x_v1.json +211 -0
  34. package/scripts/devpost-live-loop.mjs +421 -0
  35. package/src/config.ts +30 -0
  36. package/src/index.ts +92 -0
  37. package/src/logging/timeline-logger.ts +55 -0
  38. package/src/mcp/server.ts +449 -0
  39. package/src/memory/recall.ts +191 -0
  40. package/src/memory/research.ts +146 -0
  41. package/src/memory/seeds.ts +123 -0
  42. package/src/memory/session.ts +201 -0
  43. package/src/memory/store.ts +434 -0
  44. package/src/memory/types.ts +69 -0
  45. package/src/native/bridge-client.ts +239 -0
  46. package/src/native/macos-bridge-client.ts +22 -0
  47. package/src/runtime/accessibility-adapter.ts +487 -0
  48. package/src/runtime/app-adapter.ts +169 -0
  49. package/src/runtime/applescript-adapter.ts +376 -0
  50. package/src/runtime/ax-role-map.ts +102 -0
  51. package/src/runtime/browser-adapter.ts +129 -0
  52. package/src/runtime/cdp-chrome-adapter.ts +676 -0
  53. package/src/runtime/composite-adapter.ts +274 -0
  54. package/src/runtime/executor.ts +396 -0
  55. package/src/runtime/locator-cache.ts +33 -0
  56. package/src/runtime/planning-loop.ts +81 -0
  57. package/src/runtime/service.ts +448 -0
  58. package/src/runtime/session-manager.ts +50 -0
  59. package/src/runtime/state-observer.ts +136 -0
  60. package/src/runtime/vision-adapter.ts +297 -0
  61. package/src/types.ts +297 -0
  62. package/tests/bridge-client.test.ts +176 -0
  63. package/tests/browser-stealth.test.ts +210 -0
  64. package/tests/composite-adapter.test.ts +64 -0
  65. package/tests/mcp-server.test.ts +151 -0
  66. package/tests/memory-recall.test.ts +339 -0
  67. package/tests/memory-research.test.ts +159 -0
  68. package/tests/memory-seeds.test.ts +120 -0
  69. package/tests/memory-store.test.ts +392 -0
  70. package/tests/types.test.ts +92 -0
  71. package/tsconfig.check.json +17 -0
  72. package/tsconfig.json +19 -0
  73. package/vitest.config.ts +8 -0
  74. package/dist/config.js +0 -9
  75. package/dist/index.js +0 -55
  76. package/dist/logging/timeline-logger.js +0 -29
  77. package/dist/mcp/mcp-stdio-server.js +0 -284
  78. package/dist/mcp/server.js +0 -347
  79. package/dist/mcp-entry.js +0 -62
  80. package/dist/memory/recall.js +0 -160
  81. package/dist/memory/research.js +0 -98
  82. package/dist/memory/seeds.js +0 -89
  83. package/dist/memory/session.js +0 -161
  84. package/dist/memory/store.js +0 -391
  85. package/dist/memory/types.js +0 -4
  86. package/dist/native/bridge-client.js +0 -173
  87. package/dist/native/macos-bridge-client.js +0 -5
  88. package/dist/runtime/accessibility-adapter.js +0 -377
  89. package/dist/runtime/app-adapter.js +0 -48
  90. package/dist/runtime/applescript-adapter.js +0 -283
  91. package/dist/runtime/ax-role-map.js +0 -80
  92. package/dist/runtime/browser-adapter.js +0 -36
  93. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  94. package/dist/runtime/composite-adapter.js +0 -205
  95. package/dist/runtime/executor.js +0 -250
  96. package/dist/runtime/locator-cache.js +0 -12
  97. package/dist/runtime/planning-loop.js +0 -47
  98. package/dist/runtime/service.js +0 -372
  99. package/dist/runtime/session-manager.js +0 -28
  100. package/dist/runtime/state-observer.js +0 -105
  101. package/dist/runtime/vision-adapter.js +0 -208
  102. package/dist/test-mcp-protocol.js +0 -138
  103. package/dist/types.js +0 -1
@@ -0,0 +1,92 @@
1
+ import { describe, it, expect, expectTypeOf } from "vitest";
2
+ import type {
3
+ Target,
4
+ WaitCondition,
5
+ ToolName,
6
+ UIEventType,
7
+ SessionInfo,
8
+ AppContext,
9
+ LocatedElement,
10
+ WindowInfo,
11
+ RunningApp,
12
+ } from "../src/types.js";
13
+
14
+ /**
15
+ * Type-level tests to ensure the type definitions compile correctly
16
+ * and cover expected shapes. These catch regressions in the public API.
17
+ */
18
+
19
+ describe("Type definitions", () => {
20
+ it("Target union covers all locator strategies", () => {
21
+ const selectorTarget: Target = { type: "selector", value: "button.submit" };
22
+ const textTarget: Target = { type: "text", value: "Click me", exact: true };
23
+ const roleTarget: Target = { type: "role", role: "button", name: "Submit" };
24
+ const axPathTarget: Target = { type: "ax_path", path: ["0", "1", "3"] };
25
+ const axAttrTarget: Target = { type: "ax_attribute", attribute: "AXIdentifier", value: "btn1" };
26
+ const coordTarget: Target = { type: "coordinates", x: 100, y: 200 };
27
+ const imageTarget: Target = { type: "image", base64: "abc123", confidence: 0.9 };
28
+
29
+ // All should compile and be assignable
30
+ const targets: Target[] = [
31
+ selectorTarget, textTarget, roleTarget,
32
+ axPathTarget, axAttrTarget, coordTarget, imageTarget,
33
+ ];
34
+ expect(targets).toHaveLength(7);
35
+ });
36
+
37
+ it("WaitCondition covers all condition types", () => {
38
+ const conditions: WaitCondition[] = [
39
+ { type: "selector_visible", selector: ".loaded" },
40
+ { type: "selector_hidden", selector: ".spinner" },
41
+ { type: "url_matches", regex: "https://.*" },
42
+ { type: "text_appears", text: "Success" },
43
+ { type: "element_exists", target: { type: "text", value: "OK" } },
44
+ { type: "element_gone", target: { type: "text", value: "Loading" } },
45
+ { type: "window_title_matches", regex: "Untitled" },
46
+ ];
47
+ expect(conditions.length).toBeGreaterThan(0);
48
+ });
49
+
50
+ it("SessionInfo has required fields", () => {
51
+ const session: SessionInfo = {
52
+ sessionId: "test-123",
53
+ profile: "default",
54
+ createdAt: Date.now(),
55
+ adapterType: "composite",
56
+ };
57
+ expect(session.sessionId).toBe("test-123");
58
+ });
59
+
60
+ it("AppContext has required fields", () => {
61
+ const ctx: AppContext = {
62
+ bundleId: "com.apple.Notes",
63
+ appName: "Notes",
64
+ pid: 1234,
65
+ windowTitle: "My Note",
66
+ };
67
+ expect(ctx.pid).toBe(1234);
68
+ });
69
+
70
+ it("WindowInfo has required fields", () => {
71
+ const win: WindowInfo = {
72
+ windowId: 42,
73
+ title: "Test Window",
74
+ appName: "TestApp",
75
+ bundleId: "com.test.app",
76
+ pid: 5678,
77
+ bounds: { x: 0, y: 0, width: 800, height: 600 },
78
+ isOnScreen: true,
79
+ };
80
+ expect(win.windowId).toBe(42);
81
+ });
82
+
83
+ it("RunningApp has required fields", () => {
84
+ const app: RunningApp = {
85
+ name: "Finder",
86
+ bundleId: "com.apple.finder",
87
+ pid: 100,
88
+ isActive: true,
89
+ };
90
+ expect(app.name).toBe("Finder");
91
+ });
92
+ });
@@ -0,0 +1,17 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "lib": ["ES2022"],
7
+ "strict": true,
8
+ "noUncheckedIndexedAccess": true,
9
+ "exactOptionalPropertyTypes": true,
10
+ "skipLibCheck": true,
11
+ "resolveJsonModule": true,
12
+ "esModuleInterop": true,
13
+ "noEmit": true
14
+ },
15
+ "include": ["src/**/*.ts", "mcp-desktop.ts", "mcp-bridge.ts"],
16
+ "exclude": ["dist", "node_modules"]
17
+ }
package/tsconfig.json ADDED
@@ -0,0 +1,19 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "lib": ["ES2022"],
7
+ "strict": true,
8
+ "noUncheckedIndexedAccess": true,
9
+ "exactOptionalPropertyTypes": true,
10
+ "skipLibCheck": true,
11
+ "rootDir": "src",
12
+ "outDir": "dist",
13
+ "resolveJsonModule": true,
14
+ "esModuleInterop": true
15
+ },
16
+ "include": ["src/**/*.ts"],
17
+ "exclude": ["dist", "node_modules"]
18
+ }
19
+
@@ -0,0 +1,8 @@
1
+ import { defineConfig } from "vitest/config";
2
+
3
+ export default defineConfig({
4
+ test: {
5
+ include: ["tests/**/*.test.ts"],
6
+ testTimeout: 15_000,
7
+ },
8
+ });
package/dist/config.js DELETED
@@ -1,9 +0,0 @@
1
- export const DEFAULT_ACTION_BUDGET = {
2
- locateMs: 800,
3
- actMs: 200,
4
- verifyMs: 2000,
5
- maxRetries: 1,
6
- };
7
- export const DEFAULT_NAVIGATE_TIMEOUT_MS = 10_000;
8
- export const DEFAULT_WAIT_TIMEOUT_MS = 2_000;
9
- export const DEFAULT_PROFILE = "automation";
package/dist/index.js DELETED
@@ -1,55 +0,0 @@
1
- import { TimelineLogger } from "./logging/timeline-logger.js";
2
- import { MvpMcpServer } from "./mcp/server.js";
3
- import { PlaceholderAppAdapter, } from "./runtime/app-adapter.js";
4
- import { CdpChromeAdapter } from "./runtime/cdp-chrome-adapter.js";
5
- import { AutomationRuntimeService } from "./runtime/service.js";
6
- export { PlaceholderAppAdapter } from "./runtime/app-adapter.js";
7
- export { CdpChromeAdapter } from "./runtime/cdp-chrome-adapter.js";
8
- export { AccessibilityAdapter } from "./runtime/accessibility-adapter.js";
9
- export { AppleScriptAdapter } from "./runtime/applescript-adapter.js";
10
- export { VisionAdapter } from "./runtime/vision-adapter.js";
11
- export { CompositeAdapter } from "./runtime/composite-adapter.js";
12
- export { BridgeClient, BridgeClient as MacOSBridgeClient } from "./native/bridge-client.js";
13
- export { StateObserver } from "./runtime/state-observer.js";
14
- export { PlanningLoop } from "./runtime/planning-loop.js";
15
- export { AutomationRuntimeService } from "./runtime/service.js";
16
- export { MvpMcpServer } from "./mcp/server.js";
17
- export { createMcpStdioServer, startMcpStdioServer } from "./mcp/mcp-stdio-server.js";
18
- export function createRuntimeApp(adapter) {
19
- const logger = new TimelineLogger();
20
- const runtime = new AutomationRuntimeService(adapter, logger);
21
- const mcp = new MvpMcpServer(runtime);
22
- return { runtime, mcp };
23
- }
24
- async function createDefaultAdapter() {
25
- if (process.env.AUTOMATOR_ADAPTER === "placeholder") {
26
- return new PlaceholderAppAdapter();
27
- }
28
- if (process.env.AUTOMATOR_ADAPTER === "composite") {
29
- // Lazy import to avoid requiring Swift bridge for CDP-only usage
30
- const { MacOSBridgeClient } = await import("./native/macos-bridge-client.js");
31
- const { CompositeAdapter } = await import("./runtime/composite-adapter.js");
32
- const bridge = new MacOSBridgeClient();
33
- return new CompositeAdapter(bridge, {
34
- headless: process.env.AUTOMATOR_HEADLESS === "1",
35
- });
36
- }
37
- if (process.env.AUTOMATOR_ADAPTER === "accessibility") {
38
- const { MacOSBridgeClient } = await import("./native/macos-bridge-client.js");
39
- const { AccessibilityAdapter } = await import("./runtime/accessibility-adapter.js");
40
- const bridge = new MacOSBridgeClient();
41
- return new AccessibilityAdapter(bridge);
42
- }
43
- return new CdpChromeAdapter({
44
- headless: process.env.AUTOMATOR_HEADLESS === "1",
45
- });
46
- }
47
- const app = createRuntimeApp(await createDefaultAdapter());
48
- if (process.argv.includes("--healthcheck")) {
49
- const session = await app.runtime.sessionStart("automation");
50
- console.log(JSON.stringify({
51
- status: "ok",
52
- session,
53
- note: "Runtime loaded with universal adapter support.",
54
- }, null, 2));
55
- }
@@ -1,29 +0,0 @@
1
- export class TimelineLogger {
2
- timeline = [];
3
- start(action, sessionId) {
4
- return {
5
- action,
6
- sessionId,
7
- startedAt: new Date().toISOString(),
8
- locateMs: 0,
9
- actMs: 0,
10
- verifyMs: 0,
11
- retries: 0,
12
- };
13
- }
14
- finish(telemetry, status) {
15
- const finishedAt = new Date().toISOString();
16
- const totalMs = new Date(finishedAt).getTime() - new Date(telemetry.startedAt).getTime();
17
- const finalized = {
18
- ...telemetry,
19
- finishedAt,
20
- totalMs,
21
- status,
22
- };
23
- this.timeline.push(finalized);
24
- return finalized;
25
- }
26
- getRecent(limit = 50) {
27
- return this.timeline.slice(-limit);
28
- }
29
- }
@@ -1,284 +0,0 @@
1
- import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
- import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
3
- import { z } from "zod";
4
- // ── Schema building blocks ──
5
- const TargetSchema = z.union([
6
- z.string().describe("Shorthand: text to find, or 'css=...' / 'text=...' / 'ax_id=...' prefix"),
7
- z.object({
8
- text: z.string(),
9
- exact: z.boolean().optional(),
10
- }).describe("Find by visible text"),
11
- z.object({
12
- role: z.string(),
13
- name: z.string(),
14
- exact: z.boolean().optional(),
15
- }).describe("Find by ARIA/AX role and accessible name"),
16
- z.object({
17
- selector: z.string(),
18
- }).describe("Find by CSS selector (browser) or AX identifier (desktop)"),
19
- z.object({
20
- x: z.number(),
21
- y: z.number(),
22
- }).describe("Click at screen coordinates"),
23
- z.object({
24
- attribute: z.string(),
25
- value: z.string(),
26
- }).describe("Find by accessibility attribute"),
27
- ]);
28
- const WaitConditionSchema = z.object({
29
- type: z.enum([
30
- "selector_visible",
31
- "selector_hidden",
32
- "url_matches",
33
- "text_appears",
34
- "spinner_disappears",
35
- "element_exists",
36
- "element_gone",
37
- "window_title_matches",
38
- "app_idle",
39
- ]),
40
- selector: z.string().optional(),
41
- regex: z.string().optional(),
42
- text: z.string().optional(),
43
- target: TargetSchema.optional(),
44
- bundleId: z.string().optional(),
45
- timeoutMs: z.number().optional(),
46
- }).describe("Condition to wait for");
47
- const RegionSchema = z.object({
48
- x: z.number(),
49
- y: z.number(),
50
- width: z.number(),
51
- height: z.number(),
52
- });
53
- // ── Target parser ──
54
- function parseTarget(input) {
55
- if (typeof input === "string") {
56
- if (input.startsWith("css="))
57
- return { type: "selector", value: input.slice(4) };
58
- if (input.startsWith("text="))
59
- return { type: "text", value: input.slice(5), exact: true };
60
- if (input.startsWith("ax_id="))
61
- return { type: "ax_attribute", attribute: "identifier", value: input.slice(6) };
62
- return { type: "text", value: input };
63
- }
64
- const obj = input;
65
- if (typeof obj.selector === "string")
66
- return { type: "selector", value: obj.selector };
67
- if (typeof obj.text === "string")
68
- return { type: "text", value: obj.text, exact: obj.exact === true };
69
- if (typeof obj.role === "string" && typeof obj.name === "string")
70
- return { type: "role", role: obj.role, name: obj.name, exact: obj.exact === true };
71
- if (typeof obj.x === "number" && typeof obj.y === "number")
72
- return { type: "coordinates", x: obj.x, y: obj.y };
73
- if (typeof obj.attribute === "string" && typeof obj.value === "string")
74
- return { type: "ax_attribute", attribute: obj.attribute, value: obj.value };
75
- throw new Error("Invalid target");
76
- }
77
- function parseWaitCondition(input) {
78
- const obj = input;
79
- const type = obj.type;
80
- switch (type) {
81
- case "selector_visible": return { type: "selector_visible", selector: obj.selector };
82
- case "selector_hidden": return { type: "selector_hidden", selector: obj.selector };
83
- case "url_matches": return { type: "url_matches", regex: obj.regex };
84
- case "text_appears": return { type: "text_appears", text: obj.text };
85
- case "spinner_disappears": return { type: "spinner_disappears", selector: obj.selector };
86
- case "element_exists": return { type: "element_exists", target: parseTarget(obj.target) };
87
- case "element_gone": return { type: "element_gone", target: parseTarget(obj.target) };
88
- case "window_title_matches": return { type: "window_title_matches", regex: obj.regex };
89
- case "app_idle": {
90
- const cond = { type: "app_idle", bundleId: obj.bundleId };
91
- if (typeof obj.timeoutMs === "number")
92
- cond.timeoutMs = obj.timeoutMs;
93
- return cond;
94
- }
95
- default: throw new Error(`Unknown wait condition type: ${type}`);
96
- }
97
- }
98
- // ── Helpers ──
99
- function ok(data) {
100
- return { content: [{ type: "text", text: JSON.stringify(data, null, 2) }] };
101
- }
102
- function err(message) {
103
- return { content: [{ type: "text", text: message }], isError: true };
104
- }
105
- // ── Server builder ──
106
- export function createMcpStdioServer(runtime) {
107
- const mcp = new McpServer({ name: "screenhand", version: "0.1.0" }, {
108
- capabilities: { tools: {} },
109
- instructions: "ScreenHand gives AI agents eyes and hands on the desktop. Use session_start to begin, then call tools to control apps.",
110
- });
111
- // ── session_start ──
112
- mcp.tool("session_start", "Start a new automation session. Returns a sessionId needed by all other tools. Automatically attaches to the frontmost app.", { profile: z.string().optional().describe("Session profile name (default: 'automation')") }, async ({ profile }) => {
113
- try {
114
- const session = await runtime.sessionStart(profile);
115
- return ok(session);
116
- }
117
- catch (e) {
118
- return err(`Failed to start session: ${e instanceof Error ? e.message : String(e)}`);
119
- }
120
- });
121
- // ── press ──
122
- mcp.tool("press", "Click/press a UI element. Finds the element by text, role, selector, or coordinates, then clicks it.", {
123
- sessionId: z.string().describe("Session ID from session_start"),
124
- target: TargetSchema.describe("What to click — text string, {role, name}, {selector}, or {x, y}"),
125
- verify: WaitConditionSchema.optional().describe("Optional condition to verify after clicking"),
126
- }, async ({ sessionId, target, verify }) => {
127
- const input = { sessionId, target: parseTarget(target) };
128
- if (verify)
129
- input.verify = parseWaitCondition(verify);
130
- const result = await runtime.press(input);
131
- return result.ok ? ok(result) : err(result.error.message);
132
- });
133
- // ── type_into ──
134
- mcp.tool("type_into", "Type text into a UI element (text field, search box, etc). Locates the field, optionally clears it, then types.", {
135
- sessionId: z.string(),
136
- target: TargetSchema.describe("The input field to type into"),
137
- text: z.string().describe("Text to type"),
138
- clear: z.boolean().optional().describe("Clear the field first (default: true)"),
139
- verify: WaitConditionSchema.optional(),
140
- }, async ({ sessionId, target, text, clear, verify }) => {
141
- const input = { sessionId, target: parseTarget(target), text };
142
- if (typeof clear === "boolean")
143
- input.clear = clear;
144
- if (verify)
145
- input.verify = parseWaitCondition(verify);
146
- const result = await runtime.typeInto(input);
147
- return result.ok ? ok(result) : err(result.error.message);
148
- });
149
- // ── navigate ──
150
- mcp.tool("navigate", "Navigate a browser to a URL, or open an app via 'app://com.bundle.id'.", {
151
- sessionId: z.string(),
152
- url: z.string().describe("URL to navigate to, or 'app://bundleId' to launch an app"),
153
- timeoutMs: z.number().optional().describe("Navigation timeout in ms (default: 10000)"),
154
- }, async ({ sessionId, url, timeoutMs }) => {
155
- const input = { sessionId, url };
156
- if (typeof timeoutMs === "number")
157
- input.timeoutMs = timeoutMs;
158
- const result = await runtime.navigate(input);
159
- return result.ok ? ok(result) : err(result.error.message);
160
- });
161
- // ── wait_for ──
162
- mcp.tool("wait_for", "Wait for a condition: element appears/disappears, text appears, URL changes, window title matches, etc.", {
163
- sessionId: z.string(),
164
- condition: WaitConditionSchema,
165
- timeoutMs: z.number().optional().describe("Timeout in ms (default: 2000)"),
166
- }, async ({ sessionId, condition, timeoutMs }) => {
167
- const input = { sessionId, condition: parseWaitCondition(condition) };
168
- if (typeof timeoutMs === "number")
169
- input.timeoutMs = timeoutMs;
170
- const result = await runtime.waitFor(input);
171
- return result.ok ? ok(result) : err(result.error.message);
172
- });
173
- // ── extract ──
174
- mcp.tool("extract", "Extract data from a UI element. Returns text content, table data, or structured JSON from the element.", {
175
- sessionId: z.string(),
176
- target: TargetSchema,
177
- format: z.enum(["text", "table", "json"]).describe("Output format"),
178
- }, async ({ sessionId, target, format }) => {
179
- const result = await runtime.extract({
180
- sessionId,
181
- target: parseTarget(target),
182
- format: format,
183
- });
184
- return result.ok ? ok(result) : err(result.error.message);
185
- });
186
- // ── screenshot ──
187
- mcp.tool("screenshot", "Capture a screenshot of the current app window or a specific screen region. Returns the file path.", {
188
- sessionId: z.string(),
189
- region: RegionSchema.optional().describe("Optional screen region to capture"),
190
- }, async ({ sessionId, region }) => {
191
- const input = { sessionId };
192
- if (region)
193
- input.region = region;
194
- const result = await runtime.screenshot(input);
195
- return result.ok ? ok(result) : err(result.error.message);
196
- });
197
- // ── app_launch ──
198
- mcp.tool("app_launch", "Launch a macOS/Windows application by bundle ID (e.g., 'com.apple.Safari', 'com.google.Chrome').", {
199
- sessionId: z.string(),
200
- bundleId: z.string().describe("macOS bundle ID or Windows process name"),
201
- }, async ({ sessionId, bundleId }) => {
202
- const result = await runtime.appLaunch({ sessionId, bundleId });
203
- return result.ok ? ok(result) : err(result.error.message);
204
- });
205
- // ── app_focus ──
206
- mcp.tool("app_focus", "Bring a running application to the foreground.", {
207
- sessionId: z.string(),
208
- bundleId: z.string(),
209
- }, async ({ sessionId, bundleId }) => {
210
- const result = await runtime.appFocus({ sessionId, bundleId });
211
- return result.ok ? ok(result) : err(result.error.message);
212
- });
213
- // ── app_list ──
214
- mcp.tool("app_list", "List all running applications with their bundle IDs, names, and PIDs.", { sessionId: z.string() }, async ({ sessionId }) => {
215
- const result = await runtime.appList(sessionId);
216
- return result.ok ? ok(result) : err(result.error.message);
217
- });
218
- // ── window_list ──
219
- mcp.tool("window_list", "List all visible windows with their titles, positions, and sizes.", { sessionId: z.string() }, async ({ sessionId }) => {
220
- const result = await runtime.windowList(sessionId);
221
- return result.ok ? ok(result) : err(result.error.message);
222
- });
223
- // ── menu_click ──
224
- mcp.tool("menu_click", "Click a menu item by path. For example ['File', 'Save As...'] clicks File → Save As.", {
225
- sessionId: z.string(),
226
- menuPath: z.array(z.string()).describe("Menu path, e.g. ['File', 'New Window']"),
227
- }, async ({ sessionId, menuPath }) => {
228
- const result = await runtime.menuClick({ sessionId, menuPath });
229
- return result.ok ? ok(result) : err(result.error.message);
230
- });
231
- // ── key_combo ──
232
- mcp.tool("key_combo", "Send a keyboard shortcut. Keys: 'cmd', 'ctrl', 'alt', 'shift', plus any character. E.g. ['cmd', 'c'] for copy.", {
233
- sessionId: z.string(),
234
- keys: z.array(z.string()).describe("Key combination, e.g. ['cmd', 's']"),
235
- }, async ({ sessionId, keys }) => {
236
- const result = await runtime.keyCombo({ sessionId, keys });
237
- return result.ok ? ok(result) : err(result.error.message);
238
- });
239
- // ── element_tree ──
240
- mcp.tool("element_tree", "Get the accessibility element tree of the current app. Useful for understanding the UI structure and finding elements to interact with.", {
241
- sessionId: z.string(),
242
- maxDepth: z.number().optional().describe("Max tree depth (default: 5)"),
243
- }, async ({ sessionId, maxDepth }) => {
244
- const input = { sessionId };
245
- if (typeof maxDepth === "number")
246
- input.maxDepth = maxDepth;
247
- const result = await runtime.elementTree(input);
248
- return result.ok ? ok(result) : err(result.error.message);
249
- });
250
- // ── drag ──
251
- mcp.tool("drag", "Drag from one UI element to another.", {
252
- sessionId: z.string(),
253
- from: TargetSchema.describe("Element to drag from"),
254
- to: TargetSchema.describe("Element to drag to"),
255
- }, async ({ sessionId, from, to }) => {
256
- const result = await runtime.drag({
257
- sessionId,
258
- from: parseTarget(from),
259
- to: parseTarget(to),
260
- });
261
- return result.ok ? ok(result) : err(result.error.message);
262
- });
263
- // ── scroll ──
264
- mcp.tool("scroll", "Scroll in a direction, optionally targeting a specific element.", {
265
- sessionId: z.string(),
266
- direction: z.enum(["up", "down", "left", "right"]),
267
- amount: z.number().optional().describe("Scroll amount (default: 3)"),
268
- target: TargetSchema.optional().describe("Element to scroll within"),
269
- }, async ({ sessionId, direction, amount, target }) => {
270
- const input = { sessionId, direction };
271
- if (typeof amount === "number")
272
- input.amount = amount;
273
- if (target)
274
- input.target = parseTarget(target);
275
- const result = await runtime.scroll(input);
276
- return result.ok ? ok(result) : err(result.error.message);
277
- });
278
- return mcp;
279
- }
280
- export async function startMcpStdioServer(runtime) {
281
- const mcp = createMcpStdioServer(runtime);
282
- const transport = new StdioServerTransport();
283
- await mcp.connect(transport);
284
- }