screenhand 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +427 -0
- package/dist/config.js +9 -0
- package/dist/index.js +55 -0
- package/dist/logging/timeline-logger.js +29 -0
- package/dist/mcp/mcp-stdio-server.js +284 -0
- package/dist/mcp/server.js +347 -0
- package/dist/mcp-entry.js +62 -0
- package/dist/memory/recall.js +160 -0
- package/dist/memory/research.js +98 -0
- package/dist/memory/seeds.js +89 -0
- package/dist/memory/session.js +161 -0
- package/dist/memory/store.js +391 -0
- package/dist/memory/types.js +4 -0
- package/dist/native/bridge-client.js +173 -0
- package/dist/native/macos-bridge-client.js +5 -0
- package/dist/runtime/accessibility-adapter.js +377 -0
- package/dist/runtime/app-adapter.js +48 -0
- package/dist/runtime/applescript-adapter.js +283 -0
- package/dist/runtime/ax-role-map.js +80 -0
- package/dist/runtime/browser-adapter.js +36 -0
- package/dist/runtime/cdp-chrome-adapter.js +505 -0
- package/dist/runtime/composite-adapter.js +205 -0
- package/dist/runtime/executor.js +250 -0
- package/dist/runtime/locator-cache.js +12 -0
- package/dist/runtime/planning-loop.js +47 -0
- package/dist/runtime/service.js +372 -0
- package/dist/runtime/session-manager.js +28 -0
- package/dist/runtime/state-observer.js +105 -0
- package/dist/runtime/vision-adapter.js +208 -0
- package/dist/test-mcp-protocol.js +138 -0
- package/dist/types.js +1 -0
- package/package.json +72 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
const POLL_INTERVAL_MS = 200;
|
|
2
|
+
/**
|
|
3
|
+
* Vision-based adapter for apps with poor/no accessibility support.
|
|
4
|
+
* Uses screenshots + OCR to locate elements and CG events to interact.
|
|
5
|
+
*/
|
|
6
|
+
export class VisionAdapter {
|
|
7
|
+
bridge;
|
|
8
|
+
sessions = new Map();
|
|
9
|
+
sessionsByProfile = new Map();
|
|
10
|
+
constructor(bridge) {
|
|
11
|
+
this.bridge = bridge;
|
|
12
|
+
}
|
|
13
|
+
async attach(profile) {
|
|
14
|
+
const existing = this.sessionsByProfile.get(profile);
|
|
15
|
+
if (existing)
|
|
16
|
+
return existing.info;
|
|
17
|
+
await this.bridge.start();
|
|
18
|
+
const frontmost = await this.bridge.call("app.frontmost");
|
|
19
|
+
const info = {
|
|
20
|
+
sessionId: `vision_session_${profile}_${Date.now()}`,
|
|
21
|
+
profile,
|
|
22
|
+
createdAt: new Date().toISOString(),
|
|
23
|
+
adapterType: "vision",
|
|
24
|
+
};
|
|
25
|
+
const state = {
|
|
26
|
+
info,
|
|
27
|
+
pid: frontmost.pid,
|
|
28
|
+
bundleId: frontmost.bundleId,
|
|
29
|
+
appName: frontmost.name,
|
|
30
|
+
};
|
|
31
|
+
this.sessions.set(info.sessionId, state);
|
|
32
|
+
this.sessionsByProfile.set(profile, state);
|
|
33
|
+
return info;
|
|
34
|
+
}
|
|
35
|
+
async getAppContext(sessionId) {
|
|
36
|
+
const state = this.requireSession(sessionId);
|
|
37
|
+
return {
|
|
38
|
+
bundleId: state.bundleId,
|
|
39
|
+
appName: state.appName,
|
|
40
|
+
pid: state.pid,
|
|
41
|
+
windowTitle: state.appName,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
async getPageMeta(sessionId) {
|
|
45
|
+
const ctx = await this.getAppContext(sessionId);
|
|
46
|
+
return {
|
|
47
|
+
url: `app://${ctx.bundleId}`,
|
|
48
|
+
title: ctx.appName,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
async navigate(sessionId, url, _timeoutMs) {
|
|
52
|
+
if (url.startsWith("app://")) {
|
|
53
|
+
const bundleId = url.slice(6);
|
|
54
|
+
const result = await this.bridge.call("app.launch", { bundleId });
|
|
55
|
+
const state = this.requireSession(sessionId);
|
|
56
|
+
state.pid = result.pid;
|
|
57
|
+
state.bundleId = result.bundleId;
|
|
58
|
+
state.appName = result.appName;
|
|
59
|
+
}
|
|
60
|
+
return this.getPageMeta(sessionId);
|
|
61
|
+
}
|
|
62
|
+
async locate(sessionId, target, timeoutMs) {
|
|
63
|
+
const state = this.requireSession(sessionId);
|
|
64
|
+
const deadline = Date.now() + timeoutMs;
|
|
65
|
+
while (Date.now() < deadline) {
|
|
66
|
+
// Take a screenshot
|
|
67
|
+
const screenshotResult = await this.bridge.call("cg.captureScreen", {});
|
|
68
|
+
state.lastScreenshotPath = screenshotResult.path;
|
|
69
|
+
const searchText = this.getSearchText(target);
|
|
70
|
+
if (!searchText) {
|
|
71
|
+
// For coordinate targets, just return coordinates directly
|
|
72
|
+
if (target.type === "coordinates") {
|
|
73
|
+
return {
|
|
74
|
+
handleId: `vision_coords_${target.x}_${target.y}`,
|
|
75
|
+
locatorUsed: "vision:coordinates",
|
|
76
|
+
coordinates: { x: target.x, y: target.y, width: 1, height: 1 },
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
81
|
+
// OCR the screenshot
|
|
82
|
+
const matches = await this.bridge.call("vision.findText", {
|
|
83
|
+
imagePath: screenshotResult.path,
|
|
84
|
+
searchText,
|
|
85
|
+
});
|
|
86
|
+
if (matches.length > 0) {
|
|
87
|
+
const best = matches.reduce((a, b) => (a.confidence > b.confidence ? a : b));
|
|
88
|
+
return {
|
|
89
|
+
handleId: `vision_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
|
90
|
+
locatorUsed: `vision:text:${searchText}`,
|
|
91
|
+
label: best.text,
|
|
92
|
+
coordinates: best.bounds,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
await sleep(POLL_INTERVAL_MS);
|
|
96
|
+
}
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
async click(_sessionId, element) {
|
|
100
|
+
if (!element.coordinates) {
|
|
101
|
+
throw new Error("Vision adapter requires coordinates to click");
|
|
102
|
+
}
|
|
103
|
+
const cx = element.coordinates.x + element.coordinates.width / 2;
|
|
104
|
+
const cy = element.coordinates.y + element.coordinates.height / 2;
|
|
105
|
+
await this.bridge.call("cg.mouseClick", { x: cx, y: cy });
|
|
106
|
+
}
|
|
107
|
+
async setValue(_sessionId, element, text, clear) {
|
|
108
|
+
// Click to focus
|
|
109
|
+
await this.click(_sessionId, element);
|
|
110
|
+
await sleep(100);
|
|
111
|
+
if (clear) {
|
|
112
|
+
await this.bridge.call("cg.keyCombo", { keys: ["cmd", "a"] });
|
|
113
|
+
await sleep(50);
|
|
114
|
+
}
|
|
115
|
+
await this.bridge.call("cg.typeText", { text });
|
|
116
|
+
}
|
|
117
|
+
async getValue(_sessionId, element) {
|
|
118
|
+
// Vision can't reliably read values; return label if available
|
|
119
|
+
return element.label ?? "";
|
|
120
|
+
}
|
|
121
|
+
async waitFor(sessionId, condition, timeoutMs) {
|
|
122
|
+
const deadline = Date.now() + timeoutMs;
|
|
123
|
+
while (Date.now() < deadline) {
|
|
124
|
+
if (condition.type === "text_appears") {
|
|
125
|
+
const found = await this.locate(sessionId, { type: "text", value: condition.text }, 200);
|
|
126
|
+
if (found)
|
|
127
|
+
return true;
|
|
128
|
+
}
|
|
129
|
+
else if (condition.type === "element_exists") {
|
|
130
|
+
const found = await this.locate(sessionId, condition.target, 200);
|
|
131
|
+
if (found)
|
|
132
|
+
return true;
|
|
133
|
+
}
|
|
134
|
+
else if (condition.type === "element_gone") {
|
|
135
|
+
const found = await this.locate(sessionId, condition.target, 200);
|
|
136
|
+
if (!found)
|
|
137
|
+
return true;
|
|
138
|
+
}
|
|
139
|
+
else {
|
|
140
|
+
// Unsupported condition types
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
await sleep(POLL_INTERVAL_MS);
|
|
144
|
+
}
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
async extract(sessionId, _target, format) {
|
|
148
|
+
const state = this.requireSession(sessionId);
|
|
149
|
+
// Take a fresh screenshot and OCR it
|
|
150
|
+
const screenshotResult = await this.bridge.call("cg.captureScreen", {});
|
|
151
|
+
state.lastScreenshotPath = screenshotResult.path;
|
|
152
|
+
const ocrResult = await this.bridge.call("vision.ocr", { imagePath: screenshotResult.path });
|
|
153
|
+
if (format === "text") {
|
|
154
|
+
return ocrResult.text;
|
|
155
|
+
}
|
|
156
|
+
if (format === "json") {
|
|
157
|
+
return ocrResult;
|
|
158
|
+
}
|
|
159
|
+
// table format
|
|
160
|
+
return {
|
|
161
|
+
headers: ["text", "confidence", "x", "y", "width", "height"],
|
|
162
|
+
rows: ocrResult.regions.map((r) => [
|
|
163
|
+
r.text,
|
|
164
|
+
r.confidence,
|
|
165
|
+
r.bounds.x,
|
|
166
|
+
r.bounds.y,
|
|
167
|
+
r.bounds.width,
|
|
168
|
+
r.bounds.height,
|
|
169
|
+
]),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
async screenshot(_sessionId, region) {
|
|
173
|
+
const result = await this.bridge.call("cg.captureScreen", region ? { region } : {});
|
|
174
|
+
return result.path;
|
|
175
|
+
}
|
|
176
|
+
async keyCombo(_sessionId, keys) {
|
|
177
|
+
await this.bridge.call("cg.keyCombo", { keys });
|
|
178
|
+
}
|
|
179
|
+
async elementTree(_sessionId, _maxDepth, _root) {
|
|
180
|
+
throw new Error("Vision adapter does not support elementTree — use accessibility adapter");
|
|
181
|
+
}
|
|
182
|
+
// ── Private ──
|
|
183
|
+
requireSession(sessionId) {
|
|
184
|
+
const state = this.sessions.get(sessionId);
|
|
185
|
+
if (!state)
|
|
186
|
+
throw new Error(`Session not found: ${sessionId}`);
|
|
187
|
+
return state;
|
|
188
|
+
}
|
|
189
|
+
getSearchText(target) {
|
|
190
|
+
switch (target.type) {
|
|
191
|
+
case "text":
|
|
192
|
+
return target.value;
|
|
193
|
+
case "role":
|
|
194
|
+
return target.name;
|
|
195
|
+
case "selector":
|
|
196
|
+
return target.value;
|
|
197
|
+
case "ax_attribute":
|
|
198
|
+
return target.value;
|
|
199
|
+
case "image":
|
|
200
|
+
case "coordinates":
|
|
201
|
+
case "ax_path":
|
|
202
|
+
return null;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
function sleep(ms) {
|
|
207
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
208
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { createInterface } from "node:readline";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
const projectRoot = path.resolve(__dirname, "..");
|
|
7
|
+
const tsxBin = path.join(projectRoot, "node_modules", ".bin", "tsx");
|
|
8
|
+
const TIMEOUT_MS = 10_000;
|
|
9
|
+
const proc = spawn(tsxBin, [path.join(projectRoot, "src/mcp-entry.ts")], {
|
|
10
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
11
|
+
env: { ...process.env, SCREENHAND_ADAPTER: "placeholder" },
|
|
12
|
+
cwd: projectRoot,
|
|
13
|
+
});
|
|
14
|
+
let stderrBuf = "";
|
|
15
|
+
proc.stderr.on("data", (d) => { stderrBuf += d.toString(); });
|
|
16
|
+
// MCP SDK v1.27 uses newline-delimited JSON (NDJSON), not Content-Length framing
|
|
17
|
+
function send(msg) {
|
|
18
|
+
proc.stdin.write(JSON.stringify(msg) + "\n");
|
|
19
|
+
}
|
|
20
|
+
const rl = createInterface({ input: proc.stdout });
|
|
21
|
+
const lineQueue = [];
|
|
22
|
+
let lineWaiter = null;
|
|
23
|
+
rl.on("line", (line) => {
|
|
24
|
+
if (lineWaiter) {
|
|
25
|
+
const w = lineWaiter;
|
|
26
|
+
lineWaiter = null;
|
|
27
|
+
w(line);
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
lineQueue.push(line);
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
function readResponse() {
|
|
34
|
+
return new Promise((resolve, reject) => {
|
|
35
|
+
const timer = setTimeout(() => {
|
|
36
|
+
lineWaiter = null;
|
|
37
|
+
reject(new Error(`Timeout. stderr: ${stderrBuf.slice(-300)}`));
|
|
38
|
+
}, TIMEOUT_MS);
|
|
39
|
+
const handle = (line) => {
|
|
40
|
+
clearTimeout(timer);
|
|
41
|
+
resolve(JSON.parse(line));
|
|
42
|
+
};
|
|
43
|
+
const queued = lineQueue.shift();
|
|
44
|
+
if (queued) {
|
|
45
|
+
clearTimeout(timer);
|
|
46
|
+
resolve(JSON.parse(queued));
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
lineWaiter = handle;
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
function fail(msg) {
|
|
54
|
+
console.error("FAIL:", msg);
|
|
55
|
+
proc.kill();
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
try {
|
|
59
|
+
// Wait for server to start
|
|
60
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
61
|
+
// 1. Initialize
|
|
62
|
+
console.log("Sending initialize...");
|
|
63
|
+
send({
|
|
64
|
+
jsonrpc: "2.0",
|
|
65
|
+
id: 1,
|
|
66
|
+
method: "initialize",
|
|
67
|
+
params: {
|
|
68
|
+
protocolVersion: "2024-11-05",
|
|
69
|
+
capabilities: {},
|
|
70
|
+
clientInfo: { name: "test-client", version: "1.0" },
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
const initResp = await readResponse();
|
|
74
|
+
const initResult = initResp.result;
|
|
75
|
+
if (!initResult)
|
|
76
|
+
fail(`No init result: ${JSON.stringify(initResp)}`);
|
|
77
|
+
console.log("=== Initialize ===");
|
|
78
|
+
console.log(` Protocol: ${initResult.protocolVersion}`);
|
|
79
|
+
console.log(` Server: ${JSON.stringify(initResult.serverInfo)}`);
|
|
80
|
+
// 2. Send initialized notification
|
|
81
|
+
send({ jsonrpc: "2.0", method: "notifications/initialized" });
|
|
82
|
+
await new Promise((r) => setTimeout(r, 300));
|
|
83
|
+
// 3. List tools
|
|
84
|
+
console.log("\nListing tools...");
|
|
85
|
+
send({ jsonrpc: "2.0", id: 2, method: "tools/list", params: {} });
|
|
86
|
+
const toolsResp = await readResponse();
|
|
87
|
+
const toolsResult = toolsResp.result;
|
|
88
|
+
if (!toolsResult)
|
|
89
|
+
fail(`No tools result: ${JSON.stringify(toolsResp)}`);
|
|
90
|
+
const tools = toolsResult.tools ?? [];
|
|
91
|
+
console.log("=== Tools ===");
|
|
92
|
+
for (const tool of tools) {
|
|
93
|
+
console.log(` ${tool.name}: ${(tool.description ?? "").slice(0, 70)}`);
|
|
94
|
+
}
|
|
95
|
+
console.log(`\n Total: ${tools.length} tools`);
|
|
96
|
+
if (tools.length < 10)
|
|
97
|
+
fail(`Expected 16 tools, got ${tools.length}`);
|
|
98
|
+
// 4. Test session_start
|
|
99
|
+
console.log("\nCalling session_start...");
|
|
100
|
+
send({
|
|
101
|
+
jsonrpc: "2.0",
|
|
102
|
+
id: 3,
|
|
103
|
+
method: "tools/call",
|
|
104
|
+
params: { name: "session_start", arguments: {} },
|
|
105
|
+
});
|
|
106
|
+
const sessionResp = await readResponse();
|
|
107
|
+
const sessionResult = sessionResp.result;
|
|
108
|
+
if (!sessionResult)
|
|
109
|
+
fail(`No session result: ${JSON.stringify(sessionResp)}`);
|
|
110
|
+
const sessionContent = sessionResult.content;
|
|
111
|
+
const sessionData = JSON.parse(sessionContent?.[0]?.text ?? "{}");
|
|
112
|
+
console.log("=== session_start ===");
|
|
113
|
+
console.log(` Session ID: ${sessionData.sessionId}`);
|
|
114
|
+
console.log(` Profile: ${sessionData.profile}`);
|
|
115
|
+
if (!sessionData.sessionId)
|
|
116
|
+
fail("No sessionId returned");
|
|
117
|
+
// 5. Test app_list (should work with placeholder)
|
|
118
|
+
console.log("\nCalling app_list...");
|
|
119
|
+
send({
|
|
120
|
+
jsonrpc: "2.0",
|
|
121
|
+
id: 4,
|
|
122
|
+
method: "tools/call",
|
|
123
|
+
params: { name: "app_list", arguments: { sessionId: sessionData.sessionId } },
|
|
124
|
+
});
|
|
125
|
+
const appResp = await readResponse();
|
|
126
|
+
const appResult = appResp.result;
|
|
127
|
+
console.log("=== app_list ===");
|
|
128
|
+
const appContent = appResult?.content;
|
|
129
|
+
const isError = appResult?.isError;
|
|
130
|
+
console.log(` isError: ${isError ?? false}`);
|
|
131
|
+
console.log(` Response: ${(appContent?.[0]?.text ?? "").slice(0, 100)}`);
|
|
132
|
+
proc.kill();
|
|
133
|
+
console.log("\nAll tests passed!");
|
|
134
|
+
process.exit(0);
|
|
135
|
+
}
|
|
136
|
+
catch (e) {
|
|
137
|
+
fail(e instanceof Error ? e.message : String(e));
|
|
138
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "screenhand",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"mcpName": "io.github.manushi4/screenhand",
|
|
5
|
+
"description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
|
|
6
|
+
"homepage": "https://screenhand.com",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"bin": {
|
|
9
|
+
"screenhand": "dist/mcp-entry.js"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"dist",
|
|
13
|
+
"README.md",
|
|
14
|
+
"LICENSE"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"dev": "tsx src/index.ts",
|
|
18
|
+
"dev:mcp": "tsx src/mcp-entry.ts",
|
|
19
|
+
"build": "tsc -p tsconfig.json",
|
|
20
|
+
"check": "tsc --noEmit -p tsconfig.check.json",
|
|
21
|
+
"start": "node dist/index.js",
|
|
22
|
+
"start:mcp": "node dist/mcp-entry.js",
|
|
23
|
+
"build:native": "cd native/macos-bridge && swift build -c release",
|
|
24
|
+
"build:native:windows": "cd native/windows-bridge && dotnet build -c Release",
|
|
25
|
+
"test": "vitest run",
|
|
26
|
+
"test:watch": "vitest"
|
|
27
|
+
},
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "https://github.com/manushi4/screenhand"
|
|
31
|
+
},
|
|
32
|
+
"bugs": {
|
|
33
|
+
"url": "https://github.com/manushi4/screenhand/issues"
|
|
34
|
+
},
|
|
35
|
+
"author": "Khushi Singhal",
|
|
36
|
+
"license": "MIT",
|
|
37
|
+
"keywords": [
|
|
38
|
+
"screenhand",
|
|
39
|
+
"mcp",
|
|
40
|
+
"mcp-server",
|
|
41
|
+
"model-context-protocol",
|
|
42
|
+
"desktop-automation",
|
|
43
|
+
"ui-automation",
|
|
44
|
+
"accessibility",
|
|
45
|
+
"macos",
|
|
46
|
+
"windows",
|
|
47
|
+
"claude",
|
|
48
|
+
"ai-agent",
|
|
49
|
+
"ai-tools",
|
|
50
|
+
"screen-control",
|
|
51
|
+
"browser-automation",
|
|
52
|
+
"cdp",
|
|
53
|
+
"chrome-devtools-protocol",
|
|
54
|
+
"rpa",
|
|
55
|
+
"ui-testing",
|
|
56
|
+
"ocr",
|
|
57
|
+
"computer-use"
|
|
58
|
+
],
|
|
59
|
+
"dependencies": {
|
|
60
|
+
"@anthropic-ai/sdk": "^0.78.0",
|
|
61
|
+
"@modelcontextprotocol/sdk": "^1.27.1",
|
|
62
|
+
"chrome-launcher": "^1.2.1",
|
|
63
|
+
"chrome-remote-interface": "^0.33.3"
|
|
64
|
+
},
|
|
65
|
+
"devDependencies": {
|
|
66
|
+
"@types/chrome-remote-interface": "^0.31.14",
|
|
67
|
+
"@types/node": "^22.13.9",
|
|
68
|
+
"tsx": "^4.19.2",
|
|
69
|
+
"typescript": "^5.8.2",
|
|
70
|
+
"vitest": "^3.2.4"
|
|
71
|
+
}
|
|
72
|
+
}
|