screenhand 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +458 -93
- package/dist/.audit-log.jsonl +55 -0
- package/dist/.screenhand/memory/.lock +1 -0
- package/dist/.screenhand/memory/actions.jsonl +85 -0
- package/dist/.screenhand/memory/errors.jsonl +5 -0
- package/dist/.screenhand/memory/errors.jsonl.bak +4 -0
- package/dist/.screenhand/memory/state.json +35 -0
- package/dist/.screenhand/memory/state.json.bak +35 -0
- package/dist/.screenhand/memory/strategies.jsonl +12 -0
- package/dist/agent/cli.js +73 -0
- package/dist/agent/loop.js +258 -0
- package/dist/config.js +9 -0
- package/dist/index.js +56 -0
- package/dist/logging/timeline-logger.js +29 -0
- package/dist/mcp/mcp-stdio-server.js +448 -0
- package/dist/mcp/server.js +347 -0
- package/dist/mcp-desktop.js +2731 -0
- package/dist/mcp-entry.js +59 -0
- package/dist/memory/recall.js +160 -0
- package/dist/memory/research.js +98 -0
- package/dist/memory/seeds.js +89 -0
- package/dist/memory/session.js +161 -0
- package/dist/memory/store.js +391 -0
- package/dist/memory/types.js +4 -0
- package/dist/monitor/codex-monitor.js +377 -0
- package/dist/monitor/task-queue.js +84 -0
- package/dist/monitor/types.js +49 -0
- package/dist/native/bridge-client.js +174 -0
- package/dist/native/macos-bridge-client.js +5 -0
- package/dist/npm-publish-helper.js +117 -0
- package/dist/npm-token-cdp.js +113 -0
- package/dist/npm-token-create.js +135 -0
- package/dist/npm-token-finish.js +126 -0
- package/dist/playbook/engine.js +193 -0
- package/dist/playbook/index.js +4 -0
- package/dist/playbook/recorder.js +519 -0
- package/dist/playbook/runner.js +392 -0
- package/dist/playbook/store.js +166 -0
- package/dist/playbook/types.js +4 -0
- package/dist/runtime/accessibility-adapter.js +377 -0
- package/dist/runtime/app-adapter.js +48 -0
- package/dist/runtime/applescript-adapter.js +283 -0
- package/dist/runtime/ax-role-map.js +80 -0
- package/dist/runtime/browser-adapter.js +36 -0
- package/dist/runtime/cdp-chrome-adapter.js +505 -0
- package/dist/runtime/composite-adapter.js +205 -0
- package/dist/runtime/executor.js +250 -0
- package/dist/runtime/locator-cache.js +12 -0
- package/dist/runtime/planning-loop.js +47 -0
- package/dist/runtime/service.js +372 -0
- package/dist/runtime/session-manager.js +28 -0
- package/dist/runtime/state-observer.js +105 -0
- package/dist/runtime/vision-adapter.js +208 -0
- package/dist/scripts/codex-monitor-daemon.js +335 -0
- package/dist/scripts/supervisor-daemon.js +272 -0
- package/dist/scripts/worker-daemon.js +228 -0
- package/dist/src/agent/cli.js +82 -0
- package/dist/src/agent/loop.js +274 -0
- package/{src/config.ts → dist/src/config.js} +5 -10
- package/{src/index.ts → dist/src/index.js} +32 -52
- package/dist/src/jobs/manager.js +237 -0
- package/dist/src/jobs/runner.js +683 -0
- package/dist/src/jobs/store.js +102 -0
- package/dist/src/jobs/types.js +30 -0
- package/dist/src/jobs/worker.js +97 -0
- package/dist/src/logging/timeline-logger.js +45 -0
- package/dist/src/mcp/mcp-stdio-server.js +464 -0
- package/dist/src/mcp/server.js +363 -0
- package/dist/src/mcp-entry.js +60 -0
- package/dist/src/memory/recall.js +170 -0
- package/dist/src/memory/research.js +104 -0
- package/dist/src/memory/seeds.js +101 -0
- package/dist/src/memory/service.js +421 -0
- package/dist/src/memory/session.js +169 -0
- package/dist/src/memory/store.js +422 -0
- package/dist/src/memory/types.js +17 -0
- package/dist/src/monitor/codex-monitor.js +382 -0
- package/dist/src/monitor/task-queue.js +97 -0
- package/dist/src/monitor/types.js +62 -0
- package/dist/src/native/bridge-client.js +190 -0
- package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
- package/dist/src/playbook/engine.js +201 -0
- package/dist/src/playbook/index.js +20 -0
- package/dist/src/playbook/recorder.js +535 -0
- package/dist/src/playbook/runner.js +408 -0
- package/dist/src/playbook/store.js +183 -0
- package/dist/src/playbook/types.js +17 -0
- package/dist/src/runtime/accessibility-adapter.js +393 -0
- package/dist/src/runtime/app-adapter.js +64 -0
- package/dist/src/runtime/applescript-adapter.js +299 -0
- package/dist/src/runtime/ax-role-map.js +96 -0
- package/dist/src/runtime/browser-adapter.js +52 -0
- package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
- package/dist/src/runtime/composite-adapter.js +221 -0
- package/dist/src/runtime/execution-contract.js +159 -0
- package/dist/src/runtime/executor.js +266 -0
- package/{src/runtime/locator-cache.ts → dist/src/runtime/locator-cache.js} +10 -15
- package/dist/src/runtime/planning-loop.js +63 -0
- package/dist/src/runtime/service.js +388 -0
- package/dist/src/runtime/session-manager.js +60 -0
- package/dist/src/runtime/state-observer.js +121 -0
- package/dist/src/runtime/vision-adapter.js +224 -0
- package/dist/src/supervisor/locks.js +186 -0
- package/dist/src/supervisor/supervisor.js +403 -0
- package/dist/src/supervisor/types.js +30 -0
- package/dist/src/test-mcp-protocol.js +154 -0
- package/dist/src/types.js +17 -0
- package/dist/src/util/atomic-write.js +118 -0
- package/dist/test-mcp-protocol.js +138 -0
- package/dist/types.js +1 -0
- package/package.json +18 -4
- package/.claude/commands/automate.md +0 -28
- package/.claude/commands/debug-ui.md +0 -19
- package/.claude/commands/screenshot.md +0 -15
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- package/.mcp.json +0 -8
- package/DESKTOP_MCP_GUIDE.md +0 -92
- package/SECURITY.md +0 -44
- package/docs/architecture.md +0 -47
- package/install-skills.sh +0 -19
- package/mcp-bridge.ts +0 -271
- package/mcp-desktop.ts +0 -1221
- package/native/macos-bridge/Package.swift +0 -21
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +0 -261
- package/native/macos-bridge/Sources/AppManagement.swift +0 -129
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +0 -242
- package/native/macos-bridge/Sources/ObserverBridge.swift +0 -120
- package/native/macos-bridge/Sources/VisionBridge.swift +0 -80
- package/native/macos-bridge/Sources/main.swift +0 -345
- package/native/windows-bridge/AppManagement.cs +0 -234
- package/native/windows-bridge/InputBridge.cs +0 -436
- package/native/windows-bridge/Program.cs +0 -265
- package/native/windows-bridge/ScreenCapture.cs +0 -329
- package/native/windows-bridge/UIAutomationBridge.cs +0 -571
- package/native/windows-bridge/WindowsBridge.csproj +0 -17
- package/playbooks/devpost.json +0 -186
- package/playbooks/instagram.json +0 -41
- package/playbooks/instagram_v2.json +0 -201
- package/playbooks/x_v1.json +0 -211
- package/scripts/devpost-live-loop.mjs +0 -421
- package/src/logging/timeline-logger.ts +0 -55
- package/src/mcp/server.ts +0 -449
- package/src/memory/recall.ts +0 -191
- package/src/memory/research.ts +0 -146
- package/src/memory/seeds.ts +0 -123
- package/src/memory/session.ts +0 -201
- package/src/memory/store.ts +0 -434
- package/src/memory/types.ts +0 -69
- package/src/native/bridge-client.ts +0 -239
- package/src/runtime/accessibility-adapter.ts +0 -487
- package/src/runtime/app-adapter.ts +0 -169
- package/src/runtime/applescript-adapter.ts +0 -376
- package/src/runtime/ax-role-map.ts +0 -102
- package/src/runtime/browser-adapter.ts +0 -129
- package/src/runtime/cdp-chrome-adapter.ts +0 -676
- package/src/runtime/composite-adapter.ts +0 -274
- package/src/runtime/executor.ts +0 -396
- package/src/runtime/planning-loop.ts +0 -81
- package/src/runtime/service.ts +0 -448
- package/src/runtime/session-manager.ts +0 -50
- package/src/runtime/state-observer.ts +0 -136
- package/src/runtime/vision-adapter.ts +0 -297
- package/src/types.ts +0 -297
- package/tests/bridge-client.test.ts +0 -176
- package/tests/browser-stealth.test.ts +0 -210
- package/tests/composite-adapter.test.ts +0 -64
- package/tests/mcp-server.test.ts +0 -151
- package/tests/memory-recall.test.ts +0 -339
- package/tests/memory-research.test.ts +0 -159
- package/tests/memory-seeds.test.ts +0 -120
- package/tests/memory-store.test.ts +0 -392
- package/tests/types.test.ts +0 -92
- package/tsconfig.check.json +0 -17
- package/tsconfig.json +0 -19
- package/vitest.config.ts +0 -8
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
// Copyright (C) 2025 Clazro Technology Private Limited
|
|
2
|
+
// SPDX-License-Identifier: AGPL-3.0-only
|
|
3
|
+
//
|
|
4
|
+
// This file is part of ScreenHand.
|
|
5
|
+
//
|
|
6
|
+
// ScreenHand is free software: you can redistribute it and/or modify
|
|
7
|
+
// it under the terms of the GNU Affero General Public License as
|
|
8
|
+
// published by the Free Software Foundation, version 3.
|
|
9
|
+
//
|
|
10
|
+
// ScreenHand is distributed in the hope that it will be useful,
|
|
11
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
// GNU Affero General Public License for more details.
|
|
14
|
+
//
|
|
15
|
+
// You should have received a copy of the GNU Affero General Public License
|
|
16
|
+
// along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
|
|
17
|
+
/**
|
|
18
|
+
* Atomic file writes — temp file + rename to prevent corruption on crash.
|
|
19
|
+
*
|
|
20
|
+
* Also provides corrupt-file recovery: if the primary file is unreadable,
|
|
21
|
+
* falls back to the `.bak` backup created on the previous successful write.
|
|
22
|
+
*/
|
|
23
|
+
import fs from "node:fs";
|
|
24
|
+
import path from "node:path";
|
|
25
|
+
import crypto from "node:crypto";
|
|
26
|
+
/**
|
|
27
|
+
* Write data atomically: write to a temp file in the same directory,
|
|
28
|
+
* then rename over the target. On POSIX, rename is atomic within the
|
|
29
|
+
* same filesystem, so readers always see either the old or new content.
|
|
30
|
+
*
|
|
31
|
+
* Also keeps a single `.bak` of the previous version for recovery.
|
|
32
|
+
*/
|
|
33
|
+
export function writeFileAtomicSync(filePath, data) {
|
|
34
|
+
const dir = path.dirname(filePath);
|
|
35
|
+
const tmp = path.join(dir, `.${path.basename(filePath)}.${crypto.randomBytes(4).toString("hex")}.tmp`);
|
|
36
|
+
try {
|
|
37
|
+
fs.writeFileSync(tmp, data, { mode: 0o644 });
|
|
38
|
+
// Back up current file before overwriting (ignore if it doesn't exist yet)
|
|
39
|
+
try {
|
|
40
|
+
fs.copyFileSync(filePath, filePath + ".bak");
|
|
41
|
+
}
|
|
42
|
+
catch {
|
|
43
|
+
// No existing file to back up — fine
|
|
44
|
+
}
|
|
45
|
+
fs.renameSync(tmp, filePath);
|
|
46
|
+
}
|
|
47
|
+
catch (err) {
|
|
48
|
+
// Clean up temp file on failure
|
|
49
|
+
try {
|
|
50
|
+
fs.unlinkSync(tmp);
|
|
51
|
+
}
|
|
52
|
+
catch { /* ignore */ }
|
|
53
|
+
throw err;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Async variant — same temp+rename approach but non-blocking.
|
|
58
|
+
* Falls back to sync rename (rename is fast, effectively atomic).
|
|
59
|
+
*/
|
|
60
|
+
export function writeFileAtomic(filePath, data, callback) {
|
|
61
|
+
const dir = path.dirname(filePath);
|
|
62
|
+
const tmp = path.join(dir, `.${path.basename(filePath)}.${crypto.randomBytes(4).toString("hex")}.tmp`);
|
|
63
|
+
fs.writeFile(tmp, data, { mode: 0o644 }, (writeErr) => {
|
|
64
|
+
if (writeErr) {
|
|
65
|
+
try {
|
|
66
|
+
fs.unlinkSync(tmp);
|
|
67
|
+
}
|
|
68
|
+
catch { /* ignore */ }
|
|
69
|
+
callback(writeErr);
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
// Back up current file (best-effort, sync is fine for a copy)
|
|
73
|
+
try {
|
|
74
|
+
fs.copyFileSync(filePath, filePath + ".bak");
|
|
75
|
+
}
|
|
76
|
+
catch { /* ignore */ }
|
|
77
|
+
fs.rename(tmp, filePath, (renameErr) => {
|
|
78
|
+
if (renameErr) {
|
|
79
|
+
try {
|
|
80
|
+
fs.unlinkSync(tmp);
|
|
81
|
+
}
|
|
82
|
+
catch { /* ignore */ }
|
|
83
|
+
}
|
|
84
|
+
callback(renameErr);
|
|
85
|
+
});
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Read a JSON file with corrupt-file recovery.
|
|
90
|
+
* If the primary file fails to parse, tries the `.bak` backup.
|
|
91
|
+
* Returns the parsed object, or null if both are unreadable.
|
|
92
|
+
*/
|
|
93
|
+
export function readJsonWithRecovery(filePath) {
|
|
94
|
+
// Try primary file
|
|
95
|
+
const primary = tryParseJsonFile(filePath);
|
|
96
|
+
if (primary !== null)
|
|
97
|
+
return primary;
|
|
98
|
+
// Primary is missing or corrupt — try backup
|
|
99
|
+
const backup = tryParseJsonFile(filePath + ".bak");
|
|
100
|
+
if (backup !== null) {
|
|
101
|
+
// Restore backup as primary so next read is fast
|
|
102
|
+
try {
|
|
103
|
+
fs.copyFileSync(filePath + ".bak", filePath);
|
|
104
|
+
}
|
|
105
|
+
catch { /* ignore */ }
|
|
106
|
+
return backup;
|
|
107
|
+
}
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
function tryParseJsonFile(filePath) {
|
|
111
|
+
try {
|
|
112
|
+
const data = fs.readFileSync(filePath, "utf-8");
|
|
113
|
+
return JSON.parse(data);
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { createInterface } from "node:readline";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
const projectRoot = path.resolve(__dirname, "..");
|
|
7
|
+
const tsxBin = path.join(projectRoot, "node_modules", ".bin", "tsx");
|
|
8
|
+
const TIMEOUT_MS = 10_000;
|
|
9
|
+
const proc = spawn(tsxBin, [path.join(projectRoot, "src/mcp-entry.ts")], {
|
|
10
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
11
|
+
env: { ...process.env, SCREENHAND_ADAPTER: "placeholder" },
|
|
12
|
+
cwd: projectRoot,
|
|
13
|
+
});
|
|
14
|
+
let stderrBuf = "";
|
|
15
|
+
proc.stderr.on("data", (d) => { stderrBuf += d.toString(); });
|
|
16
|
+
// MCP SDK v1.27 uses newline-delimited JSON (NDJSON), not Content-Length framing
|
|
17
|
+
function send(msg) {
|
|
18
|
+
proc.stdin.write(JSON.stringify(msg) + "\n");
|
|
19
|
+
}
|
|
20
|
+
const rl = createInterface({ input: proc.stdout });
|
|
21
|
+
const lineQueue = [];
|
|
22
|
+
let lineWaiter = null;
|
|
23
|
+
rl.on("line", (line) => {
|
|
24
|
+
if (lineWaiter) {
|
|
25
|
+
const w = lineWaiter;
|
|
26
|
+
lineWaiter = null;
|
|
27
|
+
w(line);
|
|
28
|
+
}
|
|
29
|
+
else {
|
|
30
|
+
lineQueue.push(line);
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
function readResponse() {
|
|
34
|
+
return new Promise((resolve, reject) => {
|
|
35
|
+
const timer = setTimeout(() => {
|
|
36
|
+
lineWaiter = null;
|
|
37
|
+
reject(new Error(`Timeout. stderr: ${stderrBuf.slice(-300)}`));
|
|
38
|
+
}, TIMEOUT_MS);
|
|
39
|
+
const handle = (line) => {
|
|
40
|
+
clearTimeout(timer);
|
|
41
|
+
resolve(JSON.parse(line));
|
|
42
|
+
};
|
|
43
|
+
const queued = lineQueue.shift();
|
|
44
|
+
if (queued) {
|
|
45
|
+
clearTimeout(timer);
|
|
46
|
+
resolve(JSON.parse(queued));
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
lineWaiter = handle;
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
function fail(msg) {
|
|
54
|
+
console.error("FAIL:", msg);
|
|
55
|
+
proc.kill();
|
|
56
|
+
process.exit(1);
|
|
57
|
+
}
|
|
58
|
+
try {
|
|
59
|
+
// Wait for server to start
|
|
60
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
61
|
+
// 1. Initialize
|
|
62
|
+
console.log("Sending initialize...");
|
|
63
|
+
send({
|
|
64
|
+
jsonrpc: "2.0",
|
|
65
|
+
id: 1,
|
|
66
|
+
method: "initialize",
|
|
67
|
+
params: {
|
|
68
|
+
protocolVersion: "2024-11-05",
|
|
69
|
+
capabilities: {},
|
|
70
|
+
clientInfo: { name: "test-client", version: "1.0" },
|
|
71
|
+
},
|
|
72
|
+
});
|
|
73
|
+
const initResp = await readResponse();
|
|
74
|
+
const initResult = initResp.result;
|
|
75
|
+
if (!initResult)
|
|
76
|
+
fail(`No init result: ${JSON.stringify(initResp)}`);
|
|
77
|
+
console.log("=== Initialize ===");
|
|
78
|
+
console.log(` Protocol: ${initResult.protocolVersion}`);
|
|
79
|
+
console.log(` Server: ${JSON.stringify(initResult.serverInfo)}`);
|
|
80
|
+
// 2. Send initialized notification
|
|
81
|
+
send({ jsonrpc: "2.0", method: "notifications/initialized" });
|
|
82
|
+
await new Promise((r) => setTimeout(r, 300));
|
|
83
|
+
// 3. List tools
|
|
84
|
+
console.log("\nListing tools...");
|
|
85
|
+
send({ jsonrpc: "2.0", id: 2, method: "tools/list", params: {} });
|
|
86
|
+
const toolsResp = await readResponse();
|
|
87
|
+
const toolsResult = toolsResp.result;
|
|
88
|
+
if (!toolsResult)
|
|
89
|
+
fail(`No tools result: ${JSON.stringify(toolsResp)}`);
|
|
90
|
+
const tools = toolsResult.tools ?? [];
|
|
91
|
+
console.log("=== Tools ===");
|
|
92
|
+
for (const tool of tools) {
|
|
93
|
+
console.log(` ${tool.name}: ${(tool.description ?? "").slice(0, 70)}`);
|
|
94
|
+
}
|
|
95
|
+
console.log(`\n Total: ${tools.length} tools`);
|
|
96
|
+
if (tools.length < 10)
|
|
97
|
+
fail(`Expected 16 tools, got ${tools.length}`);
|
|
98
|
+
// 4. Test session_start
|
|
99
|
+
console.log("\nCalling session_start...");
|
|
100
|
+
send({
|
|
101
|
+
jsonrpc: "2.0",
|
|
102
|
+
id: 3,
|
|
103
|
+
method: "tools/call",
|
|
104
|
+
params: { name: "session_start", arguments: {} },
|
|
105
|
+
});
|
|
106
|
+
const sessionResp = await readResponse();
|
|
107
|
+
const sessionResult = sessionResp.result;
|
|
108
|
+
if (!sessionResult)
|
|
109
|
+
fail(`No session result: ${JSON.stringify(sessionResp)}`);
|
|
110
|
+
const sessionContent = sessionResult.content;
|
|
111
|
+
const sessionData = JSON.parse(sessionContent?.[0]?.text ?? "{}");
|
|
112
|
+
console.log("=== session_start ===");
|
|
113
|
+
console.log(` Session ID: ${sessionData.sessionId}`);
|
|
114
|
+
console.log(` Profile: ${sessionData.profile}`);
|
|
115
|
+
if (!sessionData.sessionId)
|
|
116
|
+
fail("No sessionId returned");
|
|
117
|
+
// 5. Test app_list (should work with placeholder)
|
|
118
|
+
console.log("\nCalling app_list...");
|
|
119
|
+
send({
|
|
120
|
+
jsonrpc: "2.0",
|
|
121
|
+
id: 4,
|
|
122
|
+
method: "tools/call",
|
|
123
|
+
params: { name: "app_list", arguments: { sessionId: sessionData.sessionId } },
|
|
124
|
+
});
|
|
125
|
+
const appResp = await readResponse();
|
|
126
|
+
const appResult = appResp.result;
|
|
127
|
+
console.log("=== app_list ===");
|
|
128
|
+
const appContent = appResult?.content;
|
|
129
|
+
const isError = appResult?.isError;
|
|
130
|
+
console.log(` isError: ${isError ?? false}`);
|
|
131
|
+
console.log(` Response: ${(appContent?.[0]?.text ?? "").slice(0, 100)}`);
|
|
132
|
+
proc.kill();
|
|
133
|
+
console.log("\nAll tests passed!");
|
|
134
|
+
process.exit(0);
|
|
135
|
+
}
|
|
136
|
+
catch (e) {
|
|
137
|
+
fail(e instanceof Error ? e.message : String(e));
|
|
138
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
CHANGED
|
@@ -1,18 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "screenhand",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"mcpName": "io.github.manushi4/screenhand",
|
|
4
5
|
"description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
|
|
5
6
|
"homepage": "https://screenhand.com",
|
|
6
7
|
"type": "module",
|
|
8
|
+
"bin": {
|
|
9
|
+
"screenhand": "dist/mcp-desktop.js",
|
|
10
|
+
"screenhand-agent": "dist/src/agent/cli.js"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"dist",
|
|
14
|
+
"README.md",
|
|
15
|
+
"LICENSE"
|
|
16
|
+
],
|
|
7
17
|
"scripts": {
|
|
8
|
-
"dev": "tsx
|
|
18
|
+
"dev": "tsx mcp-desktop.ts",
|
|
19
|
+
"dev:modular": "tsx src/mcp-entry.ts",
|
|
9
20
|
"build": "tsc -p tsconfig.json",
|
|
10
21
|
"check": "tsc --noEmit -p tsconfig.check.json",
|
|
11
|
-
"start": "node dist/
|
|
22
|
+
"start": "node dist/mcp-desktop.js",
|
|
23
|
+
"agent": "tsx src/agent/cli.ts",
|
|
12
24
|
"build:native": "cd native/macos-bridge && swift build -c release",
|
|
13
25
|
"build:native:windows": "cd native/windows-bridge && dotnet build -c Release",
|
|
14
26
|
"test": "vitest run",
|
|
15
|
-
"test:watch": "vitest"
|
|
27
|
+
"test:watch": "vitest",
|
|
28
|
+
"codex:monitor": "tsx scripts/codex-monitor-daemon.ts",
|
|
29
|
+
"codex:watch": "node scripts/vscode-codex-watch.mjs"
|
|
16
30
|
},
|
|
17
31
|
"repository": {
|
|
18
32
|
"type": "git",
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
Automate a desktop workflow described by the user.
|
|
2
|
-
|
|
3
|
-
The user will describe what they want done: $ARGUMENTS
|
|
4
|
-
|
|
5
|
-
Plan and execute the workflow step by step using the desktop automation MCP tools:
|
|
6
|
-
|
|
7
|
-
## Planning
|
|
8
|
-
1. Break the task into discrete steps
|
|
9
|
-
2. Identify which apps are involved (`apps`, `windows`)
|
|
10
|
-
3. For each step, pick the FASTEST approach — try them in this order:
|
|
11
|
-
- **Accessibility (FASTEST — always try first)**: `ui_tree` → `ui_find` → `ui_press` / `ui_set_value`. ~50ms per action, no screenshots.
|
|
12
|
-
- **Keyboard shortcuts**: `key` for known shortcuts (cmd+s, cmd+c, etc.) — instant
|
|
13
|
-
- **AppleScript**: `applescript` for scriptable apps (Finder, Mail, Notes) — fast
|
|
14
|
-
- **Chrome CDP**: `browser_dom` → `browser_click` / `browser_type` — direct DOM, no vision
|
|
15
|
-
- **Visual (LAST RESORT only)**: `screenshot` → `click_text` — slow, only when Accessibility can't see the element (canvas, games, images)
|
|
16
|
-
|
|
17
|
-
IMPORTANT: Do NOT use screenshot/OCR/click_text to interact with standard UI elements. Use ui_tree + ui_press instead — it's 10x faster and more reliable.
|
|
18
|
-
|
|
19
|
-
## Execution
|
|
20
|
-
- Execute each step, verifying success before moving to the next
|
|
21
|
-
- After key actions, use `screenshot` or `ui_tree` to confirm the expected state
|
|
22
|
-
- If a step fails, try an alternative approach before giving up
|
|
23
|
-
- Report progress as you go
|
|
24
|
-
|
|
25
|
-
## Completion
|
|
26
|
-
- Summarize what was done
|
|
27
|
-
- Note any steps that required fallbacks
|
|
28
|
-
- Flag anything that didn't work as expected
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
Inspect and debug the UI structure of an app.
|
|
2
|
-
|
|
3
|
-
1. Use `apps` to list running applications
|
|
4
|
-
2. If the user specified an app name ($ARGUMENTS), find its PID. Otherwise use the frontmost app.
|
|
5
|
-
3. Use `focus` to bring the app to the front
|
|
6
|
-
4. Use `ui_tree` with the app's PID to get the full Accessibility tree
|
|
7
|
-
5. Use `windows` to get the window bounds
|
|
8
|
-
|
|
9
|
-
Then analyze and report:
|
|
10
|
-
- App name and bundle ID
|
|
11
|
-
- Window hierarchy and layout
|
|
12
|
-
- Interactive elements (buttons, text fields, menus) with their states (enabled/disabled, value)
|
|
13
|
-
- Navigation structure
|
|
14
|
-
- Any elements that look broken or inaccessible
|
|
15
|
-
- Suggested selectors for automating key actions (titles to use with `ui_press`, `ui_find`)
|
|
16
|
-
|
|
17
|
-
Format as a structured report with sections.
|
|
18
|
-
|
|
19
|
-
$ARGUMENTS
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
Take a screenshot of the current screen and describe what you see.
|
|
2
|
-
|
|
3
|
-
1. Use the `screenshot` MCP tool to capture the screen and OCR it
|
|
4
|
-
2. Use `apps` to identify which apps are running
|
|
5
|
-
3. Use `windows` to see window positions
|
|
6
|
-
|
|
7
|
-
Then provide a clear summary:
|
|
8
|
-
- What apps are visible
|
|
9
|
-
- What the user appears to be doing
|
|
10
|
-
- Key UI elements and text on screen
|
|
11
|
-
- Any notable state (dialogs open, errors visible, etc.)
|
|
12
|
-
|
|
13
|
-
If the user provides an app name as argument, focus on that app: use `focus` to bring it forward first, then screenshot.
|
|
14
|
-
|
|
15
|
-
$ARGUMENTS
|
package/.github/FUNDING.yml
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
github: manushi4
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: Bug Report
|
|
3
|
-
about: Report a bug in ScreenHand
|
|
4
|
-
title: "[Bug] "
|
|
5
|
-
labels: bug
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
**Platform**
|
|
9
|
-
- [ ] macOS
|
|
10
|
-
- [ ] Windows
|
|
11
|
-
|
|
12
|
-
**Describe the bug**
|
|
13
|
-
A clear description of what went wrong.
|
|
14
|
-
|
|
15
|
-
**To reproduce**
|
|
16
|
-
1. Tool called: `...`
|
|
17
|
-
2. Parameters: `...`
|
|
18
|
-
3. Error/unexpected output: `...`
|
|
19
|
-
|
|
20
|
-
**Expected behavior**
|
|
21
|
-
What you expected to happen.
|
|
22
|
-
|
|
23
|
-
**Environment**
|
|
24
|
-
- OS version:
|
|
25
|
-
- Node.js version:
|
|
26
|
-
- ScreenHand version:
|
|
27
|
-
- AI client (Claude Desktop / Claude Code / Cursor / other):
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: Feature Request
|
|
3
|
-
about: Suggest a new tool or improvement for ScreenHand
|
|
4
|
-
title: "[Feature] "
|
|
5
|
-
labels: enhancement
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
**What problem does this solve?**
|
|
9
|
-
Describe the use case.
|
|
10
|
-
|
|
11
|
-
**Proposed solution**
|
|
12
|
-
How should it work? What MCP tool name/parameters would you expect?
|
|
13
|
-
|
|
14
|
-
**Alternatives considered**
|
|
15
|
-
Any workarounds you've tried.
|
|
16
|
-
|
|
17
|
-
**Platform**
|
|
18
|
-
- [ ] macOS
|
|
19
|
-
- [ ] Windows
|
|
20
|
-
- [ ] Both
|
package/.mcp.json
DELETED
package/DESKTOP_MCP_GUIDE.md
DELETED
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
# ScreenHand MCP — Usage Guide
|
|
2
|
-
|
|
3
|
-
You have access to the ScreenHand MCP server that can control any macOS/Windows application and Chrome browser. Use it for app debugging, design inspection, UI testing, and automation.
|
|
4
|
-
|
|
5
|
-
## Quick Reference
|
|
6
|
-
|
|
7
|
-
### How to discover what's on screen
|
|
8
|
-
1. `apps` → get running apps with PIDs
|
|
9
|
-
2. `windows` → get window IDs and positions
|
|
10
|
-
3. `ui_tree(pid, maxDepth)` → get full UI structure instantly (50ms, no OCR)
|
|
11
|
-
4. `screenshot(windowId)` → capture + OCR a window (600ms, use when you need visual text)
|
|
12
|
-
|
|
13
|
-
### How to interact with native macOS apps (Finder, Notes, Xcode, etc.)
|
|
14
|
-
- **Read UI structure**: `ui_tree(pid=1234, maxDepth=4)` — returns every button, menu, text field with positions
|
|
15
|
-
- **Find element**: `ui_find(pid=1234, title="Save")` — find by text
|
|
16
|
-
- **Click element**: `ui_press(pid=1234, title="Save")` — click by accessibility
|
|
17
|
-
- **Click menu**: `menu_click(pid=1234, menuPath="File/Save As...")` — click any menu item
|
|
18
|
-
- **Set text**: `ui_set_value(pid=1234, title="Search", value="hello")`
|
|
19
|
-
- **Key combo**: `key(combo="cmd+s")` or `key(combo="cmd+shift+n")`
|
|
20
|
-
|
|
21
|
-
### How to interact with Chrome/web pages (FAST — use this, not OCR)
|
|
22
|
-
- **List tabs**: `browser_tabs()`
|
|
23
|
-
- **Open URL**: `browser_open(url="https://example.com")`
|
|
24
|
-
- **Run JS**: `browser_js(code="document.title")` — execute any JavaScript, returns result
|
|
25
|
-
- **Query DOM**: `browser_dom(selector="button.primary")` — find elements with text, positions, attributes
|
|
26
|
-
- **Click element**: `browser_click(selector="#submit-btn")`
|
|
27
|
-
- **Type in input**: `browser_type(selector="input[name=search]", text="hello")`
|
|
28
|
-
- **Wait for load**: `browser_wait(condition="document.querySelector('.results')")`
|
|
29
|
-
- **Page content**: `browser_page_info()` — title, URL, text content
|
|
30
|
-
|
|
31
|
-
### How to use AppleScript
|
|
32
|
-
- `applescript(script='tell application "Finder" to get name of every file of desktop')`
|
|
33
|
-
- `applescript(script='tell application "Safari" to set URL of current tab of front window to "https://google.com"')`
|
|
34
|
-
|
|
35
|
-
## Rules & Best Practices
|
|
36
|
-
|
|
37
|
-
### Speed hierarchy — always prefer the fastest method:
|
|
38
|
-
1. **Accessibility (ui_tree, ui_press, menu_click)** — 50ms, structured, reliable. Use for ALL native app interactions.
|
|
39
|
-
2. **CDP (browser_js, browser_dom, browser_click)** — 10ms, structured. Use for ALL web/browser interactions.
|
|
40
|
-
3. **AppleScript** — 50ms, for app-specific scripting (Finder files, Safari URLs, Mail compose).
|
|
41
|
-
4. **OCR (screenshot, click_text)** — 600ms, last resort. Only use when AX and CDP aren't available.
|
|
42
|
-
|
|
43
|
-
### For app debugging:
|
|
44
|
-
- Start with `apps` to find the PID
|
|
45
|
-
- Use `ui_tree(pid, maxDepth=4)` to see the full UI hierarchy — every button, text field, label, with positions
|
|
46
|
-
- Use `ui_tree(pid, maxDepth=6)` for deep inspection of complex views
|
|
47
|
-
- Use `screenshot(windowId)` only when you need to see actual rendered text/images
|
|
48
|
-
|
|
49
|
-
### For design inspection:
|
|
50
|
-
- `screenshot_file(windowId)` returns the image path — you can read it to see the actual design
|
|
51
|
-
- `ui_tree` shows the component structure (like React DevTools but for any app)
|
|
52
|
-
- `browser_dom(selector="*")` with limit shows the DOM tree of any web page
|
|
53
|
-
- `browser_js` can extract computed styles: `getComputedStyle(el).color`
|
|
54
|
-
|
|
55
|
-
### For web app debugging:
|
|
56
|
-
- Use `browser_js` to run any debugging code — console.log, inspect state, check network
|
|
57
|
-
- Use `browser_dom` to find elements and their properties
|
|
58
|
-
- Use `browser_wait` before interacting with dynamic content
|
|
59
|
-
- Chain: `browser_navigate` → `browser_wait` → `browser_dom` → `browser_click`
|
|
60
|
-
|
|
61
|
-
### Common patterns:
|
|
62
|
-
|
|
63
|
-
**Debug a native app's UI:**
|
|
64
|
-
```
|
|
65
|
-
apps → find pid
|
|
66
|
-
ui_tree(pid, 4) → see structure
|
|
67
|
-
ui_find(pid, "button text") → locate element
|
|
68
|
-
ui_press(pid, "button text") → interact
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
**Debug a web page:**
|
|
72
|
-
```
|
|
73
|
-
browser_tabs → find the tab
|
|
74
|
-
browser_dom("main", tabId) → see page structure
|
|
75
|
-
browser_js("document.querySelector('.error')?.textContent", tabId) → inspect
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
**Automate a flow:**
|
|
79
|
-
```
|
|
80
|
-
launch(bundleId) → open app
|
|
81
|
-
ui_tree(pid, 3) → understand layout
|
|
82
|
-
menu_click(pid, "File/New") → trigger action
|
|
83
|
-
ui_set_value(pid, "Name", "test") → fill form
|
|
84
|
-
ui_press(pid, "Save") → submit
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
### Important notes:
|
|
88
|
-
- Chrome must be running with `--remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug` for browser_* tools
|
|
89
|
-
- PIDs change when apps restart — always call `apps` first to get current PIDs
|
|
90
|
-
- Window IDs change when windows are recreated — call `windows` to get current IDs
|
|
91
|
-
- `ui_tree` requires Accessibility permissions (System Settings → Privacy → Accessibility)
|
|
92
|
-
- For clicking by coordinates, use `click(x, y)` — coordinates are screen-absolute
|
package/SECURITY.md
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# Security Policy
|
|
2
|
-
|
|
3
|
-
## Overview
|
|
4
|
-
|
|
5
|
-
ScreenHand is a desktop automation tool with significant system access. We take security seriously.
|
|
6
|
-
|
|
7
|
-
## What ScreenHand Can Access
|
|
8
|
-
|
|
9
|
-
- **Screen content** via screenshots and OCR
|
|
10
|
-
- **UI elements** via native Accessibility APIs (macOS) / UI Automation (Windows)
|
|
11
|
-
- **Keyboard and mouse** input simulation
|
|
12
|
-
- **Chrome browser** tabs via DevTools Protocol (requires Chrome launched with debug port)
|
|
13
|
-
- **AppleScript** execution (macOS only)
|
|
14
|
-
|
|
15
|
-
## What ScreenHand Cannot Do
|
|
16
|
-
|
|
17
|
-
- ScreenHand does **not** send screen data or any information to external servers
|
|
18
|
-
- It does **not** access browser cookies, passwords, or stored credentials
|
|
19
|
-
- It does **not** run with elevated/admin privileges
|
|
20
|
-
- It does **not** modify system settings or install background services
|
|
21
|
-
- It does **not** communicate with any remote server (all operations are local)
|
|
22
|
-
|
|
23
|
-
## Permissions Required
|
|
24
|
-
|
|
25
|
-
### macOS
|
|
26
|
-
- **Accessibility permission**: System Settings > Privacy & Security > Accessibility > enable your terminal app
|
|
27
|
-
- This is a standard macOS requirement for any app that reads UI elements or simulates input
|
|
28
|
-
|
|
29
|
-
### Windows
|
|
30
|
-
- No special permissions needed — UI Automation works without admin for most applications
|
|
31
|
-
|
|
32
|
-
## Audit Logging
|
|
33
|
-
|
|
34
|
-
All tool calls are logged to `.audit-log.jsonl` with timestamps. This file is gitignored by default and stays on your machine.
|
|
35
|
-
|
|
36
|
-
## Reporting a Vulnerability
|
|
37
|
-
|
|
38
|
-
If you discover a security vulnerability, please email **security@screenhand.com** instead of opening a public issue.
|
|
39
|
-
|
|
40
|
-
We will acknowledge receipt within 48 hours and aim to provide a fix within 7 days for critical issues.
|
|
41
|
-
|
|
42
|
-
## Responsible Use
|
|
43
|
-
|
|
44
|
-
ScreenHand is designed for legitimate automation, testing, and productivity use cases. Users are responsible for ensuring their use complies with applicable laws and the terms of service of any applications they automate.
|
package/docs/architecture.md
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# MVP Architecture
|
|
2
|
-
|
|
3
|
-
## Design Goals
|
|
4
|
-
- Fast execution by keeping session and context persistent.
|
|
5
|
-
- Predictable completion by hard action budgets.
|
|
6
|
-
- No infinite loops: each tool call returns success or structured failure.
|
|
7
|
-
- LLM plans high-level intent; runtime handles micro-logic.
|
|
8
|
-
|
|
9
|
-
## Layers
|
|
10
|
-
1. `MCP Server Layer`
|
|
11
|
-
- Accepts tool requests (`session_start`, `navigate`, `press`, `type_into`, `wait_for`, `extract`, `screenshot`).
|
|
12
|
-
- Validates args and forwards to runtime service.
|
|
13
|
-
|
|
14
|
-
2. `Runtime Service Layer`
|
|
15
|
-
- Orchestrates session manager, executor, adapter, logging, and cache.
|
|
16
|
-
- Converts low-level errors into structured failure payloads.
|
|
17
|
-
|
|
18
|
-
3. `Executor Layer`
|
|
19
|
-
- Runs bounded state machine for action tools:
|
|
20
|
-
- locate (cached first, fallback strategy)
|
|
21
|
-
- act
|
|
22
|
-
- verify
|
|
23
|
-
- optional retry
|
|
24
|
-
- Enforces per-step time budgets.
|
|
25
|
-
|
|
26
|
-
4. `Browser Adapter Layer`
|
|
27
|
-
- Thin contract for browser operations.
|
|
28
|
-
- Current scaffold uses a placeholder adapter; later replace with CDP or Playwright robot-mode adapter.
|
|
29
|
-
|
|
30
|
-
## Core Runtime Flow
|
|
31
|
-
1. `session_start(profile)` ensures a persistent session ID.
|
|
32
|
-
2. `navigate(url)` completes within timeout and returns url/title.
|
|
33
|
-
3. `press` / `type_into` run bounded loop with max retries.
|
|
34
|
-
4. `wait_for(condition)` waits only for explicit UI conditions.
|
|
35
|
-
5. `extract(target, format)` returns structured data.
|
|
36
|
-
6. On failure, return structured diagnostics + timings.
|
|
37
|
-
|
|
38
|
-
## Key Data Contracts
|
|
39
|
-
- `ActionBudget`: `locateMs`, `actMs`, `verifyMs`, `maxRetries`.
|
|
40
|
-
- `ActionTelemetry`: per-action timing + retry count + status.
|
|
41
|
-
- `RuntimeError`: error code, attempts, page meta, and cause.
|
|
42
|
-
|
|
43
|
-
## Next Implementation Phase
|
|
44
|
-
- Harden the current CDP adapter with richer locator heuristics and cleanup hooks.
|
|
45
|
-
- Add locator strategy expansion (role/text/selector priority + fuzzy fallback).
|
|
46
|
-
- Persist locator cache per site/action.
|
|
47
|
-
- Wire transport for actual MCP protocol endpoint.
|
package/install-skills.sh
DELETED
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# Install ScreenHand skills globally for Claude Code
|
|
3
|
-
# Usage: ./install-skills.sh
|
|
4
|
-
|
|
5
|
-
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
6
|
-
TARGET="$HOME/.claude/commands"
|
|
7
|
-
|
|
8
|
-
mkdir -p "$TARGET"
|
|
9
|
-
|
|
10
|
-
cp "$SCRIPT_DIR/.claude/commands/screenshot.md" "$TARGET/desktop-screenshot.md"
|
|
11
|
-
cp "$SCRIPT_DIR/.claude/commands/debug-ui.md" "$TARGET/desktop-debug-ui.md"
|
|
12
|
-
cp "$SCRIPT_DIR/.claude/commands/automate.md" "$TARGET/desktop-automate.md"
|
|
13
|
-
|
|
14
|
-
echo "Installed skills to $TARGET:"
|
|
15
|
-
echo " /desktop-screenshot — capture and describe your screen"
|
|
16
|
-
echo " /desktop-debug-ui — inspect any app's UI tree"
|
|
17
|
-
echo " /desktop-automate — automate a multi-step workflow"
|
|
18
|
-
echo ""
|
|
19
|
-
echo "These are now available globally in any Claude Code session."
|