screenhand 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +458 -93
- package/dist/.audit-log.jsonl +55 -0
- package/dist/.screenhand/memory/.lock +1 -0
- package/dist/.screenhand/memory/actions.jsonl +85 -0
- package/dist/.screenhand/memory/errors.jsonl +5 -0
- package/dist/.screenhand/memory/errors.jsonl.bak +4 -0
- package/dist/.screenhand/memory/state.json +35 -0
- package/dist/.screenhand/memory/state.json.bak +35 -0
- package/dist/.screenhand/memory/strategies.jsonl +12 -0
- package/dist/agent/cli.js +73 -0
- package/dist/agent/loop.js +258 -0
- package/dist/config.js +9 -0
- package/dist/index.js +56 -0
- package/dist/logging/timeline-logger.js +29 -0
- package/dist/mcp/mcp-stdio-server.js +448 -0
- package/dist/mcp/server.js +347 -0
- package/dist/mcp-desktop.js +2731 -0
- package/dist/mcp-entry.js +59 -0
- package/dist/memory/recall.js +160 -0
- package/dist/memory/research.js +98 -0
- package/dist/memory/seeds.js +89 -0
- package/dist/memory/session.js +161 -0
- package/dist/memory/store.js +391 -0
- package/dist/memory/types.js +4 -0
- package/dist/monitor/codex-monitor.js +377 -0
- package/dist/monitor/task-queue.js +84 -0
- package/dist/monitor/types.js +49 -0
- package/dist/native/bridge-client.js +174 -0
- package/dist/native/macos-bridge-client.js +5 -0
- package/dist/npm-publish-helper.js +117 -0
- package/dist/npm-token-cdp.js +113 -0
- package/dist/npm-token-create.js +135 -0
- package/dist/npm-token-finish.js +126 -0
- package/dist/playbook/engine.js +193 -0
- package/dist/playbook/index.js +4 -0
- package/dist/playbook/recorder.js +519 -0
- package/dist/playbook/runner.js +392 -0
- package/dist/playbook/store.js +166 -0
- package/dist/playbook/types.js +4 -0
- package/dist/runtime/accessibility-adapter.js +377 -0
- package/dist/runtime/app-adapter.js +48 -0
- package/dist/runtime/applescript-adapter.js +283 -0
- package/dist/runtime/ax-role-map.js +80 -0
- package/dist/runtime/browser-adapter.js +36 -0
- package/dist/runtime/cdp-chrome-adapter.js +505 -0
- package/dist/runtime/composite-adapter.js +205 -0
- package/dist/runtime/executor.js +250 -0
- package/dist/runtime/locator-cache.js +12 -0
- package/dist/runtime/planning-loop.js +47 -0
- package/dist/runtime/service.js +372 -0
- package/dist/runtime/session-manager.js +28 -0
- package/dist/runtime/state-observer.js +105 -0
- package/dist/runtime/vision-adapter.js +208 -0
- package/dist/scripts/codex-monitor-daemon.js +335 -0
- package/dist/scripts/supervisor-daemon.js +272 -0
- package/dist/scripts/worker-daemon.js +228 -0
- package/dist/src/agent/cli.js +82 -0
- package/dist/src/agent/loop.js +274 -0
- package/{src/config.ts → dist/src/config.js} +5 -10
- package/{src/index.ts → dist/src/index.js} +32 -52
- package/dist/src/jobs/manager.js +237 -0
- package/dist/src/jobs/runner.js +683 -0
- package/dist/src/jobs/store.js +102 -0
- package/dist/src/jobs/types.js +30 -0
- package/dist/src/jobs/worker.js +97 -0
- package/dist/src/logging/timeline-logger.js +45 -0
- package/dist/src/mcp/mcp-stdio-server.js +464 -0
- package/dist/src/mcp/server.js +363 -0
- package/dist/src/mcp-entry.js +60 -0
- package/dist/src/memory/recall.js +170 -0
- package/dist/src/memory/research.js +104 -0
- package/dist/src/memory/seeds.js +101 -0
- package/dist/src/memory/service.js +421 -0
- package/dist/src/memory/session.js +169 -0
- package/dist/src/memory/store.js +422 -0
- package/dist/src/memory/types.js +17 -0
- package/dist/src/monitor/codex-monitor.js +382 -0
- package/dist/src/monitor/task-queue.js +97 -0
- package/dist/src/monitor/types.js +62 -0
- package/dist/src/native/bridge-client.js +190 -0
- package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
- package/dist/src/playbook/engine.js +201 -0
- package/dist/src/playbook/index.js +20 -0
- package/dist/src/playbook/recorder.js +535 -0
- package/dist/src/playbook/runner.js +408 -0
- package/dist/src/playbook/store.js +183 -0
- package/dist/src/playbook/types.js +17 -0
- package/dist/src/runtime/accessibility-adapter.js +393 -0
- package/dist/src/runtime/app-adapter.js +64 -0
- package/dist/src/runtime/applescript-adapter.js +299 -0
- package/dist/src/runtime/ax-role-map.js +96 -0
- package/dist/src/runtime/browser-adapter.js +52 -0
- package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
- package/dist/src/runtime/composite-adapter.js +221 -0
- package/dist/src/runtime/execution-contract.js +159 -0
- package/dist/src/runtime/executor.js +266 -0
- package/{src/runtime/locator-cache.ts → dist/src/runtime/locator-cache.js} +10 -15
- package/dist/src/runtime/planning-loop.js +63 -0
- package/dist/src/runtime/service.js +388 -0
- package/dist/src/runtime/session-manager.js +60 -0
- package/dist/src/runtime/state-observer.js +121 -0
- package/dist/src/runtime/vision-adapter.js +224 -0
- package/dist/src/supervisor/locks.js +186 -0
- package/dist/src/supervisor/supervisor.js +403 -0
- package/dist/src/supervisor/types.js +30 -0
- package/dist/src/test-mcp-protocol.js +154 -0
- package/dist/src/types.js +17 -0
- package/dist/src/util/atomic-write.js +118 -0
- package/dist/test-mcp-protocol.js +138 -0
- package/dist/types.js +1 -0
- package/package.json +18 -4
- package/.claude/commands/automate.md +0 -28
- package/.claude/commands/debug-ui.md +0 -19
- package/.claude/commands/screenshot.md +0 -15
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- package/.mcp.json +0 -8
- package/DESKTOP_MCP_GUIDE.md +0 -92
- package/SECURITY.md +0 -44
- package/docs/architecture.md +0 -47
- package/install-skills.sh +0 -19
- package/mcp-bridge.ts +0 -271
- package/mcp-desktop.ts +0 -1221
- package/native/macos-bridge/Package.swift +0 -21
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +0 -261
- package/native/macos-bridge/Sources/AppManagement.swift +0 -129
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +0 -242
- package/native/macos-bridge/Sources/ObserverBridge.swift +0 -120
- package/native/macos-bridge/Sources/VisionBridge.swift +0 -80
- package/native/macos-bridge/Sources/main.swift +0 -345
- package/native/windows-bridge/AppManagement.cs +0 -234
- package/native/windows-bridge/InputBridge.cs +0 -436
- package/native/windows-bridge/Program.cs +0 -265
- package/native/windows-bridge/ScreenCapture.cs +0 -329
- package/native/windows-bridge/UIAutomationBridge.cs +0 -571
- package/native/windows-bridge/WindowsBridge.csproj +0 -17
- package/playbooks/devpost.json +0 -186
- package/playbooks/instagram.json +0 -41
- package/playbooks/instagram_v2.json +0 -201
- package/playbooks/x_v1.json +0 -211
- package/scripts/devpost-live-loop.mjs +0 -421
- package/src/logging/timeline-logger.ts +0 -55
- package/src/mcp/server.ts +0 -449
- package/src/memory/recall.ts +0 -191
- package/src/memory/research.ts +0 -146
- package/src/memory/seeds.ts +0 -123
- package/src/memory/session.ts +0 -201
- package/src/memory/store.ts +0 -434
- package/src/memory/types.ts +0 -69
- package/src/native/bridge-client.ts +0 -239
- package/src/runtime/accessibility-adapter.ts +0 -487
- package/src/runtime/app-adapter.ts +0 -169
- package/src/runtime/applescript-adapter.ts +0 -376
- package/src/runtime/ax-role-map.ts +0 -102
- package/src/runtime/browser-adapter.ts +0 -129
- package/src/runtime/cdp-chrome-adapter.ts +0 -676
- package/src/runtime/composite-adapter.ts +0 -274
- package/src/runtime/executor.ts +0 -396
- package/src/runtime/planning-loop.ts +0 -81
- package/src/runtime/service.ts +0 -448
- package/src/runtime/session-manager.ts +0 -50
- package/src/runtime/state-observer.ts +0 -136
- package/src/runtime/vision-adapter.ts +0 -297
- package/src/types.ts +0 -297
- package/tests/bridge-client.test.ts +0 -176
- package/tests/browser-stealth.test.ts +0 -210
- package/tests/composite-adapter.test.ts +0 -64
- package/tests/mcp-server.test.ts +0 -151
- package/tests/memory-recall.test.ts +0 -339
- package/tests/memory-research.test.ts +0 -159
- package/tests/memory-seeds.test.ts +0 -120
- package/tests/memory-store.test.ts +0 -392
- package/tests/types.test.ts +0 -92
- package/tsconfig.check.json +0 -17
- package/tsconfig.json +0 -19
- package/vitest.config.ts +0 -8
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"session": {
|
|
3
|
+
"id": "s_mmg7qvctmkvv",
|
|
4
|
+
"client": "unknown",
|
|
5
|
+
"startedAt": "2026-03-07T11:01:31.325Z",
|
|
6
|
+
"lastActionAt": "2026-03-07T11:01:31.510Z"
|
|
7
|
+
},
|
|
8
|
+
"mission": {
|
|
9
|
+
"current": null,
|
|
10
|
+
"phase": "idle"
|
|
11
|
+
},
|
|
12
|
+
"health": {
|
|
13
|
+
"actionsTotal": 85,
|
|
14
|
+
"actionsFailed": 16,
|
|
15
|
+
"successRate": 0.8117647058823529,
|
|
16
|
+
"lastError": "screencapture failed with exit code 1",
|
|
17
|
+
"consecutiveErrors": 2
|
|
18
|
+
},
|
|
19
|
+
"patterns": {
|
|
20
|
+
"topWorking": [],
|
|
21
|
+
"topFailing": [],
|
|
22
|
+
"knownBlockers": [
|
|
23
|
+
"browser_tabs: Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug",
|
|
24
|
+
"browser_open: Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug",
|
|
25
|
+
"launch: Not found: Application with bundle ID 'com.google.Chrome' not found",
|
|
26
|
+
"focus: Not found: No running application with bundle ID 'com.google.Chrome'"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
"policy": {
|
|
30
|
+
"maxConsecutiveErrors": 5,
|
|
31
|
+
"stallThresholdMs": 300000,
|
|
32
|
+
"escalateAfterRetries": 3,
|
|
33
|
+
"pauseBetweenActionsMs": 500
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"session": {
|
|
3
|
+
"id": "s_mmg7qvctmkvv",
|
|
4
|
+
"client": "unknown",
|
|
5
|
+
"startedAt": "2026-03-07T11:01:31.325Z",
|
|
6
|
+
"lastActionAt": "2026-03-07T11:01:31.325Z"
|
|
7
|
+
},
|
|
8
|
+
"mission": {
|
|
9
|
+
"current": null,
|
|
10
|
+
"phase": "idle"
|
|
11
|
+
},
|
|
12
|
+
"health": {
|
|
13
|
+
"actionsTotal": 79,
|
|
14
|
+
"actionsFailed": 12,
|
|
15
|
+
"successRate": 0.8481012658227848,
|
|
16
|
+
"lastError": null,
|
|
17
|
+
"consecutiveErrors": 0
|
|
18
|
+
},
|
|
19
|
+
"patterns": {
|
|
20
|
+
"topWorking": [],
|
|
21
|
+
"topFailing": [],
|
|
22
|
+
"knownBlockers": [
|
|
23
|
+
"browser_tabs: Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug",
|
|
24
|
+
"browser_open: Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug",
|
|
25
|
+
"launch: Not found: Application with bundle ID 'com.google.Chrome' not found",
|
|
26
|
+
"focus: Not found: No running application with bundle ID 'com.google.Chrome'"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
"policy": {
|
|
30
|
+
"maxConsecutiveErrors": 5,
|
|
31
|
+
"stallThresholdMs": 300000,
|
|
32
|
+
"escalateAfterRetries": 3,
|
|
33
|
+
"pauseBetweenActionsMs": 500
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{"id":"seed_001","task":"Take a photo with Photo Booth","steps":[{"tool":"launch","params":{"bundleId":"com.apple.PhotoBooth"}},{"tool":"ui_press","params":{"title":"Take Photo"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["photo","booth","camera"],"fingerprint":"launch→ui_press"}
|
|
2
|
+
{"id":"seed_002","task":"Open a URL in Chrome","steps":[{"tool":"launch","params":{"bundleId":"com.google.Chrome"}},{"tool":"browser_navigate","params":{"url":""}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["chrome","browse","url"],"fingerprint":"launch→browser_navigate"}
|
|
3
|
+
{"id":"seed_003","task":"Save current document","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"key","params":{"combo":"cmd+s"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["save","document"],"fingerprint":"focus→key"}
|
|
4
|
+
{"id":"seed_004","task":"Copy from one app and paste into another","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"key","params":{"combo":"cmd+c"}},{"tool":"focus","params":{"bundleId":""}},{"tool":"key","params":{"combo":"cmd+v"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["copy","paste"],"fingerprint":"focus→key→focus→key"}
|
|
5
|
+
{"id":"seed_005","task":"Navigate to a folder in Finder","steps":[{"tool":"focus","params":{"bundleId":"com.apple.finder"}},{"tool":"key","params":{"combo":"cmd+shift+g"}},{"tool":"type_text","params":{"text":""}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["finder","folder","navigate"],"fingerprint":"focus→key→type_text"}
|
|
6
|
+
{"id":"seed_006","task":"Create a new folder in Finder","steps":[{"tool":"focus","params":{"bundleId":"com.apple.finder"}},{"tool":"key","params":{"combo":"cmd+shift+n"}},{"tool":"type_text","params":{"text":""}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["finder","folder","create"],"fingerprint":"focus→key→type_text"}
|
|
7
|
+
{"id":"seed_007","task":"Close the current window","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"key","params":{"combo":"cmd+w"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["close","window"],"fingerprint":"focus→key"}
|
|
8
|
+
{"id":"seed_008","task":"Select all content and copy","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"key","params":{"combo":"cmd+a"}},{"tool":"key","params":{"combo":"cmd+c"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["select","all","copy"],"fingerprint":"focus→key→key"}
|
|
9
|
+
{"id":"seed_009","task":"List all running applications","steps":[{"tool":"apps","params":{}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["apps","list","running"],"fingerprint":"apps"}
|
|
10
|
+
{"id":"seed_010","task":"Inspect an app's UI element tree","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"ui_tree","params":{"pid":0}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["inspect","tree","accessibility"],"fingerprint":"focus→ui_tree"}
|
|
11
|
+
{"id":"seed_011","task":"Open a new Chrome tab and navigate to URL","steps":[{"tool":"focus","params":{"bundleId":"com.google.Chrome"}},{"tool":"key","params":{"combo":"cmd+t"}},{"tool":"browser_navigate","params":{"url":""}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["chrome","tab","new"],"fingerprint":"focus→key→browser_navigate"}
|
|
12
|
+
{"id":"seed_012","task":"Export document as PDF","steps":[{"tool":"focus","params":{"bundleId":""}},{"tool":"menu_click","params":{"menuPath":"File/Export as PDF"}}],"totalDurationMs":0,"successCount":10,"failCount":0,"lastUsed":"2026-03-06T03:28:32.310Z","tags":["export","pdf"],"fingerprint":"focus→menu_click"}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* ScreenHand Agent CLI
|
|
4
|
+
*
|
|
5
|
+
* Run a desktop task autonomously:
|
|
6
|
+
* npx tsx src/agent/cli.ts "Open Safari and go to github.com"
|
|
7
|
+
* npx tsx src/agent/cli.ts "Create a new file in TextEdit called notes.txt"
|
|
8
|
+
*/
|
|
9
|
+
import { TimelineLogger } from "../logging/timeline-logger.js";
|
|
10
|
+
import { AutomationRuntimeService } from "../runtime/service.js";
|
|
11
|
+
import { runAgentLoop } from "./loop.js";
|
|
12
|
+
const task = process.argv.slice(2).join(" ");
|
|
13
|
+
if (!task) {
|
|
14
|
+
console.error("Usage: screenhand-agent <task description>");
|
|
15
|
+
console.error("Example: screenhand-agent \"Open Safari and search for MCP protocol\"");
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
async function createAdapter() {
|
|
19
|
+
const adapterType = process.env.SCREENHAND_ADAPTER ?? "accessibility";
|
|
20
|
+
switch (adapterType) {
|
|
21
|
+
case "placeholder": {
|
|
22
|
+
const { PlaceholderAppAdapter } = await import("../runtime/app-adapter.js");
|
|
23
|
+
return new PlaceholderAppAdapter();
|
|
24
|
+
}
|
|
25
|
+
case "cdp": {
|
|
26
|
+
const { CdpChromeAdapter } = await import("../runtime/cdp-chrome-adapter.js");
|
|
27
|
+
return new CdpChromeAdapter({ headless: process.env.SCREENHAND_HEADLESS === "1" });
|
|
28
|
+
}
|
|
29
|
+
case "composite": {
|
|
30
|
+
const { BridgeClient } = await import("../native/bridge-client.js");
|
|
31
|
+
const { CompositeAdapter } = await import("../runtime/composite-adapter.js");
|
|
32
|
+
return new CompositeAdapter(new BridgeClient(), { headless: process.env.SCREENHAND_HEADLESS === "1" });
|
|
33
|
+
}
|
|
34
|
+
default: {
|
|
35
|
+
const { BridgeClient } = await import("../native/bridge-client.js");
|
|
36
|
+
const { AccessibilityAdapter } = await import("../runtime/accessibility-adapter.js");
|
|
37
|
+
return new AccessibilityAdapter(new BridgeClient());
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
try {
|
|
42
|
+
const adapter = await createAdapter();
|
|
43
|
+
const runtime = new AutomationRuntimeService(adapter, new TimelineLogger());
|
|
44
|
+
const session = await runtime.sessionStart();
|
|
45
|
+
console.log(`\n🔄 Task: ${task}`);
|
|
46
|
+
console.log(` Session: ${session.sessionId}`);
|
|
47
|
+
console.log(` Model: ${process.env.SCREENHAND_MODEL ?? "claude-sonnet-4-20250514"}\n`);
|
|
48
|
+
const cliModel = process.env.SCREENHAND_MODEL;
|
|
49
|
+
const result = await runAgentLoop(runtime, session.sessionId, task, {
|
|
50
|
+
maxSteps: parseInt(process.env.SCREENHAND_MAX_STEPS ?? "50", 10),
|
|
51
|
+
...(cliModel ? { model: cliModel } : {}),
|
|
52
|
+
onStep: (step) => {
|
|
53
|
+
const icon = step.done ? "✅" : step.action ? "→" : "⚠️";
|
|
54
|
+
console.log(` ${icon} [${step.index}] ${step.reasoning.slice(0, 100)}`);
|
|
55
|
+
if (step.action && step.action.tool !== "done") {
|
|
56
|
+
console.log(` ${step.action.tool}: ${JSON.stringify(step.action).slice(0, 120)}`);
|
|
57
|
+
}
|
|
58
|
+
if (step.result) {
|
|
59
|
+
console.log(` Result: ${step.result.slice(0, 100)}`);
|
|
60
|
+
}
|
|
61
|
+
console.log(` (${step.durationMs}ms)\n`);
|
|
62
|
+
},
|
|
63
|
+
});
|
|
64
|
+
console.log(`\n${"=".repeat(60)}`);
|
|
65
|
+
console.log(`${result.success ? "✅ SUCCESS" : "❌ INCOMPLETE"}: ${result.summary}`);
|
|
66
|
+
console.log(`Steps: ${result.steps.length} | Total: ${result.totalMs}ms`);
|
|
67
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
68
|
+
process.exit(result.success ? 0 : 1);
|
|
69
|
+
}
|
|
70
|
+
catch (e) {
|
|
71
|
+
console.error(`Fatal: ${e instanceof Error ? e.message : String(e)}`);
|
|
72
|
+
process.exit(1);
|
|
73
|
+
}
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ScreenHand Agent Loop
|
|
3
|
+
*
|
|
4
|
+
* Continuous observe → decide → act loop powered by Claude.
|
|
5
|
+
* Uses element_tree (accessibility tree) as the primary observation — not screenshots.
|
|
6
|
+
* ~50ms per observe, ~50ms per action. Only the LLM call adds latency.
|
|
7
|
+
*/
|
|
8
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
9
|
+
/**
|
|
10
|
+
* Compact AX tree representation for LLM consumption.
|
|
11
|
+
* Converts the full AXNode tree into a concise text format:
|
|
12
|
+
* [button] "Send" (350,200)
|
|
13
|
+
* [textField] "Search" value="hello" (100,50)
|
|
14
|
+
*/
|
|
15
|
+
function compactTree(node, depth = 0, maxDepth = 5) {
|
|
16
|
+
if (depth > maxDepth)
|
|
17
|
+
return "";
|
|
18
|
+
const indent = " ".repeat(depth);
|
|
19
|
+
const parts = [];
|
|
20
|
+
// Role
|
|
21
|
+
const role = node.role.replace("AX", "").toLowerCase();
|
|
22
|
+
// Label — prefer title, then description, then identifier
|
|
23
|
+
const label = node.title || node.description || node.identifier || "";
|
|
24
|
+
// Value
|
|
25
|
+
const val = node.value ? ` value="${node.value.slice(0, 50)}"` : "";
|
|
26
|
+
// Position
|
|
27
|
+
const pos = node.position ? ` (${Math.round(node.position.x)},${Math.round(node.position.y)})` : "";
|
|
28
|
+
// Focused/enabled markers
|
|
29
|
+
const markers = [];
|
|
30
|
+
if (node.focused)
|
|
31
|
+
markers.push("focused");
|
|
32
|
+
if (node.enabled === false)
|
|
33
|
+
markers.push("disabled");
|
|
34
|
+
const markerStr = markers.length ? ` [${markers.join(",")}]` : "";
|
|
35
|
+
// Skip noise nodes with no useful info
|
|
36
|
+
const isNoise = !label && !val && !node.focused && (role === "group" || role === "splitgroup" || role === "scrollarea");
|
|
37
|
+
if (!isNoise) {
|
|
38
|
+
parts.push(`${indent}[${role}] "${label}"${val}${pos}${markerStr}`);
|
|
39
|
+
}
|
|
40
|
+
if (node.children) {
|
|
41
|
+
for (const child of node.children) {
|
|
42
|
+
const childStr = compactTree(child, isNoise ? depth : depth + 1, maxDepth);
|
|
43
|
+
if (childStr)
|
|
44
|
+
parts.push(childStr);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return parts.join("\n");
|
|
48
|
+
}
|
|
49
|
+
const SYSTEM_PROMPT = `You are a desktop automation agent. You control a computer through ScreenHand tools.
|
|
50
|
+
|
|
51
|
+
On each turn you receive the current UI state as an accessibility tree. You must decide the SINGLE next action to take.
|
|
52
|
+
|
|
53
|
+
Respond in this exact JSON format (no markdown, no explanation outside the JSON):
|
|
54
|
+
{
|
|
55
|
+
"reasoning": "Brief explanation of what you see and why you're taking this action",
|
|
56
|
+
"action": { "tool": "...", ... },
|
|
57
|
+
"done": false
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
When the task is fully complete, respond with:
|
|
61
|
+
{
|
|
62
|
+
"reasoning": "Task is complete because ...",
|
|
63
|
+
"action": { "tool": "done", "summary": "What was accomplished" },
|
|
64
|
+
"done": true
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
Available actions:
|
|
68
|
+
- {"tool": "press", "target": "Button text or element name"}
|
|
69
|
+
- {"tool": "type_into", "target": "Field name", "text": "text to type"}
|
|
70
|
+
- {"tool": "navigate", "url": "https://..."}
|
|
71
|
+
- {"tool": "scroll", "direction": "up|down|left|right", "amount": 3}
|
|
72
|
+
- {"tool": "key_combo", "keys": ["cmd", "c"]}
|
|
73
|
+
- {"tool": "menu_click", "menuPath": ["File", "Save"]}
|
|
74
|
+
- {"tool": "app_launch", "bundleId": "com.apple.Safari"}
|
|
75
|
+
- {"tool": "app_focus", "bundleId": "com.apple.Safari"}
|
|
76
|
+
- {"tool": "extract", "target": "element name", "format": "text"}
|
|
77
|
+
- {"tool": "wait", "ms": 1000}
|
|
78
|
+
- {"tool": "done", "summary": "what was accomplished"}
|
|
79
|
+
|
|
80
|
+
Rules:
|
|
81
|
+
- Take ONE action per turn. After each action you'll see the updated UI.
|
|
82
|
+
- Use the accessibility tree to find elements — look for roles and labels.
|
|
83
|
+
- Target elements by their visible text/label, not coordinates (unless no label exists).
|
|
84
|
+
- If an action fails, try an alternative approach — don't repeat the same failed action.
|
|
85
|
+
- If you're stuck after 3 attempts, explain what's blocking you and mark done.
|
|
86
|
+
- Be efficient. Don't take unnecessary actions.`;
|
|
87
|
+
export async function runAgentLoop(runtime, sessionId, task, options = {}) {
|
|
88
|
+
const { maxSteps = 50, model = "claude-sonnet-4-20250514", maxTokens = 1024, onStep, screenshotOnStart = false, } = options;
|
|
89
|
+
const client = new Anthropic();
|
|
90
|
+
const steps = [];
|
|
91
|
+
const messages = [];
|
|
92
|
+
const startTime = Date.now();
|
|
93
|
+
// Optional initial screenshot for context
|
|
94
|
+
if (screenshotOnStart) {
|
|
95
|
+
await runtime.screenshot({ sessionId });
|
|
96
|
+
}
|
|
97
|
+
for (let i = 0; i < maxSteps; i++) {
|
|
98
|
+
const stepStart = Date.now();
|
|
99
|
+
// 1. OBSERVE — get accessibility tree (~50ms)
|
|
100
|
+
const treeResult = await runtime.elementTree({ sessionId, maxDepth: 5 });
|
|
101
|
+
let observation;
|
|
102
|
+
if (treeResult.ok) {
|
|
103
|
+
observation = compactTree(treeResult.data);
|
|
104
|
+
// Truncate if too large to keep tokens manageable
|
|
105
|
+
if (observation.length > 8000) {
|
|
106
|
+
observation = observation.slice(0, 8000) + "\n... (truncated)";
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
observation = `[Error getting UI tree: ${treeResult.error.message}]`;
|
|
111
|
+
}
|
|
112
|
+
// Also get app context
|
|
113
|
+
let contextLine = "";
|
|
114
|
+
try {
|
|
115
|
+
const apps = await runtime.appList(sessionId);
|
|
116
|
+
if (apps.ok) {
|
|
117
|
+
const active = apps.data.find(a => a.isActive);
|
|
118
|
+
if (active)
|
|
119
|
+
contextLine = `Active app: ${active.name} (${active.bundleId})`;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
catch { /* ignore */ }
|
|
123
|
+
// 2. BUILD prompt
|
|
124
|
+
const userMsg = i === 0
|
|
125
|
+
? `Task: ${task}\n\nCurrent UI state:\n${contextLine}\n${observation}`
|
|
126
|
+
: `Action result: ${steps[i - 1].result}\n\nUpdated UI state:\n${contextLine}\n${observation}`;
|
|
127
|
+
messages.push({ role: "user", content: userMsg });
|
|
128
|
+
// 3. DECIDE — ask Claude what to do next
|
|
129
|
+
let reasoning = "";
|
|
130
|
+
let action = null;
|
|
131
|
+
let done = false;
|
|
132
|
+
try {
|
|
133
|
+
const resp = await client.messages.create({
|
|
134
|
+
model,
|
|
135
|
+
max_tokens: maxTokens,
|
|
136
|
+
system: SYSTEM_PROMPT,
|
|
137
|
+
messages,
|
|
138
|
+
});
|
|
139
|
+
const text = resp.content[0]?.type === "text" ? resp.content[0].text : "";
|
|
140
|
+
messages.push({ role: "assistant", content: text });
|
|
141
|
+
// Parse JSON response
|
|
142
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
143
|
+
if (jsonMatch) {
|
|
144
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
145
|
+
reasoning = parsed.reasoning ?? "";
|
|
146
|
+
action = parsed.action ?? null;
|
|
147
|
+
done = parsed.done === true;
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
reasoning = text;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
catch (e) {
|
|
154
|
+
reasoning = `LLM error: ${e instanceof Error ? e.message : String(e)}`;
|
|
155
|
+
}
|
|
156
|
+
// 4. ACT — execute the action (~50ms)
|
|
157
|
+
let result = "";
|
|
158
|
+
if (action) {
|
|
159
|
+
try {
|
|
160
|
+
result = await executeAction(runtime, sessionId, action);
|
|
161
|
+
}
|
|
162
|
+
catch (e) {
|
|
163
|
+
result = `Error: ${e instanceof Error ? e.message : String(e)}`;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
result = "No action taken";
|
|
168
|
+
}
|
|
169
|
+
// Record step
|
|
170
|
+
const step = {
|
|
171
|
+
index: i,
|
|
172
|
+
observation: observation.slice(0, 500),
|
|
173
|
+
reasoning,
|
|
174
|
+
action,
|
|
175
|
+
result,
|
|
176
|
+
done,
|
|
177
|
+
durationMs: Date.now() - stepStart,
|
|
178
|
+
};
|
|
179
|
+
steps.push(step);
|
|
180
|
+
if (onStep)
|
|
181
|
+
onStep(step);
|
|
182
|
+
if (done)
|
|
183
|
+
break;
|
|
184
|
+
}
|
|
185
|
+
const lastStep = steps[steps.length - 1];
|
|
186
|
+
const summary = lastStep?.action?.tool === "done"
|
|
187
|
+
? lastStep.action.summary
|
|
188
|
+
: `Stopped after ${steps.length} steps`;
|
|
189
|
+
return {
|
|
190
|
+
success: lastStep?.done ?? false,
|
|
191
|
+
summary,
|
|
192
|
+
steps,
|
|
193
|
+
totalMs: Date.now() - startTime,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
async function executeAction(runtime, sessionId, action) {
|
|
197
|
+
switch (action.tool) {
|
|
198
|
+
case "press": {
|
|
199
|
+
const r = await runtime.press({
|
|
200
|
+
sessionId,
|
|
201
|
+
target: { type: "text", value: action.target },
|
|
202
|
+
});
|
|
203
|
+
return r.ok ? `Pressed "${action.target}"` : `Failed: ${r.error.message}`;
|
|
204
|
+
}
|
|
205
|
+
case "type_into": {
|
|
206
|
+
const r = await runtime.typeInto({
|
|
207
|
+
sessionId,
|
|
208
|
+
target: { type: "text", value: action.target },
|
|
209
|
+
text: action.text,
|
|
210
|
+
});
|
|
211
|
+
return r.ok ? `Typed "${action.text}" into "${action.target}"` : `Failed: ${r.error.message}`;
|
|
212
|
+
}
|
|
213
|
+
case "navigate": {
|
|
214
|
+
const r = await runtime.navigate({ sessionId, url: action.url });
|
|
215
|
+
return r.ok ? `Navigated to ${action.url}` : `Failed: ${r.error.message}`;
|
|
216
|
+
}
|
|
217
|
+
case "scroll": {
|
|
218
|
+
const input = { sessionId, direction: action.direction };
|
|
219
|
+
if (typeof action.amount === "number")
|
|
220
|
+
input.amount = action.amount;
|
|
221
|
+
const r = await runtime.scroll(input);
|
|
222
|
+
return r.ok ? `Scrolled ${action.direction}` : `Failed: ${r.error.message}`;
|
|
223
|
+
}
|
|
224
|
+
case "key_combo": {
|
|
225
|
+
const r = await runtime.keyCombo({ sessionId, keys: action.keys });
|
|
226
|
+
return r.ok ? `Key combo: ${action.keys.join("+")}` : `Failed: ${r.error.message}`;
|
|
227
|
+
}
|
|
228
|
+
case "menu_click": {
|
|
229
|
+
const r = await runtime.menuClick({ sessionId, menuPath: action.menuPath });
|
|
230
|
+
return r.ok ? `Menu: ${action.menuPath.join(" → ")}` : `Failed: ${r.error.message}`;
|
|
231
|
+
}
|
|
232
|
+
case "app_launch": {
|
|
233
|
+
const r = await runtime.appLaunch({ sessionId, bundleId: action.bundleId });
|
|
234
|
+
return r.ok ? `Launched ${action.bundleId}` : `Failed: ${r.error.message}`;
|
|
235
|
+
}
|
|
236
|
+
case "app_focus": {
|
|
237
|
+
const r = await runtime.appFocus({ sessionId, bundleId: action.bundleId });
|
|
238
|
+
return r.ok ? `Focused ${action.bundleId}` : `Failed: ${r.error.message}`;
|
|
239
|
+
}
|
|
240
|
+
case "extract": {
|
|
241
|
+
const r = await runtime.extract({
|
|
242
|
+
sessionId,
|
|
243
|
+
target: { type: "text", value: action.target },
|
|
244
|
+
format: action.format,
|
|
245
|
+
});
|
|
246
|
+
return r.ok ? `Extracted: ${JSON.stringify(r.data).slice(0, 500)}` : `Failed: ${r.error.message}`;
|
|
247
|
+
}
|
|
248
|
+
case "wait": {
|
|
249
|
+
await new Promise(resolve => setTimeout(resolve, action.ms));
|
|
250
|
+
return `Waited ${action.ms}ms`;
|
|
251
|
+
}
|
|
252
|
+
case "done": {
|
|
253
|
+
return `Task complete: ${action.summary}`;
|
|
254
|
+
}
|
|
255
|
+
default:
|
|
256
|
+
return `Unknown action: ${action.tool}`;
|
|
257
|
+
}
|
|
258
|
+
}
|
package/dist/config.js
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { TimelineLogger } from "./logging/timeline-logger.js";
|
|
2
|
+
import { MvpMcpServer } from "./mcp/server.js";
|
|
3
|
+
import { PlaceholderAppAdapter, } from "./runtime/app-adapter.js";
|
|
4
|
+
import { CdpChromeAdapter } from "./runtime/cdp-chrome-adapter.js";
|
|
5
|
+
import { AutomationRuntimeService } from "./runtime/service.js";
|
|
6
|
+
export { PlaceholderAppAdapter } from "./runtime/app-adapter.js";
|
|
7
|
+
export { CdpChromeAdapter } from "./runtime/cdp-chrome-adapter.js";
|
|
8
|
+
export { AccessibilityAdapter } from "./runtime/accessibility-adapter.js";
|
|
9
|
+
export { AppleScriptAdapter } from "./runtime/applescript-adapter.js";
|
|
10
|
+
export { VisionAdapter } from "./runtime/vision-adapter.js";
|
|
11
|
+
export { CompositeAdapter } from "./runtime/composite-adapter.js";
|
|
12
|
+
export { BridgeClient, BridgeClient as MacOSBridgeClient } from "./native/bridge-client.js";
|
|
13
|
+
export { StateObserver } from "./runtime/state-observer.js";
|
|
14
|
+
export { PlanningLoop } from "./runtime/planning-loop.js";
|
|
15
|
+
export { AutomationRuntimeService } from "./runtime/service.js";
|
|
16
|
+
export { MvpMcpServer } from "./mcp/server.js";
|
|
17
|
+
export { createMcpStdioServer, startMcpStdioServer } from "./mcp/mcp-stdio-server.js";
|
|
18
|
+
export { runAgentLoop } from "./agent/loop.js";
|
|
19
|
+
export function createRuntimeApp(adapter) {
|
|
20
|
+
const logger = new TimelineLogger();
|
|
21
|
+
const runtime = new AutomationRuntimeService(adapter, logger);
|
|
22
|
+
const mcp = new MvpMcpServer(runtime);
|
|
23
|
+
return { runtime, mcp };
|
|
24
|
+
}
|
|
25
|
+
async function createDefaultAdapter() {
|
|
26
|
+
if (process.env.AUTOMATOR_ADAPTER === "placeholder") {
|
|
27
|
+
return new PlaceholderAppAdapter();
|
|
28
|
+
}
|
|
29
|
+
if (process.env.AUTOMATOR_ADAPTER === "composite") {
|
|
30
|
+
// Lazy import to avoid requiring Swift bridge for CDP-only usage
|
|
31
|
+
const { MacOSBridgeClient } = await import("./native/macos-bridge-client.js");
|
|
32
|
+
const { CompositeAdapter } = await import("./runtime/composite-adapter.js");
|
|
33
|
+
const bridge = new MacOSBridgeClient();
|
|
34
|
+
return new CompositeAdapter(bridge, {
|
|
35
|
+
headless: process.env.AUTOMATOR_HEADLESS === "1",
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
if (process.env.AUTOMATOR_ADAPTER === "accessibility") {
|
|
39
|
+
const { MacOSBridgeClient } = await import("./native/macos-bridge-client.js");
|
|
40
|
+
const { AccessibilityAdapter } = await import("./runtime/accessibility-adapter.js");
|
|
41
|
+
const bridge = new MacOSBridgeClient();
|
|
42
|
+
return new AccessibilityAdapter(bridge);
|
|
43
|
+
}
|
|
44
|
+
return new CdpChromeAdapter({
|
|
45
|
+
headless: process.env.AUTOMATOR_HEADLESS === "1",
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
const app = createRuntimeApp(await createDefaultAdapter());
|
|
49
|
+
if (process.argv.includes("--healthcheck")) {
|
|
50
|
+
const session = await app.runtime.sessionStart("automation");
|
|
51
|
+
console.log(JSON.stringify({
|
|
52
|
+
status: "ok",
|
|
53
|
+
session,
|
|
54
|
+
note: "Runtime loaded with universal adapter support.",
|
|
55
|
+
}, null, 2));
|
|
56
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export class TimelineLogger {
|
|
2
|
+
timeline = [];
|
|
3
|
+
start(action, sessionId) {
|
|
4
|
+
return {
|
|
5
|
+
action,
|
|
6
|
+
sessionId,
|
|
7
|
+
startedAt: new Date().toISOString(),
|
|
8
|
+
locateMs: 0,
|
|
9
|
+
actMs: 0,
|
|
10
|
+
verifyMs: 0,
|
|
11
|
+
retries: 0,
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
finish(telemetry, status) {
|
|
15
|
+
const finishedAt = new Date().toISOString();
|
|
16
|
+
const totalMs = new Date(finishedAt).getTime() - new Date(telemetry.startedAt).getTime();
|
|
17
|
+
const finalized = {
|
|
18
|
+
...telemetry,
|
|
19
|
+
finishedAt,
|
|
20
|
+
totalMs,
|
|
21
|
+
status,
|
|
22
|
+
};
|
|
23
|
+
this.timeline.push(finalized);
|
|
24
|
+
return finalized;
|
|
25
|
+
}
|
|
26
|
+
getRecent(limit = 50) {
|
|
27
|
+
return this.timeline.slice(-limit);
|
|
28
|
+
}
|
|
29
|
+
}
|