screenhand 0.5.0 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/mcp-desktop.js +463 -39
  2. package/dist/src/community/publisher.js +4 -2
  3. package/dist/src/context-tracker.js +62 -6
  4. package/dist/src/ingestion/reference-merger.js +33 -0
  5. package/dist/src/memory/recall.js +65 -1
  6. package/dist/src/memory/research.js +1 -1
  7. package/dist/src/memory/service.js +26 -5
  8. package/dist/src/memory/store.js +42 -23
  9. package/dist/src/native/bridge-client.js +3 -3
  10. package/dist/src/perception/coordinator.js +94 -15
  11. package/dist/src/perception/manager.js +65 -1
  12. package/dist/src/planner/executor.js +6 -2
  13. package/dist/src/planner/plan-refiner.js +213 -0
  14. package/dist/src/playbook/engine.js +18 -3
  15. package/dist/src/playbook/recorder.js +24 -8
  16. package/dist/src/playbook/runner.js +9 -3
  17. package/dist/src/playbook/store.js +8 -0
  18. package/dist/src/recovery/engine.js +9 -3
  19. package/dist/src/state/app-map.js +212 -2
  20. package/dist/src/state/state-watcher.js +144 -0
  21. package/dist/src/state/visual-mapper.js +325 -0
  22. package/dist/src/state/world-model.js +30 -1
  23. package/dist/src/supervisor/supervisor.js +1 -1
  24. package/dist-app-maps/com.apple.Notes.json +2328 -2201
  25. package/dist-app-maps/com.apple.Terminal.json +331 -343
  26. package/dist-app-maps/com.apple.iCal.json +3 -3
  27. package/dist-app-maps/com.apple.iphonesimulator.json +714 -223
  28. package/dist-app-maps/com.apple.mail.json +3 -3
  29. package/dist-app-maps/com.apple.reminders.json +2 -2
  30. package/dist-app-maps/net.whatsapp.WhatsApp.json +27 -27
  31. package/dist-references/notes.json +53 -16
  32. package/dist-references/simulator.json +48 -2
  33. package/package.json +1 -1
@@ -52,8 +52,10 @@ import { PlaybookStore } from "./src/playbook/store.js";
52
52
  import { ContextTracker } from "./src/context-tracker.js";
53
53
  import { McpPlaybookRecorder } from "./src/playbook/mcp-recorder.js";
54
54
  import { WorldModel } from "./src/state/index.js";
55
+ import { StateWatcher } from "./src/state/state-watcher.js";
55
56
  import { PerceptionManager } from "./src/perception/index.js";
56
57
  import { Planner, PlanExecutor, GoalStore, ToolRegistry } from "./src/planner/index.js";
58
+ import { PlanRefiner } from "./src/planner/plan-refiner.js";
57
59
  import { RecoveryEngine } from "./src/recovery/index.js";
58
60
  import { LearningEngine, LocatorPolicy } from "./src/learning/index.js";
59
61
  import { discoverWebElements, testWebElement, compileReference, saveExploreResult, discoverNativeElements } from "./src/platform/explorer.js";
@@ -70,6 +72,7 @@ import { MenuScanner } from "./src/ingestion/menu-scanner.js";
70
72
  import { DocParser } from "./src/ingestion/doc-parser.js";
71
73
  import { TutorialExtractor } from "./src/ingestion/tutorial-extractor.js";
72
74
  import { extractFeaturesFromHTML } from "./src/ingestion/feature-extractor.js";
75
+ import { quickScan, llmEnrich, buildVisualMeta, isSensitiveApp } from "./src/state/visual-mapper.js";
73
76
  import { CoverageAuditor } from "./src/ingestion/coverage-auditor.js";
74
77
  import { ReferenceMerger } from "./src/ingestion/reference-merger.js";
75
78
  import { PlaybookPublisher } from "./src/community/publisher.js";
@@ -609,6 +612,7 @@ catch { /* dir may not exist */ }
609
612
  const planner = new Planner(_executablePlaybookStore, memory, contextTracker, worldModel, learningEngine);
610
613
  const goalStore = new GoalStore(path.join(os.homedir(), ".screenhand", "planner"));
611
614
  goalStore.init();
615
+ const planRefiner = new PlanRefiner(path.join(os.homedir(), ".screenhand", "planner"));
612
616
  const toolRegistry = new ToolRegistry();
613
617
  const recoveryEngine = new RecoveryEngine(worldModel, toolRegistry.toExecutor(), memory);
614
618
  recoveryEngine.setLearningEngine(learningEngine);
@@ -616,6 +620,7 @@ recoveryEngine.setAppMap(appMap);
616
620
  planner.setToolRegistry(toolRegistry);
617
621
  planner.setAppMap(appMap);
618
622
  perceptionManager.setLearningEngine(learningEngine);
623
+ const stateWatcher = new StateWatcher(worldModel, toolRegistry.toExecutor(), 2_000);
619
624
  // ── Reactive event loop: wire perception events to automatic responses ──
620
625
  // These fire at perception speed (100-300ms), not LLM speed (~2-3s).
621
626
  perceptionManager.on("dialog_detected", (event) => {
@@ -649,6 +654,28 @@ perceptionManager.on("app_switched", (event) => {
649
654
  // Log for observability
650
655
  console.error(`[reactive] App switched to ${event.bundleId} (pid=${event.pid})`);
651
656
  });
657
+ // ── Perception-triggered recovery: focus loss, app crash, stall ──
658
+ perceptionManager.on("focus_lost", (event) => {
659
+ console.error(`[reactive] Focus lost: expected ${event.expectedBundleId}, got ${event.actualBundleId} — auto-refocusing`);
660
+ // Auto-refocus the expected app
661
+ toolRegistry.toExecutor()("focus", { bundleId: event.expectedBundleId }).catch((err) => {
662
+ console.error(`[reactive] Auto-refocus failed: ${err instanceof Error ? err.message : err}`);
663
+ });
664
+ });
665
+ perceptionManager.on("app_crash", (event) => {
666
+ console.error(`[reactive] App crash detected: ${event.bundleId} (pid=${event.pid}) — auto-relaunching`);
667
+ // Auto-relaunch the crashed app
668
+ toolRegistry.toExecutor()("launch", { bundleId: event.bundleId }).catch((err) => {
669
+ console.error(`[reactive] Auto-relaunch failed: ${err instanceof Error ? err.message : err}`);
670
+ });
671
+ });
672
+ perceptionManager.on("stall_detected", (event) => {
673
+ console.error(`[reactive] UI stall detected: ${event.bundleId} — no changes for ${(event.stallMs / 1000).toFixed(0)}s — taking screenshot for diagnosis`);
674
+ // Take a screenshot so the next LLM call can see what's on screen
675
+ toolRegistry.toExecutor()("screenshot", {}).catch((err) => {
676
+ console.error(`[reactive] Stall screenshot failed: ${err instanceof Error ? err.message : err}`);
677
+ });
678
+ });
652
679
  const mcpRecorder = new McpPlaybookRecorder(playbooksDir);
653
680
  const referenceMerger = new ReferenceMerger(referencesDir);
654
681
  const communityPublisher = new PlaybookPublisher();
@@ -681,6 +708,7 @@ const MEMORY_TOOLS = new Set([
681
708
  ]);
682
709
  // Track the strategy we're currently following (for feedback loop)
683
710
  let activeStrategyFingerprint = null;
711
+ let autoExecutionInProgress = false; // guard against concurrent auto-execution
684
712
  let currentAdaptiveBudget = null;
685
713
  // Intercept all tool registrations to auto-log + auto-recall
686
714
  const _rawOriginalTool = server.tool.bind(server);
@@ -770,7 +798,7 @@ server.tool = (...args) => {
770
798
  if (!perceptionManager.isRunning && bridgeReady) {
771
799
  const focusApp = worldModel.getState().focusedApp;
772
800
  if (focusApp?.bundleId && focusApp?.pid) {
773
- perceptionManager.tryAutoStart(focusApp, bridge).catch(() => { });
801
+ perceptionManager.tryAutoStart(focusApp, bridge).catch((e) => { process.stderr.write(`[screenhand] perception auto-start failed: ${e instanceof Error ? e.message : String(e)}\n`); });
774
802
  installSafariEnricher(focusApp.bundleId);
775
803
  }
776
804
  }
@@ -821,7 +849,7 @@ server.tool = (...args) => {
821
849
  "type_with_fallback", "select_with_fallback", "scroll_with_fallback",
822
850
  ]);
823
851
  try {
824
- const result = await originalHandler(params, extra);
852
+ let result = await originalHandler(params, extra);
825
853
  const durationMs = Date.now() - start;
826
854
  // ── POST-CALL: log action (async, non-blocking) ──
827
855
  const entry = {
@@ -873,7 +901,9 @@ server.tool = (...args) => {
873
901
  try {
874
902
  appMap.recordPageTransition(postBundleIdForCtx, pageTransition.from, pageTransition.to, toolName);
875
903
  }
876
- catch { /* non-critical — don't break tool execution for nav tracking */ }
904
+ catch (e) {
905
+ process.stderr.write(`[screenhand] nav tracking failed: ${e instanceof Error ? e.message : String(e)}\n`);
906
+ }
877
907
  }
878
908
  // ── POST-CALL: detect focus drift ──
879
909
  const postBundleId = worldModel.getState().focusedApp?.bundleId ?? null;
@@ -947,7 +977,9 @@ server.tool = (...args) => {
947
977
  }
948
978
  }
949
979
  }
950
- catch { /* non-fatal */ }
980
+ catch (e) {
981
+ process.stderr.write(`[screenhand] app map feature learning failed: ${e instanceof Error ? e.message : String(e)}\n`);
982
+ }
951
983
  }
952
984
  if (!resultIsError && learnBundleId !== "unknown") {
953
985
  try {
@@ -1369,7 +1401,9 @@ server.tool = (...args) => {
1369
1401
  }
1370
1402
  }
1371
1403
  }
1372
- catch { /* hierarchy extraction non-fatal */ }
1404
+ catch (e) {
1405
+ process.stderr.write(`[screenhand] hierarchy extraction failed: ${e instanceof Error ? e.message : String(e)}\n`);
1406
+ }
1373
1407
  }
1374
1408
  }
1375
1409
  // ── Conditional UI visibility tracking (throttled) ──
@@ -1417,7 +1451,9 @@ server.tool = (...args) => {
1417
1451
  }
1418
1452
  }
1419
1453
  }
1420
- catch { /* visibility tracking non-fatal */ }
1454
+ catch (e) {
1455
+ process.stderr.write(`[screenhand] visibility tracking failed: ${e instanceof Error ? e.message : String(e)}\n`);
1456
+ }
1421
1457
  }
1422
1458
  }
1423
1459
  // ── Timing recording: track tool response times per element ──
@@ -1511,25 +1547,65 @@ server.tool = (...args) => {
1511
1547
  if (knownError) {
1512
1548
  hints.push(`⚡ Memory: "${toolName}" has failed before: "${knownError.error}" (${knownError.occurrences}x). Fix: ${knownError.resolution}`);
1513
1549
  }
1514
- // Suggest next step if we're mid-strategy
1550
+ // ── Strategy matching: auto-execute proven strategies OR hint unproven ones ──
1515
1551
  const recentTools = memory.getRecentToolNames();
1516
- const strategyHint = memory.quickStrategyHint(recentTools, worldModel.getState().focusedApp?.bundleId);
1517
- if (strategyHint) {
1518
- activeStrategyFingerprint = strategyHint.fingerprint;
1519
- const nextParams = Object.keys(strategyHint.nextStep.params).length > 0
1520
- ? `(${JSON.stringify(strategyHint.nextStep.params)})`
1521
- : "";
1522
- hints.push(`💡 Memory: This matches strategy "${strategyHint.strategy.task}" (${strategyHint.strategy.successCount} wins, ${strategyHint.strategy.failCount ?? 0} fails). Next step: ${strategyHint.nextStep.tool}${nextParams}`);
1523
- // If this was the last step of the strategy, record success
1524
- if (recentTools.length === strategyHint.strategy.steps.length - 1) {
1525
- // Next call will be the final step but this call completing means we're on track
1552
+ const currentBundleForStrategy = worldModel.getState().focusedApp?.bundleId;
1553
+ // Try auto-execution first (10+ successes, 0 failures)
1554
+ // Guard: skip if another auto-execution is already in progress
1555
+ const autoExec = autoExecutionInProgress ? null : memory.getAutoExecutableStrategy(recentTools, currentBundleForStrategy);
1556
+ if (autoExec) {
1557
+ autoExecutionInProgress = true;
1558
+ activeStrategyFingerprint = autoExec.fingerprint;
1559
+ const autoResults = [];
1560
+ let allOk = true;
1561
+ hints.push(`🚀 Auto-executing proven strategy "${autoExec.strategy.task}" (${autoExec.strategy.successCount} wins)${autoExec.remainingSteps.length} steps remaining`);
1562
+ for (const step of autoExec.remainingSteps) {
1563
+ try {
1564
+ const stepResult = await toolRegistry.toExecutor()(step.tool, step.params);
1565
+ autoResults.push({ tool: step.tool, ...stepResult });
1566
+ // Record outcome for learning
1567
+ const target = typeof step.params.target === "string" ? step.params.target
1568
+ : typeof step.params.title === "string" ? step.params.title
1569
+ : typeof step.params.text === "string" ? step.params.text
1570
+ : null;
1571
+ contextTracker.recordOutcome(step.tool, { target, text: typeof step.params.text === "string" ? step.params.text : null }, stepResult.ok, stepResult.ok ? null : (stepResult.error ?? null));
1572
+ if (!stepResult.ok) {
1573
+ allOk = false;
1574
+ hints.push(` ✗ ${step.tool} failed: ${stepResult.error ?? "unknown"}`);
1575
+ break; // Stop auto-execution on first failure
1576
+ }
1577
+ hints.push(` ✓ ${step.tool} — ok`);
1578
+ }
1579
+ catch (err) {
1580
+ allOk = false;
1581
+ hints.push(` ✗ ${step.tool} threw: ${err instanceof Error ? err.message : String(err)}`);
1582
+ break;
1583
+ }
1526
1584
  }
1527
- }
1528
- else if (activeStrategyFingerprint && recentTools.length > 0) {
1529
- // We were following a strategy but the sequence diverged — record success
1530
- // (the agent completed the strategy or went its own way after it)
1531
- memory.recordStrategyOutcome(activeStrategyFingerprint, true);
1585
+ // Record strategy outcome
1586
+ memory.recordStrategyOutcome(autoExec.fingerprint, allOk);
1532
1587
  activeStrategyFingerprint = null;
1588
+ autoExecutionInProgress = false;
1589
+ // Append auto-execution results to the response
1590
+ const autoSummary = autoResults.map((r) => `${r.tool}: ${r.ok ? "ok" : r.error}`).join("\n");
1591
+ const resultContent = Array.isArray(result?.content) ? result.content : [];
1592
+ resultContent.push({ type: "text", text: `\n── AUTO-EXECUTED (${autoResults.length} steps) ──\n${autoSummary}` });
1593
+ result = { ...result, content: resultContent };
1594
+ }
1595
+ else {
1596
+ // Fall back to strategy hint (suggest but don't execute)
1597
+ const strategyHint = memory.quickStrategyHint(recentTools, currentBundleForStrategy);
1598
+ if (strategyHint) {
1599
+ activeStrategyFingerprint = strategyHint.fingerprint;
1600
+ const nextParams = Object.keys(strategyHint.nextStep.params).length > 0
1601
+ ? `(${JSON.stringify(strategyHint.nextStep.params)})`
1602
+ : "";
1603
+ hints.push(`💡 Memory: This matches strategy "${strategyHint.strategy.task}" (${strategyHint.strategy.successCount} wins, ${strategyHint.strategy.failCount ?? 0} fails). Next step: ${strategyHint.nextStep.tool}${nextParams}`);
1604
+ }
1605
+ else if (activeStrategyFingerprint && recentTools.length > 0) {
1606
+ memory.recordStrategyOutcome(activeStrategyFingerprint, true);
1607
+ activeStrategyFingerprint = null;
1608
+ }
1533
1609
  }
1534
1610
  // Attach hints in BOTH content (visible) and _meta (for programmatic access)
1535
1611
  if (hints.length > 0) {
@@ -1745,7 +1821,9 @@ server.tool("focus", "Focus/activate an application (or a specific window by win
1745
1821
  targetApp = { bundleId, name: appWin.appName, pid: appWin.pid || appWin.ownerPid };
1746
1822
  }
1747
1823
  }
1748
- catch { /* ignore */ }
1824
+ catch (e) {
1825
+ process.stderr.write(`[screenhand] focus window check for ${bundleId} failed: ${e instanceof Error ? e.message : String(e)}\n`);
1826
+ }
1749
1827
  if (!targetApp) {
1750
1828
  return { content: [{ type: "text", text: `Error: ${bundleId} is not running. Use launch("${bundleId}") first.` }], isError: true };
1751
1829
  }
@@ -1816,10 +1894,14 @@ server.tool("focus", "Focus/activate an application (or a specific window by win
1816
1894
  await perceptionManager.ensureStarted(ctx);
1817
1895
  installSafariEnricher(bundleId);
1818
1896
  }
1819
- catch { /* best-effort */ }
1897
+ catch (e) {
1898
+ process.stderr.write(`[screenhand] perception ensureStarted in focus failed: ${e instanceof Error ? e.message : String(e)}\n`);
1899
+ }
1820
1900
  }
1821
1901
  }
1822
- catch { /* app.list failed — world model update is best-effort */ }
1902
+ catch (e) {
1903
+ process.stderr.write(`[screenhand] focus world-model update failed: ${e instanceof Error ? e.message : String(e)}\n`);
1904
+ }
1823
1905
  return { content: [{ type: "text", text: focusMsg }] };
1824
1906
  }
1825
1907
  finally {
@@ -1883,7 +1965,9 @@ server.tool("launch", "Launch an application. Chrome/Chromium browsers are launc
1883
1965
  await perceptionManager.ensureStarted({ bundleId, appName: r.appName ?? bundleId, pid: r.pid, windowTitle: "", ...(windowId != null ? { windowId } : {}) });
1884
1966
  installSafariEnricher(bundleId);
1885
1967
  }
1886
- catch { /* perception start is best-effort */ }
1968
+ catch (e) {
1969
+ process.stderr.write(`[screenhand] perception start after launch failed: ${e instanceof Error ? e.message : String(e)}\n`);
1970
+ }
1887
1971
  let msg = `Launched ${r.appName} pid=${r.pid}`;
1888
1972
  if (chromeAppName) {
1889
1973
  const port = cdpPort ?? 9222;
@@ -2112,7 +2196,9 @@ server.tool("ui_press", "PREFERRED: Find and press/click a UI element by its tit
2112
2196
  return { content: [{ type: "text", text: `Element "${title}" not found in PID ${pid}. A system dialog from "${front.name}" (${front.bundleId}, PID ${front.pid}) may be blocking. Dismiss it first, or use click(x, y) to interact with the dialog directly.` }], isError: true };
2113
2197
  }
2114
2198
  }
2115
- catch { /* ignore frontmost check failure */ }
2199
+ catch (e) {
2200
+ process.stderr.write(`[screenhand] frontmost check in ui_press failed: ${e instanceof Error ? e.message : String(e)}\n`);
2201
+ }
2116
2202
  throw new Error(`Element "${title}" not found (searched title, value, and description)`);
2117
2203
  }
2118
2204
  }
@@ -2333,7 +2419,9 @@ server.tool("type_text", "Type text using the keyboard. Auto-detects Electron ap
2333
2419
  catch { /* not available on this port */ }
2334
2420
  }
2335
2421
  }
2336
- catch { /* auto-detect is best-effort */ }
2422
+ catch (e) {
2423
+ process.stderr.write(`[screenhand] CDP auto-detect failed: ${e instanceof Error ? e.message : String(e)}\n`);
2424
+ }
2337
2425
  }
2338
2426
  if (electronCdpPort) {
2339
2427
  // CDP path: click editor to ensure focus, then type via key events
@@ -2396,7 +2484,9 @@ server.tool("key", "Press a key combination", {
2396
2484
  const front = await bridge.call("app.frontmost", {});
2397
2485
  targetPid = front.pid;
2398
2486
  }
2399
- catch { /* fallback to global posting */ }
2487
+ catch (e) {
2488
+ process.stderr.write(`[screenhand] key frontmost PID resolve failed: ${e instanceof Error ? e.message : String(e)}\n`);
2489
+ }
2400
2490
  }
2401
2491
  const keys = combo.split("+");
2402
2492
  const hasModifier = keys.some(k => ["cmd", "ctrl", "alt", "shift"].includes(k.toLowerCase()));
@@ -2466,7 +2556,9 @@ async function getCDPClient(tabId, overridePort) {
2466
2556
  try {
2467
2557
  perceptionManager.activateCDP(client);
2468
2558
  }
2469
- catch { /* best-effort */ }
2559
+ catch (e) {
2560
+ process.stderr.write(`[screenhand] perception CDP activate failed: ${e instanceof Error ? e.message : String(e)}\n`);
2561
+ }
2470
2562
  return { client, targetId: targetId, CDP: cdp, port };
2471
2563
  }
2472
2564
  // ── Random delay helper ──
@@ -3403,6 +3495,12 @@ server.tool("platform_explore", "Autonomously explore an app or website. Maps al
3403
3495
  // Compile and save
3404
3496
  const result = compileReference(platform, "web", tested, url);
3405
3497
  const filePath = saveExploreResult(referencesDir, result);
3498
+ // Auto-merge explore selectors into main reference so data isn't fragmented
3499
+ if (result.selectors && Object.keys(result.selectors).length > 0) {
3500
+ referenceMerger.mergeExploreSelectors(result.selectors, result.errors, "", platform);
3501
+ }
3502
+ // Hot-reload: make new data immediately available to context tracker
3503
+ _playbookStoreForContext.reload();
3406
3504
  return { content: [{ type: "text", text: `Exploration complete: ${filePath}\n\nElements found: ${elements.length}\nTested: ${result.testedElements}\nWorking selectors: ${result.workingSelectors}\nErrors: ${result.errors.length}\n\nKey discoveries:\n${result.keyDiscoveries.map(d => ` - ${d}`).join("\n")}` }] };
3407
3505
  }
3408
3506
  else if (bundleId) {
@@ -3424,6 +3522,12 @@ server.tool("platform_explore", "Autonomously explore an app or website. Maps al
3424
3522
  ...el, clickWorked: true, result: "discovered_not_tested",
3425
3523
  })), undefined, bundleId);
3426
3524
  const filePath = saveExploreResult(referencesDir, result);
3525
+ // Auto-merge explore selectors into main reference so data isn't fragmented
3526
+ if (result.selectors && Object.keys(result.selectors).length > 0) {
3527
+ referenceMerger.mergeExploreSelectors(result.selectors, result.errors, bundleId, platform);
3528
+ }
3529
+ // Hot-reload: make new data immediately available to context tracker
3530
+ _playbookStoreForContext.reload();
3427
3531
  return { content: [{ type: "text", text: `Native app exploration complete: ${filePath}\n\nElements discovered: ${elements.length}\n(Native elements discovered but not auto-clicked for safety. Use playbook_record to test interactively.)` }] };
3428
3532
  }
3429
3533
  else {
@@ -5037,7 +5141,9 @@ server.tool("scroll_with_fallback", "Scroll within an element or the active wind
5037
5141
  return { content: [{ type: "text", text: `"${target}" is visible after ${i} scroll(s).` }] };
5038
5142
  }
5039
5143
  }
5040
- catch { /* OCR failed, keep scrolling */ }
5144
+ catch (e) {
5145
+ process.stderr.write(`[screenhand] OCR during scroll search failed: ${e instanceof Error ? e.message : String(e)}\n`);
5146
+ }
5041
5147
  // Scroll once
5042
5148
  const deltaX = direction === "left" ? -scrollAmount : direction === "right" ? scrollAmount : 0;
5043
5149
  const deltaY = direction === "up" ? -scrollAmount : direction === "down" ? scrollAmount : 0;
@@ -5164,7 +5270,9 @@ server.tool("wait_for_state", "Wait until a condition is met on screen: text app
5164
5270
  await client.close();
5165
5271
  }
5166
5272
  }
5167
- catch { /* CDP unavailable */ }
5273
+ catch (e) {
5274
+ process.stderr.write(`[screenhand] wait_for_state CDP check failed: ${e instanceof Error ? e.message : String(e)}\n`);
5275
+ }
5168
5276
  }
5169
5277
  const elapsed = Date.now() - (deadline - timeout);
5170
5278
  lastCheck = `${elapsed}ms`;
@@ -5450,6 +5558,18 @@ function getJobRunner() {
5450
5558
  timeout: 15000,
5451
5559
  }).trim();
5452
5560
  });
5561
+ // Wire learning feedback: PlaybookEngine reports step outcomes to context tracker + AppMap
5562
+ playbookEngine.setOutcomeCallback((step, success, error) => {
5563
+ const target = typeof step.target === "string" ? step.target : null;
5564
+ contextTracker.recordOutcome(step.action, { target, text: step.text }, success, error);
5565
+ const bid = worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId;
5566
+ if (bid && target) {
5567
+ try {
5568
+ appMap.recordElementOutcome(bid, "auto", target, success);
5569
+ }
5570
+ catch { /* non-critical */ }
5571
+ }
5572
+ });
5453
5573
  activeJobRunner = new JobRunner(bridge, jobManager, leaseManager, supervisor, (() => {
5454
5574
  const cfg = {
5455
5575
  hasCDP: cdpPort !== null,
@@ -5626,10 +5746,22 @@ originalTool("plan_execute", "Run a plan automatically. Known steps (from playbo
5626
5746
  if (!goal) {
5627
5747
  return { content: [{ type: "text", text: `Goal not found: ${goalId}` }] };
5628
5748
  }
5629
- const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
5749
+ const focusedBundleId = worldModel.getState().focusedApp?.bundleId ?? "unknown";
5750
+ const adaptiveBudget = learningEngine.getAdaptiveBudget(focusedBundleId);
5630
5751
  const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
5631
5752
  executor.setAppMap(appMap);
5632
- const result = await executor.executeGoal(goal);
5753
+ // Enable perception-triggered recovery during plan execution
5754
+ perceptionManager.setExpectedApp(focusedBundleId);
5755
+ perceptionManager.startStallDetection(30_000);
5756
+ let result;
5757
+ try {
5758
+ result = await executor.executeGoal(goal);
5759
+ }
5760
+ finally {
5761
+ // Disable reactive recovery after plan completes
5762
+ perceptionManager.setExpectedApp(null);
5763
+ perceptionManager.stopStallDetection();
5764
+ }
5633
5765
  goalStore.update(goalId, goal);
5634
5766
  // Check if paused at an LLM step
5635
5767
  if ("paused" in result) {
@@ -5668,7 +5800,25 @@ originalTool("plan_execute", "Run a plan automatically. Known steps (from playbo
5668
5800
  }
5669
5801
  }
5670
5802
  }
5671
- catch { /* strategy recording is best-effort */ }
5803
+ catch (e) {
5804
+ process.stderr.write(`[screenhand] strategy recording failed: ${e instanceof Error ? e.message : String(e)}\n`);
5805
+ }
5806
+ // Self-improving plans: refine and check for graduation
5807
+ try {
5808
+ const refinement = planRefiner.refine(goal, result);
5809
+ if (refinement.refinementCount > 0) {
5810
+ process.stderr.write(`[plan-refiner] Refined plan for "${goal.description}" (${refinement.refinementCount}x)\n`);
5811
+ }
5812
+ // Check graduation to playbook (3+ refinements)
5813
+ const playbook = planRefiner.checkGraduation(goal.description, focusedBundleId, worldModel.getState().focusedApp?.appName ?? focusedBundleId);
5814
+ if (playbook) {
5815
+ _playbookStoreForContext.save(playbook);
5816
+ process.stderr.write(`[plan-refiner] Plan GRADUATED to playbook: ${playbook.id}\n`);
5817
+ }
5818
+ }
5819
+ catch (e) {
5820
+ process.stderr.write(`[plan-refiner] Refinement failed: ${e instanceof Error ? e.message : String(e)}\n`);
5821
+ }
5672
5822
  }
5673
5823
  const lines = [
5674
5824
  result.success ? "Goal completed successfully." : `Goal failed: ${result.error}`,
@@ -6429,6 +6579,88 @@ server.tool("observer_ocr_roi", "Submit a targeted ROI OCR command to the runnin
6429
6579
  return { content: [{ type: "text", text: `ROI OCR command submitted: ${id}\nRegion: (${x}, ${y}, ${width}×${height})\nThe daemon will process this on its next cycle. Call observer_ocr_roi with commandId="${id}" to poll the result.` }] };
6430
6580
  });
6431
6581
  // ═══════════════════════════════════════════════
6582
+ // STATE WATCHER — Continuous observation event bus
6583
+ // ═══════════════════════════════════════════════
6584
+ server.tool("watch_start", "Start the state watcher polling loop. Evaluates registered watch rules every 2s against the world model.", {}, async () => {
6585
+ stateWatcher.start();
6586
+ const rules = stateWatcher.getRules();
6587
+ return { content: [{ type: "text", text: `State watcher started. ${rules.length} rules registered.` }] };
6588
+ });
6589
+ server.tool("watch_stop", "Stop the state watcher polling loop.", {}, async () => {
6590
+ stateWatcher.stop();
6591
+ return { content: [{ type: "text", text: "State watcher stopped." }] };
6592
+ });
6593
+ server.tool("watch_register", "Register a watch rule: when element with matching title appears, execute an action. Use for automated responses to known UI states.", {
6594
+ id: z.string().describe("Unique rule ID"),
6595
+ elementTitle: z.string().describe("UI element title/label to watch for (case-insensitive substring match)"),
6596
+ actionTool: z.string().describe("Tool to execute when element appears (e.g. click_text, key)"),
6597
+ actionParams: z.record(z.string(), z.unknown()).describe("Params for the action tool"),
6598
+ bundleId: z.string().optional().describe("Only match when this app is focused"),
6599
+ maxFires: z.number().optional().describe("Max times to fire (0=unlimited, default=1)"),
6600
+ }, async ({ id, elementTitle, actionTool, actionParams, bundleId, maxFires }) => {
6601
+ // Validate tool exists and is safe for automated execution
6602
+ const BLOCKED_WATCH_TOOLS = new Set(["applescript", "browser_js", "browser_stealth"]);
6603
+ if (BLOCKED_WATCH_TOOLS.has(actionTool)) {
6604
+ return { content: [{ type: "text", text: `Tool "${actionTool}" is not allowed in watch rules (security: prevents arbitrary code execution)` }], isError: true };
6605
+ }
6606
+ if (!toolRegistry.has(actionTool)) {
6607
+ return { content: [{ type: "text", text: `Unknown tool: "${actionTool}"` }], isError: true };
6608
+ }
6609
+ stateWatcher.watchForElement(id, elementTitle, { tool: actionTool, params: actionParams }, bundleId);
6610
+ if (maxFires !== undefined) {
6611
+ const rules = stateWatcher.getRules();
6612
+ const rule = rules.find((r) => r.id === id);
6613
+ if (rule) {
6614
+ // Update maxFires on the registered rule
6615
+ const ruleState = stateWatcher.rules.get(id);
6616
+ if (ruleState)
6617
+ ruleState.rule.maxFires = maxFires;
6618
+ }
6619
+ }
6620
+ return { content: [{ type: "text", text: `Watch rule "${id}" registered: when "${elementTitle}" appears → ${actionTool}(${JSON.stringify(actionParams)})` }] };
6621
+ });
6622
+ server.tool("watch_dialog", "Register a dialog watch rule: when a dialog matching the pattern appears, auto-execute an action.", {
6623
+ id: z.string().describe("Unique rule ID"),
6624
+ titlePattern: z.string().describe("Regex pattern to match dialog titles"),
6625
+ actionTool: z.string().describe("Tool to execute (e.g. click_text, key)"),
6626
+ actionParams: z.record(z.string(), z.unknown()).describe("Params for the action tool"),
6627
+ }, async ({ id, titlePattern, actionTool, actionParams }) => {
6628
+ // Validate regex — reject patterns that could cause ReDoS
6629
+ let regex;
6630
+ try {
6631
+ regex = new RegExp(titlePattern, "i");
6632
+ // Quick sanity check — if it takes >50ms on a test string, reject
6633
+ const testStr = "a".repeat(100);
6634
+ const t0 = Date.now();
6635
+ regex.test(testStr);
6636
+ if (Date.now() - t0 > 50) {
6637
+ return { content: [{ type: "text", text: `Rejected: regex pattern "${titlePattern}" is too expensive (potential ReDoS)` }], isError: true };
6638
+ }
6639
+ }
6640
+ catch (e) {
6641
+ return { content: [{ type: "text", text: `Invalid regex: ${e instanceof Error ? e.message : String(e)}` }], isError: true };
6642
+ }
6643
+ stateWatcher.watchForDialog(id, regex, { tool: actionTool, params: actionParams });
6644
+ return { content: [{ type: "text", text: `Dialog watch "${id}" registered: /${titlePattern}/i → ${actionTool}(${JSON.stringify(actionParams)})` }] };
6645
+ });
6646
+ server.tool("watch_unregister", "Remove a watch rule by ID.", {
6647
+ id: z.string().describe("Rule ID to remove"),
6648
+ }, async ({ id }) => {
6649
+ const removed = stateWatcher.unregister(id);
6650
+ return { content: [{ type: "text", text: removed ? `Rule "${id}" removed.` : `Rule "${id}" not found.` }] };
6651
+ });
6652
+ server.tool("watch_status", "Get all registered watch rules and their fire counts.", {}, async () => {
6653
+ const rules = stateWatcher.getRules();
6654
+ const running = stateWatcher.isRunning;
6655
+ const lines = [
6656
+ `State watcher: ${running ? "running" : "stopped"}`,
6657
+ `Rules: ${rules.length}`,
6658
+ "",
6659
+ ...rules.map((r) => ` [${r.id}] ${r.description} (fired ${r.fireCount}x)`),
6660
+ ];
6661
+ return { content: [{ type: "text", text: lines.join("\n") }] };
6662
+ });
6663
+ // ═══════════════════════════════════════════════
6432
6664
  // PHASE 6: TOOL MASTERY — Ingestion + Community
6433
6665
  // ═══════════════════════════════════════════════
6434
6666
  server.tool("scan_menu_bar", "Scan an app's menu bar via AX tree. Extracts all menu paths, keyboard shortcuts, and enabled/disabled states. Automatically merges discovered shortcuts into the reference file.", {
@@ -6490,6 +6722,8 @@ server.tool("scan_menu_bar", "Scan an app's menu bar via AX tree. Extracts all m
6490
6722
  });
6491
6723
  }
6492
6724
  }
6725
+ // Hot-reload: make new reference data immediately available to context tracker
6726
+ _playbookStoreForContext.reload();
6493
6727
  let output = lines.join("\n") + bootstrapInfo;
6494
6728
  output = redactUsername(output);
6495
6729
  output = output.replace(/Log Out [^\n:]+/g, "Log Out [USER]");
@@ -6553,6 +6787,8 @@ server.tool("ingest_documentation", "Parse a documentation page (HTML, markdown,
6553
6787
  }
6554
6788
  }
6555
6789
  }
6790
+ // Hot-reload: make new reference data immediately available to context tracker
6791
+ _playbookStoreForContext.reload();
6556
6792
  return { content: [{ type: "text", text: lines.join("\n") }] };
6557
6793
  });
6558
6794
  server.tool("ingest_tutorial", "Extract structured playbook steps from a video transcript (e.g. YouTube captions). Converts tutorial narration into actionable automation steps with tool mappings.", {
@@ -6643,6 +6879,190 @@ server.tool("discover_features", "Extract features from an app's official websit
6643
6879
  lines.push(` [${f.category}] ${f.name}: ${f.description}`);
6644
6880
  }
6645
6881
  }
6882
+ // Hot-reload: make new reference data immediately available to context tracker
6883
+ _playbookStoreForContext.reload();
6884
+ return { content: [{ type: "text", text: lines.join("\n") }] };
6885
+ });
6886
+ // ── Visual App Mapping (Phase 3) ─────────────────────────────────
6887
+ server.tool("map_app", "Visually map an app's UI by taking a screenshot, running OCR to identify interactive elements and zones with coordinates. Makes subsequent tool calls faster and more accurate. Runs quick scan (~500ms) inline, optional LLM enrichment in background if ANTHROPIC_API_KEY is set.", {
6888
+ bundleId: z.string().describe("macOS bundle ID (e.g. com.apple.Notes)"),
6889
+ appName: z.string().describe("Human-readable app name"),
6890
+ force: z.boolean().optional().describe("Re-map even if a recent map exists (default: false)"),
6891
+ depth: z.enum(["quick", "full"]).optional().describe("'quick' = OCR only (~500ms). 'full' = OCR + LLM enrichment (~15s). Default: quick"),
6892
+ }, async ({ bundleId, appName, force, depth }) => {
6893
+ if (isSensitiveApp(bundleId)) {
6894
+ return { content: [{ type: "text", text: `Blocked: ${bundleId} is a sensitive app (password manager, banking, etc.). Visual mapping is not allowed for privacy reasons.` }] };
6895
+ }
6896
+ // Check if already mapped (unless force)
6897
+ if (!force) {
6898
+ const existingMeta = appMap.getVisualMeta(bundleId);
6899
+ if (existingMeta && !appMap.isVisualMapStale(bundleId)) {
6900
+ return { content: [{ type: "text", text: `Visual map for ${appName} already exists (${existingMeta.screensMapped.length} screens, confidence: ${existingMeta.confidence.toFixed(2)}). Use force: true to re-map.` }] };
6901
+ }
6902
+ }
6903
+ await ensureBridge();
6904
+ // Get focused app PID
6905
+ const apps = await bridge.call("app.list", {});
6906
+ const matchedApp = apps?.find((a) => a.bundleId === bundleId);
6907
+ if (!matchedApp?.pid) {
6908
+ return { content: [{ type: "text", text: `App ${bundleId} is not running. Launch it first with focus("${bundleId}") or launch("${bundleId}").` }] };
6909
+ }
6910
+ const pid = matchedApp.pid;
6911
+ // Get window bounds
6912
+ let windowTitle = "";
6913
+ let windowBounds;
6914
+ try {
6915
+ const wins = await bridge.call("window.list", {});
6916
+ const appWins = wins?.filter((w) => w.pid === pid);
6917
+ const mainWin = appWins?.find((w) => w.focused || w.frontmost || w.isMain) ?? appWins?.[0];
6918
+ if (mainWin) {
6919
+ windowTitle = mainWin.title ?? "";
6920
+ windowBounds = mainWin.bounds ?? mainWin;
6921
+ }
6922
+ }
6923
+ catch { /* use defaults */ }
6924
+ // Phase A: Quick scan (OCR)
6925
+ const scanResult = await quickScan(bridge, pid, windowBounds);
6926
+ if (!scanResult) {
6927
+ return { content: [{ type: "text", text: `Failed to capture screenshot of ${appName}. Make sure the app window is visible.` }] };
6928
+ }
6929
+ // Get app version for staleness tracking
6930
+ let appVersion = "unknown";
6931
+ try {
6932
+ const infoResult = await bridge.call("app.info", { bundleId });
6933
+ appVersion = infoResult?.version ?? infoResult?.shortVersion ?? "unknown";
6934
+ }
6935
+ catch { /* use default */ }
6936
+ // Get display scale factor
6937
+ let scaleFactor = 2;
6938
+ try {
6939
+ const screenInfo = await bridge.call("screen.info", {});
6940
+ scaleFactor = screenInfo?.scaleFactor ?? 2;
6941
+ }
6942
+ catch { /* default to Retina */ }
6943
+ const meta = buildVisualMeta(scanResult.hash, scanResult.captureSize, windowTitle, appVersion, scanResult.scan.confidence, scaleFactor);
6944
+ // Populate into AppMap
6945
+ const { added, updated } = appMap.populateFromVisualScan(bundleId, appName, scanResult.scan, meta);
6946
+ const lines = [
6947
+ `Visual map for ${appName} (${bundleId}):`,
6948
+ ` Zones identified: ${scanResult.scan.zones.length}`,
6949
+ ` Elements mapped: ${scanResult.scan.elements.length} (${added} new, ${updated} updated)`,
6950
+ ` Map confidence: ${scanResult.scan.confidence.toFixed(2)}`,
6951
+ ` App version: ${appVersion}`,
6952
+ ];
6953
+ if (scanResult.scan.zones.length > 0) {
6954
+ lines.push(" Zones:");
6955
+ for (const z of scanResult.scan.zones) {
6956
+ const elCount = scanResult.scan.elements.filter(e => e.zone === z.label).length;
6957
+ lines.push(` ${z.label} (${z.type}): ${elCount} elements`);
6958
+ }
6959
+ }
6960
+ // Phase B: LLM enrichment (background, if depth=full and API key exists)
6961
+ if ((depth === "full") && process.env.ANTHROPIC_API_KEY) {
6962
+ lines.push(" LLM enrichment: starting in background...");
6963
+ // Fire and forget — don't block the response
6964
+ (async () => {
6965
+ try {
6966
+ // Get screenshot as file, then read as base64 for LLM
6967
+ const screenshotShot = await bridge.call("cg.captureScreen", {});
6968
+ if (!screenshotShot?.path)
6969
+ return;
6970
+ const fs = await import("node:fs");
6971
+ const screenshotBase64 = fs.readFileSync(screenshotShot.path).toString("base64");
6972
+ const screenshotData = { base64: screenshotBase64 };
6973
+ // Get AX tree for cross-reference
6974
+ let axTree = "";
6975
+ try {
6976
+ const tree = await bridge.call("ax.tree", { pid, depth: 3 });
6977
+ axTree = JSON.stringify(tree, null, 1).slice(0, 3000);
6978
+ }
6979
+ catch { /* proceed without AX */ }
6980
+ const enrichment = await llmEnrich(screenshotData.base64, axTree, appName, bundleId, windowTitle, scanResult.captureSize);
6981
+ if (enrichment) {
6982
+ // Merge LLM results into AppMap (LLM confidence capped at 0.5)
6983
+ const llmScan = {
6984
+ zones: enrichment.zones,
6985
+ elements: enrichment.elements.map(e => ({
6986
+ ...e,
6987
+ confidence: Math.min(e.confidence, 0.5), // Cap — LLM is hypothesis
6988
+ })),
6989
+ confidence: Math.min(enrichment.confidence, 0.6),
6990
+ };
6991
+ appMap.populateFromVisualScan(bundleId, appName, llmScan, {
6992
+ ...meta,
6993
+ confidence: Math.min(enrichment.confidence, 0.6),
6994
+ });
6995
+ process.stderr.write(`[visual-mapper] LLM enrichment complete for ${appName}: ${enrichment.elements.length} elements, ${enrichment.zones.length} zones\n`);
6996
+ }
6997
+ }
6998
+ catch (err) {
6999
+ process.stderr.write(`[visual-mapper] Background enrichment failed: ${err instanceof Error ? err.message : String(err)}\n`);
7000
+ }
7001
+ })().catch(() => { });
7002
+ }
7003
+ else if (depth === "full" && !process.env.ANTHROPIC_API_KEY) {
7004
+ lines.push(" LLM enrichment: skipped (no ANTHROPIC_API_KEY set)");
7005
+ }
7006
+ return { content: [{ type: "text", text: lines.join("\n") }] };
7007
+ });
7008
+ originalTool("map_status", "Check the health of an app's visual map. Shows zones, element counts, confidence, staleness, and failure rates. Useful for debugging click failures.", {
7009
+ bundleId: z.string().describe("macOS bundle ID"),
7010
+ }, async ({ bundleId }) => {
7011
+ const meta = appMap.getVisualMeta(bundleId);
7012
+ const data = appMap.getLoaded(bundleId) ?? appMap.load(bundleId);
7013
+ if (!meta) {
7014
+ return { content: [{ type: "text", text: `No visual map exists for ${bundleId}. Run map_app to create one.` }] };
7015
+ }
7016
+ const isStale = appMap.isVisualMapStale(bundleId);
7017
+ const ageMs = Date.now() - new Date(meta.lastScannedAt).getTime();
7018
+ const ageHours = Math.round(ageMs / 3_600_000);
7019
+ const lines = [
7020
+ `Visual Map Status: ${bundleId}`,
7021
+ ` Last scanned: ${meta.lastScannedAt} (${ageHours}h ago)`,
7022
+ ` App version: ${meta.appVersion}`,
7023
+ ` Confidence: ${meta.confidence.toFixed(2)}`,
7024
+ ` Staleness: ${isStale ? "STALE — consider re-mapping" : "fresh"}`,
7025
+ ` Screens mapped: ${meta.screensMapped.join(", ") || "(none)"}`,
7026
+ ` Scale factor: ${meta.scaleFactor}x`,
7027
+ ` Capture size: ${meta.captureSize.w}x${meta.captureSize.h}`,
7028
+ ];
7029
+ // Count visual-scan elements
7030
+ if (data) {
7031
+ let visualElements = 0;
7032
+ let axElements = 0;
7033
+ let totalValidations = 0;
7034
+ let totalMismatches = 0;
7035
+ for (const zone of Object.values(data.zones)) {
7036
+ for (const el of zone.elements) {
7037
+ if (el.labelSource === "ocr" || el.labelSource === "llm") {
7038
+ visualElements++;
7039
+ totalValidations += el.validationCount ?? 0;
7040
+ totalMismatches += el.mismatchCount ?? 0;
7041
+ }
7042
+ else if (el.labelSource === "ax" || el.labelSource === "manual") {
7043
+ axElements++;
7044
+ }
7045
+ }
7046
+ }
7047
+ lines.push(` Visual-scan elements: ${visualElements}`);
7048
+ lines.push(` AX-confirmed elements: ${axElements}`);
7049
+ if (totalValidations + totalMismatches > 0) {
7050
+ const matchRate = totalValidations / (totalValidations + totalMismatches);
7051
+ lines.push(` Position match rate: ${(matchRate * 100).toFixed(1)}% (${totalValidations} matches, ${totalMismatches} mismatches)`);
7052
+ }
7053
+ // Zone breakdown
7054
+ const zoneKeys = Object.keys(data.zones);
7055
+ if (zoneKeys.length > 0) {
7056
+ lines.push(" Zones:");
7057
+ for (const key of zoneKeys.slice(0, 15)) {
7058
+ const zone = data.zones[key];
7059
+ lines.push(` ${key} (${zone.type}): ${zone.elements.length} elements`);
7060
+ }
7061
+ if (zoneKeys.length > 15) {
7062
+ lines.push(` ... and ${zoneKeys.length - 15} more`);
7063
+ }
7064
+ }
7065
+ }
6646
7066
  return { content: [{ type: "text", text: lines.join("\n") }] };
6647
7067
  });
6648
7068
  server.tool("coverage_report", "Check what ScreenHand knows about an app: shortcuts, selectors, flows, playbooks, error patterns, and stability %. Useful before complex workflows to decide strategy: learn first (if empty), go fast (if high coverage), or use fallback tools (if error patterns exist). Optional for quick actions.", {
@@ -6751,10 +7171,14 @@ originalTool("community_fetch", "Search community playbooks for a platform or wo
6751
7171
  // START
6752
7172
  // ═══════════════════════════════════════════════
6753
7173
  async function main() {
6754
- // Flush playbook learnings on graceful shutdown
6755
- process.on("SIGINT", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); process.exit(0); });
6756
- process.on("SIGTERM", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); process.exit(0); });
6757
- process.on("beforeExit", () => { void perceptionManager.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); });
7174
+ // Flush all learned state on shutdown (signals, stdin EOF, or normal exit)
7175
+ const flushAll = () => { void perceptionManager.stop(); perceptionManager.stopStallDetection(); stateWatcher.stop(); contextTracker.flush(); learningEngine.flush(); appMap.flush(); };
7176
+ process.on("SIGINT", () => { flushAll(); process.exit(0); });
7177
+ process.on("SIGTERM", () => { flushAll(); process.exit(0); });
7178
+ process.on("beforeExit", flushAll);
7179
+ // MCP clients often close stdin without sending a signal — flush on stdin end too
7180
+ process.stdin.on("end", () => { flushAll(); process.exit(0); });
7181
+ process.stdin.on("close", () => { flushAll(); process.exit(0); });
6758
7182
  const transport = new StdioServerTransport();
6759
7183
  await server.connect(transport);
6760
7184
  }