screenhand 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/mcp-desktop.js +572 -162
- package/dist/src/community/fetcher.js +32 -2
- package/dist/src/community/validator.js +15 -1
- package/dist/src/context-tracker.js +115 -43
- package/dist/src/ingestion/reference-merger.js +3 -1
- package/dist/src/learning/engine.js +225 -7
- package/dist/src/learning/locator-policy.js +16 -0
- package/dist/src/learning/pattern-policy.js +9 -0
- package/dist/src/learning/recovery-policy.js +16 -0
- package/dist/src/learning/sensor-policy.js +9 -0
- package/dist/src/learning/timing-model.js +62 -0
- package/dist/src/memory/research.js +7 -1
- package/dist/src/memory/store.js +18 -7
- package/dist/src/perception/coordinator.js +304 -4
- package/dist/src/perception/manager.js +13 -0
- package/dist/src/perception/vision-source.js +14 -4
- package/dist/src/planner/executor.js +125 -2
- package/dist/src/planner/planner.js +509 -10
- package/dist/src/playbook/engine.js +10 -0
- package/dist/src/recovery/engine.js +50 -3
- package/dist/src/runtime/execution-contract.js +67 -5
- package/dist/src/runtime/executor.js +41 -1
- package/dist/src/runtime/service.js +7 -0
- package/dist/src/state/app-map.js +307 -17
- package/dist/src/util/atomic-write.js +25 -4
- package/dist-references/reddit.json +2 -2
- package/package.json +1 -1
package/dist/mcp-desktop.js
CHANGED
|
@@ -55,7 +55,7 @@ import { WorldModel } from "./src/state/index.js";
|
|
|
55
55
|
import { PerceptionManager } from "./src/perception/index.js";
|
|
56
56
|
import { Planner, PlanExecutor, GoalStore, ToolRegistry } from "./src/planner/index.js";
|
|
57
57
|
import { RecoveryEngine } from "./src/recovery/index.js";
|
|
58
|
-
import { LearningEngine } from "./src/learning/index.js";
|
|
58
|
+
import { LearningEngine, LocatorPolicy } from "./src/learning/index.js";
|
|
59
59
|
import { discoverWebElements, testWebElement, compileReference, saveExploreResult, discoverNativeElements } from "./src/platform/explorer.js";
|
|
60
60
|
import { buildDocUrls, crawlPage, compileLearnResult, saveLearnResult } from "./src/platform/learner.js";
|
|
61
61
|
import { AccessibilityAdapter } from "./src/runtime/accessibility-adapter.js";
|
|
@@ -234,6 +234,10 @@ let CDP = null;
|
|
|
234
234
|
async function ensureCDP(overridePort) {
|
|
235
235
|
if (!CDP)
|
|
236
236
|
CDP = (await import("chrome-remote-interface")).default;
|
|
237
|
+
// Validate port range (defense in depth — Zod validates at MCP boundary, this catches internal callers)
|
|
238
|
+
if (overridePort && (overridePort < 9222 || overridePort > 9999)) {
|
|
239
|
+
throw new Error(`Invalid CDP port ${overridePort} — must be 9222-9999`);
|
|
240
|
+
}
|
|
237
241
|
// If caller specified a port, use it directly (e.g. 9333 for Electron apps)
|
|
238
242
|
if (overridePort) {
|
|
239
243
|
try {
|
|
@@ -263,84 +267,100 @@ async function ensureCDP(overridePort) {
|
|
|
263
267
|
throw new Error("Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug");
|
|
264
268
|
}
|
|
265
269
|
const server = new McpServer({ name: "screenhand", version: "3.0.0" }, {
|
|
266
|
-
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools
|
|
270
|
+
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools. Never click blind — always follow: KNOW → SEE → NAVIGATE → ACT → VERIFY → STOP.
|
|
271
|
+
|
|
272
|
+
## The Golden Sequence (follow this order)
|
|
273
|
+
|
|
274
|
+
### 1. KNOW (before touching anything)
|
|
275
|
+
platform_guide("figma") → get selectors, flows, known errors for this app/site
|
|
276
|
+
memory_recall("figma export") → check if you've done this before — reuse past strategies
|
|
277
|
+
scan_menu_bar() → discover all menu items in the current app
|
|
278
|
+
|
|
279
|
+
If platform_guide() has no data: platform_explore("bundleId") to auto-discover the app, or platform_learn("domain") for websites.
|
|
280
|
+
|
|
281
|
+
### 2. SEE (understand current state)
|
|
282
|
+
apps() → what's running?
|
|
283
|
+
perception_start() → turn on continuous monitoring (3-rate: 100ms/300ms/1000ms)
|
|
284
|
+
world_state() → current app, windows, controls, dialogs
|
|
285
|
+
screenshot() → visual confirmation if needed
|
|
286
|
+
|
|
287
|
+
perception_start() keeps world_state() continuously updated. Use it for complex multi-step workflows.
|
|
267
288
|
|
|
268
|
-
|
|
289
|
+
### 3. NAVIGATE (get to the right place)
|
|
290
|
+
focus("com.figma.Desktop") → bring app to front
|
|
291
|
+
ui_tree() → see all clickable elements with roles and labels
|
|
292
|
+
ui_find("Export") → check if a specific target exists before clicking
|
|
269
293
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
294
|
+
### 4. ACT (do the thing)
|
|
295
|
+
click_with_fallback("Export") → click element (auto-tries AX → CDP → OCR → coordinates)
|
|
296
|
+
type_with_fallback("filename") → type text with auto-fallback
|
|
297
|
+
key("cmd+shift+e") → keyboard shortcuts
|
|
298
|
+
drag(fromX, fromY, toX, toY) → drag and drop
|
|
299
|
+
scroll(direction) → scroll up/down/left/right
|
|
275
300
|
|
|
276
|
-
|
|
301
|
+
Always prefer *_with_fallback tools over bare click/type — they auto-recover when one method fails.
|
|
277
302
|
|
|
278
|
-
###
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
- **perception_stop()** — turn off when done to save resources.
|
|
283
|
-
- Pattern: perception_start() → do work → world_state() to verify → perception_stop()
|
|
303
|
+
### 5. VERIFY (confirm it worked)
|
|
304
|
+
world_state() → did UI change as expected?
|
|
305
|
+
world_state_diff() → what exactly changed since last check?
|
|
306
|
+
screenshot() → visual proof
|
|
284
307
|
|
|
285
|
-
###
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
- **memory_recall(query)** — retrieve saved strategies, past errors, what worked before. ALWAYS recall before attempting unfamiliar platforms.
|
|
289
|
-
- **learning_status()** — see what ScreenHand has learned: locator preferences, recovery rankings, timing budgets per app.
|
|
290
|
-
- **learning_reset()** — nuclear option, clears all learning. Rarely needed.
|
|
291
|
-
- Pattern: memory_recall("instagram post") → use recalled strategy → if new approach works, memory_save() it
|
|
308
|
+
### 6. STOP (clean up)
|
|
309
|
+
perception_stop() → stop monitoring (save resources)
|
|
310
|
+
memory_save("figma_export", ...) → save successful strategy for next time
|
|
292
311
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
312
|
+
## For Web/Browser (Chrome, Electron apps)
|
|
313
|
+
browser_navigate("https://...") → go to URL
|
|
314
|
+
browser_stealth() → activate FIRST if site has bot detection
|
|
315
|
+
browser_dom() → read page structure (CSS selectors)
|
|
316
|
+
browser_click("#submit") → click element by CSS selector
|
|
317
|
+
browser_type("input", "text") → type into form field
|
|
318
|
+
browser_fill_form({...}) → fill multiple fields at once (human-like timing)
|
|
319
|
+
browser_js("return ...") → run JavaScript for complex extraction/actions
|
|
320
|
+
browser_wait("selector") → wait for element to appear
|
|
321
|
+
browser_human_click(x, y) → human-like click with randomized timing
|
|
299
322
|
|
|
300
|
-
|
|
301
|
-
- **platform_guide("figma")** — get selectors, flows, known errors for a platform. Call FIRST when automating any app/site.
|
|
302
|
-
- **platform_explore("bundleId")** — auto-discover an unknown app's UI structure.
|
|
303
|
-
- **platform_learn("domain")** — learn a website's structure by crawling.
|
|
304
|
-
- **scan_menu_bar()** — discover all menu items in the current app.
|
|
305
|
-
- Pattern: platform_guide() first → if not found, platform_explore() → then automate
|
|
323
|
+
All browser tools work in the background (~10ms) — no need to focus Chrome.
|
|
306
324
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
325
|
+
## For Complex Multi-Step Tasks (let ScreenHand plan it)
|
|
326
|
+
plan_goal("Export video as H.264") → describe WHAT you want — ScreenHand generates steps from playbooks, strategies, and references
|
|
327
|
+
plan_execute(goalId) → auto-run deterministic steps, pauses at LLM steps for your judgment
|
|
328
|
+
plan_step_resolve(goalId, tool, params) → you provide the tool+params for LLM steps
|
|
329
|
+
plan_status(goalId) → check progress
|
|
330
|
+
plan_cancel(goalId) → abort if needed
|
|
313
331
|
|
|
314
|
-
|
|
315
|
-
- **session_claim()** — claim exclusive access to an app window (lease-based).
|
|
316
|
-
- **session_heartbeat()** — keep your lease alive.
|
|
317
|
-
- **session_release()** — release when done.
|
|
318
|
-
- **supervisor_start()** — background daemon that detects stalled agents and recovers.
|
|
319
|
-
- Pattern: session_claim() → do work → session_heartbeat() periodically → session_release()
|
|
332
|
+
On success, the strategy is auto-saved to memory for future reuse.
|
|
320
333
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
- **plan_list()** — see all goals (active, completed, failed).
|
|
328
|
-
- **plan_cancel(goalId)** — abort a goal.
|
|
329
|
-
- Pattern: plan_goal("do X") → review steps → plan_execute() → resolve LLM steps as they pause → on success, strategy auto-saved to memory
|
|
334
|
+
## For Repeatable Workflows (automate once, run forever)
|
|
335
|
+
playbook_record() → start recording your actions
|
|
336
|
+
... do the work ...
|
|
337
|
+
export_playbook() → save as reusable playbook
|
|
338
|
+
job_create("daily post", steps) → make it a persistent job
|
|
339
|
+
worker_start() → background daemon runs jobs autonomously
|
|
330
340
|
|
|
331
|
-
|
|
332
|
-
1. **ui_tree + ui_press** for native app elements (fastest, most reliable)
|
|
333
|
-
2. **browser_* tools** for web content in Chrome/Electron
|
|
334
|
-
3. ***_with_fallback** when you're unsure which method will work
|
|
335
|
-
4. **screenshot + ocr** only for canvas apps or visual verification
|
|
336
|
-
5. **applescript** for macOS-specific automation (Finder, Mail, etc.)
|
|
341
|
+
Jobs survive MCP client restarts. worker_start() runs independently.
|
|
337
342
|
|
|
338
|
-
##
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
343
|
+
## For Multi-Agent Coordination
|
|
344
|
+
session_claim() → claim exclusive access to an app window (lease-based)
|
|
345
|
+
session_heartbeat() → keep your lease alive (call periodically)
|
|
346
|
+
session_release() → release when done
|
|
347
|
+
supervisor_start() → daemon that detects stalled agents and auto-recovers
|
|
348
|
+
|
|
349
|
+
## Self-Healing (automatic — no action needed)
|
|
350
|
+
When any tool fails, ScreenHand automatically tries alternative strategies (AX → CDP → OCR → coordinates). Learning is also automatic — every tool call teaches which selectors work, optimal timing, and recovery rankings per app. Check with:
|
|
351
|
+
- learning_status() → see learned preferences per app
|
|
352
|
+
- recovery_status() → see active cooldowns and cached strategies
|
|
353
|
+
- recovery_configure() → tune recovery budget (max time, max retries)
|
|
354
|
+
|
|
355
|
+
## Tool Speed Priority
|
|
356
|
+
1. **ui_tree + ui_press** — native Accessibility API, ~50ms (fastest, most reliable)
|
|
357
|
+
2. **browser_* tools** — Chrome DevTools Protocol, ~10ms (background, no focus needed)
|
|
358
|
+
3. ***_with_fallback** — auto-tries multiple methods (~100-500ms)
|
|
359
|
+
4. **screenshot + ocr** — visual capture, ~600ms (only for canvas apps)
|
|
360
|
+
5. **applescript** — macOS scripting (Finder, Mail, Safari, etc.)
|
|
361
|
+
|
|
362
|
+
## Key Rule
|
|
363
|
+
Never click blind. Always: KNOW → SEE → NAVIGATE → ACT → VERIFY.
|
|
344
364
|
`,
|
|
345
365
|
});
|
|
346
366
|
// ═══════════════════════════════════════════════
|
|
@@ -411,6 +431,17 @@ let lastSuccessfulToolName = "unknown";
|
|
|
411
431
|
let lastKnownBundleId = null;
|
|
412
432
|
contextTracker.setAppMap(appMap);
|
|
413
433
|
perceptionManager.setAppMap(appMap);
|
|
434
|
+
// Wire F10: connect ContextTracker to PerceptionCoordinator for per-app perception config
|
|
435
|
+
perceptionManager.setContextTracker(contextTracker);
|
|
436
|
+
// Wire #11: connect TopologyPolicy to AppMap for unified edge scoring
|
|
437
|
+
appMap.setTopologyPolicy(learningEngine.topology);
|
|
438
|
+
// Wire #14: seed TimingModel from AppMap's stored timing profiles (cold-start bootstrap)
|
|
439
|
+
learningEngine.seedTimingFromAppMap(appMap);
|
|
440
|
+
// Wire F5-F7: Cold-start bootstrap — seed all learning policies from AppMap data
|
|
441
|
+
learningEngine.seedLocatorsFromAppMap(appMap);
|
|
442
|
+
learningEngine.seedSensorsFromReadySignals(appMap);
|
|
443
|
+
learningEngine.seedPatternsFromAppMap(appMap);
|
|
444
|
+
learningEngine.seedRecoveryFromContracts(appMap);
|
|
414
445
|
const _executablePlaybookStore = new PlaybookStore(playbooksDir);
|
|
415
446
|
try {
|
|
416
447
|
_executablePlaybookStore.load();
|
|
@@ -422,7 +453,9 @@ goalStore.init();
|
|
|
422
453
|
const toolRegistry = new ToolRegistry();
|
|
423
454
|
const recoveryEngine = new RecoveryEngine(worldModel, toolRegistry.toExecutor(), memory);
|
|
424
455
|
recoveryEngine.setLearningEngine(learningEngine);
|
|
456
|
+
recoveryEngine.setAppMap(appMap);
|
|
425
457
|
planner.setToolRegistry(toolRegistry);
|
|
458
|
+
planner.setAppMap(appMap);
|
|
426
459
|
perceptionManager.setLearningEngine(learningEngine);
|
|
427
460
|
const mcpRecorder = new McpPlaybookRecorder(playbooksDir);
|
|
428
461
|
const referenceMerger = new ReferenceMerger(referencesDir);
|
|
@@ -513,6 +546,20 @@ server.tool = (...args) => {
|
|
|
513
546
|
perceptionManager.notifyToolCall();
|
|
514
547
|
// ── PRE-CALL: check for known error warnings (~0ms, in-memory) ──
|
|
515
548
|
const knownError = memory.quickErrorCheck(toolName);
|
|
549
|
+
// Wire F11: Block execution for tools that fail repeatedly with known resolution (L2→L1)
|
|
550
|
+
// Exclude playbook-seeded errors (id starts with pb_err_) — those are generic platform warnings,
|
|
551
|
+
// not errors observed in this session. Only block on real runtime failures.
|
|
552
|
+
// Also exclude errors injected via memory_record_error API (empty params) — only runtime errors
|
|
553
|
+
// from the intelligence wrapper (which always have populated params) should trigger blocks.
|
|
554
|
+
const isRuntimeError = knownError && typeof knownError.params === "object" && knownError.params !== null && Object.keys(knownError.params).length > 0;
|
|
555
|
+
if (knownError && knownError.occurrences >= 5 && knownError.resolution && !knownError.id.startsWith("pb_err_") && isRuntimeError) {
|
|
556
|
+
return {
|
|
557
|
+
content: [{
|
|
558
|
+
type: "text",
|
|
559
|
+
text: `⛔ Blocked: "${toolName}" has failed ${knownError.occurrences}x with: "${knownError.error}". Known fix: ${knownError.resolution}. Apply the fix first, then retry.`,
|
|
560
|
+
}],
|
|
561
|
+
};
|
|
562
|
+
}
|
|
516
563
|
// ── PRE-CALL: auto-start perception if not running ──
|
|
517
564
|
if (!perceptionManager.isRunning && bridgeReady) {
|
|
518
565
|
const focusApp = worldModel.getState().focusedApp;
|
|
@@ -548,6 +595,9 @@ server.tool = (...args) => {
|
|
|
548
595
|
else if (typeof paramBundleId === "string" && paramBundleId) {
|
|
549
596
|
lastKnownBundleId = paramBundleId;
|
|
550
597
|
}
|
|
598
|
+
// Snapshot the bundleId for this tool's POST-CALL, so concurrent PRE-CALL
|
|
599
|
+
// overwrites of lastKnownBundleId don't contaminate this tool's context
|
|
600
|
+
const postCallBundleId = preBundleId ?? lastKnownBundleId;
|
|
551
601
|
// Capture pre-call window title for navigation edge tracking
|
|
552
602
|
const preWindowTitle = worldModel.getFocusedWindow()?.title.value ?? null;
|
|
553
603
|
// Action tools = actually doing something. Navigation = just clicking around.
|
|
@@ -578,7 +628,7 @@ server.tool = (...args) => {
|
|
|
578
628
|
contextTracker.recordOutcome(toolName, safeParams, true, null);
|
|
579
629
|
// ── POST-CALL: Safari context gap + page context update ──
|
|
580
630
|
const postFocusApp = worldModel.getState().focusedApp;
|
|
581
|
-
const postBundleIdForCtx = postFocusApp?.bundleId ??
|
|
631
|
+
const postBundleIdForCtx = postFocusApp?.bundleId ?? postCallBundleId;
|
|
582
632
|
if (postBundleIdForCtx) {
|
|
583
633
|
lastKnownBundleId = postBundleIdForCtx;
|
|
584
634
|
// Try focused window first, then search all windows for matching bundleId
|
|
@@ -622,7 +672,7 @@ server.tool = (...args) => {
|
|
|
622
672
|
}
|
|
623
673
|
}
|
|
624
674
|
// ── POST-CALL: feed learning engine (timing + locator outcomes) ──
|
|
625
|
-
const learnBundleId = worldModel.getState().focusedApp?.bundleId ??
|
|
675
|
+
const learnBundleId = worldModel.getState().focusedApp?.bundleId ?? postCallBundleId ?? "unknown";
|
|
626
676
|
learningEngine.recordToolTiming({ tool: toolName, bundleId: learnBundleId, durationMs, success: true });
|
|
627
677
|
// Record locator outcome if the tool used a target/selector
|
|
628
678
|
const locatorTarget = safeParams.target ?? safeParams.selector ?? safeParams.locator
|
|
@@ -901,14 +951,17 @@ server.tool = (...args) => {
|
|
|
901
951
|
if (fromNode !== toNode) {
|
|
902
952
|
appMap.addNavNode(learnBundleId, fromNode, { type: "window", description: fromNode });
|
|
903
953
|
appMap.addNavNode(learnBundleId, toNode, { type: "window", description: toNode });
|
|
904
|
-
|
|
954
|
+
const locatorSlug = locatorTarget ? String(locatorTarget).slice(0, 80) : null;
|
|
955
|
+
const edgeAction = locatorSlug ? `${toolName}:${locatorSlug}` : toolName;
|
|
956
|
+
// Wire #11: record topology FIRST so AppMap can read the updated Bayesian score
|
|
905
957
|
learningEngine.recordTopologyOutcome({
|
|
906
958
|
bundleId: learnBundleId,
|
|
907
959
|
fromNode,
|
|
908
|
-
action:
|
|
960
|
+
action: edgeAction,
|
|
909
961
|
toNode,
|
|
910
962
|
success: true,
|
|
911
963
|
});
|
|
964
|
+
appMap.recordEdgeOutcome(learnBundleId, fromNode, edgeAction, toNode, true);
|
|
912
965
|
}
|
|
913
966
|
}
|
|
914
967
|
// ── State machine: detect state changes from tool results ──
|
|
@@ -1266,7 +1319,7 @@ server.tool = (...args) => {
|
|
|
1266
1319
|
// ── Record failure for playbook learning (in-memory only) ──
|
|
1267
1320
|
contextTracker.recordOutcome(toolName, safeParams, false, errorMsg);
|
|
1268
1321
|
// ── Feed learning engine (failure timing + locator) ──
|
|
1269
|
-
const learnBundleIdErr = worldModel.getState().focusedApp?.bundleId ??
|
|
1322
|
+
const learnBundleIdErr = worldModel.getState().focusedApp?.bundleId ?? postCallBundleId ?? "unknown";
|
|
1270
1323
|
learningEngine.recordToolTiming({ tool: toolName, bundleId: learnBundleIdErr, durationMs, success: false });
|
|
1271
1324
|
const failedLocator = safeParams.target ?? safeParams.selector ?? safeParams.locator
|
|
1272
1325
|
?? (toolName === "click_text" ? safeParams.text : undefined);
|
|
@@ -1422,7 +1475,7 @@ server.tool("windows", "List all visible windows with IDs, positions, and sizes"
|
|
|
1422
1475
|
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
1423
1476
|
});
|
|
1424
1477
|
server.tool("focus", "Focus/activate an application (or a specific window by windowId)", {
|
|
1425
|
-
bundleId: z.string().describe("App bundle ID, e.g. com.apple.Safari"),
|
|
1478
|
+
bundleId: z.string().regex(/^[a-zA-Z0-9._-]+$/, "Invalid bundleId format").describe("App bundle ID, e.g. com.apple.Safari"),
|
|
1426
1479
|
windowId: z.number().optional().describe("Specific window ID from windows() — raises that exact window. Use when multiple instances of the same app exist."),
|
|
1427
1480
|
}, async ({ bundleId, windowId }) => {
|
|
1428
1481
|
await ensureBridge();
|
|
@@ -1528,8 +1581,8 @@ server.tool("focus", "Focus/activate an application (or a specific window by win
|
|
|
1528
1581
|
}
|
|
1529
1582
|
});
|
|
1530
1583
|
server.tool("launch", "Launch an application. Chrome/Chromium browsers are launched with CDP enabled (port 9222) for browser_* tools.", {
|
|
1531
|
-
bundleId: z.string().describe("App bundle ID"),
|
|
1532
|
-
cdpPort: z.number().optional().describe("CDP port for Chrome/Chromium (default: 9222). Ignored for non-browser apps."),
|
|
1584
|
+
bundleId: z.string().regex(/^[a-zA-Z0-9._-]+$/, "Invalid bundleId format").describe("App bundle ID"),
|
|
1585
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port for Chrome/Chromium (default: 9222). Ignored for non-browser apps."),
|
|
1533
1586
|
}, async ({ bundleId, cdpPort }) => {
|
|
1534
1587
|
await ensureBridge();
|
|
1535
1588
|
const riskyBundleIds = {
|
|
@@ -1930,7 +1983,7 @@ server.tool("click_text", "SLOW fallback: Find text on screen via OCR and click
|
|
|
1930
1983
|
server.tool("type_text", "Type text using the keyboard. Auto-detects Electron apps and routes through CDP for reliable editor input.", {
|
|
1931
1984
|
text: z.string().describe("Text to type"),
|
|
1932
1985
|
pid: z.number().optional().describe("Target process ID for PID-targeted event delivery"),
|
|
1933
|
-
cdpPort: z.number().optional().describe("CDP port for Electron apps (e.g. 9229). When set, types via CDP instead of AX — fixes Copilot/panel focus theft."),
|
|
1986
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port for Electron apps (e.g. 9229). When set, types via CDP instead of AX — fixes Copilot/panel focus theft."),
|
|
1934
1987
|
}, async ({ text, pid, cdpPort: portOverride }) => {
|
|
1935
1988
|
await ensureBridge();
|
|
1936
1989
|
// Auto-resolve frontmost PID when none provided — global HID posting
|
|
@@ -2178,7 +2231,7 @@ function randomDelay(min, max) {
|
|
|
2178
2231
|
// BROWSER — control Chrome pages via CDP (10ms, not OCR)
|
|
2179
2232
|
// ═══════════════════════════════════════════════
|
|
2180
2233
|
server.tool("browser_tabs", "List all open Chrome/Electron tabs. Use cdpPort to connect to a specific app (e.g. 9333 for Codex Desktop).", {
|
|
2181
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps). Omit to auto-detect."),
|
|
2234
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps). Omit to auto-detect."),
|
|
2182
2235
|
}, async ({ cdpPort: portOverride }) => {
|
|
2183
2236
|
const { CDP: cdp, port } = await ensureCDP(portOverride);
|
|
2184
2237
|
const targets = await cdp.List({ port });
|
|
@@ -2188,7 +2241,7 @@ server.tool("browser_tabs", "List all open Chrome/Electron tabs. Use cdpPort to
|
|
|
2188
2241
|
});
|
|
2189
2242
|
server.tool("browser_open", "Open a URL in Chrome/Electron (creates new tab)", {
|
|
2190
2243
|
url: z.string().describe("URL to open"),
|
|
2191
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2244
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2192
2245
|
}, async ({ url, cdpPort: portOverride }) => {
|
|
2193
2246
|
// L2-71 fix: Block dangerous URL protocols
|
|
2194
2247
|
const BLOCKED_PROTOCOLS = ["javascript:", "data:", "blob:", "vbscript:"];
|
|
@@ -2212,7 +2265,7 @@ server.tool("browser_open", "Open a URL in Chrome/Electron (creates new tab)", {
|
|
|
2212
2265
|
server.tool("browser_navigate", "Navigate the active Chrome/Electron tab to a URL", {
|
|
2213
2266
|
url: z.string().describe("URL to navigate to"),
|
|
2214
2267
|
tabId: z.string().optional().describe("Tab ID (from browser_tabs). Omit for most recent tab."),
|
|
2215
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2268
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2216
2269
|
}, async ({ url, tabId, cdpPort: portOverride }) => {
|
|
2217
2270
|
// L2-71 fix: Block dangerous URL protocols that could execute arbitrary code
|
|
2218
2271
|
const BLOCKED_PROTOCOLS = ["javascript:", "data:", "blob:", "vbscript:"];
|
|
@@ -2257,7 +2310,7 @@ server.tool("browser_navigate", "Navigate the active Chrome/Electron tab to a UR
|
|
|
2257
2310
|
server.tool("browser_js", "Execute JavaScript in a Chrome/Electron tab. Returns the result. WARNING: This runs arbitrary JS in the browser context — avoid on sensitive pages (banking, email). All executions are audit-logged.", {
|
|
2258
2311
|
code: z.string().describe("JavaScript to execute. Must be an expression that returns a value. Use (() => { ... })() for multi-line."),
|
|
2259
2312
|
tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
|
|
2260
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2313
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2261
2314
|
}, async ({ code, tabId, cdpPort: portOverride }) => {
|
|
2262
2315
|
auditLog("browser_js", { code, tabId });
|
|
2263
2316
|
const { CDP: cdp, port } = await ensureCDP(portOverride);
|
|
@@ -2291,7 +2344,7 @@ server.tool("browser_dom", "Query the DOM of a Chrome/Electron page. Returns mat
|
|
|
2291
2344
|
selector: z.string().describe("CSS selector, e.g. 'button', '.nav a', '#main h2'"),
|
|
2292
2345
|
tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
|
|
2293
2346
|
limit: z.number().optional().describe("Max results (default 20)"),
|
|
2294
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2347
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2295
2348
|
}, async ({ selector, tabId, limit, cdpPort: portOverride }) => {
|
|
2296
2349
|
// Capture bundleId before any async CDP calls to avoid race condition
|
|
2297
2350
|
const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
|
|
@@ -2342,7 +2395,7 @@ server.tool("browser_dom", "Query the DOM of a Chrome/Electron page. Returns mat
|
|
|
2342
2395
|
server.tool("browser_click", "Click an element in Chrome/Electron by CSS selector. Uses CDP Input.dispatchMouseEvent for realistic mouse events.", {
|
|
2343
2396
|
selector: z.string().describe("CSS selector of element to click"),
|
|
2344
2397
|
tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
|
|
2345
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2398
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2346
2399
|
}, async ({ selector, tabId, cdpPort: portOverride }) => {
|
|
2347
2400
|
const { client } = await getCDPClient(tabId, portOverride);
|
|
2348
2401
|
await client.Runtime.enable();
|
|
@@ -2375,7 +2428,7 @@ server.tool("browser_type", "Type into an input field in Chrome/Electron. Uses C
|
|
|
2375
2428
|
text: z.string().describe("Text to type"),
|
|
2376
2429
|
clear: z.boolean().optional().describe("Clear field first (default true)"),
|
|
2377
2430
|
tabId: z.string().optional().describe("Tab ID"),
|
|
2378
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2431
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2379
2432
|
}, async ({ selector, text, clear, tabId, cdpPort: portOverride }) => {
|
|
2380
2433
|
const { client } = await getCDPClient(tabId, portOverride);
|
|
2381
2434
|
await client.Runtime.enable();
|
|
@@ -2416,7 +2469,7 @@ server.tool("browser_wait", "Wait for a condition on a Chrome/Electron page", {
|
|
|
2416
2469
|
condition: z.string().describe("JS expression that returns truthy when ready. e.g. 'document.querySelector(\".loaded\")'"),
|
|
2417
2470
|
timeoutMs: z.number().optional().describe("Timeout in ms (default 10000)"),
|
|
2418
2471
|
tabId: z.string().optional().describe("Tab ID"),
|
|
2419
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2472
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2420
2473
|
}, async ({ condition, timeoutMs, tabId, cdpPort: portOverride }) => {
|
|
2421
2474
|
const { CDP: cdp, port } = await ensureCDP(portOverride);
|
|
2422
2475
|
let targetId = tabId;
|
|
@@ -2444,7 +2497,7 @@ server.tool("browser_wait", "Wait for a condition on a Chrome/Electron page", {
|
|
|
2444
2497
|
});
|
|
2445
2498
|
server.tool("browser_page_info", "Get current page title, URL, and text content summary", {
|
|
2446
2499
|
tabId: z.string().optional().describe("Tab ID"),
|
|
2447
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2500
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2448
2501
|
}, async ({ tabId, cdpPort: portOverride }) => {
|
|
2449
2502
|
// Capture bundleId BEFORE CDP call to prevent focus-change race
|
|
2450
2503
|
const browserBundleId = worldModel.getState().focusedApp?.bundleId ?? "com.google.Chrome";
|
|
@@ -2519,7 +2572,7 @@ if (origQuery) {
|
|
|
2519
2572
|
`;
|
|
2520
2573
|
server.tool("browser_stealth", "Inject anti-detection patches into Chrome/Electron page. Call once after navigating to a protected site. Hides webdriver flag, patches plugins/languages/permissions.", {
|
|
2521
2574
|
tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
|
|
2522
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2575
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2523
2576
|
}, async ({ tabId, cdpPort: portOverride }) => {
|
|
2524
2577
|
const { client } = await getCDPClient(tabId, portOverride);
|
|
2525
2578
|
await client.Page.enable();
|
|
@@ -2539,7 +2592,7 @@ server.tool("browser_fill_form", "Fill a form field with human-like typing (anti
|
|
|
2539
2592
|
clear: z.boolean().optional().describe("Clear field first (default true)"),
|
|
2540
2593
|
delayMs: z.number().optional().describe("Avg delay between keystrokes in ms (default 50)"),
|
|
2541
2594
|
tabId: z.string().optional().describe("Tab ID"),
|
|
2542
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2595
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2543
2596
|
}, async ({ selector, text, clear, delayMs, tabId, cdpPort: portOverride }) => {
|
|
2544
2597
|
const { client } = await getCDPClient(tabId, portOverride);
|
|
2545
2598
|
await client.Runtime.enable();
|
|
@@ -2583,7 +2636,7 @@ server.tool("browser_fill_form", "Fill a form field with human-like typing (anti
|
|
|
2583
2636
|
server.tool("browser_human_click", "Alias for browser_click — both use realistic mouseMoved → mousePressed → mouseReleased events. Prefer browser_click directly.", {
|
|
2584
2637
|
selector: z.string().describe("CSS selector of element to click"),
|
|
2585
2638
|
tabId: z.string().optional().describe("Tab ID. Omit for most recent tab."),
|
|
2586
|
-
cdpPort: z.number().optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2639
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port override (e.g. 9333 for Electron apps)"),
|
|
2587
2640
|
}, async ({ selector, tabId, cdpPort: portOverride }) => {
|
|
2588
2641
|
const { client } = await getCDPClient(tabId, portOverride);
|
|
2589
2642
|
await client.Runtime.enable();
|
|
@@ -2991,7 +3044,7 @@ server.tool("playbook_record", "Macro recorder: start recording, do the flow, st
|
|
|
2991
3044
|
platform: z.string().optional().describe("Platform name (required for start)"),
|
|
2992
3045
|
name: z.string().optional().describe("Playbook name (required for stop)"),
|
|
2993
3046
|
description: z.string().optional().describe("Playbook description (for stop)"),
|
|
2994
|
-
cdpPort: z.number().optional().describe("CDP port if needed for browser_js steps (e.g. 9333 for Codex)"),
|
|
3047
|
+
cdpPort: z.number().min(9222).max(9999).optional().describe("CDP port if needed for browser_js steps (e.g. 9333 for Codex)"),
|
|
2995
3048
|
}, async ({ action, platform, name, description, cdpPort }) => {
|
|
2996
3049
|
switch (action) {
|
|
2997
3050
|
case "start": {
|
|
@@ -3162,6 +3215,27 @@ server.tool("applescript", "Run an AppleScript command. For controlling Finder,
|
|
|
3162
3215
|
if (process.platform === "win32") {
|
|
3163
3216
|
return { content: [{ type: "text", text: "AppleScript is not supported on Windows. Use ui_tree, ui_press, and other accessibility tools instead." }] };
|
|
3164
3217
|
}
|
|
3218
|
+
// Block shell execution vectors in AppleScript — allowlist approach for safety-critical commands
|
|
3219
|
+
const scriptLower = script.toLowerCase();
|
|
3220
|
+
const BLOCKED_PATTERNS = [
|
|
3221
|
+
/do\s+shell\s+script/i, // direct shell execution
|
|
3222
|
+
/run\s+shell\s+script/i, // variant
|
|
3223
|
+
/run\s+script/i, // dynamic AppleScript eval (can construct blocked commands)
|
|
3224
|
+
/do\s+script/i, // Terminal.app shell execution
|
|
3225
|
+
/«class\s/i, // raw Apple Event codes (bypass text-level blocks)
|
|
3226
|
+
/system\s+events.*process/i, // process spawning via System Events
|
|
3227
|
+
/NSAppleScript/i, // Objective-C bridge
|
|
3228
|
+
/ObjC\.import/i, // JXA Objective-C bridge
|
|
3229
|
+
/\bshell\b/i, // catch-all for shell-related commands
|
|
3230
|
+
/do\s+JavaScript/i, // JXA execution
|
|
3231
|
+
];
|
|
3232
|
+
if (BLOCKED_PATTERNS.some(p => p.test(script))) {
|
|
3233
|
+
return { content: [{ type: "text", text: "Blocked: this AppleScript contains a restricted command (shell execution, dynamic eval, or process spawning). Use the Bash tool for shell commands." }] };
|
|
3234
|
+
}
|
|
3235
|
+
// Block string concatenation that could reassemble blocked commands
|
|
3236
|
+
if (/&/.test(script) && (/script/i.test(script) || /shell/i.test(script))) {
|
|
3237
|
+
return { content: [{ type: "text", text: "Blocked: AppleScript with string concatenation containing 'script' or 'shell' — potential bypass attempt." }] };
|
|
3238
|
+
}
|
|
3165
3239
|
try {
|
|
3166
3240
|
const result = execSync(`osascript -e '${script.replace(/'/g, "'\\''")}'`, {
|
|
3167
3241
|
encoding: "utf-8",
|
|
@@ -3722,7 +3796,7 @@ import { METHOD_CAPABILITIES, DEFAULT_RETRY_POLICY, planExecution, executeWithFa
|
|
|
3722
3796
|
server.tool("execution_plan", "Show the execution plan for an action type. Returns the ordered fallback chain based on available infrastructure.", {
|
|
3723
3797
|
action: z.enum(["click", "type", "read", "locate", "select", "scroll"]).describe("Action type"),
|
|
3724
3798
|
}, async ({ action }) => {
|
|
3725
|
-
const plan = planExecution(action, { hasBridge: true, hasCDP: cdpPort !== null });
|
|
3799
|
+
const plan = planExecution(action, { hasBridge: true, hasCDP: cdpPort !== null }, getSensorRanking());
|
|
3726
3800
|
const lines = plan.map((method, i) => {
|
|
3727
3801
|
const cap = METHOD_CAPABILITIES[method];
|
|
3728
3802
|
return `${i + 1}. ${method} (~${cap.avgLatencyMs}ms)${i === 0 ? " ← primary" : ""}`;
|
|
@@ -3764,26 +3838,161 @@ function infra() {
|
|
|
3764
3838
|
return { hasBridge: true, hasCDP: cdpPort !== null };
|
|
3765
3839
|
}
|
|
3766
3840
|
/**
|
|
3767
|
-
* Get
|
|
3768
|
-
*
|
|
3841
|
+
* Get sensor rankings for the current app from the learning engine.
|
|
3842
|
+
* Used by planExecution() to reorder fallback methods based on learned success rates.
|
|
3843
|
+
* Returns undefined if no bundleId is known (falls back to canonical order).
|
|
3769
3844
|
*/
|
|
3770
|
-
function
|
|
3771
|
-
|
|
3845
|
+
function getSensorRanking(overrideBundleId) {
|
|
3846
|
+
// Use override bundleId when provided (from tool params), else worldModel, else lastKnown
|
|
3847
|
+
const bundleId = overrideBundleId ?? worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId;
|
|
3848
|
+
if (!bundleId)
|
|
3849
|
+
return undefined;
|
|
3850
|
+
const ranked = learningEngine.rankSensors(bundleId);
|
|
3851
|
+
return ranked.length > 0 ? ranked : undefined;
|
|
3852
|
+
}
|
|
3853
|
+
/**
|
|
3854
|
+
* Get a retry policy adapted by the learning engine's adaptive budgets
|
|
3855
|
+
* AND the AppMap's timing profiles (L7→L1).
|
|
3856
|
+
*
|
|
3857
|
+
* Priority: AppMap timing > Learning budget > Default
|
|
3858
|
+
* AppMap stores per-tool/per-action avg durations from real executions.
|
|
3859
|
+
* Learning budget stores per-app adaptive budgets from outcome stats.
|
|
3860
|
+
*/
|
|
3861
|
+
function getAdaptedRetryPolicy(toolName, overrideBundleId) {
|
|
3862
|
+
let typicalMs = null;
|
|
3863
|
+
// L7→L1: Check AppMap timing profiles for the action type.
|
|
3864
|
+
// Timing keys are stored as "click::Submit", "click_text::Login", etc.
|
|
3865
|
+
// Fallback tools pass "click_with_fallback" — extract the action prefix to match.
|
|
3866
|
+
const bundleId = overrideBundleId ?? worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId;
|
|
3867
|
+
if (bundleId && toolName) {
|
|
3868
|
+
const actionPrefix = toolName.replace(/_with_fallback$/, "");
|
|
3869
|
+
// Get all timing profiles for this app, then filter by action prefix
|
|
3870
|
+
const allTimings = appMap.getTimingProfile(bundleId);
|
|
3871
|
+
const matchingTimings = allTimings.filter((t) => t.key.startsWith(actionPrefix + "::") || t.key === actionPrefix);
|
|
3872
|
+
if (matchingTimings.length > 0) {
|
|
3873
|
+
// Use element_response type if available, compute median avgMs across all matching entries
|
|
3874
|
+
const responseTimes = matchingTimings
|
|
3875
|
+
.filter((t) => t.type === "element_response")
|
|
3876
|
+
.map((t) => t.avgMs);
|
|
3877
|
+
if (responseTimes.length > 0) {
|
|
3878
|
+
responseTimes.sort((a, b) => a - b);
|
|
3879
|
+
const mid = Math.floor(responseTimes.length / 2);
|
|
3880
|
+
typicalMs = responseTimes.length % 2 === 1
|
|
3881
|
+
? responseTimes[mid]
|
|
3882
|
+
: (responseTimes[mid - 1] + responseTimes[mid]) / 2;
|
|
3883
|
+
}
|
|
3884
|
+
else {
|
|
3885
|
+
typicalMs = matchingTimings[0].avgMs;
|
|
3886
|
+
}
|
|
3887
|
+
}
|
|
3888
|
+
}
|
|
3889
|
+
// Fall back to L5 adaptive budget
|
|
3890
|
+
if (typicalMs == null && currentAdaptiveBudget) {
|
|
3891
|
+
typicalMs = Math.max(currentAdaptiveBudget.locateMs, currentAdaptiveBudget.actMs);
|
|
3892
|
+
}
|
|
3893
|
+
if (typicalMs == null)
|
|
3772
3894
|
return DEFAULT_RETRY_POLICY;
|
|
3773
|
-
// Use the max of locate+act as a guide for retry delay — faster apps need shorter delays
|
|
3774
|
-
const typicalMs = Math.max(currentAdaptiveBudget.locateMs, currentAdaptiveBudget.actMs);
|
|
3775
3895
|
// Retry delay = max(100ms, typical * 1.5), capped at the default
|
|
3776
3896
|
const adaptedDelay = Math.min(DEFAULT_RETRY_POLICY.delayBetweenRetriesMs, Math.max(100, Math.ceil(typicalMs * 1.5)));
|
|
3777
3897
|
if (adaptedDelay === DEFAULT_RETRY_POLICY.delayBetweenRetriesMs)
|
|
3778
3898
|
return DEFAULT_RETRY_POLICY;
|
|
3779
3899
|
return { ...DEFAULT_RETRY_POLICY, delayBetweenRetriesMs: adaptedDelay };
|
|
3780
3900
|
}
|
|
3781
|
-
function formatResult(action, target, result) {
|
|
3901
|
+
function formatResult(action, target, result, preCheckWarnings) {
|
|
3902
|
+
const prefix = preCheckWarnings && preCheckWarnings.length > 0
|
|
3903
|
+
? preCheckWarnings.join("\n") + "\n"
|
|
3904
|
+
: "";
|
|
3782
3905
|
if (result.ok) {
|
|
3783
3906
|
const fallbackNote = result.fallbackFrom ? ` (fell back from ${result.fallbackFrom})` : "";
|
|
3784
|
-
return { content: [{ type: "text", text: `${action} "${result.target ?? target}" via ${result.method}${fallbackNote} in ${result.durationMs}ms` }] };
|
|
3907
|
+
return { content: [{ type: "text", text: `${prefix}${action} "${result.target ?? target}" via ${result.method}${fallbackNote} in ${result.durationMs}ms` }] };
|
|
3785
3908
|
}
|
|
3786
|
-
return { content: [{ type: "text", text:
|
|
3909
|
+
return { content: [{ type: "text", text: `${prefix}Failed to ${action} "${target}" — all methods exhausted. Last error: ${result.error}` }] };
|
|
3910
|
+
}
|
|
3911
|
+
/**
|
|
3912
|
+
* L3→L1: Pre-execution worldModel check.
|
|
3913
|
+
* Verifies the target app is focused and not blocked by dialogs.
|
|
3914
|
+
* Auto-focuses the app if it's in the background. Returns warnings
|
|
3915
|
+
* that should be prepended to the result.
|
|
3916
|
+
*/
|
|
3917
|
+
async function preExecutionCheck(bundleId) {
|
|
3918
|
+
const warnings = [];
|
|
3919
|
+
try {
|
|
3920
|
+
const state = worldModel.getState();
|
|
3921
|
+
const targetBundleId = bundleId ?? lastKnownBundleId ?? state.focusedApp?.bundleId;
|
|
3922
|
+
if (!targetBundleId)
|
|
3923
|
+
return warnings;
|
|
3924
|
+
// Check if target app is focused — use correct bridge method "app.focus"
|
|
3925
|
+
if (state.focusedApp && state.focusedApp.bundleId !== targetBundleId) {
|
|
3926
|
+
warnings.push(`[L3→L1] Target app ${targetBundleId} is not focused (current: ${state.focusedApp.bundleId}). Auto-focusing...`);
|
|
3927
|
+
try {
|
|
3928
|
+
await bridge.call("app.focus", { bundleId: targetBundleId });
|
|
3929
|
+
}
|
|
3930
|
+
catch {
|
|
3931
|
+
warnings.push(`[L3→L1] Auto-focus failed — proceeding anyway`);
|
|
3932
|
+
}
|
|
3933
|
+
}
|
|
3934
|
+
// Re-fetch state after auto-focus to get current focused app
|
|
3935
|
+
const postFocusState = worldModel.getState();
|
|
3936
|
+
// Check for blocking dialogs — scoped to target app only.
|
|
3937
|
+
// Observer-sourced dialogs have windowId=0 (no real window ID),
|
|
3938
|
+
// so fall back to checking if the focused app matches.
|
|
3939
|
+
const relevantDialogs = postFocusState.activeDialogs.filter((d) => {
|
|
3940
|
+
if (d.windowId === 0) {
|
|
3941
|
+
return postFocusState.focusedApp?.bundleId === targetBundleId;
|
|
3942
|
+
}
|
|
3943
|
+
const win = postFocusState.windows.get(d.windowId);
|
|
3944
|
+
return win?.bundleId === targetBundleId;
|
|
3945
|
+
});
|
|
3946
|
+
if (relevantDialogs.length > 0) {
|
|
3947
|
+
const dialogTitles = relevantDialogs
|
|
3948
|
+
.map((d) => d.title || d.type)
|
|
3949
|
+
.join(", ");
|
|
3950
|
+
warnings.push(`[L3→L1] Active dialog(s) detected: ${dialogTitles} — may block interaction`);
|
|
3951
|
+
}
|
|
3952
|
+
// Check if target window is off-screen
|
|
3953
|
+
for (const [, win] of state.windows) {
|
|
3954
|
+
if (win.bundleId === targetBundleId && !win.isOnScreen) {
|
|
3955
|
+
warnings.push(`[L3→L1] Window "${win.title.value}" is off-screen or minimized`);
|
|
3956
|
+
}
|
|
3957
|
+
}
|
|
3958
|
+
// Check if world state is stale (>10s since last update)
|
|
3959
|
+
const staleThresholdMs = 10_000;
|
|
3960
|
+
const lastUpdate = new Date(state.updatedAt).getTime();
|
|
3961
|
+
if (!Number.isNaN(lastUpdate) && Date.now() - lastUpdate > staleThresholdMs && state.confidence < 0.5) {
|
|
3962
|
+
warnings.push(`[L3→L1] World state is stale (${Math.round((Date.now() - lastUpdate) / 1000)}s old, confidence ${state.confidence.toFixed(2)}) — screen may have changed`);
|
|
3963
|
+
}
|
|
3964
|
+
}
|
|
3965
|
+
catch {
|
|
3966
|
+
// Pre-check is best-effort advisory — never crash the tool call
|
|
3967
|
+
}
|
|
3968
|
+
return warnings;
|
|
3969
|
+
}
|
|
3970
|
+
/**
|
|
3971
|
+
* L7→L1: Try to resolve an element's position from the AppMap.
|
|
3972
|
+
* Returns known screen coordinates if the map has a position for this label
|
|
3973
|
+
* AND we can get the current window bounds. Returns null otherwise.
|
|
3974
|
+
*/
|
|
3975
|
+
function resolveMapPosition(target, bundleId) {
|
|
3976
|
+
const bid = bundleId ?? worldModel.getState().focusedApp?.bundleId ?? lastKnownBundleId;
|
|
3977
|
+
if (!bid)
|
|
3978
|
+
return null;
|
|
3979
|
+
// Get window bounds from worldModel for coordinate conversion
|
|
3980
|
+
const state = worldModel.getState();
|
|
3981
|
+
const focusedWinId = state.focusedWindowId;
|
|
3982
|
+
if (focusedWinId == null)
|
|
3983
|
+
return null;
|
|
3984
|
+
const win = state.windows.get(focusedWinId);
|
|
3985
|
+
if (!win || win.bundleId !== bid)
|
|
3986
|
+
return null;
|
|
3987
|
+
const bounds = win.bounds.value;
|
|
3988
|
+
// Guard: reject stale bounds (>5s old) to prevent clicking at wrong position after window move
|
|
3989
|
+
const boundsAge = Date.now() - new Date(win.bounds.updatedAt).getTime();
|
|
3990
|
+
if (boundsAge > 5000 || boundsAge < 0)
|
|
3991
|
+
return null; // stale or future timestamp
|
|
3992
|
+
// Guard: reject uninitialized/zero-size bounds to prevent clicking at (0,0)
|
|
3993
|
+
if (bounds.width < 50 || bounds.height < 50)
|
|
3994
|
+
return null;
|
|
3995
|
+
return appMap.resolvePosition(bid, target, bounds);
|
|
3787
3996
|
}
|
|
3788
3997
|
// ── click_with_fallback ──
|
|
3789
3998
|
server.tool("click_with_fallback", "Click a target by text using the canonical fallback chain: AX → CDP → OCR. Automatically retries and falls through methods.", {
|
|
@@ -3791,10 +4000,37 @@ server.tool("click_with_fallback", "Click a target by text using the canonical f
|
|
|
3791
4000
|
bundleId: z.string().optional().describe("App bundle ID (for AX path)"),
|
|
3792
4001
|
}, async ({ target, bundleId }) => {
|
|
3793
4002
|
await ensureBridge();
|
|
3794
|
-
const
|
|
4003
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4004
|
+
// L7→L1: If AppMap knows this element's position, try coordinates first.
|
|
4005
|
+
// WARNING: Coordinate clicks are unverified — if the window moved or a modal
|
|
4006
|
+
// appeared, the click may hit the wrong target. On failure, falls through to
|
|
4007
|
+
// the standard AX/CDP/OCR chain which verifies element identity.
|
|
4008
|
+
// Skip map-guided shortcut if precheck detected blocking conditions (dialogs, off-screen)
|
|
4009
|
+
const hasBlockingCondition = preCheckWarnings.some((w) => w.includes("dialog") || w.includes("off-screen") || w.includes("not frontmost"));
|
|
4010
|
+
const mapPos = !hasBlockingCondition ? resolveMapPosition(target, bundleId) : null;
|
|
4011
|
+
if (mapPos) {
|
|
4012
|
+
try {
|
|
4013
|
+
const start = Date.now();
|
|
4014
|
+
await bridge.call("cg.mouseClick", { x: mapPos.x, y: mapPos.y });
|
|
4015
|
+
preCheckWarnings.push(`[L7→L1] Used map position (${mapPos.x}, ${mapPos.y}) for "${target}" — UNVERIFIED coordinate click`);
|
|
4016
|
+
return formatResult("Clicked", target, {
|
|
4017
|
+
ok: true, method: "coordinates", durationMs: Date.now() - start,
|
|
4018
|
+
fallbackFrom: null, retries: 0, error: null, target: `${target} at (${mapPos.x},${mapPos.y}) [map-guided, unverified]`,
|
|
4019
|
+
}, preCheckWarnings);
|
|
4020
|
+
}
|
|
4021
|
+
catch {
|
|
4022
|
+
preCheckWarnings.push(`[L7→L1] Map position click failed — falling back to standard chain`);
|
|
4023
|
+
}
|
|
4024
|
+
}
|
|
4025
|
+
const plan = planExecution("click", infra(), getSensorRanking())
|
|
3795
4026
|
.filter((m) => m !== "coordinates");
|
|
3796
4027
|
const targetPid = await resolvePid(bundleId);
|
|
3797
|
-
|
|
4028
|
+
// L2→L1: Resolve known selector from references for direct injection
|
|
4029
|
+
const knownSelector = contextTracker.getSelector(target);
|
|
4030
|
+
if (knownSelector) {
|
|
4031
|
+
preCheckWarnings.push(`[L2→L1] Injecting known selector: ${knownSelector}`);
|
|
4032
|
+
}
|
|
4033
|
+
const result = await executeWithFallback("click", plan, getAdaptedRetryPolicy("click_with_fallback"), async (method, attempt) => {
|
|
3798
4034
|
const start = Date.now();
|
|
3799
4035
|
try {
|
|
3800
4036
|
switch (method) {
|
|
@@ -3829,15 +4065,28 @@ server.tool("click_with_fallback", "Click a target by text using the canonical f
|
|
|
3829
4065
|
const client = await CDPClient({ port });
|
|
3830
4066
|
try {
|
|
3831
4067
|
const { Runtime } = client;
|
|
3832
|
-
|
|
3833
|
-
|
|
3834
|
-
|
|
4068
|
+
// L2→L1: Try known selector first (wrapped in try/catch to handle
|
|
4069
|
+
// invalid selectors gracefully), then fall back to text search.
|
|
4070
|
+
const textSearchExpr = `Array.from(document.querySelectorAll('*')).find(e =>
|
|
3835
4071
|
e.textContent?.trim() === ${JSON.stringify(target)} ||
|
|
3836
|
-
e.getAttribute('aria-label') === ${JSON.stringify(target)}
|
|
3837
|
-
|
|
4072
|
+
e.getAttribute('aria-label') === ${JSON.stringify(target)})`;
|
|
4073
|
+
const selectorExpr = knownSelector
|
|
4074
|
+
? `(() => {
|
|
4075
|
+
try {
|
|
4076
|
+
const el = document.querySelector(${JSON.stringify(knownSelector)});
|
|
4077
|
+
if (el) { el.click(); return 'clicked'; }
|
|
4078
|
+
} catch(e) { /* invalid selector — fall through to text search */ }
|
|
4079
|
+
const fallback = ${textSearchExpr};
|
|
4080
|
+
if (fallback) { fallback.click(); return 'clicked'; }
|
|
4081
|
+
return null;
|
|
4082
|
+
})()`
|
|
4083
|
+
: `(() => {
|
|
4084
|
+
const el = ${textSearchExpr};
|
|
3838
4085
|
if (el) { el.click(); return 'clicked'; }
|
|
3839
4086
|
return null;
|
|
3840
|
-
})()
|
|
4087
|
+
})()`;
|
|
4088
|
+
const evalResult = await Runtime.evaluate({
|
|
4089
|
+
expression: selectorExpr,
|
|
3841
4090
|
returnByValue: true,
|
|
3842
4091
|
});
|
|
3843
4092
|
if (evalResult.result?.value === "clicked") {
|
|
@@ -3872,7 +4121,7 @@ server.tool("click_with_fallback", "Click a target by text using the canonical f
|
|
|
3872
4121
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target };
|
|
3873
4122
|
}
|
|
3874
4123
|
});
|
|
3875
|
-
return formatResult("Clicked", target, result);
|
|
4124
|
+
return formatResult("Clicked", target, result, preCheckWarnings);
|
|
3876
4125
|
});
|
|
3877
4126
|
// ── type_with_fallback ──
|
|
3878
4127
|
server.tool("type_with_fallback", "Type text into a target field using the canonical fallback chain: AX → CDP → coordinates. Finds the field by label/placeholder, focuses it, then types.", {
|
|
@@ -3882,9 +4131,12 @@ server.tool("type_with_fallback", "Type text into a target field using the canon
|
|
|
3882
4131
|
clearFirst: z.boolean().optional().describe("Select-all and clear the field before typing (default: false)"),
|
|
3883
4132
|
}, async ({ target, text, bundleId, clearFirst }) => {
|
|
3884
4133
|
await ensureBridge();
|
|
3885
|
-
const
|
|
4134
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4135
|
+
const plan = planExecution("type", infra(), getSensorRanking());
|
|
3886
4136
|
const targetPid = await resolvePid(bundleId);
|
|
3887
|
-
|
|
4137
|
+
// L2→L1: Resolve known selector for direct injection
|
|
4138
|
+
const knownSelector = contextTracker.getSelector(target);
|
|
4139
|
+
const result = await executeWithFallback("type", plan, getAdaptedRetryPolicy("type_with_fallback"), async (method, attempt) => {
|
|
3888
4140
|
const start = Date.now();
|
|
3889
4141
|
try {
|
|
3890
4142
|
switch (method) {
|
|
@@ -3972,17 +4224,30 @@ server.tool("type_with_fallback", "Type text into a target field using the canon
|
|
|
3972
4224
|
const client = await CDPClient({ port });
|
|
3973
4225
|
try {
|
|
3974
4226
|
const { Runtime, DOM, Input } = client;
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
4227
|
+
// L2→L1: Try known selector first (with try/catch for invalid selectors),
|
|
4228
|
+
// then fall back to attribute search.
|
|
4229
|
+
const fieldSearchExpr = `Array.from(document.querySelectorAll('input, textarea, [contenteditable]')).find(e =>
|
|
3978
4230
|
e.getAttribute('placeholder') === ${JSON.stringify(target)} ||
|
|
3979
4231
|
e.getAttribute('aria-label') === ${JSON.stringify(target)} ||
|
|
3980
4232
|
e.getAttribute('name') === ${JSON.stringify(target)} ||
|
|
3981
|
-
(e.labels && Array.from(e.labels).some(l => l.textContent?.trim() === ${JSON.stringify(target)}))
|
|
3982
|
-
|
|
4233
|
+
(e.labels && Array.from(e.labels).some(l => l.textContent?.trim() === ${JSON.stringify(target)})))`;
|
|
4234
|
+
const fieldExpr = knownSelector
|
|
4235
|
+
? `(() => {
|
|
4236
|
+
try {
|
|
4237
|
+
const el = document.querySelector(${JSON.stringify(knownSelector)});
|
|
4238
|
+
if (el) { el.focus(); return true; }
|
|
4239
|
+
} catch(e) { /* invalid selector — fall through */ }
|
|
4240
|
+
const fallback = ${fieldSearchExpr};
|
|
4241
|
+
if (fallback) { fallback.focus(); return true; }
|
|
4242
|
+
return false;
|
|
4243
|
+
})()`
|
|
4244
|
+
: `(() => {
|
|
4245
|
+
const el = ${fieldSearchExpr};
|
|
3983
4246
|
if (el) { el.focus(); return true; }
|
|
3984
4247
|
return false;
|
|
3985
|
-
})()
|
|
4248
|
+
})()`;
|
|
4249
|
+
const evalResult = await Runtime.evaluate({
|
|
4250
|
+
expression: fieldExpr,
|
|
3986
4251
|
returnByValue: true,
|
|
3987
4252
|
});
|
|
3988
4253
|
if (!evalResult.result?.value)
|
|
@@ -4009,7 +4274,7 @@ server.tool("type_with_fallback", "Type text into a target field using the canon
|
|
|
4009
4274
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target };
|
|
4010
4275
|
}
|
|
4011
4276
|
});
|
|
4012
|
-
return formatResult("Typed into", target, result);
|
|
4277
|
+
return formatResult("Typed into", target, result, preCheckWarnings);
|
|
4013
4278
|
});
|
|
4014
4279
|
// ── read_with_fallback ──
|
|
4015
4280
|
server.tool("read_with_fallback", "Read text content from the screen or a specific element using the canonical fallback chain: AX → CDP → OCR. Returns the text found.", {
|
|
@@ -4017,9 +4282,15 @@ server.tool("read_with_fallback", "Read text content from the screen or a specif
|
|
|
4017
4282
|
bundleId: z.string().optional().describe("App bundle ID"),
|
|
4018
4283
|
}, async ({ target, bundleId }) => {
|
|
4019
4284
|
await ensureBridge();
|
|
4020
|
-
const
|
|
4285
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4286
|
+
const plan = planExecution("read", infra(), getSensorRanking());
|
|
4021
4287
|
const targetPid = await resolvePid(bundleId);
|
|
4022
|
-
|
|
4288
|
+
// L2→L1: Resolve known selector from references for direct injection
|
|
4289
|
+
const knownSelector = target ? contextTracker.getSelector(target) : null;
|
|
4290
|
+
if (knownSelector) {
|
|
4291
|
+
preCheckWarnings.push(`[L2→L1] Injecting known selector: ${knownSelector}`);
|
|
4292
|
+
}
|
|
4293
|
+
const result = await executeWithFallback("read", plan, getAdaptedRetryPolicy("read_with_fallback"), async (method, attempt) => {
|
|
4023
4294
|
const start = Date.now();
|
|
4024
4295
|
try {
|
|
4025
4296
|
switch (method) {
|
|
@@ -4126,14 +4397,25 @@ server.tool("read_with_fallback", "Read text content from the screen or a specif
|
|
|
4126
4397
|
try {
|
|
4127
4398
|
const { Runtime } = client;
|
|
4128
4399
|
if (target) {
|
|
4129
|
-
|
|
4130
|
-
|
|
4131
|
-
const el = Array.from(document.querySelectorAll('*')).find(e =>
|
|
4400
|
+
// L2→L1: Try known selector first, then fall back to text search
|
|
4401
|
+
const textSearch = `Array.from(document.querySelectorAll('*')).find(e =>
|
|
4132
4402
|
e.getAttribute('aria-label') === ${JSON.stringify(target)} ||
|
|
4133
|
-
e.textContent?.trim() === ${JSON.stringify(target)}
|
|
4134
|
-
|
|
4135
|
-
|
|
4136
|
-
|
|
4403
|
+
e.textContent?.trim() === ${JSON.stringify(target)})`;
|
|
4404
|
+
const expr = knownSelector
|
|
4405
|
+
? `(() => {
|
|
4406
|
+
try {
|
|
4407
|
+
const el = document.querySelector(${JSON.stringify(knownSelector)});
|
|
4408
|
+
if (el) return (el.value ?? el.textContent ?? '').trim();
|
|
4409
|
+
} catch(e) {}
|
|
4410
|
+
const fallback = ${textSearch};
|
|
4411
|
+
return fallback ? (fallback.value ?? fallback.textContent ?? '').trim() : null;
|
|
4412
|
+
})()`
|
|
4413
|
+
: `(() => {
|
|
4414
|
+
const el = ${textSearch};
|
|
4415
|
+
return el ? (el.value ?? el.textContent ?? '').trim() : null;
|
|
4416
|
+
})()`;
|
|
4417
|
+
const evalResult = await Runtime.evaluate({
|
|
4418
|
+
expression: expr,
|
|
4137
4419
|
returnByValue: true,
|
|
4138
4420
|
});
|
|
4139
4421
|
if (evalResult.result?.value == null)
|
|
@@ -4173,11 +4455,13 @@ server.tool("read_with_fallback", "Read text content from the screen or a specif
|
|
|
4173
4455
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target: null };
|
|
4174
4456
|
}
|
|
4175
4457
|
});
|
|
4458
|
+
// Custom format (not formatResult) — read results include content inline
|
|
4459
|
+
const prefix = preCheckWarnings.length > 0 ? preCheckWarnings.join("\n") + "\n" : "";
|
|
4176
4460
|
if (result.ok) {
|
|
4177
4461
|
const fallbackNote = result.fallbackFrom ? ` (fell back from ${result.fallbackFrom})` : "";
|
|
4178
|
-
return { content: [{ type: "text", text:
|
|
4462
|
+
return { content: [{ type: "text", text: `${prefix}Read via ${result.method}${fallbackNote} in ${result.durationMs}ms:\n\n${result.target}` }] };
|
|
4179
4463
|
}
|
|
4180
|
-
return { content: [{ type: "text", text:
|
|
4464
|
+
return { content: [{ type: "text", text: `${prefix}Failed to read${target ? ` "${target}"` : ""} — all methods exhausted. Last error: ${result.error}` }] };
|
|
4181
4465
|
});
|
|
4182
4466
|
// ── locate_with_fallback ──
|
|
4183
4467
|
server.tool("locate_with_fallback", "Find an element's position on screen using the canonical fallback chain: AX → CDP → OCR. Returns bounds (x, y, width, height).", {
|
|
@@ -4185,9 +4469,22 @@ server.tool("locate_with_fallback", "Find an element's position on screen using
|
|
|
4185
4469
|
bundleId: z.string().optional().describe("App bundle ID"),
|
|
4186
4470
|
}, async ({ target, bundleId }) => {
|
|
4187
4471
|
await ensureBridge();
|
|
4188
|
-
const
|
|
4472
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4473
|
+
// L7→L1: If AppMap knows this element's position, return it immediately
|
|
4474
|
+
const mapPos = resolveMapPosition(target, bundleId);
|
|
4475
|
+
if (mapPos) {
|
|
4476
|
+
// Map provides center point only — use as hint, not authoritative bounds.
|
|
4477
|
+
// Fall through to full locate chain for accurate bounds.
|
|
4478
|
+
preCheckWarnings.push(`[L7→L1] Map hint: "${target}" expected near (${mapPos.x}, ${mapPos.y}) — verifying via locate chain`);
|
|
4479
|
+
}
|
|
4480
|
+
const plan = planExecution("locate", infra(), getSensorRanking());
|
|
4189
4481
|
const targetPid = await resolvePid(bundleId);
|
|
4190
|
-
|
|
4482
|
+
// L2→L1: Resolve known selector from references for direct injection
|
|
4483
|
+
const knownSelector = contextTracker.getSelector(target);
|
|
4484
|
+
if (knownSelector) {
|
|
4485
|
+
preCheckWarnings.push(`[L2→L1] Injecting known selector: ${knownSelector}`);
|
|
4486
|
+
}
|
|
4487
|
+
const result = await executeWithFallback("locate", plan, getAdaptedRetryPolicy("locate_with_fallback"), async (method, attempt) => {
|
|
4191
4488
|
const start = Date.now();
|
|
4192
4489
|
try {
|
|
4193
4490
|
switch (method) {
|
|
@@ -4220,16 +4517,29 @@ server.tool("locate_with_fallback", "Find an element's position on screen using
|
|
|
4220
4517
|
const client = await CDPClient({ port });
|
|
4221
4518
|
try {
|
|
4222
4519
|
const { Runtime } = client;
|
|
4223
|
-
|
|
4224
|
-
|
|
4225
|
-
const el = Array.from(document.querySelectorAll('*')).find(e =>
|
|
4520
|
+
// L2→L1: Try known selector first, then fall back to text search
|
|
4521
|
+
const textSearch = `Array.from(document.querySelectorAll('*')).find(e =>
|
|
4226
4522
|
e.textContent?.trim() === ${JSON.stringify(target)} ||
|
|
4227
|
-
e.getAttribute('aria-label') === ${JSON.stringify(target)}
|
|
4228
|
-
|
|
4229
|
-
|
|
4230
|
-
|
|
4231
|
-
|
|
4232
|
-
|
|
4523
|
+
e.getAttribute('aria-label') === ${JSON.stringify(target)})`;
|
|
4524
|
+
const expr = knownSelector
|
|
4525
|
+
? `(() => {
|
|
4526
|
+
try {
|
|
4527
|
+
const el = document.querySelector(${JSON.stringify(knownSelector)});
|
|
4528
|
+
if (el) { const r = el.getBoundingClientRect(); return { x: Math.round(r.x), y: Math.round(r.y), width: Math.round(r.width), height: Math.round(r.height) }; }
|
|
4529
|
+
} catch(e) {}
|
|
4530
|
+
const fallback = ${textSearch};
|
|
4531
|
+
if (!fallback) return null;
|
|
4532
|
+
const r = fallback.getBoundingClientRect();
|
|
4533
|
+
return { x: Math.round(r.x), y: Math.round(r.y), width: Math.round(r.width), height: Math.round(r.height) };
|
|
4534
|
+
})()`
|
|
4535
|
+
: `(() => {
|
|
4536
|
+
const el = ${textSearch};
|
|
4537
|
+
if (!el) return null;
|
|
4538
|
+
const r = el.getBoundingClientRect();
|
|
4539
|
+
return { x: Math.round(r.x), y: Math.round(r.y), width: Math.round(r.width), height: Math.round(r.height) };
|
|
4540
|
+
})()`;
|
|
4541
|
+
const evalResult = await Runtime.evaluate({
|
|
4542
|
+
expression: expr,
|
|
4233
4543
|
returnByValue: true,
|
|
4234
4544
|
});
|
|
4235
4545
|
const bounds = evalResult.result?.value;
|
|
@@ -4260,7 +4570,7 @@ server.tool("locate_with_fallback", "Find an element's position on screen using
|
|
|
4260
4570
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target: null };
|
|
4261
4571
|
}
|
|
4262
4572
|
});
|
|
4263
|
-
return formatResult("Located", target, result);
|
|
4573
|
+
return formatResult("Located", target, result, preCheckWarnings);
|
|
4264
4574
|
});
|
|
4265
4575
|
// ── select_with_fallback ──
|
|
4266
4576
|
server.tool("select_with_fallback", "Select an option from a dropdown/menu using the canonical fallback chain: AX → CDP. Finds the control, opens it, and picks the specified option.", {
|
|
@@ -4269,9 +4579,15 @@ server.tool("select_with_fallback", "Select an option from a dropdown/menu using
|
|
|
4269
4579
|
bundleId: z.string().optional().describe("App bundle ID"),
|
|
4270
4580
|
}, async ({ target, option, bundleId }) => {
|
|
4271
4581
|
await ensureBridge();
|
|
4272
|
-
const
|
|
4582
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4583
|
+
const plan = planExecution("select", infra(), getSensorRanking());
|
|
4273
4584
|
const targetPid = await resolvePid(bundleId);
|
|
4274
|
-
|
|
4585
|
+
// L2→L1: Resolve known selector from references for direct injection
|
|
4586
|
+
const knownSelector = contextTracker.getSelector(target);
|
|
4587
|
+
if (knownSelector) {
|
|
4588
|
+
preCheckWarnings.push(`[L2→L1] Injecting known selector: ${knownSelector}`);
|
|
4589
|
+
}
|
|
4590
|
+
const result = await executeWithFallback("select", plan, getAdaptedRetryPolicy("select_with_fallback"), async (method, attempt) => {
|
|
4275
4591
|
const start = Date.now();
|
|
4276
4592
|
try {
|
|
4277
4593
|
switch (method) {
|
|
@@ -4301,20 +4617,34 @@ server.tool("select_with_fallback", "Select an option from a dropdown/menu using
|
|
|
4301
4617
|
const client = await CDPClient({ port });
|
|
4302
4618
|
try {
|
|
4303
4619
|
const { Runtime } = client;
|
|
4304
|
-
|
|
4305
|
-
|
|
4306
|
-
const sel = Array.from(document.querySelectorAll('select')).find(s =>
|
|
4620
|
+
// L2→L1: Try known selector first for the select element
|
|
4621
|
+
const textSearch = `Array.from(document.querySelectorAll('select')).find(s =>
|
|
4307
4622
|
s.getAttribute('aria-label') === ${JSON.stringify(target)} ||
|
|
4308
4623
|
s.getAttribute('name') === ${JSON.stringify(target)} ||
|
|
4309
|
-
(s.labels && Array.from(s.labels).some(l => l.textContent?.trim() === ${JSON.stringify(target)}))
|
|
4310
|
-
|
|
4311
|
-
|
|
4312
|
-
|
|
4313
|
-
|
|
4314
|
-
|
|
4315
|
-
|
|
4316
|
-
|
|
4317
|
-
|
|
4624
|
+
(s.labels && Array.from(s.labels).some(l => l.textContent?.trim() === ${JSON.stringify(target)})))`;
|
|
4625
|
+
const selectExpr = knownSelector
|
|
4626
|
+
? `(() => {
|
|
4627
|
+
let sel = null;
|
|
4628
|
+
try { sel = document.querySelector(${JSON.stringify(knownSelector)}); } catch(e) {}
|
|
4629
|
+
if (!sel || sel.tagName !== 'SELECT') sel = ${textSearch};
|
|
4630
|
+
if (!sel) return null;
|
|
4631
|
+
const opt = Array.from(sel.options).find(o => o.text.trim() === ${JSON.stringify(option)} || o.value === ${JSON.stringify(option)});
|
|
4632
|
+
if (!opt) return 'no_option';
|
|
4633
|
+
sel.value = opt.value;
|
|
4634
|
+
sel.dispatchEvent(new Event('change', { bubbles: true }));
|
|
4635
|
+
return 'selected';
|
|
4636
|
+
})()`
|
|
4637
|
+
: `(() => {
|
|
4638
|
+
const sel = ${textSearch};
|
|
4639
|
+
if (!sel) return null;
|
|
4640
|
+
const opt = Array.from(sel.options).find(o => o.text.trim() === ${JSON.stringify(option)} || o.value === ${JSON.stringify(option)});
|
|
4641
|
+
if (!opt) return 'no_option';
|
|
4642
|
+
sel.value = opt.value;
|
|
4643
|
+
sel.dispatchEvent(new Event('change', { bubbles: true }));
|
|
4644
|
+
return 'selected';
|
|
4645
|
+
})()`;
|
|
4646
|
+
const evalResult = await Runtime.evaluate({
|
|
4647
|
+
expression: selectExpr,
|
|
4318
4648
|
returnByValue: true,
|
|
4319
4649
|
});
|
|
4320
4650
|
if (evalResult.result?.value === "selected") {
|
|
@@ -4335,7 +4665,7 @@ server.tool("select_with_fallback", "Select an option from a dropdown/menu using
|
|
|
4335
4665
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target: null };
|
|
4336
4666
|
}
|
|
4337
4667
|
});
|
|
4338
|
-
return formatResult("Selected", `${target} → ${option}`, result);
|
|
4668
|
+
return formatResult("Selected", `${target} → ${option}`, result, preCheckWarnings);
|
|
4339
4669
|
});
|
|
4340
4670
|
// ── scroll_with_fallback ──
|
|
4341
4671
|
server.tool("scroll_with_fallback", "Scroll within an element or the active window using the canonical fallback chain: AX → CDP → coordinates. Scrolls until target text is visible, or by a fixed amount.", {
|
|
@@ -4345,9 +4675,15 @@ server.tool("scroll_with_fallback", "Scroll within an element or the active wind
|
|
|
4345
4675
|
bundleId: z.string().optional().describe("App bundle ID"),
|
|
4346
4676
|
}, async ({ direction, amount, target, bundleId }) => {
|
|
4347
4677
|
await ensureBridge();
|
|
4348
|
-
const
|
|
4678
|
+
const preCheckWarnings = await preExecutionCheck(bundleId);
|
|
4679
|
+
const plan = planExecution("scroll", infra(), getSensorRanking());
|
|
4349
4680
|
const targetPid = await resolvePid(bundleId);
|
|
4350
4681
|
const scrollAmount = amount ?? 300;
|
|
4682
|
+
// L2→L1: Resolve known selector from references for scroll container
|
|
4683
|
+
const knownSelector = target ? contextTracker.getSelector(target) : null;
|
|
4684
|
+
if (knownSelector) {
|
|
4685
|
+
preCheckWarnings.push(`[L2→L1] Injecting known selector: ${knownSelector}`);
|
|
4686
|
+
}
|
|
4351
4687
|
// Resolve scroll coordinates — center of the frontmost window
|
|
4352
4688
|
let scrollX = 400, scrollY = 400;
|
|
4353
4689
|
try {
|
|
@@ -4383,7 +4719,7 @@ server.tool("scroll_with_fallback", "Scroll within an element or the active wind
|
|
|
4383
4719
|
return { content: [{ type: "text", text: `Scrolled ${direction} 10 times but "${target}" not found.` }] };
|
|
4384
4720
|
}
|
|
4385
4721
|
// Fixed-amount scroll via fallback chain
|
|
4386
|
-
const result = await executeWithFallback("scroll", plan, getAdaptedRetryPolicy(), async (method, attempt) => {
|
|
4722
|
+
const result = await executeWithFallback("scroll", plan, getAdaptedRetryPolicy("scroll_with_fallback"), async (method, attempt) => {
|
|
4387
4723
|
const start = Date.now();
|
|
4388
4724
|
try {
|
|
4389
4725
|
const deltaX = direction === "left" ? -scrollAmount : direction === "right" ? scrollAmount : 0;
|
|
@@ -4401,9 +4737,18 @@ server.tool("scroll_with_fallback", "Scroll within an element or the active wind
|
|
|
4401
4737
|
const client = await CDPClient({ port });
|
|
4402
4738
|
try {
|
|
4403
4739
|
const { Runtime } = client;
|
|
4404
|
-
|
|
4405
|
-
|
|
4406
|
-
|
|
4740
|
+
// L2→L1: Try scrolling known selector container first
|
|
4741
|
+
const scrollExpr = knownSelector
|
|
4742
|
+
? `(() => {
|
|
4743
|
+
try {
|
|
4744
|
+
const el = document.querySelector(${JSON.stringify(knownSelector)});
|
|
4745
|
+
if (el) { el.scrollBy(${deltaX}, ${deltaY}); return 'scrolled'; }
|
|
4746
|
+
} catch(e) {}
|
|
4747
|
+
window.scrollBy(${deltaX}, ${deltaY});
|
|
4748
|
+
return 'scrolled';
|
|
4749
|
+
})()`
|
|
4750
|
+
: `window.scrollBy(${deltaX}, ${deltaY})`;
|
|
4751
|
+
await Runtime.evaluate({ expression: scrollExpr });
|
|
4407
4752
|
return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${direction} ${scrollAmount}px` };
|
|
4408
4753
|
}
|
|
4409
4754
|
finally {
|
|
@@ -4421,7 +4766,7 @@ server.tool("scroll_with_fallback", "Scroll within an element or the active wind
|
|
|
4421
4766
|
return { ok: false, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: err instanceof Error ? err.message : String(err), target: null };
|
|
4422
4767
|
}
|
|
4423
4768
|
});
|
|
4424
|
-
return formatResult("Scrolled", `${direction} ${scrollAmount}px`, result);
|
|
4769
|
+
return formatResult("Scrolled", `${direction} ${scrollAmount}px`, result, preCheckWarnings);
|
|
4425
4770
|
});
|
|
4426
4771
|
// ── wait_for_state ──
|
|
4427
4772
|
server.tool("wait_for_state", "Wait until a condition is met on screen: text appears, text disappears, or element becomes available. Polls at intervals using the fallback chain.", {
|
|
@@ -4751,6 +5096,8 @@ function getJobRunner() {
|
|
|
4751
5096
|
const locCache = new LocatorCache();
|
|
4752
5097
|
locCache.setLearningEngine(learningEngine);
|
|
4753
5098
|
const runtimeService = new AutomationRuntimeService(adapter, logger, locCache);
|
|
5099
|
+
// Wire #15: connect AppMap to Executor for skip-verify optimization
|
|
5100
|
+
runtimeService.setAppMap(appMap);
|
|
4754
5101
|
const playbookEngine = new PlaybookEngine(runtimeService);
|
|
4755
5102
|
activePlaybookEngine = playbookEngine;
|
|
4756
5103
|
// Wire CDP into playbook engine for browser_js / cdp_key_event steps
|
|
@@ -4943,6 +5290,7 @@ originalTool("plan_execute", "Run a plan automatically. Known steps (from playbo
|
|
|
4943
5290
|
}
|
|
4944
5291
|
const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
|
|
4945
5292
|
const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
|
|
5293
|
+
executor.setAppMap(appMap);
|
|
4946
5294
|
const result = await executor.executeGoal(goal);
|
|
4947
5295
|
goalStore.update(goalId, goal);
|
|
4948
5296
|
// Check if paused at an LLM step
|
|
@@ -5004,6 +5352,7 @@ originalTool("plan_step", "Execute the next single step of a goal. For increment
|
|
|
5004
5352
|
}
|
|
5005
5353
|
const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
|
|
5006
5354
|
const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
|
|
5355
|
+
executor.setAppMap(appMap);
|
|
5007
5356
|
const result = await executor.executeNextStep(goal);
|
|
5008
5357
|
goalStore.update(goalId, goal);
|
|
5009
5358
|
if ("paused" in result) {
|
|
@@ -5047,6 +5396,7 @@ originalTool("plan_step_resolve", "Resolve a paused LLM step by providing the to
|
|
|
5047
5396
|
}
|
|
5048
5397
|
const adaptiveBudget = learningEngine.getAdaptiveBudget(worldModel.getState().focusedApp?.bundleId ?? "unknown");
|
|
5049
5398
|
const executor = new PlanExecutor(worldModel, planner, toolRegistry.toExecutor(), { postconditionWaitMs: adaptiveBudget.verifyMs, defaultStepTimeout: Math.max(30_000, adaptiveBudget.actMs * 2) }, recoveryEngine, learningEngine);
|
|
5399
|
+
executor.setAppMap(appMap);
|
|
5050
5400
|
const result = await executor.resolveStep(goal, tool, params ?? {});
|
|
5051
5401
|
goalStore.update(goalId, goal);
|
|
5052
5402
|
return {
|
|
@@ -5287,6 +5637,10 @@ originalTool("perception_start", "Start continuous screen monitoring — ScreenH
|
|
|
5287
5637
|
return { content: [{ type: "text", text: `Perception already running (started ${stats.startedAt}). Use perception_stop first to restart, or pass bundleId to switch target.` }] };
|
|
5288
5638
|
}
|
|
5289
5639
|
let app = worldModel.getState().focusedApp;
|
|
5640
|
+
// Validate bundleId format before it touches AppleScript/exec
|
|
5641
|
+
if (overrideBundleId && !/^[a-zA-Z0-9._-]+$/.test(overrideBundleId)) {
|
|
5642
|
+
return { content: [{ type: "text", text: "Error: Invalid bundleId format. Only alphanumeric characters, dots, hyphens, and underscores are allowed." }] };
|
|
5643
|
+
}
|
|
5290
5644
|
// If bundleId override provided, try to resolve app info via bridge or AppleScript
|
|
5291
5645
|
if (overrideBundleId && (!app || app.bundleId !== overrideBundleId)) {
|
|
5292
5646
|
try {
|
|
@@ -5768,7 +6122,37 @@ server.tool("scan_menu_bar", "Scan an app's menu bar via AX tree. Extracts all m
|
|
|
5768
6122
|
safePath = safePath.replace(/Log Out [^\n:]+/g, "Log Out [USER]");
|
|
5769
6123
|
lines.push(` ${safePath}: ${keys}`);
|
|
5770
6124
|
}
|
|
5771
|
-
|
|
6125
|
+
// Wire #12: L6→L7 — bootstrap AppMap zones from menu scan
|
|
6126
|
+
let bootstrapInfo = "";
|
|
6127
|
+
if (appMap) {
|
|
6128
|
+
const bootstrapped = appMap.bootstrapFromMenuScan(bundleId, appName, result);
|
|
6129
|
+
// Clear hint unconditionally — the scan was attempted regardless of bootstrap outcome
|
|
6130
|
+
contextTracker.clearMenuScanHint();
|
|
6131
|
+
if (bootstrapped) {
|
|
6132
|
+
bootstrapInfo = `\nAppMap: bootstrapped zones from menu structure (new app)`;
|
|
6133
|
+
}
|
|
6134
|
+
}
|
|
6135
|
+
// Wire F8: Seed learning from menu scan shortcuts (L6→L5)
|
|
6136
|
+
// Use successCount=5 and score=0.6 so seeds pass recommend() thresholds
|
|
6137
|
+
// (minSamples=5 for locators, score > 0.5 for patterns)
|
|
6138
|
+
if (learningEngine && result.shortcuts) {
|
|
6139
|
+
for (const [menuPath, keys] of Object.entries(result.shortcuts)) {
|
|
6140
|
+
const key = LocatorPolicy.makeKey(bundleId, "key");
|
|
6141
|
+
learningEngine.locators.seedEntry({
|
|
6142
|
+
key, locator: keys, method: "ax",
|
|
6143
|
+
successCount: 5, failCount: 0, score: 0.6,
|
|
6144
|
+
lastUsed: new Date().toISOString(),
|
|
6145
|
+
});
|
|
6146
|
+
// Also seed as pattern: menu_click with the menu path
|
|
6147
|
+
learningEngine.patterns.seedEntry({
|
|
6148
|
+
key: `${bundleId}::menu_click::${menuPath}`,
|
|
6149
|
+
bundleId, tool: "menu_click", locator: menuPath,
|
|
6150
|
+
method: "ax", successCount: 3, failCount: 0, score: 0.6,
|
|
6151
|
+
lastSeen: new Date().toISOString(),
|
|
6152
|
+
});
|
|
6153
|
+
}
|
|
6154
|
+
}
|
|
6155
|
+
let output = lines.join("\n") + bootstrapInfo;
|
|
5772
6156
|
output = redactUsername(output);
|
|
5773
6157
|
output = output.replace(/Log Out [^\n:]+/g, "Log Out [USER]");
|
|
5774
6158
|
return { content: [{ type: "text", text: output }] };
|
|
@@ -5813,6 +6197,24 @@ server.tool("ingest_documentation", "Parse a documentation page (HTML, markdown,
|
|
|
5813
6197
|
lines.push(` - ${t}`);
|
|
5814
6198
|
}
|
|
5815
6199
|
}
|
|
6200
|
+
// Wire F8: Seed learning from ingested documentation flows (L6→L5)
|
|
6201
|
+
if (learningEngine && result.flows) {
|
|
6202
|
+
for (const flow of result.flows) {
|
|
6203
|
+
for (const step of flow.steps) {
|
|
6204
|
+
if (!step.tool)
|
|
6205
|
+
continue;
|
|
6206
|
+
const target = (step.params?.text ?? step.params?.title ?? step.params?.target ?? step.description);
|
|
6207
|
+
if (target) {
|
|
6208
|
+
learningEngine.patterns.seedEntry({
|
|
6209
|
+
key: `${bundleId}::${step.tool}::${target}`,
|
|
6210
|
+
bundleId, tool: step.tool, locator: String(target),
|
|
6211
|
+
method: "ax", successCount: 3, failCount: 0, score: 0.6,
|
|
6212
|
+
lastSeen: new Date().toISOString(),
|
|
6213
|
+
});
|
|
6214
|
+
}
|
|
6215
|
+
}
|
|
6216
|
+
}
|
|
6217
|
+
}
|
|
5816
6218
|
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
5817
6219
|
});
|
|
5818
6220
|
server.tool("ingest_tutorial", "Extract structured playbook steps from a video transcript (e.g. YouTube captions). Converts tutorial narration into actionable automation steps with tool mappings.", {
|
|
@@ -5937,6 +6339,14 @@ originalTool("community_fetch", "Search community playbooks for a platform or wo
|
|
|
5937
6339
|
lines.push(` Score: ${pb.ratings.score} | By: ${pb.metadata.author}`);
|
|
5938
6340
|
lines.push("");
|
|
5939
6341
|
}
|
|
6342
|
+
// Wire F9: Import community playbooks into AppMap (L6→L7)
|
|
6343
|
+
if (appMap) {
|
|
6344
|
+
for (const pb of results) {
|
|
6345
|
+
if (pb.bundleId && pb.steps.length > 0) {
|
|
6346
|
+
appMap.importFromPlaybook(pb.bundleId, pb.name, pb.steps);
|
|
6347
|
+
}
|
|
6348
|
+
}
|
|
6349
|
+
}
|
|
5940
6350
|
return { content: [{ type: "text", text: lines.join("\n") }] };
|
|
5941
6351
|
});
|
|
5942
6352
|
// ═══════════════════════════════════════════════
|