@mseep/clawdcursor 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +2264 -0
- package/LICENSE +21 -0
- package/README.md +385 -0
- package/SECURITY.md +44 -0
- package/SKILL.md +503 -0
- package/dist/core/agent-loop/agent.d.ts +42 -0
- package/dist/core/agent-loop/agent.js +1023 -0
- package/dist/core/agent-loop/agent.js.map +1 -0
- package/dist/core/agent-loop/batch-tool.d.ts +25 -0
- package/dist/core/agent-loop/batch-tool.js +218 -0
- package/dist/core/agent-loop/batch-tool.js.map +1 -0
- package/dist/core/agent-loop/coord-scale.d.ts +72 -0
- package/dist/core/agent-loop/coord-scale.js +89 -0
- package/dist/core/agent-loop/coord-scale.js.map +1 -0
- package/dist/core/agent-loop/focus-guard.d.ts +24 -0
- package/dist/core/agent-loop/focus-guard.js +29 -0
- package/dist/core/agent-loop/focus-guard.js.map +1 -0
- package/dist/core/agent-loop/project-mcp.d.ts +97 -0
- package/dist/core/agent-loop/project-mcp.js +253 -0
- package/dist/core/agent-loop/project-mcp.js.map +1 -0
- package/dist/core/agent-loop/prompt.d.ts +45 -0
- package/dist/core/agent-loop/prompt.js +426 -0
- package/dist/core/agent-loop/prompt.js.map +1 -0
- package/dist/core/agent-loop/tool-meta.d.ts +93 -0
- package/dist/core/agent-loop/tool-meta.js +651 -0
- package/dist/core/agent-loop/tool-meta.js.map +1 -0
- package/dist/core/agent-loop/tools.d.ts +38 -0
- package/dist/core/agent-loop/tools.js +2134 -0
- package/dist/core/agent-loop/tools.js.map +1 -0
- package/dist/core/agent-loop/types.d.ts +170 -0
- package/dist/core/agent-loop/types.js +12 -0
- package/dist/core/agent-loop/types.js.map +1 -0
- package/dist/core/agent.d.ts +51 -0
- package/dist/core/agent.js +245 -0
- package/dist/core/agent.js.map +1 -0
- package/dist/core/app-categories.d.ts +67 -0
- package/dist/core/app-categories.js +108 -0
- package/dist/core/app-categories.js.map +1 -0
- package/dist/core/banner.d.ts +70 -0
- package/dist/core/banner.js +245 -0
- package/dist/core/banner.js.map +1 -0
- package/dist/core/classify/capability.d.ts +45 -0
- package/dist/core/classify/capability.js +78 -0
- package/dist/core/classify/capability.js.map +1 -0
- package/dist/core/decompose/llm-decomposer.d.ts +35 -0
- package/dist/core/decompose/llm-decomposer.js +156 -0
- package/dist/core/decompose/llm-decomposer.js.map +1 -0
- package/dist/core/decompose/parser.d.ts +27 -0
- package/dist/core/decompose/parser.js +101 -0
- package/dist/core/decompose/parser.js.map +1 -0
- package/dist/core/observability/correlation.d.ts +19 -0
- package/dist/core/observability/correlation.js +36 -0
- package/dist/core/observability/correlation.js.map +1 -0
- package/dist/core/observability/cost-meter.d.ts +51 -0
- package/dist/core/observability/cost-meter.js +134 -0
- package/dist/core/observability/cost-meter.js.map +1 -0
- package/dist/core/observability/logger.d.ts +61 -0
- package/dist/core/observability/logger.js +550 -0
- package/dist/core/observability/logger.js.map +1 -0
- package/dist/core/router/aliases.d.ts +50 -0
- package/dist/core/router/aliases.js +104 -0
- package/dist/core/router/aliases.js.map +1 -0
- package/dist/core/router/normalize.d.ts +41 -0
- package/dist/core/router/normalize.js +80 -0
- package/dist/core/router/normalize.js.map +1 -0
- package/dist/core/safety.d.ts +126 -0
- package/dist/core/safety.js +568 -0
- package/dist/core/safety.js.map +1 -0
- package/dist/core/sense/a11y-resolver.d.ts +73 -0
- package/dist/core/sense/a11y-resolver.js +76 -0
- package/dist/core/sense/a11y-resolver.js.map +1 -0
- package/dist/core/sense/fingerprint.d.ts +41 -0
- package/dist/core/sense/fingerprint.js +123 -0
- package/dist/core/sense/fingerprint.js.map +1 -0
- package/dist/core/sense/rank.d.ts +70 -0
- package/dist/core/sense/rank.js +192 -0
- package/dist/core/sense/rank.js.map +1 -0
- package/dist/core/sense/reactive-check.d.ts +40 -0
- package/dist/core/sense/reactive-check.js +48 -0
- package/dist/core/sense/reactive-check.js.map +1 -0
- package/dist/core/sense/snapshot.d.ts +19 -0
- package/dist/core/sense/snapshot.js +100 -0
- package/dist/core/sense/snapshot.js.map +1 -0
- package/dist/core/sense/types.d.ts +66 -0
- package/dist/core/sense/types.js +9 -0
- package/dist/core/sense/types.js.map +1 -0
- package/dist/core/sense/ui-map-anchors.d.ts +7 -0
- package/dist/core/sense/ui-map-anchors.js +24 -0
- package/dist/core/sense/ui-map-anchors.js.map +1 -0
- package/dist/core/sense/ui-map-elements.d.ts +5 -0
- package/dist/core/sense/ui-map-elements.js +33 -0
- package/dist/core/sense/ui-map-elements.js.map +1 -0
- package/dist/core/sense/ui-map-find.d.ts +56 -0
- package/dist/core/sense/ui-map-find.js +153 -0
- package/dist/core/sense/ui-map-find.js.map +1 -0
- package/dist/core/sense/ui-map-fuse.d.ts +4 -0
- package/dist/core/sense/ui-map-fuse.js +44 -0
- package/dist/core/sense/ui-map-fuse.js.map +1 -0
- package/dist/core/sense/ui-map-geom.d.ts +3 -0
- package/dist/core/sense/ui-map-geom.js +16 -0
- package/dist/core/sense/ui-map-geom.js.map +1 -0
- package/dist/core/sense/ui-map-holder.d.ts +58 -0
- package/dist/core/sense/ui-map-holder.js +87 -0
- package/dist/core/sense/ui-map-holder.js.map +1 -0
- package/dist/core/sense/ui-map-normalize.d.ts +19 -0
- package/dist/core/sense/ui-map-normalize.js +65 -0
- package/dist/core/sense/ui-map-normalize.js.map +1 -0
- package/dist/core/sense/ui-map-render.d.ts +4 -0
- package/dist/core/sense/ui-map-render.js +34 -0
- package/dist/core/sense/ui-map-render.js.map +1 -0
- package/dist/core/sense/ui-map-resolve.d.ts +41 -0
- package/dist/core/sense/ui-map-resolve.js +59 -0
- package/dist/core/sense/ui-map-resolve.js.map +1 -0
- package/dist/core/sense/ui-map-types.d.ts +66 -0
- package/dist/core/sense/ui-map-types.js +11 -0
- package/dist/core/sense/ui-map-types.js.map +1 -0
- package/dist/core/sense/ui-map.d.ts +29 -0
- package/dist/core/sense/ui-map.js +113 -0
- package/dist/core/sense/ui-map.js.map +1 -0
- package/dist/core/verify/assertions.d.ts +132 -0
- package/dist/core/verify/assertions.js +284 -0
- package/dist/core/verify/assertions.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +24 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/browser-config.d.ts +36 -0
- package/dist/llm/browser-config.js +83 -0
- package/dist/llm/browser-config.js.map +1 -0
- package/dist/llm/client.d.ts +268 -0
- package/dist/llm/client.js +1094 -0
- package/dist/llm/client.js.map +1 -0
- package/dist/llm/config.d.ts +79 -0
- package/dist/llm/config.js +375 -0
- package/dist/llm/config.js.map +1 -0
- package/dist/llm/credentials.d.ts +35 -0
- package/dist/llm/credentials.js +491 -0
- package/dist/llm/credentials.js.map +1 -0
- package/dist/llm/external-creds.d.ts +42 -0
- package/dist/llm/external-creds.js +169 -0
- package/dist/llm/external-creds.js.map +1 -0
- package/dist/llm/providers.d.ts +123 -0
- package/dist/llm/providers.js +717 -0
- package/dist/llm/providers.js.map +1 -0
- package/dist/paths.d.ts +31 -0
- package/dist/paths.js +147 -0
- package/dist/paths.js.map +1 -0
- package/dist/platform/accessibility.d.ts +139 -0
- package/dist/platform/accessibility.js +670 -0
- package/dist/platform/accessibility.js.map +1 -0
- package/dist/platform/cdp-driver.d.ts +318 -0
- package/dist/platform/cdp-driver.js +1179 -0
- package/dist/platform/cdp-driver.js.map +1 -0
- package/dist/platform/index.d.ts +11 -0
- package/dist/platform/index.js +69 -0
- package/dist/platform/index.js.map +1 -0
- package/dist/platform/keys.d.ts +17 -0
- package/dist/platform/keys.js +129 -0
- package/dist/platform/keys.js.map +1 -0
- package/dist/platform/launch-poll.d.ts +101 -0
- package/dist/platform/launch-poll.js +177 -0
- package/dist/platform/launch-poll.js.map +1 -0
- package/dist/platform/linux.d.ts +173 -0
- package/dist/platform/linux.js +1253 -0
- package/dist/platform/linux.js.map +1 -0
- package/dist/platform/macos.d.ts +136 -0
- package/dist/platform/macos.js +976 -0
- package/dist/platform/macos.js.map +1 -0
- package/dist/platform/native-desktop.d.ts +145 -0
- package/dist/platform/native-desktop.js +936 -0
- package/dist/platform/native-desktop.js.map +1 -0
- package/dist/platform/native-helper.d.ts +130 -0
- package/dist/platform/native-helper.js +592 -0
- package/dist/platform/native-helper.js.map +1 -0
- package/dist/platform/ocr-engine.d.ts +78 -0
- package/dist/platform/ocr-engine.js +363 -0
- package/dist/platform/ocr-engine.js.map +1 -0
- package/dist/platform/ps-runner.d.ts +28 -0
- package/dist/platform/ps-runner.js +228 -0
- package/dist/platform/ps-runner.js.map +1 -0
- package/dist/platform/types.d.ts +397 -0
- package/dist/platform/types.js +15 -0
- package/dist/platform/types.js.map +1 -0
- package/dist/platform/uri-handler.d.ts +75 -0
- package/dist/platform/uri-handler.js +273 -0
- package/dist/platform/uri-handler.js.map +1 -0
- package/dist/platform/wayland-backend.d.ts +53 -0
- package/dist/platform/wayland-backend.js +348 -0
- package/dist/platform/wayland-backend.js.map +1 -0
- package/dist/platform/windows.d.ts +232 -0
- package/dist/platform/windows.js +1210 -0
- package/dist/platform/windows.js.map +1 -0
- package/dist/postbuild.d.ts +10 -0
- package/dist/postbuild.js +98 -0
- package/dist/postbuild.js.map +1 -0
- package/dist/schema/snapshot.d.ts +33 -0
- package/dist/schema/snapshot.js +90 -0
- package/dist/schema/snapshot.js.map +1 -0
- package/dist/shortcuts.d.ts +30 -0
- package/dist/shortcuts.js +261 -0
- package/dist/shortcuts.js.map +1 -0
- package/dist/surface/cli.d.ts +7 -0
- package/dist/surface/cli.js +1556 -0
- package/dist/surface/cli.js.map +1 -0
- package/dist/surface/dashboard.d.ts +8 -0
- package/dist/surface/dashboard.js +1193 -0
- package/dist/surface/dashboard.js.map +1 -0
- package/dist/surface/doctor.d.ts +29 -0
- package/dist/surface/doctor.js +1514 -0
- package/dist/surface/doctor.js.map +1 -0
- package/dist/surface/format.d.ts +10 -0
- package/dist/surface/format.js +37 -0
- package/dist/surface/format.js.map +1 -0
- package/dist/surface/http-utility.d.ts +65 -0
- package/dist/surface/http-utility.js +336 -0
- package/dist/surface/http-utility.js.map +1 -0
- package/dist/surface/mcp-server.d.ts +91 -0
- package/dist/surface/mcp-server.js +280 -0
- package/dist/surface/mcp-server.js.map +1 -0
- package/dist/surface/onboarding.d.ts +15 -0
- package/dist/surface/onboarding.js +184 -0
- package/dist/surface/onboarding.js.map +1 -0
- package/dist/surface/pidfile.d.ts +79 -0
- package/dist/surface/pidfile.js +263 -0
- package/dist/surface/pidfile.js.map +1 -0
- package/dist/surface/readiness.d.ts +45 -0
- package/dist/surface/readiness.js +230 -0
- package/dist/surface/readiness.js.map +1 -0
- package/dist/surface/report.d.ts +68 -0
- package/dist/surface/report.js +341 -0
- package/dist/surface/report.js.map +1 -0
- package/dist/surface/skill-register.d.ts +14 -0
- package/dist/surface/skill-register.js +150 -0
- package/dist/surface/skill-register.js.map +1 -0
- package/dist/surface/version.d.ts +6 -0
- package/dist/surface/version.js +27 -0
- package/dist/surface/version.js.map +1 -0
- package/dist/tools/a11y.d.ts +8 -0
- package/dist/tools/a11y.js +545 -0
- package/dist/tools/a11y.js.map +1 -0
- package/dist/tools/a11y_depth.d.ts +19 -0
- package/dist/tools/a11y_depth.js +455 -0
- package/dist/tools/a11y_depth.js.map +1 -0
- package/dist/tools/agent.d.ts +15 -0
- package/dist/tools/agent.js +248 -0
- package/dist/tools/agent.js.map +1 -0
- package/dist/tools/batch.d.ts +46 -0
- package/dist/tools/batch.js +230 -0
- package/dist/tools/batch.js.map +1 -0
- package/dist/tools/cdp.d.ts +8 -0
- package/dist/tools/cdp.js +233 -0
- package/dist/tools/cdp.js.map +1 -0
- package/dist/tools/compact.d.ts +63 -0
- package/dist/tools/compact.js +418 -0
- package/dist/tools/compact.js.map +1 -0
- package/dist/tools/cost-class.d.ts +38 -0
- package/dist/tools/cost-class.js +117 -0
- package/dist/tools/cost-class.js.map +1 -0
- package/dist/tools/desktop.d.ts +9 -0
- package/dist/tools/desktop.js +346 -0
- package/dist/tools/desktop.js.map +1 -0
- package/dist/tools/electron_bridge.d.ts +41 -0
- package/dist/tools/electron_bridge.js +261 -0
- package/dist/tools/electron_bridge.js.map +1 -0
- package/dist/tools/extras.d.ts +22 -0
- package/dist/tools/extras.js +942 -0
- package/dist/tools/extras.js.map +1 -0
- package/dist/tools/favorites.d.ts +13 -0
- package/dist/tools/favorites.js +137 -0
- package/dist/tools/favorites.js.map +1 -0
- package/dist/tools/introspection.d.ts +13 -0
- package/dist/tools/introspection.js +55 -0
- package/dist/tools/introspection.js.map +1 -0
- package/dist/tools/ocr.d.ts +8 -0
- package/dist/tools/ocr.js +66 -0
- package/dist/tools/ocr.js.map +1 -0
- package/dist/tools/orchestration.d.ts +7 -0
- package/dist/tools/orchestration.js +377 -0
- package/dist/tools/orchestration.js.map +1 -0
- package/dist/tools/playbooks/extract-compose.d.ts +22 -0
- package/dist/tools/playbooks/extract-compose.js +85 -0
- package/dist/tools/playbooks/extract-compose.js.map +1 -0
- package/dist/tools/playbooks/find-replace.d.ts +11 -0
- package/dist/tools/playbooks/find-replace.js +56 -0
- package/dist/tools/playbooks/find-replace.js.map +1 -0
- package/dist/tools/playbooks/index.d.ts +63 -0
- package/dist/tools/playbooks/index.js +70 -0
- package/dist/tools/playbooks/index.js.map +1 -0
- package/dist/tools/playbooks/keys-blocklist.d.ts +24 -0
- package/dist/tools/playbooks/keys-blocklist.js +89 -0
- package/dist/tools/playbooks/keys-blocklist.js.map +1 -0
- package/dist/tools/registry.d.ts +40 -0
- package/dist/tools/registry.js +560 -0
- package/dist/tools/registry.js.map +1 -0
- package/dist/tools/safety-gate.d.ts +16 -0
- package/dist/tools/safety-gate.js +70 -0
- package/dist/tools/safety-gate.js.map +1 -0
- package/dist/tools/scheduler.d.ts +76 -0
- package/dist/tools/scheduler.js +413 -0
- package/dist/tools/scheduler.js.map +1 -0
- package/dist/tools/shortcuts.d.ts +13 -0
- package/dist/tools/shortcuts.js +205 -0
- package/dist/tools/shortcuts.js.map +1 -0
- package/dist/tools/smart.d.ts +15 -0
- package/dist/tools/smart.js +785 -0
- package/dist/tools/smart.js.map +1 -0
- package/dist/tools/types.d.ts +174 -0
- package/dist/tools/types.js +67 -0
- package/dist/tools/types.js.map +1 -0
- package/dist/tools/window-text.d.ts +15 -0
- package/dist/tools/window-text.js +39 -0
- package/dist/tools/window-text.js.map +1 -0
- package/dist/types.d.ts +122 -0
- package/dist/types.js +41 -0
- package/dist/types.js.map +1 -0
- package/native/Package.swift +38 -0
- package/native/README.md +113 -0
- package/native/Sources/ClawdCursorHelper/main.swift +602 -0
- package/native/Sources/ClawdCursorHost/main.swift +182 -0
- package/native/Sources/PermissionCheck/main.swift +53 -0
- package/native/Sources/ScreenshotHelper/main.swift +219 -0
- package/native/build.sh +139 -0
- package/native/entitlements.plist +12 -0
- package/package.json +115 -0
- package/scripts/banner.ps1 +112 -0
- package/scripts/coord-accuracy.ps1 +140 -0
- package/scripts/coord-uwp.ps1 +80 -0
- package/scripts/edge-glow.ps1 +180 -0
- package/scripts/find-element.ps1 +198 -0
- package/scripts/get-foreground-window.ps1 +71 -0
- package/scripts/get-screen-context.ps1 +183 -0
- package/scripts/get-windows.ps1 +66 -0
- package/scripts/install-panic-hotkey.ps1 +46 -0
- package/scripts/interact-element.ps1 +431 -0
- package/scripts/invoke-element.ps1 +314 -0
- package/scripts/linux/atspi-bridge.py +356 -0
- package/scripts/linux/ocr-recognize.py +154 -0
- package/scripts/mac/_window-picker.jxa +163 -0
- package/scripts/mac/find-element.jxa +0 -0
- package/scripts/mac/find-element.sh +161 -0
- package/scripts/mac/focus-window.jxa +284 -0
- package/scripts/mac/get-focused-element.jxa +102 -0
- package/scripts/mac/get-foreground-window.jxa +173 -0
- package/scripts/mac/get-screen-context.jxa +197 -0
- package/scripts/mac/get-ui-tree.sh +141 -0
- package/scripts/mac/get-windows.jxa +117 -0
- package/scripts/mac/interact-element.sh +235 -0
- package/scripts/mac/invoke-element.jxa +408 -0
- package/scripts/mac/ocr-recognize.swift +124 -0
- package/scripts/ocr-recognize.ps1 +102 -0
- package/scripts/postinstall-native.js +48 -0
- package/scripts/ps-bridge.ps1 +830 -0
- package/scripts/smoke-mcp.ps1 +119 -0
- package/scripts/sync-version.ts +178 -0
- package/scripts/verify-install.js +81 -0
|
@@ -0,0 +1,2134 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Unified-agent tool catalog.
|
|
4
|
+
*
|
|
5
|
+
* ONE tool vocabulary across blind / hybrid / vision modes. The only
|
|
6
|
+
* difference between modes: in `blind`, the `screenshot` tool is removed
|
|
7
|
+
* from the catalog before the LLM sees it.
|
|
8
|
+
*
|
|
9
|
+
* Design rules:
|
|
10
|
+
* - Every mutation goes through PlatformAdapter (OS-agnostic).
|
|
11
|
+
* - NO ctx.platform call happens outside a tool's `execute()` — the agent
|
|
12
|
+
* loop never touches the adapter directly.
|
|
13
|
+
* - Terminal actions (`done` / `give_up` / `cannot_read`) just return
|
|
14
|
+
* `stop: true` with a terminalExit tag; the agent loop decides the
|
|
15
|
+
* AgentResult.
|
|
16
|
+
* - a11y-first wording. `invoke_element` and `set_field_value` are the
|
|
17
|
+
* preferred targeting tools; coord clicks are the fallback.
|
|
18
|
+
*
|
|
19
|
+
* Zero app-specific rules. A new LOB app works because a11y roles + the
|
|
20
|
+
* rank-before-truncate sense layer surface its buttons.
|
|
21
|
+
*/
|
|
22
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
25
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
26
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
27
|
+
}
|
|
28
|
+
Object.defineProperty(o, k2, desc);
|
|
29
|
+
}) : (function(o, m, k, k2) {
|
|
30
|
+
if (k2 === undefined) k2 = k;
|
|
31
|
+
o[k2] = m[k];
|
|
32
|
+
}));
|
|
33
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
34
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
35
|
+
}) : function(o, v) {
|
|
36
|
+
o["default"] = v;
|
|
37
|
+
});
|
|
38
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
39
|
+
var ownKeys = function(o) {
|
|
40
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
41
|
+
var ar = [];
|
|
42
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
43
|
+
return ar;
|
|
44
|
+
};
|
|
45
|
+
return ownKeys(o);
|
|
46
|
+
};
|
|
47
|
+
return function (mod) {
|
|
48
|
+
if (mod && mod.__esModule) return mod;
|
|
49
|
+
var result = {};
|
|
50
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
51
|
+
__setModuleDefault(result, mod);
|
|
52
|
+
return result;
|
|
53
|
+
};
|
|
54
|
+
})();
|
|
55
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
56
|
+
exports.buildUnifiedTools = buildUnifiedTools;
|
|
57
|
+
exports.coerceCoord = coerceCoord;
|
|
58
|
+
const batch_tool_1 = require("./batch-tool");
|
|
59
|
+
const coord_scale_1 = require("./coord-scale");
|
|
60
|
+
const focus_guard_1 = require("./focus-guard");
|
|
61
|
+
const aliases_1 = require("../router/aliases");
|
|
62
|
+
const uri_handler_1 = require("../../platform/uri-handler");
|
|
63
|
+
const ocr_engine_1 = require("../../platform/ocr-engine");
|
|
64
|
+
const browser_config_1 = require("../../llm/browser-config");
|
|
65
|
+
const assertions_1 = require("../verify/assertions");
|
|
66
|
+
const ui_map_1 = require("../sense/ui-map");
|
|
67
|
+
const ui_map_render_1 = require("../sense/ui-map-render");
|
|
68
|
+
const prompt_1 = require("./prompt");
|
|
69
|
+
const ui_map_resolve_1 = require("../sense/ui-map-resolve");
|
|
70
|
+
const ui_map_find_1 = require("../sense/ui-map-find");
|
|
71
|
+
/** Lazy OCR singleton for the agent-loop perception tools (read_text, smart_click).
|
|
72
|
+
* Mirrors the pattern in src/tools/smart.ts. Construction never throws; the real
|
|
73
|
+
* availability check happens in isAvailable(). */
|
|
74
|
+
let _agentOcr = null;
|
|
75
|
+
function getAgentOcr() {
|
|
76
|
+
if (!_agentOcr)
|
|
77
|
+
_agentOcr = new ocr_engine_1.OcrEngine();
|
|
78
|
+
return _agentOcr;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Hedging-language phrases that indicate the agent is GUESSING about
|
|
82
|
+
* the task outcome instead of observing the actual screen state. Used
|
|
83
|
+
* by the `done` tool to reject speculative evidence claims like
|
|
84
|
+
* "the email should have been sent" — a real symptom from a Kimi run
|
|
85
|
+
* where the agent typed in a stale window and never noticed.
|
|
86
|
+
*
|
|
87
|
+
* Patterns are word-boundary anchored where possible so we don't
|
|
88
|
+
* false-positive on substrings (e.g., "shoulder" must not match
|
|
89
|
+
* "should"). Multi-word phrases match contiguous whitespace.
|
|
90
|
+
*
|
|
91
|
+
* The list is short on purpose — only the unambiguous "I'm guessing"
|
|
92
|
+
* phrases. Words like "looks", "shown", "displayed" are LEGITIMATE
|
|
93
|
+
* concrete-observation language and stay allowed.
|
|
94
|
+
*/
|
|
95
|
+
const HEDGING_PATTERN = new RegExp([
|
|
96
|
+
// Modal verbs of uncertainty
|
|
97
|
+
'\\bshould\\s+(?:have|be|now)\\b',
|
|
98
|
+
'\\bshould\\s+(?:have\\s+been|be|now)\\b',
|
|
99
|
+
'\\bshould\\b(?=\\s+\\w)',
|
|
100
|
+
'\\bmight\\s+(?:have|be)\\b',
|
|
101
|
+
'\\bmay\\s+have\\b',
|
|
102
|
+
'\\bcould\\s+have\\b',
|
|
103
|
+
'\\bprobably\\b',
|
|
104
|
+
'\\blikely\\s+(?:has|have|is|was)\\b',
|
|
105
|
+
// Speaker-uncertainty phrasings
|
|
106
|
+
'\\bI\\s+think\\b',
|
|
107
|
+
'\\bI\\s+believe\\b',
|
|
108
|
+
'\\bI\\s+assume\\b',
|
|
109
|
+
'\\bassuming\\b',
|
|
110
|
+
'\\bif\\s+(?:successful|it\\s+worked|the\\s+\\w+\\s+worked)\\b',
|
|
111
|
+
// Approximate observation
|
|
112
|
+
'\\bappears?\\s+to\\b',
|
|
113
|
+
'\\bseems?\\s+to\\b',
|
|
114
|
+
'\\bpresumably\\b',
|
|
115
|
+
].join('|'), 'i');
|
|
116
|
+
/**
|
|
117
|
+
* Build the unified tool catalog per mode + capability.
|
|
118
|
+
*
|
|
119
|
+
* Modes:
|
|
120
|
+
* - 'blind' → text-LLM; no `screenshot` tool in catalog
|
|
121
|
+
* - 'hybrid' → text-LLM; `screenshot` tool available on demand
|
|
122
|
+
* - 'vision' → vision-LLM; COMPOUND TOOL FORM (mouse/keyboard/window
|
|
123
|
+
* as action-discriminated schemas à la Anthropic
|
|
124
|
+
* computer_20250124) + perception + a11y + terminals
|
|
125
|
+
*
|
|
126
|
+
* Capability (text modes only):
|
|
127
|
+
* - When supplied and non-'general', filter to the scoped palette
|
|
128
|
+
* defined in `palettes.ts`. Typical palette ≈ 6–10 tools.
|
|
129
|
+
* - 'general' / undefined → full text-agent catalog (back-compat).
|
|
130
|
+
*
|
|
131
|
+
* Terminal actions (`done`, `give_up`, `cannot_read`) are always
|
|
132
|
+
* present regardless of mode/capability — the agent must always have
|
|
133
|
+
* an exit door.
|
|
134
|
+
*/
|
|
135
|
+
/** Reuse a cost-compatible current UIMap from the holder, or compile a fresh one.
|
|
136
|
+
* Date.now() is called at the tool-invocation boundary (correct: snapshot is fresh).
|
|
137
|
+
* Returns null when there is no holder on this context (non-UIMap-aware call sites). */
|
|
138
|
+
async function finderMap(ctx, rawMaxCost) {
|
|
139
|
+
const holder = ctx.uiMaps;
|
|
140
|
+
if (!holder)
|
|
141
|
+
return null;
|
|
142
|
+
const requested = (rawMaxCost === 'cheap' || rawMaxCost === 'ocr_ok' || rawMaxCost === 'vision_ok') ? rawMaxCost : 'ocr_ok';
|
|
143
|
+
const now = Date.now();
|
|
144
|
+
const reuse = holder.currentIfCost(requested, now);
|
|
145
|
+
if (reuse)
|
|
146
|
+
return reuse;
|
|
147
|
+
const id = holder.nextId();
|
|
148
|
+
const map = await (0, ui_map_1.compileUIMap)((0, ui_map_1.defaultCompileDeps)(ctx.platform, now, id), { max_cost: requested });
|
|
149
|
+
holder.put(map, now, requested);
|
|
150
|
+
return map;
|
|
151
|
+
}
|
|
152
|
+
function buildUnifiedTools() {
|
|
153
|
+
const tools = [
|
|
154
|
+
// ─── PERCEPTION ─────────────────────────────────────────────
|
|
155
|
+
{
|
|
156
|
+
name: 'read_screen',
|
|
157
|
+
description: 'START HERE — cheapest perception. Read the accessibility tree of the focused window: buttons, inputs, text elements with coordinates. The snapshot is auto-attached each turn; call this again only when you expect the screen changed since the last turn. If the tree is empty, escalate to read_text (OCR) next, then screenshot only as a last resort.',
|
|
158
|
+
inputSchema: {
|
|
159
|
+
type: 'object',
|
|
160
|
+
properties: {
|
|
161
|
+
processId: { type: 'number', description: 'Optional: limit to a specific process' },
|
|
162
|
+
},
|
|
163
|
+
additionalProperties: false,
|
|
164
|
+
},
|
|
165
|
+
changesScreen: false,
|
|
166
|
+
async execute(args, ctx) {
|
|
167
|
+
const pid = typeof args.processId === 'number' ? args.processId : undefined;
|
|
168
|
+
const tree = await ctx.platform.getUiTree(pid);
|
|
169
|
+
if (tree.length === 0) {
|
|
170
|
+
return { success: true, text: '(empty a11y tree — app may be custom-canvas)' };
|
|
171
|
+
}
|
|
172
|
+
const lines = tree.slice(0, 60).map(el => `[${el.controlType || 'Element'}] "${el.name || ''}" @${el.bounds.x},${el.bounds.y} ${el.bounds.width}×${el.bounds.height}${el.value ? ` value="${el.value.slice(0, 40)}"` : ''}${el.focused ? ' [FOCUSED]' : ''}`);
|
|
173
|
+
const more = tree.length > 60 ? `\n… +${tree.length - 60} more` : '';
|
|
174
|
+
return { success: true, text: `Fresh a11y (${tree.length} els):\n${(0, prompt_1.wrapUntrustedScreenContent)(lines.join('\n') + more)}` };
|
|
175
|
+
},
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
name: 'list_windows',
|
|
179
|
+
description: 'List visible top-level windows with title, process, and bounds. Useful when the active window is wrong or missing.',
|
|
180
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
181
|
+
changesScreen: false,
|
|
182
|
+
async execute(_args, ctx) {
|
|
183
|
+
const windows = await ctx.platform.listWindows();
|
|
184
|
+
const active = await ctx.platform.getActiveWindow();
|
|
185
|
+
const lines = windows.slice(0, 20).map(w => {
|
|
186
|
+
const isActive = active && w.processId === active.processId && w.title === active.title;
|
|
187
|
+
return `${isActive ? '→' : ' '} [${w.processName}] "${w.title}" pid=${w.processId} ${w.bounds.width}×${w.bounds.height}`;
|
|
188
|
+
});
|
|
189
|
+
const more = windows.length > 20 ? `\n… +${windows.length - 20} more windows` : '';
|
|
190
|
+
return { success: true, text: `Windows (${windows.length}):\n${lines.join('\n')}${more}` };
|
|
191
|
+
},
|
|
192
|
+
},
|
|
193
|
+
// ─── A11Y ACTIONS (preferred) ───────────────────────────────
|
|
194
|
+
{
|
|
195
|
+
name: 'invoke_element',
|
|
196
|
+
description: 'Click/activate a UI element by its accessibility name. MORE RELIABLE than coord clicks — use this when the snapshot shows a named target.',
|
|
197
|
+
inputSchema: {
|
|
198
|
+
type: 'object',
|
|
199
|
+
properties: {
|
|
200
|
+
name: { type: 'string', description: 'Accessibility name of the element' },
|
|
201
|
+
automationId: { type: 'string', description: 'Element automation ID (more precise than name)' },
|
|
202
|
+
controlType: { type: 'string', description: 'Optional role filter (Button, MenuItem, Tab, etc.)' },
|
|
203
|
+
processId: { type: 'number', description: 'Optional: limit to a specific process' },
|
|
204
|
+
action: {
|
|
205
|
+
type: 'string',
|
|
206
|
+
enum: ['click', 'set-value', 'get-value', 'focus', 'expand', 'collapse'],
|
|
207
|
+
description: 'Action to perform (default: "click")',
|
|
208
|
+
},
|
|
209
|
+
value: { type: 'string', description: 'Value for set-value action' },
|
|
210
|
+
element_id: { type: 'string', description: 'Target a compiled element from compile_ui (requires snapshot_id)' },
|
|
211
|
+
snapshot_id: { type: 'string', description: 'The compile_ui snapshot the element_id came from (requires element_id)' },
|
|
212
|
+
expect: EXPECT_SCHEMA,
|
|
213
|
+
},
|
|
214
|
+
// `name` OR `automationId` must be supplied; neither is required at
|
|
215
|
+
// the JSON-schema level — the execute() body guards the total absence.
|
|
216
|
+
additionalProperties: false,
|
|
217
|
+
},
|
|
218
|
+
changesScreen: true,
|
|
219
|
+
async execute(args, ctx) {
|
|
220
|
+
const refIds = { element_id: typeof args.element_id === 'string' ? args.element_id : undefined,
|
|
221
|
+
snapshot_id: typeof args.snapshot_id === 'string' ? args.snapshot_id : undefined };
|
|
222
|
+
if (refIds.element_id || refIds.snapshot_id) {
|
|
223
|
+
const aw = await ctx.platform.getActiveWindow().catch(() => null);
|
|
224
|
+
const plan = (0, ui_map_resolve_1.resolveRef)(refIds, ctx.uiMaps, Date.now(), 'click', aw);
|
|
225
|
+
if (!plan.ok)
|
|
226
|
+
return { success: false, text: `invoke_element ref rejected: ${plan.error}`, isError: true };
|
|
227
|
+
if (plan.via === 'name') {
|
|
228
|
+
// Mirror the by-name activation CASCADE: click → select → toggle.
|
|
229
|
+
// A ref to a ListItem / combo-item / checkbox may not support
|
|
230
|
+
// InvokePattern, so we try the three activation verbs in order and
|
|
231
|
+
// stop at the first success — identical logic to the by-name path above.
|
|
232
|
+
const refLadder = ['click', 'select', 'toggle'];
|
|
233
|
+
let refRes = await ctx.platform.invokeElement({ name: plan.name, action: refLadder[0] });
|
|
234
|
+
let refUsed = refLadder[0];
|
|
235
|
+
for (let i = 1; i < refLadder.length && !refRes.success; i++) {
|
|
236
|
+
refUsed = refLadder[i];
|
|
237
|
+
refRes = await ctx.platform.invokeElement({ name: plan.name, action: refUsed });
|
|
238
|
+
}
|
|
239
|
+
await sleep(150);
|
|
240
|
+
return { success: refRes.success, text: refRes.success ? `Invoked "${plan.name}" via a11y${refUsed !== 'click' ? ` (${refUsed})` : ''} (via ${plan.element.id}).` : `a11y invoke of ${plan.element.id} missed.`, targetLabel: plan.name };
|
|
241
|
+
}
|
|
242
|
+
const [bx, by, bw, bh] = plan.bounds;
|
|
243
|
+
await ctx.platform.mouseClick(Math.round(bx + bw / 2), Math.round(by + bh / 2));
|
|
244
|
+
await sleep(150);
|
|
245
|
+
return { success: true, text: `Clicked ${plan.element.id} at its bounds center.`, targetLabel: plan.element.id };
|
|
246
|
+
}
|
|
247
|
+
// `automationId` is accepted for MCP backward-compat but the PlatformAdapter
|
|
248
|
+
// invokeElement interface does not expose automationId filtering — it is used
|
|
249
|
+
// only as a name alias when name is absent.
|
|
250
|
+
const rawName = typeof args.name === 'string' ? args.name : '';
|
|
251
|
+
const automationId = typeof args.automationId === 'string' ? args.automationId : undefined;
|
|
252
|
+
const name = rawName || automationId || '';
|
|
253
|
+
if (!name) {
|
|
254
|
+
return { success: false, text: 'invoke_element: "name" or "automationId" is required (the accessibility name of the element to invoke).' };
|
|
255
|
+
}
|
|
256
|
+
const controlType = typeof args.controlType === 'string' ? args.controlType : undefined;
|
|
257
|
+
const processId = typeof args.processId === 'number' ? args.processId : undefined;
|
|
258
|
+
const VALID_ACTIONS = ['click', 'set-value', 'get-value', 'focus', 'expand', 'collapse'];
|
|
259
|
+
const rawAction = typeof args.action === 'string' ? args.action : 'click';
|
|
260
|
+
const action = VALID_ACTIONS.includes(rawAction)
|
|
261
|
+
? rawAction
|
|
262
|
+
: 'click';
|
|
263
|
+
const value = typeof args.value === 'string' ? args.value : undefined;
|
|
264
|
+
// OS-AGNOSTIC ACTIVATION CASCADE. "click" is the generic "activate this
|
|
265
|
+
// element" intent — but a named target can be a Button (InvokePattern),
|
|
266
|
+
// a checkbox (TogglePattern), or a ListItem / combo-item
|
|
267
|
+
// (SelectionItemPattern), and the agent operating BLIND can't see which.
|
|
268
|
+
// Live regression 2026-06-07: invoke "Cool blue" (a ListItem) failed
|
|
269
|
+
// because only SelectionItemPattern fit, forcing a coord-click fallback
|
|
270
|
+
// that needs a screenshot — the exact token cost clawdcursor avoids. So
|
|
271
|
+
// for the activate intent we try the activation verbs in order until one
|
|
272
|
+
// takes. EXPLICIT verbs (expand/collapse/get-value/set-value/focus) stay
|
|
273
|
+
// strict — the agent that asked to expand never silently gets a select.
|
|
274
|
+
// Pure adapter-string retries → works on every OS with zero per-OS code,
|
|
275
|
+
// and only the failing path pays the extra round-trips.
|
|
276
|
+
const ladder = action === 'click' ? ['click', 'select', 'toggle'] : [action];
|
|
277
|
+
let res = await ctx.platform.invokeElement({ name, controlType, processId, action: ladder[0], value });
|
|
278
|
+
let used = ladder[0];
|
|
279
|
+
for (let i = 1; i < ladder.length && !res.success; i++) {
|
|
280
|
+
used = ladder[i];
|
|
281
|
+
res = await ctx.platform.invokeElement({ name, controlType, processId, action: used, value });
|
|
282
|
+
}
|
|
283
|
+
await sleep(150);
|
|
284
|
+
return {
|
|
285
|
+
success: res.success,
|
|
286
|
+
text: res.success
|
|
287
|
+
? (res.data && 'value' in res.data
|
|
288
|
+
? `Invoked "${name}" (${used}) → value: "${res.data.value}"`
|
|
289
|
+
: `Invoked "${name}" via a11y${used !== 'click' ? ` (${used})` : ''}.`)
|
|
290
|
+
: `a11y invoke "${name}" missed — element not found or not actionable.`,
|
|
291
|
+
targetLabel: name,
|
|
292
|
+
};
|
|
293
|
+
},
|
|
294
|
+
},
|
|
295
|
+
{
|
|
296
|
+
name: 'set_field_value',
|
|
297
|
+
description: 'Set an editable field\'s value directly via accessibility (more reliable than click+type for forms).',
|
|
298
|
+
inputSchema: {
|
|
299
|
+
type: 'object',
|
|
300
|
+
properties: {
|
|
301
|
+
name: { type: 'string', description: 'Accessibility name of the field' },
|
|
302
|
+
value: { type: 'string' },
|
|
303
|
+
controlType: { type: 'string', description: 'Optional role filter (e.g. "Edit")' },
|
|
304
|
+
processId: { type: 'number' },
|
|
305
|
+
element_id: { type: 'string', description: 'Target a compiled element from compile_ui (requires snapshot_id)' },
|
|
306
|
+
snapshot_id: { type: 'string', description: 'The compile_ui snapshot the element_id came from (requires element_id)' },
|
|
307
|
+
expect: EXPECT_SCHEMA,
|
|
308
|
+
},
|
|
309
|
+
required: ['value'],
|
|
310
|
+
additionalProperties: false,
|
|
311
|
+
},
|
|
312
|
+
changesScreen: true,
|
|
313
|
+
async execute(args, ctx) {
|
|
314
|
+
const refIds = { element_id: typeof args.element_id === 'string' ? args.element_id : undefined,
|
|
315
|
+
snapshot_id: typeof args.snapshot_id === 'string' ? args.snapshot_id : undefined };
|
|
316
|
+
if (refIds.element_id || refIds.snapshot_id) {
|
|
317
|
+
const fillValue = String(args.value ?? '');
|
|
318
|
+
const aw = await ctx.platform.getActiveWindow().catch(() => null);
|
|
319
|
+
const plan = (0, ui_map_resolve_1.resolveRef)(refIds, ctx.uiMaps, Date.now(), 'fill', aw);
|
|
320
|
+
if (!plan.ok)
|
|
321
|
+
return { success: false, text: `set_field_value ref rejected: ${plan.error}`, isError: true };
|
|
322
|
+
if (plan.via === 'name') {
|
|
323
|
+
const res = await ctx.platform.invokeElement({ name: plan.name, action: 'set-value', value: fillValue });
|
|
324
|
+
await sleep(150);
|
|
325
|
+
return { success: res.success, text: res.success ? `Set "${plan.name}" = ${fillValue.length} chars (via ${plan.element.id}).` : `Set of ${plan.element.id} failed.`, targetLabel: plan.name };
|
|
326
|
+
}
|
|
327
|
+
const [bx, by, bw, bh] = plan.bounds;
|
|
328
|
+
await ctx.platform.mouseClick(Math.round(bx + bw / 2), Math.round(by + bh / 2));
|
|
329
|
+
await ctx.platform.typeText(fillValue);
|
|
330
|
+
await sleep(150);
|
|
331
|
+
return { success: true, text: `Filled ${plan.element.id} via bounds + type (${fillValue.length} chars).`, targetLabel: plan.element.id };
|
|
332
|
+
}
|
|
333
|
+
const name = String(args.name ?? '');
|
|
334
|
+
const value = String(args.value ?? '');
|
|
335
|
+
const controlType = typeof args.controlType === 'string' ? args.controlType : undefined;
|
|
336
|
+
const processId = typeof args.processId === 'number' ? args.processId : undefined;
|
|
337
|
+
const res = await ctx.platform.invokeElement({ name, controlType, processId, action: 'set-value', value });
|
|
338
|
+
await sleep(150);
|
|
339
|
+
return {
|
|
340
|
+
success: res.success,
|
|
341
|
+
text: res.success ? `Set "${name}" = ${value.length} chars` : `Set "${name}" failed.`,
|
|
342
|
+
targetLabel: name,
|
|
343
|
+
};
|
|
344
|
+
},
|
|
345
|
+
},
|
|
346
|
+
// ─── A11Y DEPTH (Tranche 2) ────────────────────────────────
|
|
347
|
+
{
|
|
348
|
+
name: 'a11y_expand',
|
|
349
|
+
description: 'Expand a tree node / combo / disclosure by a11y name (UIA ExpandCollapsePattern, AX AXExpanded).',
|
|
350
|
+
inputSchema: {
|
|
351
|
+
type: 'object',
|
|
352
|
+
properties: {
|
|
353
|
+
name: { type: 'string' },
|
|
354
|
+
controlType: { type: 'string' },
|
|
355
|
+
processId: { type: 'number' },
|
|
356
|
+
},
|
|
357
|
+
required: ['name'],
|
|
358
|
+
additionalProperties: false,
|
|
359
|
+
},
|
|
360
|
+
changesScreen: true,
|
|
361
|
+
async execute(args, ctx) {
|
|
362
|
+
const name = String(args.name ?? '');
|
|
363
|
+
const res = await ctx.platform.invokeElement({
|
|
364
|
+
name,
|
|
365
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
366
|
+
processId: await resolveAgentPid(args, ctx),
|
|
367
|
+
action: 'expand',
|
|
368
|
+
});
|
|
369
|
+
return {
|
|
370
|
+
success: res.success,
|
|
371
|
+
text: res.success ? `Expanded "${name}".` : `Could not expand "${name}".`,
|
|
372
|
+
targetLabel: name,
|
|
373
|
+
};
|
|
374
|
+
},
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
name: 'a11y_collapse',
|
|
378
|
+
description: 'Collapse a tree node / combo / disclosure by a11y name.',
|
|
379
|
+
inputSchema: {
|
|
380
|
+
type: 'object',
|
|
381
|
+
properties: {
|
|
382
|
+
name: { type: 'string' },
|
|
383
|
+
controlType: { type: 'string' },
|
|
384
|
+
processId: { type: 'number' },
|
|
385
|
+
},
|
|
386
|
+
required: ['name'],
|
|
387
|
+
additionalProperties: false,
|
|
388
|
+
},
|
|
389
|
+
changesScreen: true,
|
|
390
|
+
async execute(args, ctx) {
|
|
391
|
+
const name = String(args.name ?? '');
|
|
392
|
+
const res = await ctx.platform.invokeElement({
|
|
393
|
+
name,
|
|
394
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
395
|
+
processId: await resolveAgentPid(args, ctx),
|
|
396
|
+
action: 'collapse',
|
|
397
|
+
});
|
|
398
|
+
return {
|
|
399
|
+
success: res.success,
|
|
400
|
+
text: res.success ? `Collapsed "${name}".` : `Could not collapse "${name}".`,
|
|
401
|
+
targetLabel: name,
|
|
402
|
+
};
|
|
403
|
+
},
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
name: 'a11y_toggle',
|
|
407
|
+
description: 'Toggle a checkbox / switch / toggle-button by a11y name. Returns new state (On/Off/Indeterminate).',
|
|
408
|
+
inputSchema: {
|
|
409
|
+
type: 'object',
|
|
410
|
+
properties: {
|
|
411
|
+
name: { type: 'string' },
|
|
412
|
+
controlType: { type: 'string' },
|
|
413
|
+
processId: { type: 'number' },
|
|
414
|
+
},
|
|
415
|
+
required: ['name'],
|
|
416
|
+
additionalProperties: false,
|
|
417
|
+
},
|
|
418
|
+
changesScreen: true,
|
|
419
|
+
async execute(args, ctx) {
|
|
420
|
+
const name = String(args.name ?? '');
|
|
421
|
+
const res = await ctx.platform.invokeElement({
|
|
422
|
+
name,
|
|
423
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
424
|
+
processId: await resolveAgentPid(args, ctx),
|
|
425
|
+
action: 'toggle',
|
|
426
|
+
});
|
|
427
|
+
if (!res.success)
|
|
428
|
+
return { success: false, text: `Could not toggle "${name}".`, targetLabel: name };
|
|
429
|
+
const state = res.data?.toggleState ?? 'unknown';
|
|
430
|
+
return { success: true, text: `Toggled "${name}" → ${state}.`, targetLabel: name };
|
|
431
|
+
},
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
name: 'a11y_select',
|
|
435
|
+
description: 'Select a list item / tab / radio by a11y name (UIA SelectionItemPattern, AX AXSelected).',
|
|
436
|
+
inputSchema: {
|
|
437
|
+
type: 'object',
|
|
438
|
+
properties: {
|
|
439
|
+
name: { type: 'string' },
|
|
440
|
+
controlType: { type: 'string' },
|
|
441
|
+
processId: { type: 'number' },
|
|
442
|
+
},
|
|
443
|
+
required: ['name'],
|
|
444
|
+
additionalProperties: false,
|
|
445
|
+
},
|
|
446
|
+
changesScreen: true,
|
|
447
|
+
async execute(args, ctx) {
|
|
448
|
+
const name = String(args.name ?? '');
|
|
449
|
+
const res = await ctx.platform.invokeElement({
|
|
450
|
+
name,
|
|
451
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
452
|
+
processId: await resolveAgentPid(args, ctx),
|
|
453
|
+
action: 'select',
|
|
454
|
+
});
|
|
455
|
+
return {
|
|
456
|
+
success: res.success,
|
|
457
|
+
text: res.success ? `Selected "${name}".` : `Could not select "${name}".`,
|
|
458
|
+
targetLabel: name,
|
|
459
|
+
};
|
|
460
|
+
},
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
name: 'a11y_get_value',
|
|
464
|
+
description: 'Read the current value of a named field (UIA ValuePattern / AX AXValue). Useful to verify before typing.',
|
|
465
|
+
inputSchema: {
|
|
466
|
+
type: 'object',
|
|
467
|
+
properties: {
|
|
468
|
+
name: { type: 'string' },
|
|
469
|
+
controlType: { type: 'string' },
|
|
470
|
+
processId: { type: 'number' },
|
|
471
|
+
},
|
|
472
|
+
required: ['name'],
|
|
473
|
+
additionalProperties: false,
|
|
474
|
+
},
|
|
475
|
+
changesScreen: false,
|
|
476
|
+
async execute(args, ctx) {
|
|
477
|
+
const name = String(args.name ?? '');
|
|
478
|
+
const res = await ctx.platform.invokeElement({
|
|
479
|
+
name,
|
|
480
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
481
|
+
processId: await resolveAgentPid(args, ctx),
|
|
482
|
+
action: 'get-value',
|
|
483
|
+
});
|
|
484
|
+
if (!res.success)
|
|
485
|
+
return { success: false, text: `"${name}" has no readable value.` };
|
|
486
|
+
const value = res.data?.value ?? '';
|
|
487
|
+
return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)(`"${name}" = "${truncate(String(value), 120)}"`) };
|
|
488
|
+
},
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
name: 'verify',
|
|
492
|
+
description: 'Deterministically check CURRENT state against machine-checkable assertions — the harness executes them, no guessing. Types: window_title_contains{value}, app_running{name}, element_exists{name}, element_value_contains{name,value}, clipboard_contains{value}, file_exists{path}, file_contains{path,value}, ocr_contains{value}, file_changed_since_start{path} (proves a file was written during THIS task). Cheaper and more reliable than a screenshot — use after a critical step or before done().',
|
|
493
|
+
inputSchema: {
|
|
494
|
+
type: 'object',
|
|
495
|
+
properties: {
|
|
496
|
+
assertions: {
|
|
497
|
+
type: 'array',
|
|
498
|
+
description: 'Up to 8 assertions, each {type, ...fields} per the types listed in the tool description.',
|
|
499
|
+
items: { type: 'object' },
|
|
500
|
+
},
|
|
501
|
+
},
|
|
502
|
+
required: ['assertions'],
|
|
503
|
+
additionalProperties: false,
|
|
504
|
+
},
|
|
505
|
+
changesScreen: false,
|
|
506
|
+
async execute(args, ctx) {
|
|
507
|
+
const parsed = (0, assertions_1.parseAssertions)(args.assertions);
|
|
508
|
+
if ('error' in parsed)
|
|
509
|
+
return { success: false, text: `verify rejected: ${parsed.error}` };
|
|
510
|
+
const report = await (0, assertions_1.checkAssertions)(parsed.assertions, {
|
|
511
|
+
adapter: ctx.platform,
|
|
512
|
+
ocrText: async () => (await getAgentOcr().recognizeScreen()).fullText ?? '',
|
|
513
|
+
});
|
|
514
|
+
return {
|
|
515
|
+
success: report.ok,
|
|
516
|
+
text: `${report.ok ? 'VERIFIED' : `FAILED ${report.failed}/${report.outcomes.length}`}:\n${(0, assertions_1.renderReport)(report)}`,
|
|
517
|
+
};
|
|
518
|
+
},
|
|
519
|
+
},
|
|
520
|
+
{
|
|
521
|
+
name: 'get_element_state',
|
|
522
|
+
description: 'Get state flags of a named element (focused/enabled/disabled/selected/busy/offscreen/expandable/expanded).',
|
|
523
|
+
inputSchema: {
|
|
524
|
+
type: 'object',
|
|
525
|
+
properties: {
|
|
526
|
+
name: { type: 'string' },
|
|
527
|
+
controlType: { type: 'string' },
|
|
528
|
+
processId: { type: 'number' },
|
|
529
|
+
},
|
|
530
|
+
required: ['name'],
|
|
531
|
+
additionalProperties: false,
|
|
532
|
+
},
|
|
533
|
+
changesScreen: false,
|
|
534
|
+
async execute(args, ctx) {
|
|
535
|
+
const name = String(args.name ?? '');
|
|
536
|
+
const hits = await ctx.platform.findElements({
|
|
537
|
+
name,
|
|
538
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
539
|
+
processId: await resolveAgentPid(args, ctx),
|
|
540
|
+
});
|
|
541
|
+
if (hits.length === 0)
|
|
542
|
+
return { success: false, text: `No element named "${name}".` };
|
|
543
|
+
const el = hits[0];
|
|
544
|
+
return {
|
|
545
|
+
success: true,
|
|
546
|
+
text: JSON.stringify({
|
|
547
|
+
name: el.name,
|
|
548
|
+
controlType: el.controlType,
|
|
549
|
+
focused: el.focused ?? false,
|
|
550
|
+
enabled: el.enabled ?? true,
|
|
551
|
+
disabled: el.disabled ?? false,
|
|
552
|
+
selected: el.selected ?? false,
|
|
553
|
+
busy: el.busy ?? false,
|
|
554
|
+
offscreen: el.offscreen ?? false,
|
|
555
|
+
expandable: el.expandable ?? false,
|
|
556
|
+
expanded: el.expanded ?? false,
|
|
557
|
+
}),
|
|
558
|
+
};
|
|
559
|
+
},
|
|
560
|
+
},
|
|
561
|
+
// ─── INPUT (mouse) ──────────────────────────────────────────
|
|
562
|
+
{
|
|
563
|
+
name: 'click',
|
|
564
|
+
description: 'Click at (x,y). The default coordinate space follows context (image-space while a screenshot is in your context, else screen-space) — pass `space` explicitly when mixing sources: space:"screen" for a11y/@x,y map coords, space:"image" for coords read off the screenshot. Prefer invoke_element when the target has an a11y name.',
|
|
565
|
+
inputSchema: {
|
|
566
|
+
type: 'object',
|
|
567
|
+
properties: {
|
|
568
|
+
x: { type: 'number' },
|
|
569
|
+
y: { type: 'number' },
|
|
570
|
+
button: { type: 'string', enum: ['left', 'right'] },
|
|
571
|
+
count: { type: 'number', description: '1=single, 2=double' },
|
|
572
|
+
space: COORD_SPACE_SCHEMA,
|
|
573
|
+
expect: EXPECT_SCHEMA,
|
|
574
|
+
},
|
|
575
|
+
required: ['x', 'y'],
|
|
576
|
+
additionalProperties: false,
|
|
577
|
+
},
|
|
578
|
+
changesScreen: true,
|
|
579
|
+
async execute(args, ctx) {
|
|
580
|
+
const { x: ix, y: iy, warning } = coerceCoord(args.x, args.y);
|
|
581
|
+
if (!Number.isFinite(ix) || !Number.isFinite(iy)) {
|
|
582
|
+
return { success: false, isError: true, text: `click: x/y must be finite numbers, got x=${JSON.stringify(args.x)} y=${JSON.stringify(args.y)}` };
|
|
583
|
+
}
|
|
584
|
+
const button = args.button === 'right' ? 'right' : 'left';
|
|
585
|
+
const count = args.count === 2 ? 2 : 1;
|
|
586
|
+
// SCALE: 'image' coords (read off the 1280-wide screenshot) → physical;
|
|
587
|
+
// 'screen'/default (a11y coords, already physical) → pass through.
|
|
588
|
+
// Explicit space wins; else use ctx.coordSpaceDefault (set to 'image' on
|
|
589
|
+
// vision turns by the agent loop); fall back to 'screen'.
|
|
590
|
+
const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
|
|
591
|
+
const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
|
|
592
|
+
const x = (0, coord_scale_1.scaleCoord)(ix, scale);
|
|
593
|
+
const y = (0, coord_scale_1.scaleCoord)(iy, scale);
|
|
594
|
+
const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
|
|
595
|
+
const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
|
|
596
|
+
const before = raised ? await ctx.platform.getActiveWindow().catch(() => null) : fg0;
|
|
597
|
+
const activation = await ctx.platform.mouseClick(x, y, { button, count });
|
|
598
|
+
await sleep(150);
|
|
599
|
+
const after = await ctx.platform.getActiveWindow().catch(() => null);
|
|
600
|
+
const note = warning ? ` (${warning})` : '';
|
|
601
|
+
const focusWarn = focusTheftWarning(activation, before, after);
|
|
602
|
+
return { success: true, text: `Clicked ${button} x${count} at ${coordBreadcrumb(ix, iy, x, y, space, scale, ctx)}${raised}${focusBreadcrumb(before, after)}${note}${focusWarn}` };
|
|
603
|
+
},
|
|
604
|
+
},
|
|
605
|
+
{
|
|
606
|
+
name: 'drag',
|
|
607
|
+
description: 'Drag the mouse from (startX,startY) to (endX,endY) — select text, draw, resize. To TRACE A CURVE/PATH (gesture, curved track, drawing), pass `path` = an array of 12–20 {x,y} points instead: press at the first point, move through each, release at the last. The default coordinate space follows context; if you read coords off the SCREENSHOT, pass space:"image" so the tool scales them.',
|
|
608
|
+
inputSchema: {
|
|
609
|
+
type: 'object',
|
|
610
|
+
properties: {
|
|
611
|
+
startX: { type: 'number' },
|
|
612
|
+
startY: { type: 'number' },
|
|
613
|
+
endX: { type: 'number' },
|
|
614
|
+
endY: { type: 'number' },
|
|
615
|
+
path: {
|
|
616
|
+
type: 'array',
|
|
617
|
+
description: 'Stepped drag path: array of {x,y} points (min 2). When given, startX/startY/endX/endY are ignored. Press at first point, release at last.',
|
|
618
|
+
items: { type: 'object', properties: { x: { type: 'number' }, y: { type: 'number' } }, required: ['x', 'y'] },
|
|
619
|
+
},
|
|
620
|
+
space: COORD_SPACE_SCHEMA,
|
|
621
|
+
expect: EXPECT_SCHEMA,
|
|
622
|
+
},
|
|
623
|
+
additionalProperties: false,
|
|
624
|
+
},
|
|
625
|
+
changesScreen: true,
|
|
626
|
+
async execute(args, ctx) {
|
|
627
|
+
const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
|
|
628
|
+
const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
|
|
629
|
+
// Stepped path variant: press at the first point, walk the rest,
|
|
630
|
+
// release at the last (canvas tracing — same gesture the MCP-side
|
|
631
|
+
// mouse_drag_stepped performs).
|
|
632
|
+
if (args.path !== undefined) {
|
|
633
|
+
let pts;
|
|
634
|
+
try {
|
|
635
|
+
pts = typeof args.path === 'string' ? JSON.parse(args.path) : args.path;
|
|
636
|
+
}
|
|
637
|
+
catch {
|
|
638
|
+
return { success: false, isError: true, text: 'drag: `path` must be an array of {x,y} points' };
|
|
639
|
+
}
|
|
640
|
+
if (!Array.isArray(pts) || pts.length < 2 || !pts.every(p => p && Number.isFinite(Number(p.x)) && Number.isFinite(Number(p.y)))) {
|
|
641
|
+
return { success: false, isError: true, text: 'drag: `path` needs at least 2 {x,y} points with finite coords' };
|
|
642
|
+
}
|
|
643
|
+
const scaled = pts.map(p => ({ x: (0, coord_scale_1.scaleCoord)(Number(p.x), scale), y: (0, coord_scale_1.scaleCoord)(Number(p.y), scale) }));
|
|
644
|
+
const fg0p = await ctx.platform.getActiveWindow().catch(() => null);
|
|
645
|
+
const raisedP = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0p);
|
|
646
|
+
const beforeP = raisedP ? await ctx.platform.getActiveWindow().catch(() => null) : fg0p;
|
|
647
|
+
await ctx.platform.mouseMove(scaled[0].x, scaled[0].y);
|
|
648
|
+
await ctx.platform.mouseDown('left');
|
|
649
|
+
try {
|
|
650
|
+
for (let i = 1; i < scaled.length; i++) {
|
|
651
|
+
await ctx.platform.mouseMove(scaled[i].x, scaled[i].y);
|
|
652
|
+
await sleep(16); // let the app register the motion between segments
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
finally {
|
|
656
|
+
await ctx.platform.mouseUp('left');
|
|
657
|
+
}
|
|
658
|
+
await sleep(200);
|
|
659
|
+
const afterP = await ctx.platform.getActiveWindow().catch(() => null);
|
|
660
|
+
return { success: true, text: `Stepped-drag through ${pts.length} ${space} points → screen (${scaled[0].x},${scaled[0].y})…(${scaled[scaled.length - 1].x},${scaled[scaled.length - 1].y}) [×${scale}]${raisedP}${focusBreadcrumb(beforeP, afterP)}` };
|
|
661
|
+
}
|
|
662
|
+
const start = coerceCoord(args.startX, args.startY);
|
|
663
|
+
const end = coerceCoord(args.endX, args.endY);
|
|
664
|
+
if (![start.x, start.y, end.x, end.y].every(Number.isFinite)) {
|
|
665
|
+
return { success: false, isError: true, text: `drag: startX/startY/endX/endY must be finite numbers (or pass \`path\`), got ${JSON.stringify(args)}` };
|
|
666
|
+
}
|
|
667
|
+
const sx = (0, coord_scale_1.scaleCoord)(start.x, scale), sy = (0, coord_scale_1.scaleCoord)(start.y, scale);
|
|
668
|
+
const ex = (0, coord_scale_1.scaleCoord)(end.x, scale), ey = (0, coord_scale_1.scaleCoord)(end.y, scale);
|
|
669
|
+
const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
|
|
670
|
+
const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
|
|
671
|
+
const before = raised ? await ctx.platform.getActiveWindow().catch(() => null) : fg0;
|
|
672
|
+
await ctx.platform.mouseDrag(sx, sy, ex, ey);
|
|
673
|
+
await sleep(200);
|
|
674
|
+
const after = await ctx.platform.getActiveWindow().catch(() => null);
|
|
675
|
+
return { success: true, text: `Dragged ${space} (${start.x},${start.y})→(${end.x},${end.y}) → screen (${sx},${sy})→(${ex},${ey}) [×${scale}]${raised}${focusBreadcrumb(before, after)}` };
|
|
676
|
+
},
|
|
677
|
+
},
|
|
678
|
+
{
|
|
679
|
+
name: 'move',
|
|
680
|
+
description: 'Move the cursor to (x,y) WITHOUT clicking — hover/dwell over a target (pair with wait(ms) for a required dwell time). The default coordinate space follows context; pass space:"image" for coords read off the screenshot.',
|
|
681
|
+
inputSchema: {
|
|
682
|
+
type: 'object',
|
|
683
|
+
properties: {
|
|
684
|
+
x: { type: 'number' },
|
|
685
|
+
y: { type: 'number' },
|
|
686
|
+
space: COORD_SPACE_SCHEMA,
|
|
687
|
+
},
|
|
688
|
+
required: ['x', 'y'],
|
|
689
|
+
additionalProperties: false,
|
|
690
|
+
},
|
|
691
|
+
changesScreen: false,
|
|
692
|
+
async execute(args, ctx) {
|
|
693
|
+
const c = coerceCoord(args.x, args.y);
|
|
694
|
+
if (!Number.isFinite(c.x) || !Number.isFinite(c.y)) {
|
|
695
|
+
return { success: false, isError: true, text: `move: x/y must be finite numbers, got x=${JSON.stringify(args.x)} y=${JSON.stringify(args.y)}` };
|
|
696
|
+
}
|
|
697
|
+
const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
|
|
698
|
+
const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
|
|
699
|
+
const x = (0, coord_scale_1.scaleCoord)(c.x, scale), y = (0, coord_scale_1.scaleCoord)(c.y, scale);
|
|
700
|
+
await ctx.platform.mouseMove(x, y);
|
|
701
|
+
return { success: true, text: `Cursor moved (hover) to ${space} (${c.x},${c.y}) → screen (${x},${y}) [×${scale}]` };
|
|
702
|
+
},
|
|
703
|
+
},
|
|
704
|
+
{
|
|
705
|
+
name: 'scroll',
|
|
706
|
+
description: 'Scroll at (x,y) in a direction. Omit x,y to scroll at the screen center. If you read x,y off the SCREENSHOT, pass space:"image".',
|
|
707
|
+
inputSchema: {
|
|
708
|
+
type: 'object',
|
|
709
|
+
properties: {
|
|
710
|
+
x: { type: 'number' },
|
|
711
|
+
y: { type: 'number' },
|
|
712
|
+
direction: { type: 'string', enum: ['up', 'down'] },
|
|
713
|
+
amount: { type: 'number', description: 'Wheel ticks (default 3)' },
|
|
714
|
+
space: COORD_SPACE_SCHEMA,
|
|
715
|
+
},
|
|
716
|
+
required: ['direction'],
|
|
717
|
+
additionalProperties: false,
|
|
718
|
+
},
|
|
719
|
+
changesScreen: true,
|
|
720
|
+
async execute(args, ctx) {
|
|
721
|
+
const dir = args.direction === 'up' ? 'up' : 'down';
|
|
722
|
+
const amount = typeof args.amount === 'number' ? args.amount : 3;
|
|
723
|
+
// Default to screen-center when x/y missing; coerce strings via the helper.
|
|
724
|
+
const hasXY = args.x !== undefined || args.y !== undefined;
|
|
725
|
+
const space = args.space === 'image' ? 'image' : args.space === 'screen' ? 'screen' : (ctx.coordSpaceDefault ?? 'screen');
|
|
726
|
+
const scale = space === 'image' ? (0, coord_scale_1.imageScale)(ctx) : 1;
|
|
727
|
+
// No-coordinate default: center of the screen IN THE DRIVER'S SPACE
|
|
728
|
+
// (logical points on macOS, physical px elsewhere) — physicalWidth/2
|
|
729
|
+
// mislanded 2× off on Retina (audit 2026-06-11, M3).
|
|
730
|
+
const center = (0, coord_scale_1.screenCenter)(ctx);
|
|
731
|
+
let x = center.x;
|
|
732
|
+
let y = center.y;
|
|
733
|
+
if (hasXY) {
|
|
734
|
+
const c = coerceCoord(args.x, args.y);
|
|
735
|
+
if (Number.isFinite(c.x) && Number.isFinite(c.y)) {
|
|
736
|
+
x = (0, coord_scale_1.scaleCoord)(c.x, scale);
|
|
737
|
+
y = (0, coord_scale_1.scaleCoord)(c.y, scale);
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
await ctx.platform.mouseScroll(x, y, dir, amount);
|
|
741
|
+
await sleep(150);
|
|
742
|
+
return { success: true, text: `Scrolled ${dir} ${amount} at (${x},${y})` };
|
|
743
|
+
},
|
|
744
|
+
},
|
|
745
|
+
// ─── INPUT (keyboard) ───────────────────────────────────────
|
|
746
|
+
{
|
|
747
|
+
name: 'type',
|
|
748
|
+
description: 'Type text into the currently focused input. Prefer set_field_value when a field has an a11y name.',
|
|
749
|
+
inputSchema: {
|
|
750
|
+
type: 'object',
|
|
751
|
+
properties: {
|
|
752
|
+
text: { type: 'string' },
|
|
753
|
+
expect: EXPECT_SCHEMA,
|
|
754
|
+
},
|
|
755
|
+
required: ['text'],
|
|
756
|
+
additionalProperties: false,
|
|
757
|
+
},
|
|
758
|
+
changesScreen: true,
|
|
759
|
+
async execute(args, ctx) {
|
|
760
|
+
const text = String(args.text ?? '');
|
|
761
|
+
if (!text)
|
|
762
|
+
return { success: true, text: 'Typed 0 chars' };
|
|
763
|
+
// FAST PATH: paste via the clipboard (one Ctrl/Cmd+V — instant) instead
|
|
764
|
+
// of per-keystroke typing, which is visibly slow on anything longer than
|
|
765
|
+
// a few chars (~20ms/char). This is the legacy smart_type mechanism.
|
|
766
|
+
// Save + restore the prior clipboard so a pending copy isn't clobbered
|
|
767
|
+
// (e.g. a copy→paste→type flow). mod+v is portable across OSes.
|
|
768
|
+
// Char-by-char is kept as a fallback for fields that reject paste.
|
|
769
|
+
try {
|
|
770
|
+
const prior = await ctx.platform.readClipboard().catch(() => '');
|
|
771
|
+
await ctx.platform.writeClipboard(text);
|
|
772
|
+
await sleep(40);
|
|
773
|
+
await ctx.platform.keyPress('mod+v');
|
|
774
|
+
await sleep(150);
|
|
775
|
+
await ctx.platform.writeClipboard(prior).catch(() => { });
|
|
776
|
+
return { success: true, text: `Typed ${text.length} chars (paste): "${truncate(text, 60)}"` };
|
|
777
|
+
}
|
|
778
|
+
catch {
|
|
779
|
+
await ctx.platform.typeText(text);
|
|
780
|
+
await sleep(200);
|
|
781
|
+
return { success: true, text: `Typed ${text.length} chars: "${truncate(text, 60)}"` };
|
|
782
|
+
}
|
|
783
|
+
},
|
|
784
|
+
},
|
|
785
|
+
{
|
|
786
|
+
name: 'key',
|
|
787
|
+
description: 'Press a key or key combo. Use "mod" for Ctrl/Cmd. Use "+" for a chord (e.g. "mod+s", "shift+Tab"). Space-separate for a sequence ("Down Down End"). Examples: "Return", "Tab", "Escape", "F5", "ctrl+a".',
|
|
788
|
+
inputSchema: {
|
|
789
|
+
type: 'object',
|
|
790
|
+
properties: {
|
|
791
|
+
// `combo` is the canonical System B name. `key` is accepted as a
|
|
792
|
+
// backward-compatible alias (matches the MCP surface param name
|
|
793
|
+
// `key_press.key` and the compound surface alias).
|
|
794
|
+
combo: { type: 'string', description: 'Key/combo to press (e.g. "Return", "mod+s"). Space-separate for a sequence.' },
|
|
795
|
+
key: { type: 'string', description: 'Alias for combo — accepted for MCP/compound backward-compatibility.' },
|
|
796
|
+
expect: EXPECT_SCHEMA,
|
|
797
|
+
},
|
|
798
|
+
// Neither is required at the JSON-Schema level so the validator passes
|
|
799
|
+
// when only one is provided; the execute() guard catches a total absence.
|
|
800
|
+
additionalProperties: false,
|
|
801
|
+
},
|
|
802
|
+
changesScreen: true,
|
|
803
|
+
async execute(args, ctx) {
|
|
804
|
+
// (b) Accept `key` as a backward-compatible alias for `combo`.
|
|
805
|
+
const raw = (args.combo ?? args.key);
|
|
806
|
+
// (a) Guard: missing or empty argument → actionable error instead of crash.
|
|
807
|
+
if (raw === undefined || raw === null || String(raw).trim() === '') {
|
|
808
|
+
return {
|
|
809
|
+
success: false,
|
|
810
|
+
isError: true,
|
|
811
|
+
text: 'key: "combo" is required — the key or combo to press, e.g. "Return" or "mod+s". (The MCP surface alias is "key".)',
|
|
812
|
+
};
|
|
813
|
+
}
|
|
814
|
+
const input = String(raw).trim();
|
|
815
|
+
// Dangerous key combos that are blocked (mirrors System A BLOCKED_KEYS).
|
|
816
|
+
const BLOCKED = ['alt+f4', 'ctrl+alt+delete', 'ctrl+alt+del'];
|
|
817
|
+
// (b) "+" joins a chord; whitespace separates combos pressed in sequence.
|
|
818
|
+
const combos = input.split(/\s+/);
|
|
819
|
+
// (c) BLOCKED_KEYS guard — check every combo in the sequence.
|
|
820
|
+
for (const c of combos) {
|
|
821
|
+
const norm = c.toLowerCase().replace(/\s+/g, '');
|
|
822
|
+
if (BLOCKED.some(b => norm === b)) {
|
|
823
|
+
return { success: false, isError: true, text: `BLOCKED: "${c}" is a dangerous key combo.` };
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
for (const c of combos) {
|
|
827
|
+
await ctx.platform.keyPress(c);
|
|
828
|
+
if (combos.length > 1)
|
|
829
|
+
await sleep(50); // brief gap between sequence steps
|
|
830
|
+
}
|
|
831
|
+
await sleep(150);
|
|
832
|
+
return { success: true, text: `Pressed ${input}` };
|
|
833
|
+
},
|
|
834
|
+
},
|
|
835
|
+
// ─── APPS & WINDOWS ─────────────────────────────────────────
|
|
836
|
+
{
|
|
837
|
+
name: 'open_app',
|
|
838
|
+
description: 'Open an application by name (e.g. "Notepad", "TextEdit", "Safari").',
|
|
839
|
+
inputSchema: {
|
|
840
|
+
type: 'object',
|
|
841
|
+
properties: { name: { type: 'string' } },
|
|
842
|
+
required: ['name'],
|
|
843
|
+
additionalProperties: false,
|
|
844
|
+
},
|
|
845
|
+
changesScreen: true,
|
|
846
|
+
async execute(args, ctx) {
|
|
847
|
+
const name = String(args.name ?? '');
|
|
848
|
+
// Alias resolution lives at the agent-tool layer (PR1 of v0.9):
|
|
849
|
+
// the platform adapter is alias-data-agnostic, so we look up the
|
|
850
|
+
// canonical row here and forward the launch hints through
|
|
851
|
+
// `launchApp` opts. Cross-OS name mapping (Windows "Notepad" → mac
|
|
852
|
+
// "TextEdit") and UWP / executable / searchTerm details all flow
|
|
853
|
+
// through this single resolution point.
|
|
854
|
+
const alias = (0, aliases_1.resolveAlias)(name);
|
|
855
|
+
const platform = ctx.platform.platform;
|
|
856
|
+
// Pick the right name to hand to the platform launcher per OS.
|
|
857
|
+
// Falls back to the raw `name` when no alias matches.
|
|
858
|
+
let launchName = name;
|
|
859
|
+
if (alias) {
|
|
860
|
+
if (platform === 'darwin') {
|
|
861
|
+
launchName = alias.macOSAppName ?? name;
|
|
862
|
+
}
|
|
863
|
+
else if (platform === 'win32') {
|
|
864
|
+
launchName = alias.executable ?? name;
|
|
865
|
+
}
|
|
866
|
+
else {
|
|
867
|
+
// Linux: use the alias's executable but strip any `.exe`
|
|
868
|
+
// suffix that's there for the Windows path.
|
|
869
|
+
launchName = alias.executable?.replace(/\.exe$/i, '') ?? name;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
const res = await ctx.platform.launchApp(launchName, {
|
|
873
|
+
alwaysNewInstance: alias?.alwaysNewInstance,
|
|
874
|
+
uwpAppId: alias?.uwpAppId,
|
|
875
|
+
// Pick the searchTerm that gives the OS native launcher (Start
|
|
876
|
+
// Menu / Spotlight) the best chance of resolving to the right
|
|
877
|
+
// app — alias.searchTerm wins when present, mac falls back to
|
|
878
|
+
// the bundle name.
|
|
879
|
+
searchTerm: alias?.searchTerm
|
|
880
|
+
?? (platform === 'darwin' ? alias?.macOSAppName : undefined),
|
|
881
|
+
});
|
|
882
|
+
await sleep(800);
|
|
883
|
+
return {
|
|
884
|
+
success: true,
|
|
885
|
+
text: res.title ? `Opened "${name}" (pid=${res.pid}, window="${res.title}")` : `Launched "${name}" (no window surfaced yet)`,
|
|
886
|
+
};
|
|
887
|
+
},
|
|
888
|
+
},
|
|
889
|
+
{
|
|
890
|
+
name: 'focus_window',
|
|
891
|
+
description: 'Bring a window to the foreground. Match by processName, pid, or title substring.',
|
|
892
|
+
inputSchema: {
|
|
893
|
+
type: 'object',
|
|
894
|
+
properties: {
|
|
895
|
+
processName: { type: 'string' },
|
|
896
|
+
processId: { type: 'number' },
|
|
897
|
+
title: { type: 'string' },
|
|
898
|
+
},
|
|
899
|
+
additionalProperties: false,
|
|
900
|
+
},
|
|
901
|
+
changesScreen: true,
|
|
902
|
+
async execute(args, ctx) {
|
|
903
|
+
const q = {};
|
|
904
|
+
if (typeof args.processName === 'string')
|
|
905
|
+
q.processName = args.processName;
|
|
906
|
+
if (typeof args.processId === 'number')
|
|
907
|
+
q.processId = args.processId;
|
|
908
|
+
if (typeof args.title === 'string')
|
|
909
|
+
q.title = args.title;
|
|
910
|
+
const ok = await ctx.platform.focusWindow(q);
|
|
911
|
+
await sleep(250);
|
|
912
|
+
return { success: ok, text: ok ? 'Focused matching window.' : 'No matching window found.' };
|
|
913
|
+
},
|
|
914
|
+
},
|
|
915
|
+
// ─── WINDOW STATE + BOUNDS (Tranche 1B primitives) ──────────
|
|
916
|
+
{
|
|
917
|
+
name: 'maximize_window',
|
|
918
|
+
description: 'Maximize the foreground window (or a matched window). Polite request; WM may interpret.',
|
|
919
|
+
inputSchema: {
|
|
920
|
+
type: 'object',
|
|
921
|
+
properties: {
|
|
922
|
+
processName: { type: 'string' },
|
|
923
|
+
processId: { type: 'number' },
|
|
924
|
+
title: { type: 'string' },
|
|
925
|
+
},
|
|
926
|
+
additionalProperties: false,
|
|
927
|
+
},
|
|
928
|
+
changesScreen: true,
|
|
929
|
+
async execute(args, ctx) {
|
|
930
|
+
const q = buildWinQuery(args);
|
|
931
|
+
const ok = await ctx.platform.setWindowState('maximize', q);
|
|
932
|
+
return { success: ok, text: ok ? 'Maximized window.' : 'Maximize request ignored.' };
|
|
933
|
+
},
|
|
934
|
+
},
|
|
935
|
+
{
|
|
936
|
+
name: 'minimize_window',
|
|
937
|
+
description: 'Minimize the foreground or matched window to the taskbar / Dock.',
|
|
938
|
+
inputSchema: {
|
|
939
|
+
type: 'object',
|
|
940
|
+
properties: {
|
|
941
|
+
processName: { type: 'string' },
|
|
942
|
+
processId: { type: 'number' },
|
|
943
|
+
title: { type: 'string' },
|
|
944
|
+
},
|
|
945
|
+
additionalProperties: false,
|
|
946
|
+
},
|
|
947
|
+
changesScreen: true,
|
|
948
|
+
async execute(args, ctx) {
|
|
949
|
+
const q = buildWinQuery(args);
|
|
950
|
+
const ok = await ctx.platform.setWindowState('minimize', q);
|
|
951
|
+
return { success: ok, text: ok ? 'Minimized window.' : 'Minimize request failed.' };
|
|
952
|
+
},
|
|
953
|
+
},
|
|
954
|
+
{
|
|
955
|
+
name: 'restore_window',
|
|
956
|
+
description: 'Restore a minimized or maximized window to its previous bounds.',
|
|
957
|
+
inputSchema: {
|
|
958
|
+
type: 'object',
|
|
959
|
+
properties: {
|
|
960
|
+
processName: { type: 'string' },
|
|
961
|
+
processId: { type: 'number' },
|
|
962
|
+
title: { type: 'string' },
|
|
963
|
+
},
|
|
964
|
+
additionalProperties: false,
|
|
965
|
+
},
|
|
966
|
+
changesScreen: true,
|
|
967
|
+
async execute(args, ctx) {
|
|
968
|
+
const q = buildWinQuery(args);
|
|
969
|
+
const ok = await ctx.platform.setWindowState('normal', q);
|
|
970
|
+
return { success: ok, text: ok ? 'Restored window.' : 'Restore request failed.' };
|
|
971
|
+
},
|
|
972
|
+
},
|
|
973
|
+
{
|
|
974
|
+
name: 'close_window',
|
|
975
|
+
description: 'Polite close request (WM_CLOSE / AXCloseAction / _NET_CLOSE_WINDOW). App may prompt.',
|
|
976
|
+
inputSchema: {
|
|
977
|
+
type: 'object',
|
|
978
|
+
properties: {
|
|
979
|
+
processName: { type: 'string' },
|
|
980
|
+
processId: { type: 'number' },
|
|
981
|
+
title: { type: 'string' },
|
|
982
|
+
},
|
|
983
|
+
additionalProperties: false,
|
|
984
|
+
},
|
|
985
|
+
changesScreen: true,
|
|
986
|
+
async execute(args, ctx) {
|
|
987
|
+
const q = buildWinQuery(args);
|
|
988
|
+
const ok = await ctx.platform.setWindowState('close', q);
|
|
989
|
+
return { success: ok, text: ok ? 'Close request posted.' : 'Close request failed.', targetLabel: 'close_window' };
|
|
990
|
+
},
|
|
991
|
+
},
|
|
992
|
+
{
|
|
993
|
+
name: 'resize_window',
|
|
994
|
+
description: 'Set the foreground (or matched) window bounds in logical pixels. Omitted fields preserved.',
|
|
995
|
+
inputSchema: {
|
|
996
|
+
type: 'object',
|
|
997
|
+
properties: {
|
|
998
|
+
x: { type: 'number' }, y: { type: 'number' },
|
|
999
|
+
width: { type: 'number' }, height: { type: 'number' },
|
|
1000
|
+
processName: { type: 'string' },
|
|
1001
|
+
processId: { type: 'number' },
|
|
1002
|
+
title: { type: 'string' },
|
|
1003
|
+
},
|
|
1004
|
+
additionalProperties: false,
|
|
1005
|
+
},
|
|
1006
|
+
changesScreen: true,
|
|
1007
|
+
async execute(args, ctx) {
|
|
1008
|
+
const q = buildWinQuery(args);
|
|
1009
|
+
const x = typeof args.x === 'number' ? args.x : undefined;
|
|
1010
|
+
const y = typeof args.y === 'number' ? args.y : undefined;
|
|
1011
|
+
const width = typeof args.width === 'number' ? args.width : undefined;
|
|
1012
|
+
const height = typeof args.height === 'number' ? args.height : undefined;
|
|
1013
|
+
const ok = await ctx.platform.setWindowBounds({ x, y, width, height }, q);
|
|
1014
|
+
return { success: ok, text: ok ? `Resized window (x=${x ?? '-'}, y=${y ?? '-'}, w=${width ?? '-'}, h=${height ?? '-'}).` : 'Resize failed.' };
|
|
1015
|
+
},
|
|
1016
|
+
},
|
|
1017
|
+
{
|
|
1018
|
+
name: 'list_displays',
|
|
1019
|
+
description: 'Enumerate connected displays with logical bounds + DPI ratio. Use before display-specific screenshots.',
|
|
1020
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1021
|
+
changesScreen: false,
|
|
1022
|
+
async execute(_args, ctx) {
|
|
1023
|
+
const displays = await ctx.platform.listDisplays();
|
|
1024
|
+
return { success: true, text: JSON.stringify(displays) };
|
|
1025
|
+
},
|
|
1026
|
+
},
|
|
1027
|
+
{
|
|
1028
|
+
name: 'switch_tab_os',
|
|
1029
|
+
description: 'Cycle next/previous browser tab (mod+Tab / mod+Shift+Tab) or jump to tab N (mod+1..9).',
|
|
1030
|
+
inputSchema: {
|
|
1031
|
+
type: 'object',
|
|
1032
|
+
properties: {
|
|
1033
|
+
index: { type: 'number', description: '1-9 for direct tab jump' },
|
|
1034
|
+
direction: { type: 'string', enum: ['next', 'previous'] },
|
|
1035
|
+
},
|
|
1036
|
+
additionalProperties: false,
|
|
1037
|
+
},
|
|
1038
|
+
changesScreen: true,
|
|
1039
|
+
async execute(args, ctx) {
|
|
1040
|
+
if (typeof args.index === 'number') {
|
|
1041
|
+
const n = Math.max(1, Math.min(9, Math.floor(args.index)));
|
|
1042
|
+
await ctx.platform.keyPress(`mod+${n}`);
|
|
1043
|
+
return { success: true, text: `Switched to tab ${n}` };
|
|
1044
|
+
}
|
|
1045
|
+
const dir = args.direction === 'previous' ? 'previous' : 'next';
|
|
1046
|
+
await ctx.platform.keyPress(dir === 'next' ? 'mod+Tab' : 'mod+shift+Tab');
|
|
1047
|
+
return { success: true, text: `Cycled to ${dir} tab` };
|
|
1048
|
+
},
|
|
1049
|
+
},
|
|
1050
|
+
// ─── ACCESSIBILITY DEPTH (Tranche 1B) ───────────────────────
|
|
1051
|
+
{
|
|
1052
|
+
name: 'focus_element',
|
|
1053
|
+
description: 'Keyboard-focus an element by a11y name. Does NOT raise window — use focus_window first if needed.',
|
|
1054
|
+
inputSchema: {
|
|
1055
|
+
type: 'object',
|
|
1056
|
+
properties: {
|
|
1057
|
+
name: { type: 'string' },
|
|
1058
|
+
controlType: { type: 'string' },
|
|
1059
|
+
processId: { type: 'number' },
|
|
1060
|
+
},
|
|
1061
|
+
required: ['name'],
|
|
1062
|
+
additionalProperties: false,
|
|
1063
|
+
},
|
|
1064
|
+
changesScreen: true,
|
|
1065
|
+
async execute(args, ctx) {
|
|
1066
|
+
const name = String(args.name ?? '');
|
|
1067
|
+
const result = await ctx.platform.invokeElement({
|
|
1068
|
+
name,
|
|
1069
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
1070
|
+
processId: typeof args.processId === 'number' ? args.processId : undefined,
|
|
1071
|
+
action: 'focus',
|
|
1072
|
+
});
|
|
1073
|
+
return {
|
|
1074
|
+
success: result.success,
|
|
1075
|
+
text: result.success ? `Focused "${name}" via a11y.` : `Could not focus "${name}".`,
|
|
1076
|
+
targetLabel: name,
|
|
1077
|
+
};
|
|
1078
|
+
},
|
|
1079
|
+
},
|
|
1080
|
+
{
|
|
1081
|
+
name: 'wait_for_element',
|
|
1082
|
+
description: 'Poll the a11y tree until an element matching name/controlType appears. Useful after an action spawns a dialog.',
|
|
1083
|
+
inputSchema: {
|
|
1084
|
+
type: 'object',
|
|
1085
|
+
properties: {
|
|
1086
|
+
name: { type: 'string' },
|
|
1087
|
+
controlType: { type: 'string' },
|
|
1088
|
+
processId: { type: 'number' },
|
|
1089
|
+
timeoutMs: { type: 'number', description: 'Default 5000', maximum: 30000 },
|
|
1090
|
+
intervalMs: { type: 'number', description: 'Default 250' },
|
|
1091
|
+
},
|
|
1092
|
+
additionalProperties: false,
|
|
1093
|
+
},
|
|
1094
|
+
changesScreen: false,
|
|
1095
|
+
async execute(args, ctx) {
|
|
1096
|
+
const timeout = typeof args.timeoutMs === 'number' ? Math.min(30000, args.timeoutMs) : 5000;
|
|
1097
|
+
const element = await ctx.platform.waitForElement({
|
|
1098
|
+
name: typeof args.name === 'string' ? args.name : undefined,
|
|
1099
|
+
controlType: typeof args.controlType === 'string' ? args.controlType : undefined,
|
|
1100
|
+
processId: typeof args.processId === 'number' ? args.processId : undefined,
|
|
1101
|
+
intervalMs: typeof args.intervalMs === 'number' ? args.intervalMs : 250,
|
|
1102
|
+
}, timeout);
|
|
1103
|
+
if (!element)
|
|
1104
|
+
return { success: false, text: `wait_for_element: timed out after ${timeout}ms` };
|
|
1105
|
+
return { success: true, text: `Found element: ${element.name} [${element.controlType}] @${element.bounds.x},${element.bounds.y}` };
|
|
1106
|
+
},
|
|
1107
|
+
},
|
|
1108
|
+
// ─── SYSTEM OPEN HELPERS (Tranche 1B) ───────────────────────
|
|
1109
|
+
{
|
|
1110
|
+
name: 'open_file',
|
|
1111
|
+
description: 'Open a file or folder in the OS default app (explorer / open / xdg-open).',
|
|
1112
|
+
inputSchema: {
|
|
1113
|
+
type: 'object',
|
|
1114
|
+
properties: { path: { type: 'string' } },
|
|
1115
|
+
required: ['path'],
|
|
1116
|
+
additionalProperties: false,
|
|
1117
|
+
},
|
|
1118
|
+
changesScreen: true,
|
|
1119
|
+
async execute(args, ctx) {
|
|
1120
|
+
const p = String(args.path ?? '');
|
|
1121
|
+
try {
|
|
1122
|
+
if (ctx.platform.platform === 'darwin')
|
|
1123
|
+
await ctx.platform.launchApp('open', { url: p });
|
|
1124
|
+
else if (ctx.platform.platform === 'linux')
|
|
1125
|
+
await ctx.platform.launchApp('xdg-open', { url: p });
|
|
1126
|
+
else
|
|
1127
|
+
await ctx.platform.launchApp('explorer.exe', { url: p });
|
|
1128
|
+
await sleep(500);
|
|
1129
|
+
return { success: true, text: `Opened: ${p}` };
|
|
1130
|
+
}
|
|
1131
|
+
catch (err) {
|
|
1132
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1133
|
+
return { success: false, text: `open_file failed: ${msg}` };
|
|
1134
|
+
}
|
|
1135
|
+
},
|
|
1136
|
+
},
|
|
1137
|
+
{
|
|
1138
|
+
name: 'open_url',
|
|
1139
|
+
description: 'Open a URL in the default browser. Use instead of navigate_browser when you don\'t care which browser.',
|
|
1140
|
+
inputSchema: {
|
|
1141
|
+
type: 'object',
|
|
1142
|
+
properties: { url: { type: 'string' } },
|
|
1143
|
+
required: ['url'],
|
|
1144
|
+
additionalProperties: false,
|
|
1145
|
+
},
|
|
1146
|
+
changesScreen: true,
|
|
1147
|
+
async execute(args, ctx) {
|
|
1148
|
+
const u = String(args.url ?? '');
|
|
1149
|
+
if (!/^https?:\/\//i.test(u))
|
|
1150
|
+
return { success: false, text: 'open_url: URL must start with http(s)://' };
|
|
1151
|
+
try {
|
|
1152
|
+
if (ctx.platform.platform === 'darwin') {
|
|
1153
|
+
await ctx.platform.launchApp('open', { url: u });
|
|
1154
|
+
}
|
|
1155
|
+
else if (ctx.platform.platform === 'linux') {
|
|
1156
|
+
await ctx.platform.launchApp('xdg-open', { url: u });
|
|
1157
|
+
}
|
|
1158
|
+
else {
|
|
1159
|
+
// Windows: launch the REGISTERED https handler directly (e.g.
|
|
1160
|
+
// msedge.exe), not `explorer.exe <url>`. explorer drops the URL in a
|
|
1161
|
+
// background tab and opens no explorer window, so launchApp's
|
|
1162
|
+
// window-find misses and falls back to a Start-menu search that
|
|
1163
|
+
// presses Win and types — spurious "searching" that derails the run.
|
|
1164
|
+
// The resolved browser exe HAS a findable window, so launchApp
|
|
1165
|
+
// foregrounds it cleanly with no fallback.
|
|
1166
|
+
const { resolveSchemeHandlerExecutable } = await Promise.resolve().then(() => __importStar(require('../../platform/uri-handler')));
|
|
1167
|
+
const exe = await resolveSchemeHandlerExecutable('https').catch(() => null);
|
|
1168
|
+
await ctx.platform.launchApp(exe ?? 'explorer.exe', { url: u });
|
|
1169
|
+
}
|
|
1170
|
+
await sleep(800);
|
|
1171
|
+
return { success: true, text: `Opened URL: ${u}` };
|
|
1172
|
+
}
|
|
1173
|
+
catch (err) {
|
|
1174
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1175
|
+
return { success: false, text: `open_url failed: ${msg}` };
|
|
1176
|
+
}
|
|
1177
|
+
},
|
|
1178
|
+
},
|
|
1179
|
+
{
|
|
1180
|
+
// open_uri — the general OS protocol-handler escape route.
|
|
1181
|
+
//
|
|
1182
|
+
// Every OS ships a protocol-handler registry. Windows uses
|
|
1183
|
+
// HKCR\\<scheme>\\shell\\open\\command. macOS uses LaunchServices.
|
|
1184
|
+
// Linux uses xdg-mime + .desktop files. The user's installed apps
|
|
1185
|
+
// register themselves as handlers and the OS routes for us:
|
|
1186
|
+
// mailto: → default mail client (Outlook, Mail.app, Thunderbird, Spark...)
|
|
1187
|
+
// tel: → default phone app (Skype, FaceTime, dialer...)
|
|
1188
|
+
// sms: → default messaging app
|
|
1189
|
+
// webcal: → default calendar
|
|
1190
|
+
// slack: → Slack
|
|
1191
|
+
// vscode: → VS Code
|
|
1192
|
+
// obsidian: → Obsidian
|
|
1193
|
+
// spotify: → Spotify
|
|
1194
|
+
// zoommtg: → Zoom
|
|
1195
|
+
// discord: → Discord
|
|
1196
|
+
// file: → OS file-association dispatcher
|
|
1197
|
+
// http(s): → default browser
|
|
1198
|
+
//
|
|
1199
|
+
// This is THE app-agnostic escape route. ONE tool, every app that
|
|
1200
|
+
// registers a protocol handler. Zero vision, zero a11y, zero
|
|
1201
|
+
// app-specific code. The agent picks the scheme; we just dispatch.
|
|
1202
|
+
name: 'open_uri',
|
|
1203
|
+
description: 'Open ANY registered URI scheme via the OS protocol-handler registry. ONE tool replaces dozens of app-specific shortcuts. Examples: mailto:bob@example.com?subject=hi&body=hello (mail), tel:+15551234 (phone), slack://channel?team=T123&id=C456 (Slack), vscode://file/path (VS Code), webcal://server/cal.ics (calendar), spotify:track:ID (Spotify), https://example.com (browser). Must be properly URL-encoded — pair with build_uri when you have semantic fields.',
|
|
1204
|
+
inputSchema: {
|
|
1205
|
+
type: 'object',
|
|
1206
|
+
properties: {
|
|
1207
|
+
uri: { type: 'string', description: 'A full URI with scheme (e.g. "mailto:bob@example.com?subject=hi&body=hello").' },
|
|
1208
|
+
},
|
|
1209
|
+
required: ['uri'],
|
|
1210
|
+
additionalProperties: false,
|
|
1211
|
+
},
|
|
1212
|
+
changesScreen: true,
|
|
1213
|
+
async execute(args, ctx) {
|
|
1214
|
+
const u = String(args.uri ?? '').trim();
|
|
1215
|
+
if (!u)
|
|
1216
|
+
return { success: false, isError: true, text: 'open_uri: uri is required' };
|
|
1217
|
+
const schemeMatch = u.match(/^([a-z][a-z0-9+.-]*):/i);
|
|
1218
|
+
if (!schemeMatch) {
|
|
1219
|
+
return { success: false, isError: true, text: 'open_uri: argument must be a URI with a scheme (e.g. mailto:, tel:, https:, slack:)' };
|
|
1220
|
+
}
|
|
1221
|
+
const scheme = schemeMatch[1].toLowerCase();
|
|
1222
|
+
try {
|
|
1223
|
+
if (ctx.platform.platform === 'darwin') {
|
|
1224
|
+
await ctx.platform.launchApp('open', { url: u });
|
|
1225
|
+
await sleep(1500);
|
|
1226
|
+
return {
|
|
1227
|
+
success: true,
|
|
1228
|
+
text: `Dispatched ${scheme}: URI to the OS default handler. The configured app for ${scheme}: should now be focused. Verify with read_screen / list_windows. To complete (e.g. send a composed mail), use one more keystroke (cmd+enter on macOS).`,
|
|
1229
|
+
};
|
|
1230
|
+
}
|
|
1231
|
+
if (ctx.platform.platform === 'linux') {
|
|
1232
|
+
await ctx.platform.launchApp('xdg-open', { url: u });
|
|
1233
|
+
await sleep(1500);
|
|
1234
|
+
return {
|
|
1235
|
+
success: true,
|
|
1236
|
+
text: `Dispatched ${scheme}: URI to the OS default handler. The configured app for ${scheme}: should now be focused. Verify with read_screen / list_windows. To complete (e.g. send a composed mail), use one more keystroke (ctrl+enter on Linux).`,
|
|
1237
|
+
};
|
|
1238
|
+
}
|
|
1239
|
+
// Windows: shell-routed dispatch (explorer.exe mailto:, rundll32
|
|
1240
|
+
// url.dll, cmd /c start) silently fails for New Outlook and other
|
|
1241
|
+
// UWP-packaged handlers — the handler returns without opening a
|
|
1242
|
+
// new window. The reliable path is to resolve the registered
|
|
1243
|
+
// handler executable and invoke IT directly with the URI, then
|
|
1244
|
+
// VERIFY a new visible window appeared. Without verification
|
|
1245
|
+
// open_uri returned "success" while nothing actually happened on
|
|
1246
|
+
// screen, sending the agent into stagnation loops.
|
|
1247
|
+
const exe = await (0, uri_handler_1.resolveSchemeHandlerExecutable)(scheme);
|
|
1248
|
+
if (!exe) {
|
|
1249
|
+
return {
|
|
1250
|
+
success: false,
|
|
1251
|
+
isError: true,
|
|
1252
|
+
text: `open_uri: no registered Windows handler found for "${scheme}:". Try a different scheme or drive the app's UI directly.`,
|
|
1253
|
+
};
|
|
1254
|
+
}
|
|
1255
|
+
const launchResult = await (0, uri_handler_1.launchHandlerAndVerify)(exe, u, { waitMs: 5000 });
|
|
1256
|
+
if (!launchResult.success) {
|
|
1257
|
+
return {
|
|
1258
|
+
success: false,
|
|
1259
|
+
isError: true,
|
|
1260
|
+
text: `open_uri: failed to launch handler "${exe}" for ${scheme}: — ${launchResult.error ?? 'unknown error'}`,
|
|
1261
|
+
};
|
|
1262
|
+
}
|
|
1263
|
+
if (!launchResult.windowOpened) {
|
|
1264
|
+
return {
|
|
1265
|
+
success: false,
|
|
1266
|
+
isError: true,
|
|
1267
|
+
text: `open_uri: handler "${exe}" was launched with ${scheme}: but no new window appeared within 5s. The handler probably routed the URI into an existing instance silently. Drive the app's UI directly (focus_window + click + type_text) instead of relying on the protocol dispatch.`,
|
|
1268
|
+
};
|
|
1269
|
+
}
|
|
1270
|
+
return {
|
|
1271
|
+
success: true,
|
|
1272
|
+
text: `Opened ${scheme}: in the registered handler. New window appeared: "${launchResult.hwndLabel ?? '(handle unknown)'}". To complete (e.g. send a composed mail), use one more keystroke (ctrl+enter).`,
|
|
1273
|
+
};
|
|
1274
|
+
}
|
|
1275
|
+
catch (err) {
|
|
1276
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1277
|
+
return { success: false, isError: true, text: `open_uri failed: ${msg}` };
|
|
1278
|
+
}
|
|
1279
|
+
},
|
|
1280
|
+
},
|
|
1281
|
+
{
|
|
1282
|
+
// build_uri — pure helper that converts semantic fields to an
|
|
1283
|
+
// encoded URI. No I/O. Pair with open_uri to dispatch.
|
|
1284
|
+
name: 'build_uri',
|
|
1285
|
+
description: 'Build a properly-encoded URI from a scheme + path + query JSON. Returns the URI text; pair with open_uri to dispatch. Examples: scheme="mailto" path="bob@example.com" query={"subject":"hi","body":"hello"} → "mailto:bob@example.com?subject=hi&body=hello".',
|
|
1286
|
+
inputSchema: {
|
|
1287
|
+
type: 'object',
|
|
1288
|
+
properties: {
|
|
1289
|
+
scheme: { type: 'string', description: 'URI scheme without the colon (mailto, tel, sms, slack, ...).' },
|
|
1290
|
+
path: { type: 'string', description: 'Scheme-specific path. Encoded for you; @ and , are preserved for mailto, + for tel.' },
|
|
1291
|
+
query: { type: 'string', description: 'JSON object of query params, e.g. {"subject":"hi"}. Each value URL-encoded.' },
|
|
1292
|
+
},
|
|
1293
|
+
required: ['scheme'],
|
|
1294
|
+
additionalProperties: false,
|
|
1295
|
+
},
|
|
1296
|
+
changesScreen: false,
|
|
1297
|
+
async execute(args) {
|
|
1298
|
+
const s = String(args.scheme ?? '').trim().toLowerCase();
|
|
1299
|
+
if (!s || !/^[a-z][a-z0-9+.-]*$/.test(s)) {
|
|
1300
|
+
return { success: false, isError: true, text: 'build_uri: scheme must match /^[a-z][a-z0-9+.-]*$/' };
|
|
1301
|
+
}
|
|
1302
|
+
const safe = (v) => encodeURIComponent(v).replace(/'/g, '%27').replace(/"/g, '%22');
|
|
1303
|
+
const encodedPath = args.path
|
|
1304
|
+
? safe(String(args.path))
|
|
1305
|
+
.replace(/%40/g, '@')
|
|
1306
|
+
.replace(/%2C/g, ',')
|
|
1307
|
+
.replace(/%2B/g, '+')
|
|
1308
|
+
.replace(/%2F/g, '/')
|
|
1309
|
+
: '';
|
|
1310
|
+
let queryStr = '';
|
|
1311
|
+
if (args.query) {
|
|
1312
|
+
let obj;
|
|
1313
|
+
try {
|
|
1314
|
+
obj = typeof args.query === 'string' ? JSON.parse(String(args.query)) : args.query;
|
|
1315
|
+
}
|
|
1316
|
+
catch {
|
|
1317
|
+
return { success: false, isError: true, text: 'build_uri: query must be valid JSON' };
|
|
1318
|
+
}
|
|
1319
|
+
const parts = [];
|
|
1320
|
+
for (const [k, v] of Object.entries(obj)) {
|
|
1321
|
+
if (v === undefined || v === null)
|
|
1322
|
+
continue;
|
|
1323
|
+
parts.push(`${safe(k)}=${safe(String(v))}`);
|
|
1324
|
+
}
|
|
1325
|
+
if (parts.length)
|
|
1326
|
+
queryStr = '?' + parts.join('&');
|
|
1327
|
+
}
|
|
1328
|
+
return { success: true, text: `${s}:${encodedPath}${queryStr}` };
|
|
1329
|
+
},
|
|
1330
|
+
},
|
|
1331
|
+
{
|
|
1332
|
+
name: 'get_system_time',
|
|
1333
|
+
description: 'Return current system time (ISO, epoch, timezone). Zero I/O.',
|
|
1334
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1335
|
+
changesScreen: false,
|
|
1336
|
+
async execute() {
|
|
1337
|
+
const now = new Date();
|
|
1338
|
+
return {
|
|
1339
|
+
success: true,
|
|
1340
|
+
text: JSON.stringify({
|
|
1341
|
+
iso: now.toISOString(),
|
|
1342
|
+
epochMs: now.getTime(),
|
|
1343
|
+
timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
|
|
1344
|
+
}),
|
|
1345
|
+
};
|
|
1346
|
+
},
|
|
1347
|
+
},
|
|
1348
|
+
// ─── MOUSE + KEYBOARD EXTENDED (Tranche 1B) ────────────────
|
|
1349
|
+
{
|
|
1350
|
+
name: 'mouse_move_relative',
|
|
1351
|
+
description: 'Move cursor by a relative offset (dx, dy). Wayland-safe via cursor cache.',
|
|
1352
|
+
inputSchema: {
|
|
1353
|
+
type: 'object',
|
|
1354
|
+
properties: { dx: { type: 'number' }, dy: { type: 'number' } },
|
|
1355
|
+
required: ['dx', 'dy'],
|
|
1356
|
+
additionalProperties: false,
|
|
1357
|
+
},
|
|
1358
|
+
changesScreen: false,
|
|
1359
|
+
async execute(args, ctx) {
|
|
1360
|
+
await ctx.platform.mouseMoveRelative(Number(args.dx ?? 0), Number(args.dy ?? 0));
|
|
1361
|
+
return { success: true, text: `Cursor moved by (${args.dx}, ${args.dy})` };
|
|
1362
|
+
},
|
|
1363
|
+
},
|
|
1364
|
+
{
|
|
1365
|
+
name: 'mouse_down',
|
|
1366
|
+
description: 'Press a mouse button without releasing. Pair with mouse_up. Enables hold-and-drag + modifier clicks.',
|
|
1367
|
+
inputSchema: {
|
|
1368
|
+
type: 'object',
|
|
1369
|
+
properties: { button: { type: 'string', enum: ['left', 'right', 'middle'] } },
|
|
1370
|
+
additionalProperties: false,
|
|
1371
|
+
},
|
|
1372
|
+
changesScreen: true,
|
|
1373
|
+
async execute(args, ctx) {
|
|
1374
|
+
const b = args.button ?? 'left';
|
|
1375
|
+
await ctx.platform.mouseDown(b);
|
|
1376
|
+
return { success: true, text: `Mouse ${b} down.` };
|
|
1377
|
+
},
|
|
1378
|
+
},
|
|
1379
|
+
{
|
|
1380
|
+
name: 'mouse_up',
|
|
1381
|
+
description: 'Release a mouse button previously pressed with mouse_down.',
|
|
1382
|
+
inputSchema: {
|
|
1383
|
+
type: 'object',
|
|
1384
|
+
properties: { button: { type: 'string', enum: ['left', 'right', 'middle'] } },
|
|
1385
|
+
additionalProperties: false,
|
|
1386
|
+
},
|
|
1387
|
+
changesScreen: true,
|
|
1388
|
+
async execute(args, ctx) {
|
|
1389
|
+
const b = args.button ?? 'left';
|
|
1390
|
+
await ctx.platform.mouseUp(b);
|
|
1391
|
+
return { success: true, text: `Mouse ${b} up.` };
|
|
1392
|
+
},
|
|
1393
|
+
},
|
|
1394
|
+
{
|
|
1395
|
+
name: 'key_down',
|
|
1396
|
+
description: 'Press a key without releasing. Pair with key_up. Use to hold modifiers (shift, ctrl) during clicks.',
|
|
1397
|
+
inputSchema: {
|
|
1398
|
+
type: 'object',
|
|
1399
|
+
properties: { key: { type: 'string' } },
|
|
1400
|
+
required: ['key'],
|
|
1401
|
+
additionalProperties: false,
|
|
1402
|
+
},
|
|
1403
|
+
changesScreen: false,
|
|
1404
|
+
async execute(args, ctx) {
|
|
1405
|
+
await ctx.platform.keyDown(String(args.key ?? ''));
|
|
1406
|
+
return { success: true, text: `Key down: ${args.key}` };
|
|
1407
|
+
},
|
|
1408
|
+
},
|
|
1409
|
+
{
|
|
1410
|
+
name: 'key_up',
|
|
1411
|
+
description: 'Release a key previously pressed with key_down.',
|
|
1412
|
+
inputSchema: {
|
|
1413
|
+
type: 'object',
|
|
1414
|
+
properties: { key: { type: 'string' } },
|
|
1415
|
+
required: ['key'],
|
|
1416
|
+
additionalProperties: false,
|
|
1417
|
+
},
|
|
1418
|
+
changesScreen: false,
|
|
1419
|
+
async execute(args, ctx) {
|
|
1420
|
+
await ctx.platform.keyUp(String(args.key ?? ''));
|
|
1421
|
+
return { success: true, text: `Key up: ${args.key}` };
|
|
1422
|
+
},
|
|
1423
|
+
},
|
|
1424
|
+
{
|
|
1425
|
+
name: 'undo_last',
|
|
1426
|
+
description: 'Send the OS Undo keystroke (mod+Z).',
|
|
1427
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1428
|
+
changesScreen: true,
|
|
1429
|
+
async execute(_args, ctx) {
|
|
1430
|
+
await ctx.platform.keyPress('mod+z');
|
|
1431
|
+
return { success: true, text: 'Sent undo.' };
|
|
1432
|
+
},
|
|
1433
|
+
},
|
|
1434
|
+
// ─── CLIPBOARD ─────────────────────────────────────────────
|
|
1435
|
+
{
|
|
1436
|
+
name: 'read_clipboard',
|
|
1437
|
+
description: 'Read the OS clipboard.',
|
|
1438
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1439
|
+
changesScreen: false,
|
|
1440
|
+
async execute(_args, ctx) {
|
|
1441
|
+
const text = await ctx.platform.readClipboard();
|
|
1442
|
+
return { success: true, text: `Clipboard (${text.length} chars):\n${(0, prompt_1.wrapUntrustedScreenContent)(truncate(text, 500))}` };
|
|
1443
|
+
},
|
|
1444
|
+
},
|
|
1445
|
+
{
|
|
1446
|
+
name: 'write_clipboard',
|
|
1447
|
+
description: 'Write text to the OS clipboard.',
|
|
1448
|
+
inputSchema: {
|
|
1449
|
+
type: 'object',
|
|
1450
|
+
properties: { text: { type: 'string' } },
|
|
1451
|
+
required: ['text'],
|
|
1452
|
+
additionalProperties: false,
|
|
1453
|
+
},
|
|
1454
|
+
changesScreen: false,
|
|
1455
|
+
async execute(args, ctx) {
|
|
1456
|
+
const text = String(args.text ?? '');
|
|
1457
|
+
await ctx.platform.writeClipboard(text);
|
|
1458
|
+
return { success: true, text: `Wrote ${text.length} chars to clipboard.` };
|
|
1459
|
+
},
|
|
1460
|
+
},
|
|
1461
|
+
// ─── FLOW CONTROL ───────────────────────────────────────────
|
|
1462
|
+
{
|
|
1463
|
+
name: 'wait',
|
|
1464
|
+
description: 'Pause for N milliseconds (max 5000). Use after actions that trigger animations or page loads.',
|
|
1465
|
+
inputSchema: {
|
|
1466
|
+
type: 'object',
|
|
1467
|
+
properties: { ms: { type: 'number', maximum: 5000 } },
|
|
1468
|
+
required: ['ms'],
|
|
1469
|
+
additionalProperties: false,
|
|
1470
|
+
},
|
|
1471
|
+
changesScreen: false,
|
|
1472
|
+
async execute(args) {
|
|
1473
|
+
const ms = Math.min(5000, Math.max(0, Number(args.ms ?? 0)));
|
|
1474
|
+
await sleep(ms);
|
|
1475
|
+
return { success: true, text: `Waited ${ms}ms.` };
|
|
1476
|
+
},
|
|
1477
|
+
},
|
|
1478
|
+
// ─── VISION (hybrid + vision modes only) ────────────────────
|
|
1479
|
+
{
|
|
1480
|
+
name: 'screenshot',
|
|
1481
|
+
description: 'LAST RESORT — expensive: sends image bytes into LLM context. Escalation order: read_screen (a11y tree, free) → read_text (OCR, cheap) → screenshot (this, expensive). Only call this when both a11y and OCR failed to provide what you need (canvas-only app, icon-only UI, pixel-level verification).',
|
|
1482
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1483
|
+
changesScreen: false,
|
|
1484
|
+
async execute(_args, ctx) {
|
|
1485
|
+
const shot = await ctx.platform.screenshot({ maxWidth: 1280 });
|
|
1486
|
+
ctx.screenshotsCaptured.n += 1;
|
|
1487
|
+
return {
|
|
1488
|
+
success: true,
|
|
1489
|
+
text: `Captured ${shot.width}×${shot.height}.`,
|
|
1490
|
+
screenshot: shot,
|
|
1491
|
+
};
|
|
1492
|
+
},
|
|
1493
|
+
},
|
|
1494
|
+
// ─── OCR PERCEPTION (webview / canvas, cheap — no vision model) ──────
|
|
1495
|
+
// When the a11y tree is empty (browser page, Electron, canvas, game), OCR
|
|
1496
|
+
// reads the visible TEXT so the TEXT model can keep driving — no screenshot
|
|
1497
|
+
// bytes, no escalation to the vision model. This is the cheap path: it keeps
|
|
1498
|
+
// haiku as the brain instead of handing the whole subtask to sonnet.
|
|
1499
|
+
{
|
|
1500
|
+
name: 'read_text',
|
|
1501
|
+
description: 'OCR the screen and return visible text + positions. Use when the a11y snapshot is empty/sparse (webview, canvas, PDF, game) to READ on-screen content. Cheaper than a screenshot (no image bytes). May take 1–3s.',
|
|
1502
|
+
inputSchema: {
|
|
1503
|
+
type: 'object',
|
|
1504
|
+
properties: {
|
|
1505
|
+
filter: { type: 'string', description: 'Optional: keep only lines containing this text (case-insensitive).' },
|
|
1506
|
+
},
|
|
1507
|
+
additionalProperties: false,
|
|
1508
|
+
},
|
|
1509
|
+
changesScreen: false,
|
|
1510
|
+
async execute(args, _ctx) {
|
|
1511
|
+
const ocr = getAgentOcr();
|
|
1512
|
+
if (!ocr.isAvailable())
|
|
1513
|
+
return { success: false, text: 'read_text: OCR not available on this platform — fall back to screenshot/vision.' };
|
|
1514
|
+
const result = await ocr.recognizeScreen();
|
|
1515
|
+
if (result.elements.length === 0)
|
|
1516
|
+
return { success: true, text: '(read_text: OCR found no text — screen may be blank, or OCR unavailable.)' };
|
|
1517
|
+
const lineMap = new Map();
|
|
1518
|
+
for (const el of result.elements) {
|
|
1519
|
+
const arr = lineMap.get(el.line) ?? [];
|
|
1520
|
+
arr.push(el);
|
|
1521
|
+
lineMap.set(el.line, arr);
|
|
1522
|
+
}
|
|
1523
|
+
const filter = typeof args.filter === 'string' ? args.filter.toLowerCase() : null;
|
|
1524
|
+
const lines = [];
|
|
1525
|
+
for (const [, toks] of [...lineMap.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1526
|
+
const sorted = [...toks].sort((a, b) => a.x - b.x);
|
|
1527
|
+
const lineText = sorted.map(t => t.text).join(' ');
|
|
1528
|
+
if (filter && !lineText.toLowerCase().includes(filter))
|
|
1529
|
+
continue;
|
|
1530
|
+
const minX = Math.min(...sorted.map(t => t.x));
|
|
1531
|
+
const minY = Math.min(...sorted.map(t => t.y));
|
|
1532
|
+
lines.push(`@${minX},${minY} "${lineText}"`);
|
|
1533
|
+
}
|
|
1534
|
+
if (lines.length === 0)
|
|
1535
|
+
return { success: true, text: `(read_text: no lines match "${filter}")` };
|
|
1536
|
+
return { success: true, text: `OCR (${result.elements.length} words, ${result.durationMs}ms):\n${(0, prompt_1.wrapUntrustedScreenContent)(lines.join('\n'))}` };
|
|
1537
|
+
},
|
|
1538
|
+
},
|
|
1539
|
+
{
|
|
1540
|
+
name: 'compile_ui',
|
|
1541
|
+
description: 'Compile the current screen into one fused UI map (a11y + OCR + lazy vision) of elements with stable ids, roles, confidence and sources. Returns a ranked element list with a snapshot id; act on a specific element via invoke_element/set_field_value with {element_id, snapshot_id}. a11y-first; pulls OCR only when a11y is sparse or target_text is missing; pass max_cost:\'cheap\' to forbid OCR, or \'vision_ok\' to allow screenshots.',
|
|
1542
|
+
inputSchema: {
|
|
1543
|
+
type: 'object',
|
|
1544
|
+
properties: {
|
|
1545
|
+
purpose: { type: 'string', enum: ['general', 'find_text', 'act'], description: 'What the compile is for' },
|
|
1546
|
+
target_text: { type: 'string', description: 'If set and absent from a11y, pull OCR to find it' },
|
|
1547
|
+
max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Hard ceiling on perception cost (default ocr_ok)' },
|
|
1548
|
+
},
|
|
1549
|
+
additionalProperties: false,
|
|
1550
|
+
},
|
|
1551
|
+
changesScreen: false,
|
|
1552
|
+
async execute(args, ctx) {
|
|
1553
|
+
const holder = ctx.uiMaps;
|
|
1554
|
+
if (!holder)
|
|
1555
|
+
return { success: false, text: 'compile_ui: no UIMap holder on this context.' };
|
|
1556
|
+
const now = Date.now();
|
|
1557
|
+
const id = holder.nextId();
|
|
1558
|
+
const hints = {
|
|
1559
|
+
purpose: typeof args.purpose === 'string' ? args.purpose : undefined,
|
|
1560
|
+
target_text: typeof args.target_text === 'string' ? args.target_text : undefined,
|
|
1561
|
+
max_cost: typeof args.max_cost === 'string' ? args.max_cost : undefined,
|
|
1562
|
+
};
|
|
1563
|
+
const map = await (0, ui_map_1.compileUIMap)((0, ui_map_1.defaultCompileDeps)(ctx.platform, now, id), hints);
|
|
1564
|
+
holder.put(map, now, hints.max_cost ?? 'ocr_ok');
|
|
1565
|
+
return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)((0, ui_map_render_1.renderUIMap)(map)) };
|
|
1566
|
+
},
|
|
1567
|
+
},
|
|
1568
|
+
{
|
|
1569
|
+
name: 'find_action_button',
|
|
1570
|
+
description: 'Semantically locate the best clickable element for an intent (e.g. "submit", "cancel", "search") over the compiled UI. Returns JSON {status:"ok"|"ambiguous"|"none", snapshot_id, best?, candidates}. On "ok", act with invoke_element({element_id: best.element_id, snapshot_id}). Deterministic synonym + text + confidence match.',
|
|
1571
|
+
inputSchema: { type: 'object', properties: {
|
|
1572
|
+
intent: { type: 'string', description: 'What you want to do (submit/cancel/search/login/...)' },
|
|
1573
|
+
max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Perception cost ceiling (default ocr_ok)' },
|
|
1574
|
+
}, required: ['intent'], additionalProperties: false },
|
|
1575
|
+
changesScreen: false,
|
|
1576
|
+
async execute(args, ctx) {
|
|
1577
|
+
const map = await finderMap(ctx, args.max_cost);
|
|
1578
|
+
if (!map)
|
|
1579
|
+
return { success: false, text: 'find_action_button: no UIMap holder on this context.' };
|
|
1580
|
+
const r = (0, ui_map_find_1.findActionButton)(map.elements, map.snapshot_id, String(args.intent ?? ''));
|
|
1581
|
+
return { success: r.status === 'ok', text: JSON.stringify(r) };
|
|
1582
|
+
},
|
|
1583
|
+
},
|
|
1584
|
+
{
|
|
1585
|
+
name: 'find_input_field',
|
|
1586
|
+
description: 'Semantically locate the best editable field for a purpose (e.g. "recipient", "subject", "body", "search") over the compiled UI, including label-less fields via their adjacent label. Returns JSON {status, snapshot_id, best?, candidates}. On "ok", fill with set_field_value({element_id: best.element_id, snapshot_id, value}). Deterministic.',
|
|
1587
|
+
inputSchema: { type: 'object', properties: {
|
|
1588
|
+
purpose: { type: 'string', description: 'What the field is for (recipient/subject/body/search/...)' },
|
|
1589
|
+
max_cost: { type: 'string', enum: ['cheap', 'ocr_ok', 'vision_ok'], description: 'Perception cost ceiling (default ocr_ok)' },
|
|
1590
|
+
}, required: ['purpose'], additionalProperties: false },
|
|
1591
|
+
changesScreen: false,
|
|
1592
|
+
async execute(args, ctx) {
|
|
1593
|
+
const map = await finderMap(ctx, args.max_cost);
|
|
1594
|
+
if (!map)
|
|
1595
|
+
return { success: false, text: 'find_input_field: no UIMap holder on this context.' };
|
|
1596
|
+
const r = (0, ui_map_find_1.findInputField)(map.elements, map.snapshot_id, String(args.purpose ?? ''));
|
|
1597
|
+
return { success: r.status === 'ok', text: JSON.stringify(r) };
|
|
1598
|
+
},
|
|
1599
|
+
},
|
|
1600
|
+
{
|
|
1601
|
+
name: 'smart_click',
|
|
1602
|
+
description: 'OCR-locate visible text on screen and click its center. Use when the a11y tree is empty and invoke_element fails (webview/canvas). Pass the exact visible text (e.g. "Search", a video title, "Sign in").',
|
|
1603
|
+
inputSchema: {
|
|
1604
|
+
type: 'object',
|
|
1605
|
+
properties: {
|
|
1606
|
+
target: { type: 'string', description: 'The visible text to click.' },
|
|
1607
|
+
button: { type: 'string', enum: ['left', 'right'] },
|
|
1608
|
+
},
|
|
1609
|
+
required: ['target'],
|
|
1610
|
+
additionalProperties: false,
|
|
1611
|
+
},
|
|
1612
|
+
changesScreen: true,
|
|
1613
|
+
async execute(args, ctx) {
|
|
1614
|
+
const target = String(args.target ?? '').trim();
|
|
1615
|
+
if (!target)
|
|
1616
|
+
return { success: false, isError: true, text: 'smart_click: target required.' };
|
|
1617
|
+
const button = args.button === 'right' ? 'right' : 'left';
|
|
1618
|
+
const ocr = getAgentOcr();
|
|
1619
|
+
if (!ocr.isAvailable())
|
|
1620
|
+
return { success: false, text: 'smart_click: OCR not available — escalate to vision.' };
|
|
1621
|
+
const result = await ocr.recognizeScreen();
|
|
1622
|
+
if (result.elements.length === 0)
|
|
1623
|
+
return { success: false, text: 'smart_click: OCR found no text — escalate to vision.' };
|
|
1624
|
+
const hit = locateByOcr(target, result.elements);
|
|
1625
|
+
if (!hit)
|
|
1626
|
+
return { success: false, text: `smart_click: no match for "${target}". Call read_text to see visible text, then retry with exact text.` };
|
|
1627
|
+
// OCR coords are screen-space — pass straight to mouseClick, same as the
|
|
1628
|
+
// `click` tool does with a11y coords (no imageScale; that's image-space only).
|
|
1629
|
+
const fg0 = await ctx.platform.getActiveWindow().catch(() => null);
|
|
1630
|
+
const raised = await (0, focus_guard_1.ensureTargetForeground)(ctx, fg0);
|
|
1631
|
+
const before = await ctx.platform.getActiveWindow().catch(() => null);
|
|
1632
|
+
const activation = await ctx.platform.mouseClick(hit.x, hit.y, { button, count: 1 });
|
|
1633
|
+
await sleep(150);
|
|
1634
|
+
getAgentOcr().invalidateCache();
|
|
1635
|
+
const after = await ctx.platform.getActiveWindow().catch(() => null);
|
|
1636
|
+
const focusWarn = focusTheftWarning(activation, before, after);
|
|
1637
|
+
return { success: true, text: `smart_click: clicked "${hit.label}" (score ${hit.score.toFixed(2)}) at (${hit.x},${hit.y})${raised}${focusWarn}`, targetLabel: hit.label };
|
|
1638
|
+
},
|
|
1639
|
+
},
|
|
1640
|
+
// ─── BROWSER (CDP / DOM — reliable web automation, no pixels) ────────
|
|
1641
|
+
// For web pages, driving the DOM by selector/text is far more reliable
|
|
1642
|
+
// than OCR + coordinate clicks: no occlusion, no focus-stealing, no
|
|
1643
|
+
// image scaling. These tools operate a DEDICATED, agent-owned browser
|
|
1644
|
+
// instance (separate profile + debug port) so they never disturb the
|
|
1645
|
+
// user's own windows. They DEGRADE GRACEFULLY: if CDP isn't wired or a
|
|
1646
|
+
// browser can't be launched, they say so and the agent falls back to
|
|
1647
|
+
// read_text / smart_click. Haiku stays the brain — it reads DOM text and
|
|
1648
|
+
// decides; no vision model needed.
|
|
1649
|
+
{
|
|
1650
|
+
name: 'browser_connect',
|
|
1651
|
+
description: 'Open/attach a dedicated browser the agent controls via the DOM (reliable for web pages — no pixels). Call this FIRST for any website task, then use browser_navigate/read/click/type. If it fails, fall back to read_text/smart_click.',
|
|
1652
|
+
inputSchema: { type: 'object', properties: {}, additionalProperties: false },
|
|
1653
|
+
changesScreen: true,
|
|
1654
|
+
async execute(_args, ctx) {
|
|
1655
|
+
if (!ctx.cdp)
|
|
1656
|
+
return { success: false, text: 'browser_connect: CDP not available in this build — use read_text/smart_click for the page instead.' };
|
|
1657
|
+
// CLAWD_AGENT_CDP_OFF=1 → attach-only (never launch a new instance).
|
|
1658
|
+
const allowLaunch = !/^(1|true)$/i.test(process.env.CLAWD_AGENT_CDP_OFF ?? '');
|
|
1659
|
+
const ok = await ctx.cdp.ensureConnected({ launch: allowLaunch, exePaths: [...(0, browser_config_1.getEdgePaths)(), ...(0, browser_config_1.getChromePaths)()] }).catch(() => false);
|
|
1660
|
+
if (!ok)
|
|
1661
|
+
return { success: false, text: `browser_connect: could not ${allowLaunch ? 'launch or attach to' : 'attach to'} a CDP browser — fall back to read_text/smart_click.` };
|
|
1662
|
+
const url = await ctx.cdp.getUrl().catch(() => null);
|
|
1663
|
+
const title = await ctx.cdp.getTitle().catch(() => null);
|
|
1664
|
+
// Disclose provenance honestly: 'attached' means we connected to a
|
|
1665
|
+
// browser already on the user debug port — likely the USER'S own
|
|
1666
|
+
// session. Navigation is mechanically redirected into the agent's own
|
|
1667
|
+
// tab by the driver (root-cause fix 2026-06-11), so their tabs are
|
|
1668
|
+
// never navigated away; reads still see their current page.
|
|
1669
|
+
const mode = ctx.cdp.getConnectionMode?.() ?? 'unknown';
|
|
1670
|
+
const provenance = mode === 'attached'
|
|
1671
|
+
? ' ⚠ ATTACHED to an EXISTING browser (likely the user\'s own session). browser_navigate automatically works in the agent\'s OWN tab — the user\'s tabs are never navigated away; reads before navigating still see their current page. Do not close their tabs/windows.'
|
|
1672
|
+
: mode === 'dedicated'
|
|
1673
|
+
? ' (dedicated agent-owned instance — safe to drive freely). NOTE: this browser has its OWN profile — login state may DIFFER from the window you were driving. If a site demands login here but the on-screen window looked logged in, drive the on-screen window instead (keyboard/OCR) or use relaunch_with_cdp.'
|
|
1674
|
+
: '';
|
|
1675
|
+
return { success: true, text: `browser_connect: connected to "${title ?? '(blank)'}" at ${url ?? 'about:blank'}.${provenance} Use browser_navigate to open a URL, browser_read to see the page, browser_click/browser_type to interact.` };
|
|
1676
|
+
},
|
|
1677
|
+
},
|
|
1678
|
+
{
|
|
1679
|
+
name: 'browser_navigate',
|
|
1680
|
+
description: 'Navigate the agent-owned browser to a URL (waits for load). Requires browser_connect first.',
|
|
1681
|
+
inputSchema: {
|
|
1682
|
+
type: 'object',
|
|
1683
|
+
properties: { url: { type: 'string', description: 'The URL to open (e.g. https://www.youtube.com).' } },
|
|
1684
|
+
required: ['url'],
|
|
1685
|
+
additionalProperties: false,
|
|
1686
|
+
},
|
|
1687
|
+
changesScreen: true,
|
|
1688
|
+
async execute(args, ctx) {
|
|
1689
|
+
if (!ctx.cdp || !(await ctx.cdp.isConnected()))
|
|
1690
|
+
return { success: false, text: 'browser_navigate: not connected — call browser_connect first.' };
|
|
1691
|
+
const url = String(args.url ?? '').trim();
|
|
1692
|
+
if (!url)
|
|
1693
|
+
return { success: false, isError: true, text: 'browser_navigate: url required.' };
|
|
1694
|
+
const r = await ctx.cdp.navigate(url);
|
|
1695
|
+
return r.success ? { success: true, text: `browser_navigate: loaded ${r.value ?? url}` } : { success: false, text: `browser_navigate failed: ${r.error}` };
|
|
1696
|
+
},
|
|
1697
|
+
},
|
|
1698
|
+
{
|
|
1699
|
+
name: 'browser_read',
|
|
1700
|
+
description: 'Read the current page as structured DOM: interactive elements (links/buttons/inputs with selectors), or text for a CSS selector. Use instead of read_text on web pages. Requires browser_connect first.',
|
|
1701
|
+
inputSchema: {
|
|
1702
|
+
type: 'object',
|
|
1703
|
+
properties: {
|
|
1704
|
+
selector: { type: 'string', description: 'Optional CSS selector to read text from (default: structured interactive-element list for the whole page).' },
|
|
1705
|
+
},
|
|
1706
|
+
additionalProperties: false,
|
|
1707
|
+
},
|
|
1708
|
+
changesScreen: false,
|
|
1709
|
+
async execute(args, ctx) {
|
|
1710
|
+
if (!ctx.cdp || !(await ctx.cdp.isConnected()))
|
|
1711
|
+
return { success: false, text: 'browser_read: not connected — call browser_connect first.' };
|
|
1712
|
+
const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
|
|
1713
|
+
const text = selector ? await ctx.cdp.readText(selector, 3000) : await ctx.cdp.getPageContext();
|
|
1714
|
+
// Page content is the highest-risk injection surface — always delimited.
|
|
1715
|
+
return { success: true, text: (0, prompt_1.wrapUntrustedScreenContent)(text) };
|
|
1716
|
+
},
|
|
1717
|
+
},
|
|
1718
|
+
{
|
|
1719
|
+
name: 'browser_click',
|
|
1720
|
+
description: 'Click a page element by visible text or CSS selector (DOM click — no coordinates). Requires browser_connect first.',
|
|
1721
|
+
inputSchema: {
|
|
1722
|
+
type: 'object',
|
|
1723
|
+
properties: {
|
|
1724
|
+
text: { type: 'string', description: 'Visible text of the element to click (preferred).' },
|
|
1725
|
+
selector: { type: 'string', description: 'CSS selector (alternative to text).' },
|
|
1726
|
+
},
|
|
1727
|
+
additionalProperties: false,
|
|
1728
|
+
},
|
|
1729
|
+
changesScreen: true,
|
|
1730
|
+
async execute(args, ctx) {
|
|
1731
|
+
if (!ctx.cdp || !(await ctx.cdp.isConnected()))
|
|
1732
|
+
return { success: false, text: 'browser_click: not connected — call browser_connect first.' };
|
|
1733
|
+
const text = typeof args.text === 'string' ? args.text.trim() : '';
|
|
1734
|
+
const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
|
|
1735
|
+
if (!text && !selector)
|
|
1736
|
+
return { success: false, isError: true, text: 'browser_click: provide text or selector.' };
|
|
1737
|
+
const r = text ? await ctx.cdp.clickByText(text) : await ctx.cdp.click(selector);
|
|
1738
|
+
return r.success ? { success: true, text: `browser_click: clicked ${text ? `"${text}"` : selector} (${r.method})` } : { success: false, text: `browser_click failed: ${r.error}. Call browser_read to see the actual elements, then retry.` };
|
|
1739
|
+
},
|
|
1740
|
+
},
|
|
1741
|
+
{
|
|
1742
|
+
name: 'browser_type',
|
|
1743
|
+
description: 'Type text into a page input by CSS selector or associated label (DOM input — no coordinates). Requires browser_connect first.',
|
|
1744
|
+
inputSchema: {
|
|
1745
|
+
type: 'object',
|
|
1746
|
+
properties: {
|
|
1747
|
+
text: { type: 'string', description: 'Text to type.' },
|
|
1748
|
+
selector: { type: 'string', description: 'CSS selector for the input.' },
|
|
1749
|
+
label: { type: 'string', description: 'Label text associated with the input (alternative to selector).' },
|
|
1750
|
+
},
|
|
1751
|
+
required: ['text'],
|
|
1752
|
+
additionalProperties: false,
|
|
1753
|
+
},
|
|
1754
|
+
changesScreen: true,
|
|
1755
|
+
async execute(args, ctx) {
|
|
1756
|
+
if (!ctx.cdp || !(await ctx.cdp.isConnected()))
|
|
1757
|
+
return { success: false, text: 'browser_type: not connected — call browser_connect first.' };
|
|
1758
|
+
const text = String(args.text ?? '');
|
|
1759
|
+
const selector = typeof args.selector === 'string' ? args.selector.trim() : '';
|
|
1760
|
+
const label = typeof args.label === 'string' ? args.label.trim() : '';
|
|
1761
|
+
if (!selector && !label)
|
|
1762
|
+
return { success: false, isError: true, text: 'browser_type: provide selector or label.' };
|
|
1763
|
+
const r = label ? await ctx.cdp.typeByLabel(label, text) : await ctx.cdp.typeInField(selector, text);
|
|
1764
|
+
return r.success ? { success: true, text: `browser_type: typed into ${selector || `label "${label}"`}` } : { success: false, text: `browser_type failed: ${r.error}` };
|
|
1765
|
+
},
|
|
1766
|
+
},
|
|
1767
|
+
// ─── BATCHED PLANNING ───────────────────────────────────────
|
|
1768
|
+
// Run several known next actions in one turn (saves LLM round-trips).
|
|
1769
|
+
(0, batch_tool_1.buildBatchTool)(),
|
|
1770
|
+
// ─── TERMINAL ACTIONS ──────────────────────────────────────
|
|
1771
|
+
{
|
|
1772
|
+
name: 'done',
|
|
1773
|
+
description: 'Declare the task complete. Provide SPECIFIC screen evidence — a window title, a value visible in the document, a status bar message. Do NOT use hedging words ("should", "might", "probably", "I think", "I believe") — that means you are guessing. If the task CHANGED anything you MUST pass `assertions` (same types as the verify tool, plus `file_changed_since_start` for a file you wrote) that prove the RESULT — and the proof must reflect your change, not state that was already there (an ambient clock, an already-open window). The harness re-checks them against the live screen and rejects done if any fail or none is discriminating. If you can\'t see concrete evidence, take a screenshot or read_screen first.',
|
|
1774
|
+
inputSchema: {
|
|
1775
|
+
type: 'object',
|
|
1776
|
+
properties: {
|
|
1777
|
+
evidence: { type: 'string' },
|
|
1778
|
+
assertions: {
|
|
1779
|
+
type: 'array',
|
|
1780
|
+
description: 'Optional machine-checkable proofs (verify-tool types). The harness executes them; done is rejected if any fail.',
|
|
1781
|
+
items: { type: 'object' },
|
|
1782
|
+
},
|
|
1783
|
+
},
|
|
1784
|
+
required: ['evidence'],
|
|
1785
|
+
additionalProperties: false,
|
|
1786
|
+
},
|
|
1787
|
+
changesScreen: false,
|
|
1788
|
+
terminal: true,
|
|
1789
|
+
async execute(args, ctx) {
|
|
1790
|
+
const evidence = String(args.evidence ?? '').trim();
|
|
1791
|
+
// Guard 1: evidence must be present and non-trivial. An empty string
|
|
1792
|
+
// or "ok" / "done" gives the verifier nothing to work with.
|
|
1793
|
+
if (evidence.length < 8) {
|
|
1794
|
+
return {
|
|
1795
|
+
success: false,
|
|
1796
|
+
text: 'done rejected: evidence is empty or too short. Look at the screen and report a SPECIFIC concrete observation (window title, on-screen text, focused element) before declaring done.',
|
|
1797
|
+
isError: true,
|
|
1798
|
+
};
|
|
1799
|
+
}
|
|
1800
|
+
// Guard 2: hedging-language detection. Phrases like "should have
|
|
1801
|
+
// been sent", "might be open", "I think it worked" are speculative
|
|
1802
|
+
// — they signal the agent guessed instead of verifying. Force a
|
|
1803
|
+
// re-check by rejecting the call. The agent's next turn will see
|
|
1804
|
+
// this rejection and either take a screenshot/read_screen or
|
|
1805
|
+
// rephrase with concrete observations.
|
|
1806
|
+
//
|
|
1807
|
+
// Pattern is intentionally narrow: words must appear as standalone
|
|
1808
|
+
// tokens (or first-letter-of-token), not as part of larger words
|
|
1809
|
+
// like "shoulder" or "mighty". Word-boundary anchored.
|
|
1810
|
+
if (HEDGING_PATTERN.test(evidence)) {
|
|
1811
|
+
return {
|
|
1812
|
+
success: false,
|
|
1813
|
+
text: `done rejected: evidence contains hedging language ("should", "might", "probably", "I think", "I believe", "appears to", "seems to", "if successful"…). That means you are GUESSING, not observing. Take a screenshot or call read_screen, then describe what you actually see — concrete strings, not predictions.`,
|
|
1814
|
+
isError: true,
|
|
1815
|
+
};
|
|
1816
|
+
}
|
|
1817
|
+
// Guard 3 (the strong one): harness-executed assertions. The model's
|
|
1818
|
+
// prose is a CLAIM; these checks are PROOF — run against live ground
|
|
1819
|
+
// truth (UIA values, window list, clipboard, fs, OCR). A model that
|
|
1820
|
+
// hallucinates a result (live Outlook run 2026-06-06: "verified" a
|
|
1821
|
+
// recipient that was never committed) gets caught HERE, at done-time,
|
|
1822
|
+
// instead of the task silently failing after the run ends.
|
|
1823
|
+
const mutated = ctx.mutatedScreen === true;
|
|
1824
|
+
// NB (P1): hard-requiring `assertions` for EVERY mutating task (the
|
|
1825
|
+
// strictest anti-false-success gate) is intentionally NOT enforced here.
|
|
1826
|
+
// It would force every screen-changing task to carry a discriminating
|
|
1827
|
+
// proof — but real apps are frequently already open (the only cheap
|
|
1828
|
+
// proofs, window_title/app_running, are then non-discriminating), so it
|
|
1829
|
+
// both over-constrains agents and can't be satisfied against a static
|
|
1830
|
+
// app. Left as STRONG guidance in the `done` description; flagged for
|
|
1831
|
+
// Fable review as the stricter option (needs the run-agent suite to
|
|
1832
|
+
// model post-action state). The discriminating gate below + the
|
|
1833
|
+
// file_changed_since_start proof are the deployable 80%.
|
|
1834
|
+
if (args.assertions !== undefined) {
|
|
1835
|
+
const parsed = (0, assertions_1.parseAssertions)(args.assertions);
|
|
1836
|
+
if ('error' in parsed) {
|
|
1837
|
+
return { success: false, text: `done rejected: ${parsed.error}`, isError: true };
|
|
1838
|
+
}
|
|
1839
|
+
const report = await (0, assertions_1.checkAssertions)(parsed.assertions, {
|
|
1840
|
+
adapter: ctx.platform,
|
|
1841
|
+
ocrText: async () => (await getAgentOcr().recognizeScreen()).fullText ?? '',
|
|
1842
|
+
taskStartedAt: ctx.taskStartedAt,
|
|
1843
|
+
});
|
|
1844
|
+
if (!report.ok) {
|
|
1845
|
+
return {
|
|
1846
|
+
success: false,
|
|
1847
|
+
isError: true,
|
|
1848
|
+
text: `done rejected: ${report.failed} of ${report.outcomes.length} assertion(s) FAILED — the live screen does not back your claim:\n${(0, assertions_1.renderReport)(report)}\nFix the failing condition (the detail shows the actual state), or give_up with the reason.`,
|
|
1849
|
+
};
|
|
1850
|
+
}
|
|
1851
|
+
// Guard 3b (P1): for a mutating task, at least one PASSING proof must
|
|
1852
|
+
// be discriminating — not already true before the task acted.
|
|
1853
|
+
// Otherwise the "proof" demonstrates nothing changed because of you
|
|
1854
|
+
// (asserting an ambient clock / a window that was already open).
|
|
1855
|
+
if (mutated && ctx.taskBaseline && !(0, assertions_1.hasDiscriminatingEvidence)(parsed.assertions, report, ctx.taskBaseline)) {
|
|
1856
|
+
return {
|
|
1857
|
+
success: false,
|
|
1858
|
+
isError: true,
|
|
1859
|
+
text: `done rejected: every proof you gave was ALREADY true before you acted — none of them shows your change:\n${(0, assertions_1.renderReport)(report)}\nAssert the NEW state your action produced (file_changed_since_start for a file you wrote, element_value_contains for text you typed, a window title that wasn't open before), or give_up.`,
|
|
1860
|
+
};
|
|
1861
|
+
}
|
|
1862
|
+
return {
|
|
1863
|
+
success: true,
|
|
1864
|
+
text: `done: ${evidence}\nVERIFIED:\n${(0, assertions_1.renderReport)(report)}`,
|
|
1865
|
+
stop: true,
|
|
1866
|
+
terminalExit: 'done',
|
|
1867
|
+
};
|
|
1868
|
+
}
|
|
1869
|
+
return { success: true, text: `done: ${evidence}`, stop: true, terminalExit: 'done' };
|
|
1870
|
+
},
|
|
1871
|
+
},
|
|
1872
|
+
{
|
|
1873
|
+
name: 'give_up',
|
|
1874
|
+
description: 'Abandon the task when it\'s impossible from here (credentials missing, captcha, destructive action needs user confirm, stuck after retries).',
|
|
1875
|
+
inputSchema: {
|
|
1876
|
+
type: 'object',
|
|
1877
|
+
properties: { reason: { type: 'string' } },
|
|
1878
|
+
required: ['reason'],
|
|
1879
|
+
additionalProperties: false,
|
|
1880
|
+
},
|
|
1881
|
+
changesScreen: false,
|
|
1882
|
+
terminal: true,
|
|
1883
|
+
async execute(args) {
|
|
1884
|
+
const reason = String(args.reason ?? 'unknown');
|
|
1885
|
+
return { success: false, text: `give_up: ${reason}`, stop: true, terminalExit: 'give_up' };
|
|
1886
|
+
},
|
|
1887
|
+
},
|
|
1888
|
+
{
|
|
1889
|
+
name: 'cannot_read',
|
|
1890
|
+
description: 'Escalate from blind mode to vision — the a11y snapshot doesn\'t contain what you need. Only available in blind mode.',
|
|
1891
|
+
inputSchema: {
|
|
1892
|
+
type: 'object',
|
|
1893
|
+
properties: { reason: { type: 'string' } },
|
|
1894
|
+
required: ['reason'],
|
|
1895
|
+
additionalProperties: false,
|
|
1896
|
+
},
|
|
1897
|
+
changesScreen: false,
|
|
1898
|
+
terminal: true,
|
|
1899
|
+
async execute(args) {
|
|
1900
|
+
const reason = String(args.reason ?? 'a11y snapshot insufficient');
|
|
1901
|
+
return { success: false, text: `cannot_read: ${reason}`, stop: true, terminalExit: 'cannot_read' };
|
|
1902
|
+
},
|
|
1903
|
+
},
|
|
1904
|
+
];
|
|
1905
|
+
// A/B toggle: CLAWD_AGENT_NO_BATCH=1 removes the batch tool so the SAME task
|
|
1906
|
+
// can be run per-call (one tool per turn) vs batched, for measurement.
|
|
1907
|
+
if (/^(1|true)$/i.test(process.env.CLAWD_AGENT_NO_BATCH ?? '')) {
|
|
1908
|
+
const bi = tools.findIndex(t => t.name === 'batch');
|
|
1909
|
+
if (bi >= 0)
|
|
1910
|
+
tools.splice(bi, 1);
|
|
1911
|
+
}
|
|
1912
|
+
// Full flat catalog. `screenshot` is available so the agent can call it
|
|
1913
|
+
// when a11y is insufficient. `cannot_read` is excluded — the model runs
|
|
1914
|
+
// in hybrid mode with direct screenshot access; there is no blind→vision
|
|
1915
|
+
// escalation path to trigger.
|
|
1916
|
+
return tools.filter(t => t.name !== 'cannot_read');
|
|
1917
|
+
}
|
|
1918
|
+
/**
|
|
1919
|
+
* Resolve `processId` to the active-window pid when the LLM omits it.
|
|
1920
|
+
* Without this, UIA / AX searches walk the entire system tree and
|
|
1921
|
+
* either take 10-20 seconds or hang outright. Pre-scoping to the
|
|
1922
|
+
* focused app's pid is almost always what the agent actually wants.
|
|
1923
|
+
*
|
|
1924
|
+
* Used by every agent-internal tool that calls `findElements` or
|
|
1925
|
+
* `invokeElement` with an optional `processId` arg.
|
|
1926
|
+
*/
|
|
1927
|
+
async function resolveAgentPid(args, ctx) {
|
|
1928
|
+
if (typeof args.processId === 'number')
|
|
1929
|
+
return args.processId;
|
|
1930
|
+
try {
|
|
1931
|
+
const active = await ctx.platform.getActiveWindow();
|
|
1932
|
+
return active?.processId;
|
|
1933
|
+
}
|
|
1934
|
+
catch {
|
|
1935
|
+
return undefined;
|
|
1936
|
+
}
|
|
1937
|
+
}
|
|
1938
|
+
function buildWinQuery(args) {
|
|
1939
|
+
const q = {};
|
|
1940
|
+
if (typeof args.processName === 'string')
|
|
1941
|
+
q.processName = args.processName;
|
|
1942
|
+
if (typeof args.processId === 'number')
|
|
1943
|
+
q.processId = args.processId;
|
|
1944
|
+
if (typeof args.title === 'string')
|
|
1945
|
+
q.title = args.title;
|
|
1946
|
+
return Object.keys(q).length ? q : undefined;
|
|
1947
|
+
}
|
|
1948
|
+
/**
|
|
1949
|
+
* Shared `expect` arg schema for consequential tools. The agent loop (and the
|
|
1950
|
+
* batch executor) verify these post-conditions reactively after the action —
|
|
1951
|
+
* a failure surfaces as a DEVIATION (Layer C). Exposed on every tool the model
|
|
1952
|
+
* uses for send/save/submit-class actions, including the OCR/coordinate
|
|
1953
|
+
* fallbacks (click/smart_click/open_uri/browser_click) where verification
|
|
1954
|
+
* matters most (audit 2026-06-10, finding C2/M3).
|
|
1955
|
+
*/
|
|
1956
|
+
const EXPECT_SCHEMA = {
|
|
1957
|
+
type: 'array',
|
|
1958
|
+
description: 'Optional post-conditions to verify after this action (same assertion types as the verify tool: window_title_contains, app_running, element_exists, element_value_contains, clipboard_contains, file_exists, file_contains, ocr_contains, file_changed_since_start). If any FAIL the action returns a DEVIATION and you must adapt. State an OUTCOME you can observe (a window title, a rendered element/chip, a status) — NOT the raw text you typed.',
|
|
1959
|
+
items: {
|
|
1960
|
+
type: 'object',
|
|
1961
|
+
properties: { type: { type: 'string', enum: ['window_title_contains', 'app_running', 'element_exists', 'element_value_contains', 'clipboard_contains', 'file_exists', 'file_contains', 'ocr_contains', 'file_changed_since_start'] } },
|
|
1962
|
+
required: ['type'],
|
|
1963
|
+
},
|
|
1964
|
+
};
|
|
1965
|
+
/** Shared `space` arg schema for the granular pointer tools (click/drag/scroll). */
|
|
1966
|
+
const COORD_SPACE_SCHEMA = {
|
|
1967
|
+
type: 'string',
|
|
1968
|
+
enum: ['screen', 'image'],
|
|
1969
|
+
description: 'Coordinate space of the x/y you pass. "screen" = accessibility/COMPILED-UI coords (@x,y), already correct for the real screen. "image" = coords you read off the SCREENSHOT (downscaled to 1280px wide); the tool scales them up to the real screen. When omitted, the DEFAULT FOLLOWS CONTEXT: "image" while a screenshot is in your context, "screen" otherwise. So pass space:"screen" explicitly when clicking an @x,y map coord on a screenshot turn, and space:"image" when you read coords off the picture.',
|
|
1970
|
+
};
|
|
1971
|
+
/** One-line coordinate breadcrumb for tool-result text: makes the input space,
|
|
1972
|
+
* the scaled screen coords, and the scale factor visible so a wrong-window
|
|
1973
|
+
* click is diagnosable from logs alone (no screenshot needed). */
|
|
1974
|
+
function coordBreadcrumb(ix, iy, sx, sy, space, scale, ctx) {
|
|
1975
|
+
const scaled = scale !== 1 ? ` → screen (${sx},${sy})` : '';
|
|
1976
|
+
return `${space} (${ix},${iy})${scaled} [×${scale}, screen ${ctx.screen.physicalWidth}×${ctx.screen.physicalHeight}]`;
|
|
1977
|
+
}
|
|
1978
|
+
/** Foreground-window before→after, so focus theft (clicks landing on the wrong
|
|
1979
|
+
* window) is visible in the result text. Empty when focus didn't change. */
|
|
1980
|
+
function focusBreadcrumb(before, after) {
|
|
1981
|
+
const b = before?.title ?? '?';
|
|
1982
|
+
const a = after?.title ?? '?';
|
|
1983
|
+
if (b === a)
|
|
1984
|
+
return '';
|
|
1985
|
+
return ` · focus "${truncateTitle(b)}"→"${truncateTitle(a)}"`;
|
|
1986
|
+
}
|
|
1987
|
+
function truncateTitle(s) {
|
|
1988
|
+
return s.length > 32 ? s.slice(0, 31) + '…' : s;
|
|
1989
|
+
}
|
|
1990
|
+
/**
|
|
1991
|
+
* Warn when a coordinate click could not be confirmed to land on the intended
|
|
1992
|
+
* window — the cause of a keystroke leak where an OTP typed after a missed
|
|
1993
|
+
* click went into the wrong window (session 2026-06-11). Two signals:
|
|
1994
|
+
* (a) the platform reported activation FAILED (Windows foreground-lock kept a
|
|
1995
|
+
* different window in front), or
|
|
1996
|
+
* (b) the foreground window CHANGED across the click (before ≠ after), which
|
|
1997
|
+
* for a click meant to interact with the already-focused window means the
|
|
1998
|
+
* click hit something else.
|
|
1999
|
+
* Returns a loud, actionable suffix telling the agent to verify focus before
|
|
2000
|
+
* typing; empty string when the click looks clean.
|
|
2001
|
+
*/
|
|
2002
|
+
function focusTheftWarning(activation, before, after) {
|
|
2003
|
+
const activationFailed = activation && activation.activated === false;
|
|
2004
|
+
const foregroundChanged = !!before?.title && !!after?.title && before.title !== after.title;
|
|
2005
|
+
if (!activationFailed && !foregroundChanged)
|
|
2006
|
+
return '';
|
|
2007
|
+
const landed = after?.title ? `"${truncateTitle(after.title)}"` : 'an unknown window';
|
|
2008
|
+
return ` ⚠ FOCUS NOT CONFIRMED — the click may have landed on ${landed} instead of your target`
|
|
2009
|
+
+ ` (Windows foreground-lock or coords over a different window). DO NOT type next:`
|
|
2010
|
+
+ ` re-focus the intended window first (focus_window / window.focus by processId),`
|
|
2011
|
+
+ ` or act on an a11y/el_NN target instead of coordinates.`;
|
|
2012
|
+
}
|
|
2013
|
+
/**
|
|
2014
|
+
* Locate a target string among OCR elements and return the click point (center
|
|
2015
|
+
* of the best-matching contiguous span) in SCREEN pixels. Ported from the
|
|
2016
|
+
* proven scoring in src/tools/smart.ts: exact > substring-ratio > token-overlap,
|
|
2017
|
+
* with a penalty for a single token matching a multi-word target (stops "begin"
|
|
2018
|
+
* in body text beating the "Begin Exam" button). Null when nothing scores ≥0.4.
|
|
2019
|
+
*/
|
|
2020
|
+
function locateByOcr(target, elements) {
|
|
2021
|
+
const norm = (s) => s.toLowerCase().replace(/[^\w\s]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
2022
|
+
const targetNorm = norm(target);
|
|
2023
|
+
const targetWords = targetNorm.split(' ').filter(Boolean);
|
|
2024
|
+
const targetWordSet = new Set(targetWords);
|
|
2025
|
+
const lineMap = new Map();
|
|
2026
|
+
for (const el of elements) {
|
|
2027
|
+
if (!el.text)
|
|
2028
|
+
continue;
|
|
2029
|
+
const a = lineMap.get(el.line) ?? [];
|
|
2030
|
+
a.push(el);
|
|
2031
|
+
lineMap.set(el.line, a);
|
|
2032
|
+
}
|
|
2033
|
+
let best = null;
|
|
2034
|
+
let bestScore = 0;
|
|
2035
|
+
const MAX_N = Math.min(8, targetWords.length + 2);
|
|
2036
|
+
for (const toks of lineMap.values()) {
|
|
2037
|
+
const sorted = [...toks].sort((a, b) => a.x - b.x);
|
|
2038
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
2039
|
+
for (let n = 1; n <= MAX_N && i + n <= sorted.length; n++) {
|
|
2040
|
+
const span = sorted.slice(i, i + n);
|
|
2041
|
+
let contiguous = true;
|
|
2042
|
+
for (let k = 1; k < span.length; k++) {
|
|
2043
|
+
const gap = span[k].x - (span[k - 1].x + span[k - 1].width);
|
|
2044
|
+
if (gap > Math.max(span[k - 1].height * 1.5, 30)) {
|
|
2045
|
+
contiguous = false;
|
|
2046
|
+
break;
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
if (!contiguous)
|
|
2050
|
+
continue;
|
|
2051
|
+
const phrase = norm(span.map(t => t.text).join(' '));
|
|
2052
|
+
let score = 0;
|
|
2053
|
+
if (phrase === targetNorm)
|
|
2054
|
+
score = 1.0;
|
|
2055
|
+
else if (phrase.includes(targetNorm) || targetNorm.includes(phrase)) {
|
|
2056
|
+
score = Math.min(phrase.length, targetNorm.length) / Math.max(phrase.length, targetNorm.length) * 0.9;
|
|
2057
|
+
}
|
|
2058
|
+
else {
|
|
2059
|
+
const pw = phrase.split(' ').filter(Boolean);
|
|
2060
|
+
const overlap = pw.filter(w => targetWordSet.has(w)).length;
|
|
2061
|
+
const cov = overlap / Math.max(targetWords.length, 1);
|
|
2062
|
+
if (cov >= 1)
|
|
2063
|
+
score = 0.85;
|
|
2064
|
+
else if (cov >= 0.5)
|
|
2065
|
+
score = 0.5 * cov;
|
|
2066
|
+
}
|
|
2067
|
+
if (targetWords.length > 1 && n === 1 && score < 0.95)
|
|
2068
|
+
score *= 0.55;
|
|
2069
|
+
if (score > bestScore) {
|
|
2070
|
+
bestScore = score;
|
|
2071
|
+
const minX = Math.min(...span.map(t => t.x));
|
|
2072
|
+
const minY = Math.min(...span.map(t => t.y));
|
|
2073
|
+
const maxX = Math.max(...span.map(t => t.x + t.width));
|
|
2074
|
+
const maxY = Math.max(...span.map(t => t.y + t.height));
|
|
2075
|
+
best = {
|
|
2076
|
+
x: Math.round((minX + maxX) / 2),
|
|
2077
|
+
y: Math.round((minY + maxY) / 2),
|
|
2078
|
+
label: span.map(t => t.text).join(' '),
|
|
2079
|
+
score,
|
|
2080
|
+
};
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
2085
|
+
return best && bestScore >= 0.4 ? best : null;
|
|
2086
|
+
}
|
|
2087
|
+
function sleep(ms) {
|
|
2088
|
+
return new Promise(r => setTimeout(r, ms));
|
|
2089
|
+
}
|
|
2090
|
+
function truncate(s, max) {
|
|
2091
|
+
return s.length > max ? s.slice(0, max - 1) + '…' : s;
|
|
2092
|
+
}
|
|
2093
|
+
/**
|
|
2094
|
+
* Coerce an LLM-supplied coordinate argument into a clean `{ x, y }` pair.
|
|
2095
|
+
* Models occasionally smush both axes into one field (e.g. `x="390, 79"`,
|
|
2096
|
+
* `x="(390, 79)"`, or `x="390 79"`). The strict number schema makes `Number(...)`
|
|
2097
|
+
* silently produce NaN, which then becomes a click at (NaN, y) — a crash
|
|
2098
|
+
* disguised as a no-op. This helper splits the smushed form when present
|
|
2099
|
+
* and falls back to a clean parse otherwise.
|
|
2100
|
+
*
|
|
2101
|
+
* App-agnostic, OS-agnostic, model-agnostic. Used by every coordinate-taking
|
|
2102
|
+
* tool (click, drag, scroll, hover, move).
|
|
2103
|
+
*/
|
|
2104
|
+
function coerceCoord(rawX, rawY) {
|
|
2105
|
+
const parseOne = (v) => {
|
|
2106
|
+
if (typeof v === 'number')
|
|
2107
|
+
return v;
|
|
2108
|
+
if (typeof v === 'string') {
|
|
2109
|
+
// Strip parens, brackets, leading/trailing whitespace.
|
|
2110
|
+
const cleaned = v.replace(/[()[\]\s]/g, '');
|
|
2111
|
+
const n = Number(cleaned);
|
|
2112
|
+
return Number.isFinite(n) ? n : NaN;
|
|
2113
|
+
}
|
|
2114
|
+
return NaN;
|
|
2115
|
+
};
|
|
2116
|
+
// Case A: x is a string containing a comma or pair-like "390, 79" / "390 79" / "(390,79)".
|
|
2117
|
+
if (typeof rawX === 'string' && /[\s,]/.test(rawX)) {
|
|
2118
|
+
const parts = rawX.replace(/[()[\]]/g, '').split(/[,\s]+/).filter(Boolean);
|
|
2119
|
+
if (parts.length >= 2) {
|
|
2120
|
+
const x = Number(parts[0]);
|
|
2121
|
+
const y = Number(parts[1]);
|
|
2122
|
+
if (Number.isFinite(x) && Number.isFinite(y)) {
|
|
2123
|
+
return {
|
|
2124
|
+
x, y,
|
|
2125
|
+
warning: `coord parser: x came in as "${rawX}" — split into x=${x},y=${y}. Pass x and y as SEPARATE numeric args next time.`,
|
|
2126
|
+
};
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
}
|
|
2130
|
+
const x = parseOne(rawX);
|
|
2131
|
+
const y = parseOne(rawY);
|
|
2132
|
+
return { x, y };
|
|
2133
|
+
}
|
|
2134
|
+
//# sourceMappingURL=tools.js.map
|