screenhand 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -109
- package/bin/darwin-arm64/macos-bridge +0 -0
- package/dist/mcp-desktop.js +5876 -0
- package/dist/scripts/codex-monitor-daemon.js +335 -0
- package/dist/scripts/export-help-center.js +112 -0
- package/dist/scripts/marketing-loop.js +117 -0
- package/dist/scripts/observer-daemon.js +288 -0
- package/dist/scripts/orchestrator-daemon.js +399 -0
- package/dist/scripts/supervisor-daemon.js +272 -0
- package/dist/scripts/threads-campaign.js +208 -0
- package/dist/scripts/worker-daemon.js +228 -0
- package/dist/src/agent/cli.js +82 -0
- package/dist/src/agent/loop.js +274 -0
- package/dist/src/community/fetcher.js +109 -0
- package/dist/src/community/index.js +6 -0
- package/dist/src/community/publisher.js +191 -0
- package/dist/src/community/remote-api.js +121 -0
- package/dist/src/community/types.js +3 -0
- package/dist/src/community/validator.js +95 -0
- package/{src/config.ts → dist/src/config.js} +5 -10
- package/dist/src/context-tracker.js +489 -0
- package/{src/index.ts → dist/src/index.js} +32 -52
- package/dist/src/ingestion/coverage-auditor.js +233 -0
- package/dist/src/ingestion/doc-parser.js +164 -0
- package/dist/src/ingestion/index.js +8 -0
- package/dist/src/ingestion/menu-scanner.js +152 -0
- package/dist/src/ingestion/reference-merger.js +186 -0
- package/dist/src/ingestion/shortcut-extractor.js +180 -0
- package/dist/src/ingestion/tutorial-extractor.js +170 -0
- package/dist/src/ingestion/types.js +3 -0
- package/dist/src/jobs/manager.js +305 -0
- package/dist/src/jobs/runner.js +806 -0
- package/dist/src/jobs/store.js +102 -0
- package/dist/src/jobs/types.js +30 -0
- package/dist/src/jobs/worker.js +97 -0
- package/dist/src/learning/engine.js +356 -0
- package/dist/src/learning/index.js +9 -0
- package/dist/src/learning/locator-policy.js +120 -0
- package/dist/src/learning/pattern-policy.js +89 -0
- package/dist/src/learning/recovery-policy.js +116 -0
- package/dist/src/learning/sensor-policy.js +115 -0
- package/dist/src/learning/timing-model.js +204 -0
- package/dist/src/learning/topology-policy.js +90 -0
- package/dist/src/learning/types.js +9 -0
- package/dist/src/logging/timeline-logger.js +48 -0
- package/dist/src/mcp/mcp-stdio-server.js +464 -0
- package/dist/src/mcp/server.js +363 -0
- package/dist/src/mcp-entry.js +60 -0
- package/dist/src/memory/playbook-seeds.js +200 -0
- package/dist/src/memory/recall.js +222 -0
- package/dist/src/memory/research.js +104 -0
- package/dist/src/memory/seeds.js +101 -0
- package/dist/src/memory/service.js +446 -0
- package/dist/src/memory/session.js +169 -0
- package/dist/src/memory/store.js +451 -0
- package/{src/runtime/locator-cache.ts → dist/src/memory/types.js} +1 -17
- package/dist/src/monitor/codex-monitor.js +382 -0
- package/dist/src/monitor/task-queue.js +97 -0
- package/dist/src/monitor/types.js +62 -0
- package/dist/src/native/bridge-client.js +412 -0
- package/{src/native/macos-bridge-client.ts → dist/src/native/macos-bridge-client.js} +0 -1
- package/dist/src/observer/state.js +199 -0
- package/dist/src/observer/types.js +43 -0
- package/dist/src/orchestrator/state.js +68 -0
- package/dist/src/orchestrator/types.js +22 -0
- package/dist/src/perception/ax-source.js +162 -0
- package/dist/src/perception/cdp-source.js +162 -0
- package/dist/src/perception/coordinator.js +771 -0
- package/dist/src/perception/frame-differ.js +287 -0
- package/dist/src/perception/index.js +22 -0
- package/dist/src/perception/manager.js +199 -0
- package/dist/src/perception/types.js +47 -0
- package/dist/src/perception/vision-source.js +399 -0
- package/dist/src/planner/deterministic.js +298 -0
- package/dist/src/planner/executor.js +870 -0
- package/dist/src/planner/goal-store.js +92 -0
- package/dist/src/planner/index.js +21 -0
- package/dist/src/planner/planner.js +520 -0
- package/dist/src/planner/tool-registry.js +71 -0
- package/dist/src/planner/types.js +22 -0
- package/dist/src/platform/explorer.js +213 -0
- package/dist/src/platform/help-center-markdown.js +527 -0
- package/dist/src/platform/learner.js +257 -0
- package/dist/src/playbook/engine.js +486 -0
- package/dist/src/playbook/index.js +20 -0
- package/dist/src/playbook/mcp-recorder.js +204 -0
- package/dist/src/playbook/recorder.js +536 -0
- package/dist/src/playbook/runner.js +408 -0
- package/dist/src/playbook/store.js +312 -0
- package/dist/src/playbook/types.js +17 -0
- package/dist/src/recovery/detectors.js +156 -0
- package/dist/src/recovery/engine.js +327 -0
- package/dist/src/recovery/index.js +20 -0
- package/dist/src/recovery/strategies.js +274 -0
- package/dist/src/recovery/types.js +20 -0
- package/dist/src/runtime/accessibility-adapter.js +430 -0
- package/dist/src/runtime/app-adapter.js +64 -0
- package/dist/src/runtime/applescript-adapter.js +305 -0
- package/dist/src/runtime/ax-role-map.js +96 -0
- package/dist/src/runtime/browser-adapter.js +52 -0
- package/dist/src/runtime/cdp-chrome-adapter.js +521 -0
- package/dist/src/runtime/composite-adapter.js +221 -0
- package/dist/src/runtime/execution-contract.js +159 -0
- package/dist/src/runtime/executor.js +286 -0
- package/dist/src/runtime/locator-cache.js +50 -0
- package/dist/src/runtime/planning-loop.js +63 -0
- package/dist/src/runtime/service.js +432 -0
- package/dist/src/runtime/session-manager.js +63 -0
- package/dist/src/runtime/state-observer.js +121 -0
- package/dist/src/runtime/vision-adapter.js +225 -0
- package/dist/src/state/app-map-types.js +72 -0
- package/dist/src/state/app-map.js +1974 -0
- package/dist/src/state/entity-tracker.js +108 -0
- package/dist/src/state/fusion.js +96 -0
- package/dist/src/state/index.js +21 -0
- package/dist/src/state/ladder-generator.js +236 -0
- package/dist/src/state/persistence.js +156 -0
- package/dist/src/state/types.js +17 -0
- package/dist/src/state/world-model.js +1456 -0
- package/dist/src/supervisor/locks.js +186 -0
- package/dist/src/supervisor/supervisor.js +403 -0
- package/dist/src/supervisor/types.js +30 -0
- package/dist/src/test-mcp-protocol.js +154 -0
- package/dist/src/types.js +17 -0
- package/dist/src/util/atomic-write.js +133 -0
- package/dist/src/util/sanitize.js +146 -0
- package/dist-app-maps/com.figma.Desktop.json +959 -0
- package/dist-app-maps/com.hnc.Discord.json +1146 -0
- package/dist-app-maps/notion.id.json +2831 -0
- package/dist-playbooks/canva-screenhand-carousel.json +445 -0
- package/dist-playbooks/codex-desktop.json +76 -0
- package/dist-playbooks/competitor-research-stack.json +122 -0
- package/dist-playbooks/davinci-color-grade.json +153 -0
- package/dist-playbooks/davinci-edit-timeline.json +162 -0
- package/dist-playbooks/davinci-render.json +114 -0
- package/dist-playbooks/devto.json +52 -0
- package/dist-playbooks/discord.json +41 -0
- package/dist-playbooks/google-flow-create-project.json +59 -0
- package/dist-playbooks/google-flow-edit-image.json +90 -0
- package/dist-playbooks/google-flow-edit-video.json +90 -0
- package/dist-playbooks/google-flow-generate-image.json +68 -0
- package/dist-playbooks/google-flow-generate-video.json +191 -0
- package/dist-playbooks/google-flow-open-project.json +48 -0
- package/dist-playbooks/google-flow-open-scenebuilder.json +64 -0
- package/dist-playbooks/google-flow-search-assets.json +64 -0
- package/dist-playbooks/instagram.json +57 -0
- package/dist-playbooks/linkedin.json +52 -0
- package/dist-playbooks/n8n.json +43 -0
- package/dist-playbooks/reddit.json +52 -0
- package/dist-playbooks/threads.json +59 -0
- package/dist-playbooks/x-twitter.json +59 -0
- package/dist-playbooks/youtube.json +59 -0
- package/dist-references/canva.json +646 -0
- package/dist-references/codex-desktop.json +305 -0
- package/dist-references/davinci-resolve-keyboard.json +594 -0
- package/dist-references/davinci-resolve-menu-map.json +1139 -0
- package/dist-references/davinci-resolve-menus-batch1.json +116 -0
- package/dist-references/davinci-resolve-menus-batch2.json +372 -0
- package/dist-references/davinci-resolve-menus-batch3.json +330 -0
- package/dist-references/davinci-resolve-menus-batch4.json +297 -0
- package/dist-references/davinci-resolve-shortcuts.json +333 -0
- package/dist-references/devto.json +317 -0
- package/dist-references/discord.json +549 -0
- package/dist-references/figma.json +1186 -0
- package/dist-references/finder.json +146 -0
- package/dist-references/google-ads-transparency.json +95 -0
- package/dist-references/google-flow.json +649 -0
- package/dist-references/instagram.json +341 -0
- package/dist-references/linkedin.json +324 -0
- package/dist-references/meta-ad-library.json +86 -0
- package/dist-references/n8n.json +387 -0
- package/dist-references/notes.json +27 -0
- package/dist-references/notion.json +163 -0
- package/dist-references/reddit.json +341 -0
- package/dist-references/threads.json +337 -0
- package/dist-references/x-twitter.json +403 -0
- package/dist-references/youtube.json +373 -0
- package/native/macos-bridge/Package.swift +1 -0
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +257 -36
- package/native/macos-bridge/Sources/AppManagement.swift +212 -2
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +348 -53
- package/native/macos-bridge/Sources/StreamCapture.swift +136 -0
- package/native/macos-bridge/Sources/VisionBridge.swift +165 -7
- package/native/macos-bridge/Sources/main.swift +169 -16
- package/native/windows-bridge/Program.cs +5 -0
- package/native/windows-bridge/ScreenCapture.cs +124 -0
- package/package.json +29 -4
- package/scripts/postinstall.cjs +127 -0
- package/.claude/commands/automate.md +0 -28
- package/.claude/commands/debug-ui.md +0 -19
- package/.claude/commands/screenshot.md +0 -15
- package/.github/FUNDING.yml +0 -1
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -27
- package/.github/ISSUE_TEMPLATE/feature_request.md +0 -20
- package/.mcp.json +0 -8
- package/DESKTOP_MCP_GUIDE.md +0 -92
- package/SECURITY.md +0 -44
- package/docs/architecture.md +0 -47
- package/install-skills.sh +0 -19
- package/mcp-bridge.ts +0 -271
- package/mcp-desktop.ts +0 -1221
- package/playbooks/instagram.json +0 -41
- package/playbooks/instagram_v2.json +0 -201
- package/playbooks/x_v1.json +0 -211
- package/scripts/devpost-live-loop.mjs +0 -421
- package/src/logging/timeline-logger.ts +0 -55
- package/src/mcp/server.ts +0 -449
- package/src/memory/recall.ts +0 -191
- package/src/memory/research.ts +0 -146
- package/src/memory/seeds.ts +0 -123
- package/src/memory/session.ts +0 -201
- package/src/memory/store.ts +0 -434
- package/src/memory/types.ts +0 -69
- package/src/native/bridge-client.ts +0 -239
- package/src/runtime/accessibility-adapter.ts +0 -487
- package/src/runtime/app-adapter.ts +0 -169
- package/src/runtime/applescript-adapter.ts +0 -376
- package/src/runtime/ax-role-map.ts +0 -102
- package/src/runtime/browser-adapter.ts +0 -129
- package/src/runtime/cdp-chrome-adapter.ts +0 -676
- package/src/runtime/composite-adapter.ts +0 -274
- package/src/runtime/executor.ts +0 -396
- package/src/runtime/planning-loop.ts +0 -81
- package/src/runtime/service.ts +0 -448
- package/src/runtime/session-manager.ts +0 -50
- package/src/runtime/state-observer.ts +0 -136
- package/src/runtime/vision-adapter.ts +0 -297
- package/src/types.ts +0 -297
- package/tests/bridge-client.test.ts +0 -176
- package/tests/browser-stealth.test.ts +0 -210
- package/tests/composite-adapter.test.ts +0 -64
- package/tests/mcp-server.test.ts +0 -151
- package/tests/memory-recall.test.ts +0 -339
- package/tests/memory-research.test.ts +0 -159
- package/tests/memory-seeds.test.ts +0 -120
- package/tests/memory-store.test.ts +0 -392
- package/tests/types.test.ts +0 -92
- package/tsconfig.check.json +0 -17
- package/tsconfig.json +0 -19
- package/vitest.config.ts +0 -8
- /package/{playbooks → dist-references}/devpost.json +0 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
// Copyright (C) 2025 Clazro Technology Private Limited
|
|
2
|
+
// SPDX-License-Identifier: AGPL-3.0-only
|
|
3
|
+
//
|
|
4
|
+
// This file is part of ScreenHand.
|
|
5
|
+
//
|
|
6
|
+
// ScreenHand is free software: you can redistribute it and/or modify
|
|
7
|
+
// it under the terms of the GNU Affero General Public License as
|
|
8
|
+
// published by the Free Software Foundation, version 3.
|
|
9
|
+
//
|
|
10
|
+
// ScreenHand is distributed in the hope that it will be useful,
|
|
11
|
+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12
|
+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13
|
+
// GNU Affero General Public License for more details.
|
|
14
|
+
//
|
|
15
|
+
// You should have received a copy of the GNU Affero General Public License
|
|
16
|
+
// along with ScreenHand. If not, see <https://www.gnu.org/licenses/>.
|
|
17
|
+
import { DEFAULT_RECOVERY_BUDGET } from "../recovery/types.js";
|
|
18
|
+
import { DEFAULT_PLANNER_CONFIG } from "./types.js";
|
|
19
|
+
/**
|
|
20
|
+
* PlanExecutor — runs ActionPlans step by step, verifying postconditions
|
|
21
|
+
* against the world model after each step.
|
|
22
|
+
*
|
|
23
|
+
* On failure, delegates to the Planner for replanning.
|
|
24
|
+
* On LLM steps, pauses and returns control to the client.
|
|
25
|
+
*/
|
|
26
|
+
export class PlanExecutor {
|
|
27
|
+
worldModel;
|
|
28
|
+
planner;
|
|
29
|
+
executeTool;
|
|
30
|
+
recovery;
|
|
31
|
+
learningEngine;
|
|
32
|
+
config;
|
|
33
|
+
/** Accumulated execution trace for current goal — reset on each executeGoal() call */
|
|
34
|
+
log = [];
|
|
35
|
+
constructor(worldModel, planner, executeTool, config, recovery, learningEngine) {
|
|
36
|
+
this.worldModel = worldModel;
|
|
37
|
+
this.planner = planner;
|
|
38
|
+
this.executeTool = executeTool;
|
|
39
|
+
this.recovery = recovery;
|
|
40
|
+
this.learningEngine = learningEngine;
|
|
41
|
+
this.config = { ...DEFAULT_PLANNER_CONFIG, ...config };
|
|
42
|
+
}
|
|
43
|
+
dbg(msg) {
|
|
44
|
+
const line = `[${new Date().toISOString().substring(11, 23)}] ${msg}`;
|
|
45
|
+
this.log.push(line);
|
|
46
|
+
process.stderr.write(line + "\n");
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Execute a full goal: iterate subgoals, execute plans, replan on failure.
|
|
50
|
+
* Pauses at LLM steps and returns an ExecutionPause for the client to resolve.
|
|
51
|
+
*/
|
|
52
|
+
async executeGoal(goal) {
|
|
53
|
+
const start = Date.now();
|
|
54
|
+
let stepsExecuted = 0;
|
|
55
|
+
let replans = 0;
|
|
56
|
+
this.log = []; // reset log for this goal
|
|
57
|
+
// Capture the expected app at goal start for app_switched detection
|
|
58
|
+
const expectedBundleId = this.worldModel.getState().focusedApp?.bundleId ?? null;
|
|
59
|
+
this.dbg(`═══ GOAL START: "${goal.description}" ═══`);
|
|
60
|
+
this.dbg(`Focused app: ${expectedBundleId ?? "none"} | Windows: ${this.worldModel.getState().windows.size} | Controls: ${[...this.worldModel.getState().windows.values()].reduce((n, w) => n + w.controls.size, 0)}`);
|
|
61
|
+
// Recovery budget for the entire goal lifetime
|
|
62
|
+
const recoveryBudget = {
|
|
63
|
+
...DEFAULT_RECOVERY_BUDGET,
|
|
64
|
+
usedStrategyIds: new Set(),
|
|
65
|
+
};
|
|
66
|
+
// Plan any unplanned subgoals
|
|
67
|
+
await this.planner.planGoal(goal);
|
|
68
|
+
// Resume from pausedAt if set
|
|
69
|
+
const startSubgoalIdx = goal.pausedAt?.subgoalIndex ?? 0;
|
|
70
|
+
delete goal.pausedAt;
|
|
71
|
+
for (let sgIdx = startSubgoalIdx; sgIdx < goal.subgoals.length; sgIdx++) {
|
|
72
|
+
const subgoal = goal.subgoals[sgIdx];
|
|
73
|
+
if (subgoal.status === "completed" || subgoal.status === "skipped")
|
|
74
|
+
continue;
|
|
75
|
+
this.dbg(`── Subgoal ${sgIdx + 1}/${goal.subgoals.length}: "${subgoal.description}"`);
|
|
76
|
+
subgoal.status = "active";
|
|
77
|
+
while (subgoal.status === "active" &&
|
|
78
|
+
subgoal.attempts < subgoal.maxAttempts) {
|
|
79
|
+
if (!subgoal.plan) {
|
|
80
|
+
this.dbg(` ✗ No plan available`);
|
|
81
|
+
subgoal.status = "failed";
|
|
82
|
+
subgoal.lastError = "No plan available";
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
this.dbg(` Plan: ${subgoal.plan.source} | ${subgoal.plan.steps.length} steps | attempt ${subgoal.attempts + 1}/${subgoal.maxAttempts}`);
|
|
86
|
+
const result = await this.executePlan(subgoal.plan, recoveryBudget);
|
|
87
|
+
// Check if we hit an LLM pause
|
|
88
|
+
if ("paused" in result) {
|
|
89
|
+
this.dbg(` ⏸ Paused at step ${result.stepIndex}: ${result.stepDescription}`);
|
|
90
|
+
// Save resume point on the goal
|
|
91
|
+
goal.pausedAt = {
|
|
92
|
+
subgoalIndex: sgIdx,
|
|
93
|
+
stepIndex: result.stepIndex,
|
|
94
|
+
};
|
|
95
|
+
goal.status = "active";
|
|
96
|
+
return {
|
|
97
|
+
...result,
|
|
98
|
+
subgoalIndex: sgIdx,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
stepsExecuted += result.stepsExecuted;
|
|
102
|
+
if (result.success) {
|
|
103
|
+
this.dbg(` ✓ Subgoal completed`);
|
|
104
|
+
subgoal.status = "completed";
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
this.dbg(` ✗ Plan failed: ${result.error}`);
|
|
108
|
+
// Plan failed — try replanning
|
|
109
|
+
replans++;
|
|
110
|
+
const reason = this.diagnoseFailure(result, expectedBundleId);
|
|
111
|
+
this.dbg(` → Replan #${replans}, reason: ${reason}`);
|
|
112
|
+
const newPlan = await this.planner.replan(subgoal, reason, result.error ?? undefined);
|
|
113
|
+
if (!newPlan) {
|
|
114
|
+
this.dbg(` ✗ No replan available — giving up`);
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
this.dbg(` → New plan: ${newPlan.source} | ${newPlan.steps.length} steps`);
|
|
118
|
+
subgoal.plan = newPlan;
|
|
119
|
+
subgoal.status = "active";
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
this.planner.evaluateGoal(goal);
|
|
123
|
+
const finalError = goal.status === "failed"
|
|
124
|
+
? goal.subgoals.find((sg) => sg.status === "failed")?.lastError ?? "Unknown error"
|
|
125
|
+
: null;
|
|
126
|
+
this.dbg(`═══ GOAL ${goal.status.toUpperCase()} in ${Date.now() - start}ms | steps=${stepsExecuted} replans=${replans}${finalError ? ` error="${finalError}"` : ""} ═══`);
|
|
127
|
+
return {
|
|
128
|
+
goalId: goal.id,
|
|
129
|
+
success: goal.status === "completed",
|
|
130
|
+
subgoalsCompleted: goal.subgoals.filter((sg) => sg.status === "completed").length,
|
|
131
|
+
totalSubgoals: goal.subgoals.length,
|
|
132
|
+
stepsExecuted,
|
|
133
|
+
replans,
|
|
134
|
+
durationMs: Date.now() - start,
|
|
135
|
+
error: finalError,
|
|
136
|
+
executionLog: [...this.log],
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Execute the next single step of a goal. Returns the step result,
|
|
141
|
+
* or an ExecutionPause if the next step requires LLM interpretation.
|
|
142
|
+
*/
|
|
143
|
+
async executeNextStep(goal) {
|
|
144
|
+
// Find the current active subgoal and step
|
|
145
|
+
for (let sgIdx = 0; sgIdx < goal.subgoals.length; sgIdx++) {
|
|
146
|
+
const subgoal = goal.subgoals[sgIdx];
|
|
147
|
+
if (subgoal.status === "completed" || subgoal.status === "skipped" || subgoal.status === "failed")
|
|
148
|
+
continue;
|
|
149
|
+
if (!subgoal.plan) {
|
|
150
|
+
subgoal.plan = await this.planner.planSubgoal(subgoal);
|
|
151
|
+
}
|
|
152
|
+
subgoal.status = "active";
|
|
153
|
+
const plan = subgoal.plan;
|
|
154
|
+
if (plan.currentStepIndex >= plan.steps.length) {
|
|
155
|
+
subgoal.status = "completed";
|
|
156
|
+
continue;
|
|
157
|
+
}
|
|
158
|
+
const step = plan.steps[plan.currentStepIndex];
|
|
159
|
+
// If step requires LLM and has no tool assigned, pause
|
|
160
|
+
if (step.requiresLLM && !step.tool) {
|
|
161
|
+
return {
|
|
162
|
+
paused: true,
|
|
163
|
+
reason: "requires_llm",
|
|
164
|
+
stepIndex: plan.currentStepIndex,
|
|
165
|
+
stepDescription: step.description,
|
|
166
|
+
subgoalIndex: sgIdx,
|
|
167
|
+
completedSteps: plan.currentStepIndex,
|
|
168
|
+
totalSteps: plan.steps.length,
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
const nextStep = findNextMeaningfulStep(plan.steps, plan.currentStepIndex);
|
|
172
|
+
const result = await this.executeStepInternal(step, nextStep);
|
|
173
|
+
if (result.success) {
|
|
174
|
+
step.status = "completed";
|
|
175
|
+
step.resolvedBy = "auto";
|
|
176
|
+
plan.currentStepIndex++;
|
|
177
|
+
// Check if subgoal is complete
|
|
178
|
+
if (plan.currentStepIndex >= plan.steps.length) {
|
|
179
|
+
subgoal.status = "completed";
|
|
180
|
+
this.planner.evaluateGoal(goal);
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
else {
|
|
184
|
+
step.status = "failed";
|
|
185
|
+
}
|
|
186
|
+
return result;
|
|
187
|
+
}
|
|
188
|
+
// All subgoals done
|
|
189
|
+
this.planner.evaluateGoal(goal);
|
|
190
|
+
return {
|
|
191
|
+
goalId: goal.id,
|
|
192
|
+
success: goal.status === "completed",
|
|
193
|
+
subgoalsCompleted: goal.subgoals.filter((sg) => sg.status === "completed").length,
|
|
194
|
+
totalSubgoals: goal.subgoals.length,
|
|
195
|
+
stepsExecuted: 0,
|
|
196
|
+
replans: 0,
|
|
197
|
+
durationMs: 0,
|
|
198
|
+
error: goal.status === "failed"
|
|
199
|
+
? goal.subgoals.find((sg) => sg.status === "failed")?.lastError ?? "Unknown error"
|
|
200
|
+
: null,
|
|
201
|
+
executionLog: [...this.log],
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Resolve an LLM step: the client provides the tool + params to use.
|
|
206
|
+
* Executes the tool, advances the plan, and returns the result.
|
|
207
|
+
*/
|
|
208
|
+
async resolveStep(goal, tool, params) {
|
|
209
|
+
// Find the paused step
|
|
210
|
+
const sgIdx = goal.pausedAt?.subgoalIndex ?? 0;
|
|
211
|
+
const stepIdx = goal.pausedAt?.stepIndex ?? 0;
|
|
212
|
+
const subgoal = goal.subgoals[sgIdx];
|
|
213
|
+
if (!subgoal?.plan) {
|
|
214
|
+
return {
|
|
215
|
+
step: { tool: "", params: {}, expectedPostcondition: null, timeout: 0, fallbackTool: null, requiresLLM: true, status: "failed", description: "No plan" },
|
|
216
|
+
success: false,
|
|
217
|
+
durationMs: 0,
|
|
218
|
+
postconditionMet: false,
|
|
219
|
+
error: "No active plan to resolve",
|
|
220
|
+
usedFallback: false,
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
const plan = subgoal.plan;
|
|
224
|
+
const step = plan.steps[stepIdx];
|
|
225
|
+
if (!step) {
|
|
226
|
+
return {
|
|
227
|
+
step: { tool: "", params: {}, expectedPostcondition: null, timeout: 0, fallbackTool: null, requiresLLM: true, status: "failed", description: "No step" },
|
|
228
|
+
success: false,
|
|
229
|
+
durationMs: 0,
|
|
230
|
+
postconditionMet: false,
|
|
231
|
+
error: "Step not found at pause index",
|
|
232
|
+
usedFallback: false,
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
// Resolve the LLM step with client-provided tool+params
|
|
236
|
+
step.tool = tool;
|
|
237
|
+
step.params = params;
|
|
238
|
+
step.resolvedBy = "client";
|
|
239
|
+
const result = await this.executeStepInternal(step);
|
|
240
|
+
if (result.success) {
|
|
241
|
+
step.status = "completed";
|
|
242
|
+
plan.currentStepIndex = stepIdx + 1;
|
|
243
|
+
delete goal.pausedAt;
|
|
244
|
+
if (plan.currentStepIndex >= plan.steps.length) {
|
|
245
|
+
subgoal.status = "completed";
|
|
246
|
+
this.planner.evaluateGoal(goal);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
else {
|
|
250
|
+
step.status = "failed";
|
|
251
|
+
}
|
|
252
|
+
return result;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Execute a single ActionPlan's steps sequentially.
|
|
256
|
+
* Pauses at LLM steps instead of failing.
|
|
257
|
+
*/
|
|
258
|
+
async executePlan(plan, recoveryBudget) {
|
|
259
|
+
const stepResults = [];
|
|
260
|
+
for (let i = plan.currentStepIndex; i < plan.steps.length; i++) {
|
|
261
|
+
const step = plan.steps[i];
|
|
262
|
+
plan.currentStepIndex = i;
|
|
263
|
+
// Pause at LLM-required steps for client resolution
|
|
264
|
+
if (step.requiresLLM && !step.tool) {
|
|
265
|
+
return {
|
|
266
|
+
paused: true,
|
|
267
|
+
reason: "requires_llm",
|
|
268
|
+
stepIndex: i,
|
|
269
|
+
stepDescription: step.description,
|
|
270
|
+
subgoalIndex: 0,
|
|
271
|
+
completedSteps: stepResults.length,
|
|
272
|
+
totalSteps: plan.steps.length,
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
// Find the next meaningful step (skip screenshots/OCR — they have no target)
|
|
276
|
+
const nextStep = findNextMeaningfulStep(plan.steps, i);
|
|
277
|
+
const result = await this.executeStepInternal(step, nextStep);
|
|
278
|
+
stepResults.push(result);
|
|
279
|
+
if (!result.success) {
|
|
280
|
+
step.status = "failed";
|
|
281
|
+
// Attempt recovery before reporting failure
|
|
282
|
+
if (this.recovery && recoveryBudget) {
|
|
283
|
+
const expectedBundleId = this.worldModel.getState().focusedApp?.bundleId ?? null;
|
|
284
|
+
const recoveryOutcome = await this.recovery.attemptRecovery(result.error ?? "unknown failure", expectedBundleId, recoveryBudget);
|
|
285
|
+
if (recoveryOutcome.recovered) {
|
|
286
|
+
// Retry the failed step once after recovery
|
|
287
|
+
step.status = "pending";
|
|
288
|
+
const retryResult = await this.executeStepInternal(step);
|
|
289
|
+
stepResults.push(retryResult);
|
|
290
|
+
if (retryResult.success) {
|
|
291
|
+
step.status = "completed";
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
step.status = "failed";
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
success: false,
|
|
299
|
+
stepsExecuted: stepResults.length,
|
|
300
|
+
error: result.error,
|
|
301
|
+
stepResults,
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
step.status = "completed";
|
|
305
|
+
}
|
|
306
|
+
return {
|
|
307
|
+
success: true,
|
|
308
|
+
stepsExecuted: stepResults.length,
|
|
309
|
+
error: null,
|
|
310
|
+
stepResults,
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Execute a single PlanStep and verify its postcondition.
|
|
315
|
+
*/
|
|
316
|
+
/**
|
|
317
|
+
* Execute a single PlanStep with world-model-aware LOOK → ACT → VERIFY loop.
|
|
318
|
+
* Uses the world model (0ms reads) for awareness — NOT screenshots.
|
|
319
|
+
*/
|
|
320
|
+
async executeStepInternal(step, nextStep) {
|
|
321
|
+
const start = Date.now();
|
|
322
|
+
step.status = "executing";
|
|
323
|
+
let usedFallback = false;
|
|
324
|
+
// ── LOOK: Pre-step awareness from world model (0ms, consistent snapshot) ──
|
|
325
|
+
const preState = this.worldModel.getConsistentSnapshot();
|
|
326
|
+
const preControls = [...preState.windows.values()].reduce((n, w) => n + w.controls.size, 0);
|
|
327
|
+
const paramStr = Object.entries(step.params)
|
|
328
|
+
.filter(([k]) => !k.startsWith("_"))
|
|
329
|
+
.map(([k, v]) => `${k}=${JSON.stringify(v)}`)
|
|
330
|
+
.join(", ");
|
|
331
|
+
this.dbg(` ▶ ${step.tool}(${paramStr})`);
|
|
332
|
+
this.dbg(` PRE | app=${preState.focusedApp?.bundleId ?? "none"} | win=${preState.focusedWindowId ?? "none"} | controls=${preControls} | dialogs=${preState.activeDialogs.length}`);
|
|
333
|
+
if (nextStep) {
|
|
334
|
+
const npStr = Object.entries(nextStep.params).filter(([k]) => !k.startsWith("_")).map(([k, v]) => `${k}=${JSON.stringify(v)}`).join(", ");
|
|
335
|
+
this.dbg(` NEXT | ${nextStep.tool}(${npStr})`);
|
|
336
|
+
}
|
|
337
|
+
// 1. Dialog check: if a dialog is blocking, fail fast so recovery can handle it
|
|
338
|
+
if (preState.activeDialogs.length > 0) {
|
|
339
|
+
const dialog = preState.activeDialogs[0];
|
|
340
|
+
const err = `Dialog blocking: ${dialog.type} "${dialog.title ?? dialog.message ?? "unknown"}" [buttons: ${dialog.buttons.join(", ")}]`;
|
|
341
|
+
this.dbg(` BLOCK| ${err}`);
|
|
342
|
+
return {
|
|
343
|
+
step,
|
|
344
|
+
success: false,
|
|
345
|
+
durationMs: Date.now() - start,
|
|
346
|
+
postconditionMet: false,
|
|
347
|
+
error: err,
|
|
348
|
+
usedFallback: false,
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
// 2. Target validation: for ANY tool that targets a UI element, verify it exists
|
|
352
|
+
if (INTERACTION_TOOLS.has(step.tool)) {
|
|
353
|
+
const target = (step.params.title ?? step.params.text ?? step.params.name ?? step.params.selector);
|
|
354
|
+
if (typeof target === "string") {
|
|
355
|
+
const focusedWinId = preState.focusedWindowId;
|
|
356
|
+
if (focusedWinId !== null) {
|
|
357
|
+
const win = preState.windows.get(focusedWinId);
|
|
358
|
+
if (win && win.controls.size > 5) {
|
|
359
|
+
const targetLower = target.toLowerCase();
|
|
360
|
+
const found = [...win.controls.values()].some((c) => c.label.value?.toLowerCase().includes(targetLower));
|
|
361
|
+
if (!found) {
|
|
362
|
+
const available = [...win.controls.values()]
|
|
363
|
+
.slice(0, 15)
|
|
364
|
+
.map((c) => `${c.role}:"${c.label.value ?? ""}"`)
|
|
365
|
+
.filter((s) => s.length > 3)
|
|
366
|
+
.join(", ");
|
|
367
|
+
const err = `Pre-check: "${target}" not found in world model (${win.controls.size} controls tracked). Available: ${available}`;
|
|
368
|
+
this.dbg(` MISS | ${err}`);
|
|
369
|
+
return {
|
|
370
|
+
step,
|
|
371
|
+
success: false,
|
|
372
|
+
durationMs: Date.now() - start,
|
|
373
|
+
postconditionMet: false,
|
|
374
|
+
error: err,
|
|
375
|
+
usedFallback: false,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
this.dbg(` FOUND| "${target}" in world model ✓`);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
// 3. Focus validation: for type_text, verify a text field is focused
|
|
384
|
+
if (step.tool === "type_text") {
|
|
385
|
+
const focusedWinId = preState.focusedWindowId;
|
|
386
|
+
if (focusedWinId !== null) {
|
|
387
|
+
const win = preState.windows.get(focusedWinId);
|
|
388
|
+
if (win?.focusedElement) {
|
|
389
|
+
const role = win.focusedElement.role.toLowerCase();
|
|
390
|
+
const isTextInput = role.includes("text") || role.includes("search") ||
|
|
391
|
+
role.includes("combobox") || role.includes("field") || role.includes("area");
|
|
392
|
+
if (!isTextInput && win.controls.size > 5) {
|
|
393
|
+
// Focused element isn't a text field — warn but don't block
|
|
394
|
+
// (some apps use non-standard roles)
|
|
395
|
+
step.description = `${step.description} [⚠ focused: ${win.focusedElement.role}:"${win.focusedElement.label.value ?? ""}"]`;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
// ── ACT: Execute the tool ──
|
|
401
|
+
// Auto-upgrade click_text → ui_press when world model already has the target via AX.
|
|
402
|
+
// click_text uses cg.captureWindow (crashes on GPU-heavy pages) + OCR (slow, sometimes wrong tab).
|
|
403
|
+
// ui_press uses AX directly — 10x faster, no screenshots, no crash risk.
|
|
404
|
+
if (step.tool === "click_text" && preState.focusedApp) {
|
|
405
|
+
const clickTarget = step.params.text;
|
|
406
|
+
if (clickTarget) {
|
|
407
|
+
for (const win of preState.windows.values()) {
|
|
408
|
+
const match = [...win.controls.values()].find((c) => c.label.value?.toLowerCase().includes(clickTarget.toLowerCase()));
|
|
409
|
+
if (match) {
|
|
410
|
+
// Get real pid: window.pid (from AX scan) > focusedApp.pid (often 0 from feedWorldModel)
|
|
411
|
+
const pid = win.pid || preState.focusedApp.pid;
|
|
412
|
+
if (pid) {
|
|
413
|
+
this.dbg(` SWAP | click_text → ui_press (found "${match.label.value}" as ${match.role} in AX, pid=${pid})`);
|
|
414
|
+
step.tool = "ui_press";
|
|
415
|
+
step.params = { pid, title: clickTarget };
|
|
416
|
+
}
|
|
417
|
+
else {
|
|
418
|
+
this.dbg(` SWAP | pid=0, cannot use ui_press — failing to avoid bridge crash`);
|
|
419
|
+
return {
|
|
420
|
+
step, success: false, durationMs: Date.now() - start, postconditionMet: false,
|
|
421
|
+
error: `Cannot click "${clickTarget}": element found in AX but pid unknown. Use focus first.`,
|
|
422
|
+
usedFallback: false,
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
break;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
// Skip screenshot steps for browser apps when world model is populated.
|
|
431
|
+
// CGWindowListCreateImage crashes on GPU-heavy pages (WebGL/canvas).
|
|
432
|
+
// The world model already has full UI visibility — screenshot adds nothing.
|
|
433
|
+
if (SCREENSHOT_TOOLS.has(step.tool) && preState.focusedApp) {
|
|
434
|
+
const appDomain = preState.appDomains?.get(preState.focusedApp.bundleId);
|
|
435
|
+
const family = appDomain?.family;
|
|
436
|
+
const hasControls = preState.windows.size > 0 &&
|
|
437
|
+
[...preState.windows.values()].some((w) => w.controls.size > 10);
|
|
438
|
+
if (family === "browser" && hasControls) {
|
|
439
|
+
this.dbg(` SKIP | ${step.tool} — browser+world model active (${preControls} controls)`);
|
|
440
|
+
step.description = `${step.description} [skipped — browser+world model active]`;
|
|
441
|
+
return {
|
|
442
|
+
step,
|
|
443
|
+
success: true,
|
|
444
|
+
durationMs: Date.now() - start,
|
|
445
|
+
postconditionMet: true,
|
|
446
|
+
error: null,
|
|
447
|
+
usedFallback: false,
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
const params = { ...step.params };
|
|
452
|
+
// Auto-inject windowId for click_text/screenshot_file/ocr when not provided by plan.
|
|
453
|
+
// These tools require windowId but strategies often omit it — use focused window.
|
|
454
|
+
if (WINDOW_ID_TOOLS.has(step.tool) && !params.windowId) {
|
|
455
|
+
const winId = preState.focusedWindowId ?? [...preState.windows.keys()][0];
|
|
456
|
+
if (winId != null) {
|
|
457
|
+
params.windowId = winId;
|
|
458
|
+
this.dbg(` INJ | windowId=${winId} injected for ${step.tool}`);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
if (this.learningEngine && !params._budget) {
|
|
462
|
+
const bundleId = preState.focusedApp?.bundleId;
|
|
463
|
+
if (bundleId) {
|
|
464
|
+
params._budget = this.learningEngine.getAdaptiveBudget(bundleId);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
const stepTimeout = Math.max(step.timeout || 0, this.config.defaultStepTimeout);
|
|
468
|
+
this.dbg(` ACT | calling ${step.tool} (timeout=${stepTimeout}ms)`);
|
|
469
|
+
let result = await this.tryToolWithTimeout(step.tool, params, stepTimeout);
|
|
470
|
+
this.dbg(` ACT | ok=${result.ok}${result.ok ? "" : ` error="${result.error}"`}`);
|
|
471
|
+
if (!result.ok && step.fallbackTool) {
|
|
472
|
+
this.dbg(` ACT | trying fallback: ${step.fallbackTool}`);
|
|
473
|
+
result = await this.tryToolWithTimeout(step.fallbackTool, params, stepTimeout);
|
|
474
|
+
this.dbg(` ACT | fallback ok=${result.ok}${result.ok ? "" : ` error="${result.error}"`}`);
|
|
475
|
+
usedFallback = true;
|
|
476
|
+
}
|
|
477
|
+
if (!result.ok) {
|
|
478
|
+
const durationMs = Date.now() - start;
|
|
479
|
+
this.dbg(` FAIL | ${step.tool} failed in ${durationMs}ms: ${result.error}`);
|
|
480
|
+
this.recordLearningOutcomes(usedFallback ? (step.fallbackTool ?? step.tool) : step.tool, params, false, durationMs);
|
|
481
|
+
return {
|
|
482
|
+
step,
|
|
483
|
+
success: false,
|
|
484
|
+
durationMs,
|
|
485
|
+
postconditionMet: false,
|
|
486
|
+
error: result.error ?? "Tool execution failed",
|
|
487
|
+
usedFallback,
|
|
488
|
+
};
|
|
489
|
+
}
|
|
490
|
+
this.feedWorldModel(usedFallback ? step.fallbackTool : step.tool, params, result);
|
|
491
|
+
this.recordLearningOutcomes(usedFallback ? step.fallbackTool : step.tool, params, true, Date.now() - start);
|
|
492
|
+
// ── VERIFY: Post-step awareness from world model ──
|
|
493
|
+
// For state-changing tools, wait briefly for perception to update the world model
|
|
494
|
+
if (STATE_CHANGING_TOOLS.has(step.tool)) {
|
|
495
|
+
await sleep(150); // AX refreshes in ~50ms, 150ms gives margin
|
|
496
|
+
}
|
|
497
|
+
const postState = this.worldModel.getConsistentSnapshot();
|
|
498
|
+
const postControls = [...postState.windows.values()].reduce((n, w) => n + w.controls.size, 0);
|
|
499
|
+
this.dbg(` POST | app=${postState.focusedApp?.bundleId ?? "none"} | win=${postState.focusedWindowId ?? "none"} | controls=${postControls} | dialogs=${postState.activeDialogs.length}`);
|
|
500
|
+
// 1. Check if a dialog appeared after the action
|
|
501
|
+
if (postState.activeDialogs.length > 0 && preState.activeDialogs.length === 0) {
|
|
502
|
+
const dialog = postState.activeDialogs[0];
|
|
503
|
+
const err = `Dialog appeared after ${step.tool}: ${dialog.type} "${dialog.title ?? dialog.message ?? "unknown"}" [buttons: ${dialog.buttons.join(", ")}]`;
|
|
504
|
+
this.dbg(` FAIL | ${err}`);
|
|
505
|
+
return {
|
|
506
|
+
step,
|
|
507
|
+
success: false,
|
|
508
|
+
durationMs: Date.now() - start,
|
|
509
|
+
postconditionMet: false,
|
|
510
|
+
error: err,
|
|
511
|
+
usedFallback,
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
// 2. Check if focus was lost (app switched unexpectedly)
|
|
515
|
+
if (preState.focusedApp?.bundleId && postState.focusedApp?.bundleId &&
|
|
516
|
+
preState.focusedApp.bundleId !== postState.focusedApp.bundleId &&
|
|
517
|
+
!FOCUS_TOOLS.has(step.tool)) {
|
|
518
|
+
const err = `Focus lost: was ${preState.focusedApp.bundleId}, now ${postState.focusedApp.bundleId}`;
|
|
519
|
+
this.dbg(` FAIL | ${err}`);
|
|
520
|
+
return {
|
|
521
|
+
step,
|
|
522
|
+
success: false,
|
|
523
|
+
durationMs: Date.now() - start,
|
|
524
|
+
postconditionMet: false,
|
|
525
|
+
error: err,
|
|
526
|
+
usedFallback,
|
|
527
|
+
};
|
|
528
|
+
}
|
|
529
|
+
// 3. For navigation tools, wait for world model to reflect new page
|
|
530
|
+
const isNavigation = NAVIGATION_TOOLS.has(step.tool) ||
|
|
531
|
+
(step.tool === "key" && isEnterKey(step.params));
|
|
532
|
+
if (isNavigation) {
|
|
533
|
+
const nextTarget = nextStep
|
|
534
|
+
? (nextStep.params.text ?? nextStep.params.title ?? nextStep.params.name)
|
|
535
|
+
: undefined;
|
|
536
|
+
const maxWait = nextTarget ? 8000 : 3000;
|
|
537
|
+
const pollMs = 200;
|
|
538
|
+
this.dbg(` NAV | waiting for ${nextTarget ? `"${nextTarget}"` : "any state change"} (max ${maxWait}ms)`);
|
|
539
|
+
const waited = await this.waitForWorldModelChange(preState.focusedWindowId, preState, maxWait, pollMs, nextTarget);
|
|
540
|
+
this.dbg(` NAV | waited=${waited}`);
|
|
541
|
+
if (!waited && nextTarget) {
|
|
542
|
+
const err = `Navigation failed: "${nextTarget}" not found after ${maxWait}ms. Page may not have loaded or URL was wrong.`;
|
|
543
|
+
this.dbg(` FAIL | ${err}`);
|
|
544
|
+
return {
|
|
545
|
+
step,
|
|
546
|
+
success: false,
|
|
547
|
+
durationMs: Date.now() - start,
|
|
548
|
+
postconditionMet: false,
|
|
549
|
+
error: err,
|
|
550
|
+
usedFallback,
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
if (!waited) {
|
|
554
|
+
step.description = `${step.description} [⚠ no state change detected after ${maxWait}ms]`;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
// 4. Verify postcondition
|
|
558
|
+
let postconditionMet = true;
|
|
559
|
+
let postconditionActual = null;
|
|
560
|
+
let postconditionSoft = false;
|
|
561
|
+
const assertion = step.expectedPostcondition ?? this.inferPostcondition(step, nextStep);
|
|
562
|
+
if (!step.expectedPostcondition && assertion) {
|
|
563
|
+
postconditionSoft = true;
|
|
564
|
+
}
|
|
565
|
+
if (assertion) {
|
|
566
|
+
this.dbg(` PC | checking ${assertion.type}="${assertion.target}" (${postconditionSoft ? "soft" : "hard"})`);
|
|
567
|
+
await sleep(Math.min(this.config.postconditionWaitMs, 500));
|
|
568
|
+
if (this.worldModel.getState().windows.size > 0) {
|
|
569
|
+
const pcResult = this.worldModel.assertStateDetailed(assertion);
|
|
570
|
+
postconditionMet = pcResult.matched;
|
|
571
|
+
postconditionActual = pcResult.actual;
|
|
572
|
+
this.dbg(` PC | matched=${postconditionMet} actual="${postconditionActual ?? "nothing"}"`);
|
|
573
|
+
}
|
|
574
|
+
else {
|
|
575
|
+
this.dbg(` PC | skipped — no windows in world model`);
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
if (!postconditionMet && postconditionSoft) {
|
|
579
|
+
this.dbg(` WARN | soft postcondition miss: "${assertion.target}" not visible (continuing)`);
|
|
580
|
+
step.description = `${step.description} [⚠ soft postcondition: "${assertion.target}" not yet visible]`;
|
|
581
|
+
return {
|
|
582
|
+
step,
|
|
583
|
+
success: true,
|
|
584
|
+
durationMs: Date.now() - start,
|
|
585
|
+
postconditionMet: false,
|
|
586
|
+
error: null,
|
|
587
|
+
usedFallback,
|
|
588
|
+
};
|
|
589
|
+
}
|
|
590
|
+
const finalDuration = Date.now() - start;
|
|
591
|
+
if (postconditionMet) {
|
|
592
|
+
this.dbg(` ✓ OK in ${finalDuration}ms`);
|
|
593
|
+
}
|
|
594
|
+
else {
|
|
595
|
+
this.dbg(` FAIL | postcondition not met: expected ${assertion?.type}="${assertion?.target}", got "${postconditionActual ?? "nothing"}"`);
|
|
596
|
+
}
|
|
597
|
+
return {
|
|
598
|
+
step,
|
|
599
|
+
success: postconditionMet,
|
|
600
|
+
durationMs: finalDuration,
|
|
601
|
+
postconditionMet,
|
|
602
|
+
error: postconditionMet ? null : `Postcondition not met: expected ${assertion?.type}="${assertion?.target}", got ${postconditionActual ?? "nothing"}`,
|
|
603
|
+
usedFallback,
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
/**
|
|
607
|
+
* Wait for the world model to reflect a state change (e.g. after navigation).
|
|
608
|
+
* Polls the world model (0ms reads) — NOT screenshots.
|
|
609
|
+
* Returns true if a change was detected, false if timed out.
|
|
610
|
+
*/
|
|
611
|
+
async waitForWorldModelChange(windowId, preState, maxWaitMs, pollMs, waitForTarget) {
|
|
612
|
+
if (windowId === null)
|
|
613
|
+
return true;
|
|
614
|
+
const preWin = preState.windows.get(windowId);
|
|
615
|
+
const preTitle = preWin?.title.value ?? "";
|
|
616
|
+
const preControlCount = preWin?.controls.size ?? 0;
|
|
617
|
+
const targetLower = waitForTarget?.toLowerCase();
|
|
618
|
+
const deadline = Date.now() + maxWaitMs;
|
|
619
|
+
let genericChangeDetected = false;
|
|
620
|
+
while (Date.now() < deadline) {
|
|
621
|
+
await sleep(pollMs);
|
|
622
|
+
const current = this.worldModel.getState();
|
|
623
|
+
const curWin = current.windows.get(windowId);
|
|
624
|
+
if (!curWin)
|
|
625
|
+
continue;
|
|
626
|
+
// If waiting for specific content, check ALL windows (page may load in different window)
|
|
627
|
+
if (targetLower) {
|
|
628
|
+
for (const [, win] of current.windows) {
|
|
629
|
+
const found = [...win.controls.values()].some((c) => c.label.value?.toLowerCase().includes(targetLower));
|
|
630
|
+
if (found)
|
|
631
|
+
return true; // Target content appeared — page is ready
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
// Generic change detection (title, control count, dialogs)
|
|
635
|
+
if (!genericChangeDetected) {
|
|
636
|
+
if (curWin.title.value !== preTitle && curWin.title.value)
|
|
637
|
+
genericChangeDetected = true;
|
|
638
|
+
const countDelta = Math.abs(curWin.controls.size - preControlCount);
|
|
639
|
+
if (countDelta > 10)
|
|
640
|
+
genericChangeDetected = true;
|
|
641
|
+
if (current.activeDialogs.length > preState.activeDialogs.length)
|
|
642
|
+
return true;
|
|
643
|
+
}
|
|
644
|
+
// If no specific target requested, return on generic change
|
|
645
|
+
if (!targetLower && genericChangeDetected)
|
|
646
|
+
return true;
|
|
647
|
+
}
|
|
648
|
+
// If we detected a generic change but target never appeared, still return true
|
|
649
|
+
return genericChangeDetected;
|
|
650
|
+
}
|
|
651
|
+
/**
|
|
652
|
+
* Auto-infer a soft postcondition from the current and next step.
|
|
653
|
+
* Returns null if no useful postcondition can be inferred.
|
|
654
|
+
*
|
|
655
|
+
* Inference rules:
|
|
656
|
+
* - Navigation → next step's target text should become visible
|
|
657
|
+
* - focus/launch → target app should be focused
|
|
658
|
+
* - click/press on element → if next step targets different text, that text should appear
|
|
659
|
+
*/
|
|
660
|
+
inferPostcondition(step, nextStep) {
|
|
661
|
+
// focus/launch → app should be focused
|
|
662
|
+
if (step.tool === "focus" || step.tool === "launch") {
|
|
663
|
+
const bundleId = (step.params.bundleId ?? step.params.appName);
|
|
664
|
+
if (bundleId) {
|
|
665
|
+
return { type: "app_focused", target: bundleId };
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
// Navigation or state-changing click → next step's target should be visible
|
|
669
|
+
if (nextStep && STATE_CHANGING_TOOLS.has(step.tool)) {
|
|
670
|
+
const nextTarget = (nextStep.params.text ?? nextStep.params.title ?? nextStep.params.name);
|
|
671
|
+
if (nextTarget && typeof nextTarget === "string" && nextTarget.length >= 3) {
|
|
672
|
+
// Don't infer if current step targets the same text (redundant)
|
|
673
|
+
const currentTarget = (step.params.text ?? step.params.title ?? step.params.name);
|
|
674
|
+
if (!currentTarget || currentTarget.toLowerCase() !== nextTarget.toLowerCase()) {
|
|
675
|
+
return { type: "text_visible", target: nextTarget };
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
return null;
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Feed tool execution results into the world model to keep state fresh
|
|
683
|
+
* between perception cycles. Best-effort — parse failures are silently ignored.
|
|
684
|
+
*/
|
|
685
|
+
feedWorldModel(tool, params, result) {
|
|
686
|
+
if (!result.ok || !result.result)
|
|
687
|
+
return;
|
|
688
|
+
try {
|
|
689
|
+
if (FOCUS_TOOLS.has(tool)) {
|
|
690
|
+
const bundleId = params.bundleId ?? params.appName;
|
|
691
|
+
if (bundleId) {
|
|
692
|
+
this.worldModel.updateFocusedApp({
|
|
693
|
+
bundleId,
|
|
694
|
+
appName: params.appName ?? bundleId,
|
|
695
|
+
pid: 0,
|
|
696
|
+
windowTitle: "",
|
|
697
|
+
});
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
else if (BROWSER_TOOLS.has(tool)) {
|
|
701
|
+
// Extract URL and title from result for CDP snapshot
|
|
702
|
+
let parsed = null;
|
|
703
|
+
try {
|
|
704
|
+
parsed = JSON.parse(result.result);
|
|
705
|
+
}
|
|
706
|
+
catch { /* not JSON */ }
|
|
707
|
+
const url = parsed?.url ?? params.url ?? "";
|
|
708
|
+
const title = parsed?.title ?? "";
|
|
709
|
+
const bundleId = this.worldModel.getState().focusedApp?.bundleId;
|
|
710
|
+
if (bundleId && url) {
|
|
711
|
+
this.worldModel.ingestCDPSnapshot(bundleId, url, title);
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
else if (tool === "ocr") {
|
|
715
|
+
// OCR results may contain text regions
|
|
716
|
+
let parsed = null;
|
|
717
|
+
try {
|
|
718
|
+
parsed = JSON.parse(result.result);
|
|
719
|
+
}
|
|
720
|
+
catch { /* not JSON */ }
|
|
721
|
+
if (parsed?.regions && Array.isArray(parsed.regions)) {
|
|
722
|
+
const windowId = params.windowId ??
|
|
723
|
+
this.worldModel.getState().focusedWindowId ?? 0;
|
|
724
|
+
const regions = parsed.regions;
|
|
725
|
+
if (regions.length > 0 && windowId) {
|
|
726
|
+
this.worldModel.ingestOCRRegions(windowId, regions);
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
catch {
|
|
732
|
+
// Best-effort: don't let world model feeding break execution
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
/**
|
|
736
|
+
* Record tool timing and locator outcomes to the learning engine.
|
|
737
|
+
* Best-effort — errors are silently ignored.
|
|
738
|
+
*/
|
|
739
|
+
recordLearningOutcomes(tool, params, success, durationMs) {
|
|
740
|
+
if (!this.learningEngine)
|
|
741
|
+
return;
|
|
742
|
+
try {
|
|
743
|
+
const bundleId = this.worldModel.getState().focusedApp?.bundleId;
|
|
744
|
+
if (!bundleId)
|
|
745
|
+
return;
|
|
746
|
+
// Record tool timing for adaptive budget learning
|
|
747
|
+
this.learningEngine.recordToolTiming({
|
|
748
|
+
tool,
|
|
749
|
+
bundleId,
|
|
750
|
+
durationMs,
|
|
751
|
+
success,
|
|
752
|
+
});
|
|
753
|
+
// Record locator outcome when a target/selector was used
|
|
754
|
+
const target = (params.target ?? params.selector);
|
|
755
|
+
if (target && LOCATOR_TOOLS.has(tool)) {
|
|
756
|
+
const method = tool.startsWith("browser_") ? "cdp" :
|
|
757
|
+
tool === "ocr" ? "ocr" : "ax";
|
|
758
|
+
this.learningEngine.recordLocatorOutcome({
|
|
759
|
+
bundleId,
|
|
760
|
+
actionKey: tool,
|
|
761
|
+
locator: target,
|
|
762
|
+
method,
|
|
763
|
+
success,
|
|
764
|
+
});
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
catch {
|
|
768
|
+
// Best-effort
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
async tryToolWithTimeout(tool, params, timeoutMs) {
|
|
772
|
+
return Promise.race([
|
|
773
|
+
this.tryTool(tool, params),
|
|
774
|
+
new Promise((resolve) => setTimeout(() => resolve({ ok: false, error: `Step timeout after ${timeoutMs}ms` }), timeoutMs)),
|
|
775
|
+
]);
|
|
776
|
+
}
|
|
777
|
+
async tryTool(tool, params) {
|
|
778
|
+
try {
|
|
779
|
+
return await this.executeTool(tool, params);
|
|
780
|
+
}
|
|
781
|
+
catch (err) {
|
|
782
|
+
return {
|
|
783
|
+
ok: false,
|
|
784
|
+
error: err instanceof Error ? err.message : String(err),
|
|
785
|
+
};
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
diagnoseFailure(planResult, expectedBundleId) {
|
|
789
|
+
// Check if the focused app changed (app_switched)
|
|
790
|
+
if (expectedBundleId) {
|
|
791
|
+
const currentBundleId = this.worldModel.getState().focusedApp?.bundleId;
|
|
792
|
+
if (currentBundleId && currentBundleId !== expectedBundleId) {
|
|
793
|
+
return "app_switched";
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
const lastFailed = [...planResult.stepResults].reverse().find((r) => !r.success);
|
|
797
|
+
if (!lastFailed)
|
|
798
|
+
return "postcondition_mismatch";
|
|
799
|
+
const error = lastFailed.error ?? "";
|
|
800
|
+
if (error.includes("dialog") || error.includes("Dialog"))
|
|
801
|
+
return "unexpected_dialog";
|
|
802
|
+
if (error.includes("not found") || error.includes("LOCATE_FAILED"))
|
|
803
|
+
return "element_not_found";
|
|
804
|
+
if (error.includes("timeout") || error.includes("TIMEOUT"))
|
|
805
|
+
return "timeout";
|
|
806
|
+
if (error.includes("Postcondition"))
|
|
807
|
+
return "postcondition_mismatch";
|
|
808
|
+
return "postcondition_mismatch";
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
/**
|
|
812
|
+
* Tool categories for world-model-aware execution.
|
|
813
|
+
*/
|
|
814
|
+
/** Tools that change which app is focused */
|
|
815
|
+
const FOCUS_TOOLS = new Set(["focus", "launch"]);
|
|
816
|
+
/** Tools that cause page/state transitions (need settle time) */
|
|
817
|
+
const STATE_CHANGING_TOOLS = new Set([
|
|
818
|
+
"focus", "launch", "key", "click", "click_text", "click_with_fallback",
|
|
819
|
+
"ui_press", "browser_click", "browser_navigate", "browser_open",
|
|
820
|
+
"menu_click",
|
|
821
|
+
]);
|
|
822
|
+
/** Tools that navigate (URL change, page load — need title/control change verification) */
|
|
823
|
+
const NAVIGATION_TOOLS = new Set([
|
|
824
|
+
"browser_navigate", "browser_open",
|
|
825
|
+
// Note: key("enter") after typing a URL is also navigation, but we detect
|
|
826
|
+
// that via the step sequence in executePlan, not here.
|
|
827
|
+
]);
|
|
828
|
+
/** Tools that interact with specific UI elements (need target validation) */
|
|
829
|
+
const INTERACTION_TOOLS = new Set([
|
|
830
|
+
"click", "click_text", "click_with_fallback",
|
|
831
|
+
"ui_press", "ui_set_value", "ui_find",
|
|
832
|
+
"browser_click", "browser_type",
|
|
833
|
+
"type_with_fallback", "select_with_fallback",
|
|
834
|
+
"read_with_fallback", "locate_with_fallback",
|
|
835
|
+
]);
|
|
836
|
+
/** Tools that call CGWindowListCreateImage — crash on GPU-heavy browser windows */
|
|
837
|
+
const SCREENSHOT_TOOLS = new Set(["screenshot", "screenshot_file", "ocr"]);
|
|
838
|
+
/** Tools that require a windowId param — auto-injected from world model if missing */
|
|
839
|
+
const WINDOW_ID_TOOLS = new Set(["click_text", "screenshot_file", "ocr", "observer_ocr_roi"]);
|
|
840
|
+
/** Feed world model from tool results */
|
|
841
|
+
const BROWSER_TOOLS = new Set(["browser_navigate", "browser_open", "browser_dom", "browser_page_info"]);
|
|
842
|
+
const LOCATOR_TOOLS = new Set([
|
|
843
|
+
"click", "click_text", "click_with_fallback",
|
|
844
|
+
"type_text", "type_with_fallback",
|
|
845
|
+
"ui_press", "ui_set_value", "ui_find",
|
|
846
|
+
"browser_click", "browser_type",
|
|
847
|
+
"select_with_fallback", "read_with_fallback", "locate_with_fallback",
|
|
848
|
+
]);
|
|
849
|
+
/**
|
|
850
|
+
* Find the next step that has a meaningful target (text, title, name).
|
|
851
|
+
* Skips screenshot/ocr steps which have no target params.
|
|
852
|
+
* Used to extract content-based readiness targets for navigation waits.
|
|
853
|
+
*/
|
|
854
|
+
function findNextMeaningfulStep(steps, currentIndex) {
|
|
855
|
+
for (let j = currentIndex + 1; j < steps.length; j++) {
|
|
856
|
+
const s = steps[j];
|
|
857
|
+
if (SCREENSHOT_TOOLS.has(s.tool))
|
|
858
|
+
continue; // skip screenshot/ocr
|
|
859
|
+
return s;
|
|
860
|
+
}
|
|
861
|
+
return null;
|
|
862
|
+
}
|
|
863
|
+
/** Check if a key step is pressing Enter/Return (likely navigation after URL typing) */
|
|
864
|
+
function isEnterKey(params) {
|
|
865
|
+
const key = (params.key ?? params.combo ?? "").toLowerCase();
|
|
866
|
+
return key === "enter" || key === "return" || key.endsWith("+enter") || key.endsWith("+return");
|
|
867
|
+
}
|
|
868
|
+
function sleep(ms) {
|
|
869
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
870
|
+
}
|