@vellumai/assistant 0.4.48 → 0.4.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/ARCHITECTURE.md +2 -2
  2. package/README.md +2 -23
  3. package/docs/architecture/integrations.md +45 -41
  4. package/docs/architecture/keychain-broker.md +3 -3
  5. package/docs/runbook-trusted-contacts.md +3 -8
  6. package/hook-templates/debug-prompt-logger/hook.json +1 -1
  7. package/hook-templates/debug-prompt-logger/run.sh +1 -3
  8. package/package.json +1 -1
  9. package/src/__tests__/actor-token-service.test.ts +0 -1
  10. package/src/__tests__/anthropic-provider.test.ts +156 -0
  11. package/src/__tests__/approval-cascade.test.ts +810 -0
  12. package/src/__tests__/approval-primitive.test.ts +0 -1
  13. package/src/__tests__/approval-routes-http.test.ts +2 -0
  14. package/src/__tests__/assistant-attachments.test.ts +12 -34
  15. package/src/__tests__/assistant-feature-flag-guardrails.test.ts +76 -0
  16. package/src/__tests__/assistant-feature-flags-integration.test.ts +0 -1
  17. package/src/__tests__/browser-skill-baseline-tool-payload.test.ts +2 -2
  18. package/src/__tests__/channel-guardian.test.ts +0 -2
  19. package/src/__tests__/channel-readiness-routes.test.ts +15 -6
  20. package/src/__tests__/channel-readiness-service.test.ts +10 -9
  21. package/src/__tests__/checker.test.ts +9 -29
  22. package/src/__tests__/computer-use-skill-manifest-regression.test.ts +1 -1
  23. package/src/__tests__/computer-use-tools.test.ts +2 -19
  24. package/src/__tests__/config-watcher.test.ts +0 -1
  25. package/src/__tests__/confirmation-request-guardian-bridge.test.ts +0 -1
  26. package/src/__tests__/context-image-dimensions.test.ts +332 -0
  27. package/src/__tests__/context-token-estimator.test.ts +196 -13
  28. package/src/__tests__/conversation-attention-store.test.ts +0 -1
  29. package/src/__tests__/conversation-attention-telegram.test.ts +0 -1
  30. package/src/__tests__/conversation-routes-guardian-reply.test.ts +144 -0
  31. package/src/__tests__/conversation-routes-slash-commands.test.ts +1 -0
  32. package/src/__tests__/credential-metadata-store.test.ts +64 -73
  33. package/src/__tests__/credential-security-invariants.test.ts +13 -7
  34. package/src/__tests__/credential-vault-unit.test.ts +280 -49
  35. package/src/__tests__/credential-vault.test.ts +138 -16
  36. package/src/__tests__/credentials-cli.test.ts +71 -0
  37. package/src/__tests__/dynamic-skill-workflow-prompt.test.ts +0 -1
  38. package/src/__tests__/ephemeral-permissions.test.ts +3 -3
  39. package/src/__tests__/gateway-only-guard.test.ts +0 -1
  40. package/src/__tests__/guardian-action-grant-mint-consume.test.ts +0 -1
  41. package/src/__tests__/guardian-decision-primitive-canonical.test.ts +0 -1
  42. package/src/__tests__/guardian-routing-invariants.test.ts +0 -1
  43. package/src/__tests__/guardian-verification-voice-binding.test.ts +0 -1
  44. package/src/__tests__/handlers-user-message-approval-consumption.test.ts +0 -39
  45. package/src/__tests__/heartbeat-service.test.ts +0 -1
  46. package/src/__tests__/host-cu-proxy.test.ts +629 -0
  47. package/src/__tests__/host-shell-tool.test.ts +27 -15
  48. package/src/__tests__/http-user-message-parity.test.ts +1 -0
  49. package/src/__tests__/ingress-url-consistency.test.ts +14 -21
  50. package/src/__tests__/integration-status.test.ts +32 -51
  51. package/src/__tests__/intent-routing.test.ts +0 -1
  52. package/src/__tests__/invite-routes-http.test.ts +10 -9
  53. package/src/__tests__/keychain-broker-client.test.ts +11 -43
  54. package/src/__tests__/notification-routing-intent.test.ts +0 -1
  55. package/src/__tests__/oauth-cli.test.ts +373 -14
  56. package/src/__tests__/oauth-provider-profiles.test.ts +9 -9
  57. package/src/__tests__/oauth-scope-policy.test.ts +4 -6
  58. package/src/__tests__/oauth-store.test.ts +756 -0
  59. package/src/__tests__/onboarding-starter-tasks.test.ts +0 -1
  60. package/src/__tests__/provider-error-scenarios.test.ts +0 -1
  61. package/src/__tests__/provider-streaming.benchmark.test.ts +0 -1
  62. package/src/__tests__/public-ingress-urls.test.ts +15 -21
  63. package/src/__tests__/recording-handler.test.ts +3 -4
  64. package/src/__tests__/registry.test.ts +2 -2
  65. package/src/__tests__/runtime-events-sse.test.ts +55 -7
  66. package/src/__tests__/schedule-store.test.ts +0 -1
  67. package/src/__tests__/scheduler-recurrence.test.ts +0 -1
  68. package/src/__tests__/scoped-approval-grants.test.ts +0 -1
  69. package/src/__tests__/scoped-grant-security-matrix.test.ts +0 -1
  70. package/src/__tests__/secret-ingress-handler.test.ts +0 -1
  71. package/src/__tests__/send-endpoint-busy.test.ts +21 -6
  72. package/src/__tests__/sequence-store.test.ts +0 -1
  73. package/src/__tests__/session-init.benchmark.test.ts +4 -5
  74. package/src/__tests__/skill-include-graph.test.ts +66 -0
  75. package/src/__tests__/skill-load-feature-flag.test.ts +0 -1
  76. package/src/__tests__/skill-load-tool.test.ts +149 -1
  77. package/src/__tests__/skill-projection-feature-flag.test.ts +0 -1
  78. package/src/__tests__/skills-uninstall.test.ts +1 -1
  79. package/src/__tests__/skills.test.ts +3 -3
  80. package/src/__tests__/slack-channel-config.test.ts +67 -3
  81. package/src/__tests__/slack-share-routes.test.ts +17 -19
  82. package/src/__tests__/system-prompt.test.ts +0 -1
  83. package/src/__tests__/telegram-invite-adapter.test.ts +18 -22
  84. package/src/__tests__/terminal-tools.test.ts +4 -3
  85. package/src/__tests__/test-support/computer-use-skill-harness.ts +3 -2
  86. package/src/__tests__/tool-approval-handler.test.ts +0 -1
  87. package/src/__tests__/tool-execution-pipeline.benchmark.test.ts +0 -1
  88. package/src/__tests__/tool-executor-lifecycle-events.test.ts +0 -1
  89. package/src/__tests__/tool-executor-shell-integration.test.ts +0 -1
  90. package/src/__tests__/tool-executor.test.ts +0 -1
  91. package/src/__tests__/tool-grant-request-escalation.test.ts +0 -1
  92. package/src/__tests__/trust-store-pattern-matches.test.ts +29 -0
  93. package/src/__tests__/trust-store.test.ts +1 -22
  94. package/src/__tests__/trusted-contact-approval-notifier.test.ts +0 -1
  95. package/src/__tests__/trusted-contact-inline-approval-integration.test.ts +0 -1
  96. package/src/__tests__/twilio-routes.test.ts +0 -16
  97. package/src/__tests__/verification-control-plane-policy.test.ts +0 -1
  98. package/src/__tests__/voice-scoped-grant-consumer.test.ts +0 -1
  99. package/src/agent/ax-tree-compaction.test.ts +235 -0
  100. package/src/agent/loop.ts +76 -130
  101. package/src/calls/call-domain.ts +1 -6
  102. package/src/calls/relay-server.ts +9 -13
  103. package/src/calls/twilio-config.ts +2 -7
  104. package/src/calls/twilio-routes.ts +1 -2
  105. package/src/calls/voice-ingress-preflight.ts +1 -1
  106. package/src/cli/commands/browser-relay.ts +18 -12
  107. package/src/cli/commands/completions.ts +0 -3
  108. package/src/cli/commands/credentials.ts +101 -15
  109. package/src/cli/commands/oauth/apps.ts +255 -0
  110. package/src/cli/commands/oauth/connections.ts +299 -0
  111. package/src/cli/commands/oauth/index.ts +52 -0
  112. package/src/cli/commands/oauth/providers.ts +242 -0
  113. package/src/cli/commands/skills.ts +4 -338
  114. package/src/cli/program.ts +1 -5
  115. package/src/cli/reference.ts +1 -3
  116. package/src/config/assistant-feature-flags.ts +0 -3
  117. package/src/config/bundled-skills/_shared/CLI_RETRIEVAL_PATTERN.md +1 -1
  118. package/src/config/bundled-skills/computer-use/SKILL.md +3 -6
  119. package/src/config/bundled-skills/computer-use/TOOLS.json +22 -4
  120. package/src/config/bundled-skills/google-calendar/calendar-client.ts +21 -16
  121. package/src/config/bundled-skills/messaging/tools/shared.ts +1 -4
  122. package/src/config/bundled-skills/settings/SKILL.md +1 -1
  123. package/src/config/bundled-skills/settings/TOOLS.json +2 -8
  124. package/src/config/bundled-skills/settings/tools/voice-config-update.ts +5 -33
  125. package/src/config/env-registry.ts +14 -83
  126. package/src/config/env.ts +11 -50
  127. package/src/config/feature-flag-registry.json +16 -16
  128. package/src/config/loader.ts +0 -6
  129. package/src/config/schema.ts +3 -1
  130. package/src/config/skills.ts +21 -2
  131. package/src/context/image-dimensions.ts +229 -0
  132. package/src/context/token-estimator.ts +75 -12
  133. package/src/context/window-manager.ts +49 -10
  134. package/src/daemon/assistant-attachments.ts +1 -13
  135. package/src/daemon/handlers/config-ingress.ts +8 -33
  136. package/src/daemon/handlers/config-slack-channel.ts +49 -46
  137. package/src/daemon/handlers/config-telegram.ts +32 -16
  138. package/src/daemon/handlers/sessions.ts +10 -24
  139. package/src/daemon/handlers/shared.ts +0 -130
  140. package/src/daemon/host-cu-proxy.ts +401 -0
  141. package/src/daemon/lifecycle.ts +36 -68
  142. package/src/daemon/message-protocol.ts +3 -0
  143. package/src/daemon/message-types/computer-use.ts +2 -119
  144. package/src/daemon/message-types/host-cu.ts +19 -0
  145. package/src/daemon/message-types/messages.ts +3 -0
  146. package/src/daemon/server.ts +14 -21
  147. package/src/daemon/session-agent-loop-handlers.ts +2 -0
  148. package/src/daemon/session-attachments.ts +1 -2
  149. package/src/daemon/session-slash.ts +1 -1
  150. package/src/daemon/session-surfaces.ts +40 -28
  151. package/src/daemon/session-tool-setup.ts +2 -9
  152. package/src/daemon/session.ts +138 -15
  153. package/src/daemon/tool-side-effects.ts +2 -8
  154. package/src/daemon/watch-handler.ts +2 -2
  155. package/src/events/tool-metrics-listener.ts +2 -2
  156. package/src/hooks/manager.ts +1 -4
  157. package/src/inbound/public-ingress-urls.ts +7 -7
  158. package/src/logfire.ts +16 -5
  159. package/src/memory/conversation-key-store.ts +21 -0
  160. package/src/memory/db-init.ts +4 -0
  161. package/src/memory/migrations/149-oauth-tables.ts +60 -0
  162. package/src/memory/migrations/index.ts +1 -0
  163. package/src/memory/schema/index.ts +1 -0
  164. package/src/memory/schema/oauth.ts +65 -0
  165. package/src/messaging/provider.ts +4 -4
  166. package/src/messaging/providers/gmail/client.ts +82 -2
  167. package/src/messaging/providers/gmail/people-client.ts +10 -10
  168. package/src/messaging/providers/telegram-bot/adapter.ts +17 -17
  169. package/src/messaging/providers/whatsapp/adapter.ts +11 -8
  170. package/src/messaging/registry.ts +2 -32
  171. package/src/notifications/copy-composer.ts +0 -5
  172. package/src/notifications/signal.ts +4 -5
  173. package/src/oauth/byo-connection.test.ts +126 -25
  174. package/src/oauth/byo-connection.ts +22 -6
  175. package/src/oauth/connect-orchestrator.ts +113 -57
  176. package/src/oauth/connect-types.ts +17 -23
  177. package/src/oauth/connection-resolver.ts +35 -11
  178. package/src/oauth/connection.ts +1 -1
  179. package/src/oauth/manual-token-connection.ts +104 -0
  180. package/src/oauth/oauth-store.ts +496 -0
  181. package/src/oauth/platform-connection.test.ts +29 -0
  182. package/src/oauth/platform-connection.ts +6 -5
  183. package/src/oauth/provider-behaviors.ts +124 -0
  184. package/src/oauth/scope-policy.ts +9 -2
  185. package/src/oauth/seed-providers.ts +161 -0
  186. package/src/oauth/token-persistence.ts +74 -78
  187. package/src/permissions/checker.ts +3 -3
  188. package/src/permissions/defaults.ts +0 -1
  189. package/src/permissions/prompter.ts +10 -1
  190. package/src/permissions/trust-store.ts +13 -0
  191. package/src/prompts/__tests__/build-cli-reference-section.test.ts +3 -1
  192. package/src/prompts/system-prompt.ts +28 -40
  193. package/src/providers/anthropic/client.ts +133 -24
  194. package/src/providers/retry.ts +1 -27
  195. package/src/runtime/auth/route-policy.ts +0 -3
  196. package/src/runtime/channel-reply-delivery.ts +0 -40
  197. package/src/runtime/gateway-client.ts +0 -7
  198. package/src/runtime/http-server.ts +8 -6
  199. package/src/runtime/http-types.ts +2 -2
  200. package/src/runtime/middleware/twilio-validation.ts +1 -11
  201. package/src/runtime/pending-interactions.ts +14 -12
  202. package/src/runtime/routes/channel-delivery-routes.ts +0 -1
  203. package/src/runtime/routes/conversation-routes.ts +73 -19
  204. package/src/runtime/routes/events-routes.ts +21 -11
  205. package/src/runtime/routes/host-cu-routes.ts +97 -0
  206. package/src/runtime/routes/inbound-stages/background-dispatch.ts +12 -111
  207. package/src/runtime/routes/integrations/slack/share.ts +6 -7
  208. package/src/runtime/routes/log-export-routes.ts +126 -8
  209. package/src/runtime/routes/settings-routes.ts +55 -48
  210. package/src/runtime/routes/surface-action-routes.ts +1 -1
  211. package/src/runtime/routes/watch-routes.ts +128 -0
  212. package/src/schedule/integration-status.ts +10 -9
  213. package/src/security/credential-key.ts +0 -156
  214. package/src/security/keychain-broker-client.ts +5 -6
  215. package/src/security/oauth2.ts +1 -1
  216. package/src/security/token-manager.ts +119 -46
  217. package/src/skills/catalog-install.ts +358 -0
  218. package/src/skills/include-graph.ts +32 -0
  219. package/src/telegram/bot-username.ts +2 -3
  220. package/src/tools/browser/network-recorder.ts +1 -1
  221. package/src/tools/browser/network-recording-types.ts +1 -1
  222. package/src/tools/computer-use/definitions.ts +46 -11
  223. package/src/tools/computer-use/registry.ts +4 -5
  224. package/src/tools/credentials/broker.ts +1 -2
  225. package/src/tools/credentials/metadata-store.ts +17 -121
  226. package/src/tools/credentials/vault.ts +94 -167
  227. package/src/tools/registry.ts +2 -7
  228. package/src/tools/skills/load.ts +62 -3
  229. package/src/tools/watch/watch-state.ts +0 -12
  230. package/src/util/logger.ts +7 -41
  231. package/src/util/platform.ts +9 -28
  232. package/src/watcher/providers/google-calendar.ts +2 -1
  233. package/src/__tests__/computer-use-session-compaction.test.ts +0 -143
  234. package/src/__tests__/computer-use-session-lifecycle.test.ts +0 -322
  235. package/src/__tests__/computer-use-session-working-dir.test.ts +0 -166
  236. package/src/__tests__/computer-use-skill-baseline.test.ts +0 -78
  237. package/src/__tests__/computer-use-skill-endstate.test.ts +0 -105
  238. package/src/__tests__/computer-use-skill-lifecycle-cleanup.test.ts +0 -249
  239. package/src/__tests__/ride-shotgun-handler.test.ts +0 -452
  240. package/src/cli/commands/dev.ts +0 -129
  241. package/src/cli/commands/map.ts +0 -391
  242. package/src/cli/commands/oauth.ts +0 -77
  243. package/src/config/bundled-skills/computer-use/tools/computer-use-request-control.ts +0 -16
  244. package/src/daemon/computer-use-session.ts +0 -1026
  245. package/src/daemon/ride-shotgun-handler.ts +0 -569
  246. package/src/oauth/provider-base-urls.ts +0 -21
  247. package/src/oauth/provider-profiles.ts +0 -192
  248. package/src/prompts/computer-use-prompt.ts +0 -98
  249. package/src/runtime/routes/computer-use-routes.ts +0 -641
  250. package/src/runtime/telegram-streaming-delivery.test.ts +0 -729
  251. package/src/runtime/telegram-streaming-delivery.ts +0 -393
  252. package/src/tools/computer-use/request-computer-control.ts +0 -56
@@ -1,1026 +0,0 @@
1
- /**
2
- * Computer-use session orchestrator.
3
- *
4
- * Manages the observation -> infer -> action loop for computer-use tasks,
5
- * bridging the macOS client (which captures screen state and executes actions)
6
- * with the AgentLoop (which runs inference via the Anthropic API with CU tools).
7
- */
8
-
9
- import { v4 as uuid } from "uuid";
10
-
11
- import { AgentLoop } from "../agent/loop.js";
12
- import { getConfig } from "../config/loader.js";
13
- import { PermissionPrompter } from "../permissions/prompter.js";
14
- import { SecretPrompter } from "../permissions/secret-prompter.js";
15
- import type { UserDecision } from "../permissions/types.js";
16
- import { buildComputerUseSystemPrompt } from "../prompts/computer-use-prompt.js";
17
- import type {
18
- ContentBlock,
19
- Message,
20
- Provider,
21
- ToolDefinition,
22
- } from "../providers/types.js";
23
- import { allComputerUseTools } from "../tools/computer-use/definitions.js";
24
- import { ToolExecutor } from "../tools/executor.js";
25
- import { getTool, registerSkillTools } from "../tools/registry.js";
26
- import {
27
- injectReasonField,
28
- REASON_SKIP_SET,
29
- } from "../tools/schema-transforms.js";
30
- import type { Tool, ToolExecutionResult } from "../tools/types.js";
31
- import { allUiSurfaceTools } from "../tools/ui-surface/definitions.js";
32
- import { getLogger } from "../util/logger.js";
33
- import { getSandboxWorkingDir } from "../util/platform.js";
34
- import type {
35
- CuObservation,
36
- ServerMessage,
37
- SurfaceData,
38
- SurfaceType,
39
- UiSurfaceShow,
40
- } from "./message-protocol.js";
41
- import { INTERACTIVE_SURFACE_TYPES } from "./message-protocol.js";
42
- import {
43
- projectSkillTools,
44
- resetSkillToolProjection,
45
- type SkillProjectionCache,
46
- } from "./session-skill-tools.js";
47
-
48
- const log = getLogger("computer-use-session");
49
-
50
- const MAX_STEPS = 50;
51
- const SESSION_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes
52
- const MAX_HISTORY_ENTRIES = 10;
53
- const LOOP_DETECTION_WINDOW = 3;
54
- const CONSECUTIVE_UNCHANGED_WARNING_THRESHOLD = 2;
55
-
56
- /** Number of most-recent AX tree snapshots to keep in conversation history. */
57
- const MAX_AX_TREES_IN_HISTORY = 2;
58
-
59
- /** Regex that matches the `<ax-tree>…</ax-tree>` markers injected by buildObservationResultContent. */
60
- const AX_TREE_PATTERN = /<ax-tree>[\s\S]*?<\/ax-tree>/g;
61
- const AX_TREE_PLACEHOLDER = "<ax_tree_omitted />";
62
-
63
- type SessionState =
64
- | "idle"
65
- | "awaiting_observation"
66
- | "inferring"
67
- | "complete"
68
- | "error";
69
-
70
- interface ActionRecord {
71
- step: number;
72
- toolName: string;
73
- input: Record<string, unknown>;
74
- reasoning?: string;
75
- result?: string;
76
- }
77
-
78
- export class ComputerUseSession {
79
- private readonly sessionId: string;
80
- private readonly task: string;
81
- private readonly screenWidth: number;
82
- private readonly screenHeight: number;
83
- private readonly provider: Provider;
84
- private sendToClient: (msg: ServerMessage) => void;
85
- private readonly interactionType: "computer_use" | "text_qa";
86
- private readonly onTerminal?: (sessionId: string) => void;
87
- private readonly preactivatedSkillIds: string[];
88
- private readonly skillProjectionState = new Map<string, string>();
89
- private readonly skillProjectionCache: SkillProjectionCache = {};
90
-
91
- private state: SessionState = "idle";
92
- private stepCount = 0;
93
- private actionHistory: ActionRecord[] = [];
94
- private previousAXTree: string | undefined;
95
- private consecutiveUnchangedSteps = 0;
96
- private abortController: AbortController | null = null;
97
- private sessionTimer: ReturnType<typeof setTimeout> | null = null;
98
-
99
- private pendingObservation: {
100
- resolve: (result: ToolExecutionResult) => void;
101
- } | null = null;
102
-
103
- private pendingSurfaceActions = new Map<
104
- string,
105
- {
106
- resolve: (result: ToolExecutionResult) => void;
107
- }
108
- >();
109
- /** @internal */ surfaceState = new Map<
110
- string,
111
- { surfaceType: SurfaceType; data: SurfaceData; title?: string }
112
- >();
113
- private terminalNotified = false;
114
- private prompter: PermissionPrompter | null = null;
115
-
116
- // Tracks the agent loop promise so callers can await session completion
117
- private loopPromise: Promise<void> | null = null;
118
-
119
- constructor(
120
- sessionId: string,
121
- task: string,
122
- screenWidth: number,
123
- screenHeight: number,
124
- provider: Provider,
125
- sendToClient: (msg: ServerMessage) => void,
126
- interactionType?: "computer_use" | "text_qa",
127
- onTerminal?: (sessionId: string) => void,
128
- preactivatedSkillIds?: string[],
129
- ) {
130
- this.sessionId = sessionId;
131
- this.task = task;
132
- this.screenWidth = screenWidth;
133
- this.screenHeight = screenHeight;
134
- this.provider = provider;
135
- this.sendToClient = sendToClient;
136
- this.interactionType = interactionType ?? "computer_use";
137
- this.onTerminal = onTerminal;
138
- this.preactivatedSkillIds = preactivatedSkillIds ?? ["computer-use"];
139
- }
140
-
141
- // ---------------------------------------------------------------------------
142
- // Public API
143
- // ---------------------------------------------------------------------------
144
-
145
- async handleObservation(obs: CuObservation): Promise<void> {
146
- if (this.state === "complete" || this.state === "error") {
147
- log.warn(
148
- { sessionId: this.sessionId, state: this.state },
149
- "Observation received after session ended",
150
- );
151
- return;
152
- }
153
-
154
- // Track consecutive unchanged steps
155
- const hadPreviousAXTree = this.previousAXTree != null;
156
- if (this.stepCount > 0) {
157
- if (obs.axDiff == null && hadPreviousAXTree && obs.axTree != null) {
158
- this.consecutiveUnchangedSteps++;
159
- } else if (obs.axDiff != null) {
160
- this.consecutiveUnchangedSteps = 0;
161
- }
162
- }
163
-
164
- // Capture previous AX tree for next turn
165
- if (obs.axTree != null) {
166
- this.previousAXTree = obs.axTree;
167
- }
168
-
169
- if (this.state === "awaiting_observation" && this.pendingObservation) {
170
- // Resolve the pending proxy tool result with updated screen context
171
- const content = this.buildObservationResultContent(
172
- obs,
173
- hadPreviousAXTree,
174
- );
175
- const result: ToolExecutionResult = obs.executionError
176
- ? {
177
- content: `Action failed: ${obs.executionError}\n\n${content}`,
178
- isError: true,
179
- }
180
- : { content, isError: false };
181
- this.state = "inferring";
182
- this.pendingObservation.resolve(result);
183
- this.pendingObservation = null;
184
- // The agent loop continues automatically after resolution
185
- return;
186
- }
187
-
188
- // First observation — start the agent loop
189
- this.state = "inferring";
190
- this.abortController = new AbortController();
191
-
192
- // Safety net: abort the session if it runs longer than SESSION_TIMEOUT_MS
193
- this.sessionTimer = setTimeout(() => {
194
- log.warn(
195
- { sessionId: this.sessionId, timeoutMs: SESSION_TIMEOUT_MS },
196
- "Session timeout reached, aborting",
197
- );
198
- this.abort();
199
- }, SESSION_TIMEOUT_MS);
200
-
201
- const messages = this.buildMessages(obs, hadPreviousAXTree);
202
- this.loopPromise = this.runAgentLoop(messages).catch((err) => {
203
- // Catches errors from setup code (e.g. skill projection failures) that
204
- // occur before runAgentLoop's internal try-catch takes over.
205
- const message = err instanceof Error ? err.message : String(err);
206
- log.error(
207
- { err, sessionId: this.sessionId },
208
- "Agent loop startup failed",
209
- );
210
- if (this.sessionTimer) {
211
- clearTimeout(this.sessionTimer);
212
- this.sessionTimer = null;
213
- }
214
- if (this.state !== "complete" && this.state !== "error") {
215
- this.state = "error";
216
- this.sendToClient({
217
- type: "cu_error",
218
- sessionId: this.sessionId,
219
- message,
220
- });
221
- this.notifyTerminal();
222
- }
223
- });
224
-
225
- await this.loopPromise;
226
- }
227
-
228
- abort(): void {
229
- if (this.state === "complete" || this.state === "error") return;
230
-
231
- log.info({ sessionId: this.sessionId }, "Aborting computer-use session");
232
- if (this.sessionTimer) {
233
- clearTimeout(this.sessionTimer);
234
- this.sessionTimer = null;
235
- }
236
- this.abortController?.abort();
237
-
238
- // If waiting for an observation, resolve it as cancelled
239
- if (this.pendingObservation) {
240
- this.pendingObservation.resolve({
241
- content: "Session aborted",
242
- isError: true,
243
- });
244
- this.pendingObservation = null;
245
- }
246
-
247
- // Dispose prompter to clear pending permission timers and reject promises
248
- this.prompter?.dispose();
249
-
250
- // Resolve any pending surface actions
251
- for (const [, pending] of this.pendingSurfaceActions) {
252
- pending.resolve({ content: "Session aborted", isError: true });
253
- }
254
- this.pendingSurfaceActions.clear();
255
- this.surfaceState.clear();
256
-
257
- this.state = "error";
258
- this.sendToClient({
259
- type: "cu_error",
260
- sessionId: this.sessionId,
261
- message: "Session aborted by user",
262
- });
263
- this.notifyTerminal();
264
- }
265
-
266
- isComplete(): boolean {
267
- return this.state === "complete";
268
- }
269
-
270
- getState(): string {
271
- return this.state;
272
- }
273
-
274
- /**
275
- * Compute CU tool definitions from the bundled computer-use skill via
276
- * skill projection. Returns null if projection fails so the caller can
277
- * fall back to legacy hardcoded tool definitions.
278
- */
279
- private getProjectedCuToolDefinitions(): ToolDefinition[] | null {
280
- if (this.preactivatedSkillIds.length === 0) {
281
- log.warn(
282
- "No preactivatedSkillIds configured, falling back to legacy CU tools",
283
- );
284
- return null;
285
- }
286
-
287
- try {
288
- const projection = projectSkillTools([], {
289
- preactivatedSkillIds: this.preactivatedSkillIds,
290
- previouslyActiveSkillIds: this.skillProjectionState,
291
- cache: this.skillProjectionCache,
292
- });
293
-
294
- if (projection.allowedToolNames.size === 0) {
295
- log.warn(
296
- { preactivatedSkillIds: this.preactivatedSkillIds },
297
- "Skill projection produced no tool definitions, falling back to legacy CU tools",
298
- );
299
- return null;
300
- }
301
-
302
- // Tool definitions are no longer returned from projectSkillTools
303
- // (dispatched via skill_execute). Build definitions from the registry.
304
- const defs: ToolDefinition[] = [];
305
- for (const name of projection.allowedToolNames) {
306
- const tool = getTool(name);
307
- if (tool) defs.push(tool.getDefinition());
308
- }
309
- return defs;
310
- } catch (err) {
311
- log.warn(
312
- { err },
313
- "Skill projection failed, falling back to legacy CU tools",
314
- );
315
- return null;
316
- }
317
- }
318
-
319
- handleSurfaceAction(
320
- surfaceId: string,
321
- actionId: string,
322
- data?: Record<string, unknown>,
323
- ): void {
324
- const pending = this.pendingSurfaceActions.get(surfaceId);
325
- if (!pending) {
326
- log.warn({ surfaceId, actionId }, "No pending surface action found");
327
- return;
328
- }
329
- // selection_changed is a non-terminal state update — don't consume the
330
- // pending entry. The selection state will be in the action button payload.
331
- if (actionId === "selection_changed") {
332
- return;
333
- }
334
- this.pendingSurfaceActions.delete(surfaceId);
335
- pending.resolve({
336
- content: JSON.stringify({ actionId, data: data ?? {} }),
337
- isError: false,
338
- });
339
- }
340
-
341
- // ---------------------------------------------------------------------------
342
- // Agent loop execution
343
- // ---------------------------------------------------------------------------
344
-
345
- private async runAgentLoop(messages: Message[]): Promise<void> {
346
- const systemPrompt = buildComputerUseSystemPrompt(
347
- this.screenWidth,
348
- this.screenHeight,
349
- );
350
-
351
- let cuToolDefs = this.getProjectedCuToolDefinitions();
352
- if (!cuToolDefs) {
353
- // Fallback: register the legacy CU tools as skill-origin tools so
354
- // ToolExecutor can resolve them via getTool(), but using the same
355
- // ownerSkillId as the bundled computer-use skill. This avoids
356
- // core-vs-skill collisions that would permanently block skill
357
- // projection recovery on subsequent sessions.
358
- const fallbackSkillId = this.preactivatedSkillIds[0] ?? "computer-use";
359
- const fallbackTools: Tool[] = allComputerUseTools.map((t) => ({
360
- ...t,
361
- origin: "skill" as const,
362
- ownerSkillId: fallbackSkillId,
363
- ownerSkillBundled: true,
364
- }));
365
- registerSkillTools(fallbackTools);
366
- // Track in the session map so resetSkillToolProjection cleans up
367
- this.skillProjectionState.set(fallbackSkillId, "fallback");
368
- cuToolDefs = allComputerUseTools.map((t) => t.getDefinition());
369
- }
370
-
371
- const toolDefs: ToolDefinition[] = [
372
- ...cuToolDefs,
373
- ...allUiSurfaceTools.map((t) => t.getDefinition()),
374
- ];
375
-
376
- this.prompter = new PermissionPrompter(this.sendToClient);
377
- const prompter = this.prompter;
378
- const secretPrompter = new SecretPrompter(this.sendToClient);
379
- const executor = new ToolExecutor(prompter);
380
-
381
- const proxyResolver = async (
382
- toolName: string,
383
- input: Record<string, unknown>,
384
- ): Promise<ToolExecutionResult> => {
385
- // ── Surface tool proxying ──────────────────────────────────────
386
- if (toolName === "ui_show") {
387
- const surfaceId = uuid();
388
- const surfaceType = input.surface_type as SurfaceType;
389
- const title = typeof input.title === "string" ? input.title : undefined;
390
- const data = input.data as SurfaceData;
391
- const actions = input.actions as
392
- | Array<{ id: string; label: string; style?: string }>
393
- | undefined;
394
- // Interactive surfaces default to awaiting user action.
395
- // Tables and lists only block when explicit action buttons are provided;
396
- // selectionMode alone should not gate blocking because selection_changed
397
- // fires on every click and would immediately resolve multi-select surfaces.
398
- const hasActions = Array.isArray(actions) && actions.length > 0;
399
- const isInteractive =
400
- surfaceType === "card"
401
- ? hasActions
402
- : surfaceType === "list"
403
- ? hasActions
404
- : surfaceType === "table"
405
- ? hasActions
406
- : INTERACTIVE_SURFACE_TYPES.includes(surfaceType);
407
- const awaitAction = (input.await_action as boolean) ?? isInteractive;
408
-
409
- // Track surface state for ui_update merging
410
- this.surfaceState.set(surfaceId, { surfaceType, data, title });
411
-
412
- this.sendToClient({
413
- type: "ui_surface_show",
414
- sessionId: this.sessionId,
415
- surfaceId,
416
- surfaceType,
417
- title,
418
- data,
419
- actions: actions?.map((a) => ({
420
- id: a.id,
421
- label: a.label,
422
- style: (a.style ?? "secondary") as
423
- | "primary"
424
- | "secondary"
425
- | "destructive",
426
- })),
427
- } as unknown as UiSurfaceShow);
428
-
429
- if (awaitAction) {
430
- return new Promise<ToolExecutionResult>((resolve) => {
431
- this.pendingSurfaceActions.set(surfaceId, { resolve });
432
- });
433
- }
434
- return { content: JSON.stringify({ surfaceId }), isError: false };
435
- }
436
-
437
- if (toolName === "ui_update") {
438
- const surfaceId = input.surface_id as string;
439
- const patch = input.data as Record<string, unknown>;
440
-
441
- // Merge the partial patch into the stored full surface data
442
- const stored = this.surfaceState.get(surfaceId);
443
- let mergedData: SurfaceData;
444
- if (stored) {
445
- mergedData = { ...stored.data, ...patch } as SurfaceData;
446
- stored.data = mergedData;
447
- } else {
448
- mergedData = patch as unknown as SurfaceData;
449
- }
450
-
451
- this.sendToClient({
452
- type: "ui_surface_update",
453
- sessionId: this.sessionId,
454
- surfaceId,
455
- data: mergedData,
456
- });
457
- return { content: "Surface updated", isError: false };
458
- }
459
-
460
- if (toolName === "ui_dismiss") {
461
- const surfaceId = input.surface_id as string;
462
- this.sendToClient({
463
- type: "ui_surface_dismiss",
464
- sessionId: this.sessionId,
465
- surfaceId,
466
- });
467
- this.pendingSurfaceActions.delete(surfaceId);
468
- this.surfaceState.delete(surfaceId);
469
- return { content: "Surface dismissed", isError: false };
470
- }
471
-
472
- // ── Computer-use tool proxying ─────────────────────────────────
473
- const reasoning =
474
- typeof input.reasoning === "string" ? input.reasoning : undefined;
475
-
476
- // Record action in history
477
- this.actionHistory.push({
478
- step: this.stepCount + 1,
479
- toolName,
480
- input,
481
- reasoning,
482
- });
483
-
484
- // Check for terminal tools
485
- if (
486
- toolName === "computer_use_done" ||
487
- toolName === "computer_use_respond"
488
- ) {
489
- const summary =
490
- toolName === "computer_use_done"
491
- ? typeof input.summary === "string"
492
- ? input.summary
493
- : "Task completed"
494
- : typeof input.answer === "string"
495
- ? input.answer
496
- : "No answer provided";
497
-
498
- this.sendToClient({
499
- type: "cu_complete",
500
- sessionId: this.sessionId,
501
- summary,
502
- stepCount: this.stepCount,
503
- isResponse: toolName === "computer_use_respond" ? true : undefined,
504
- });
505
- this.state = "complete";
506
- // Stop AgentLoop immediately so terminal tools cannot trigger extra provider calls.
507
- this.abortController?.abort();
508
- this.notifyTerminal();
509
- return { content: "Session complete", isError: false };
510
- }
511
-
512
- this.stepCount++;
513
-
514
- // Enforce step limit — abort the loop so toolChoice:'any' can't force another turn
515
- if (this.stepCount > MAX_STEPS) {
516
- this.state = "error";
517
- this.sendToClient({
518
- type: "cu_error",
519
- sessionId: this.sessionId,
520
- message: `Step limit (${MAX_STEPS}) exceeded`,
521
- });
522
- this.abortController?.abort();
523
- this.notifyTerminal();
524
- return { content: `Step limit (${MAX_STEPS}) exceeded`, isError: true };
525
- }
526
-
527
- // Send action to client for execution
528
- this.sendToClient({
529
- type: "cu_action",
530
- sessionId: this.sessionId,
531
- toolName,
532
- input,
533
- reasoning,
534
- stepNumber: this.stepCount,
535
- });
536
-
537
- // Wait for next observation from client
538
- this.state = "awaiting_observation";
539
- return new Promise<ToolExecutionResult>((resolve) => {
540
- this.pendingObservation = { resolve };
541
- });
542
- };
543
-
544
- // Build a set of tool names the CU session is allowed to execute.
545
- // This prevents tools registered globally (e.g. computer_use_request_control)
546
- // but not advertised to the CU model from executing during CU sessions.
547
- const allowedToolNames = new Set(toolDefs.map((td) => td.name));
548
-
549
- const toolExecutor = async (
550
- name: string,
551
- input: Record<string, unknown>,
552
- ): Promise<ToolExecutionResult> => {
553
- return executor.execute(name, input, {
554
- workingDir: getSandboxWorkingDir(),
555
- sessionId: this.sessionId,
556
- conversationId: this.sessionId,
557
- trustClass: "guardian",
558
- proxyToolResolver: proxyResolver,
559
- allowedToolNames,
560
- requestSecret: async (params) => {
561
- return secretPrompter.prompt(
562
- params.service,
563
- params.field,
564
- params.label,
565
- params.description,
566
- params.placeholder,
567
- this.sessionId,
568
- params.purpose,
569
- params.allowedTools,
570
- params.allowedDomains,
571
- );
572
- },
573
- });
574
- };
575
-
576
- // Wrap the provider so that old AX tree snapshots are stripped from
577
- // conversation history before each API call, keeping only the most recent
578
- // MAX_AX_TREES_IN_HISTORY entries. This prevents TTFT from growing
579
- // linearly with step count.
580
- const compactingProvider: Provider = {
581
- name: this.provider.name,
582
- sendMessage: (msgs, tools, sys, opts) => {
583
- const compacted = ComputerUseSession.compactHistory(msgs);
584
- return this.provider.sendMessage(compacted, tools, sys, opts);
585
- },
586
- };
587
-
588
- const toolDefsWithReason = injectReasonField(toolDefs, REASON_SKIP_SET);
589
-
590
- const cuConfig = getConfig();
591
- const agentLoop = new AgentLoop(
592
- compactingProvider,
593
- systemPrompt,
594
- {
595
- maxTokens: 4096,
596
- maxInputTokens: cuConfig.contextWindow.maxInputTokens,
597
- toolChoice: { type: "any" },
598
- },
599
- toolDefsWithReason,
600
- toolExecutor,
601
- );
602
-
603
- try {
604
- await agentLoop.run(
605
- messages,
606
- (event) => {
607
- switch (event.type) {
608
- case "error":
609
- log.error(
610
- { err: event.error, sessionId: this.sessionId },
611
- "Agent loop error",
612
- );
613
- if (this.state !== "complete") {
614
- this.state = "error";
615
- this.sendToClient({
616
- type: "cu_error",
617
- sessionId: this.sessionId,
618
- message: event.error.message,
619
- });
620
- this.notifyTerminal();
621
- }
622
- break;
623
- case "usage":
624
- log.info(
625
- {
626
- sessionId: this.sessionId,
627
- inputTokens: event.inputTokens,
628
- outputTokens: event.outputTokens,
629
- model: event.model,
630
- },
631
- "Usage",
632
- );
633
- break;
634
- // Other events (text_delta, thinking_delta, etc.) are not surfaced to the CU client
635
- }
636
- },
637
- this.abortController?.signal,
638
- );
639
-
640
- // If the loop exits without completing, treat as error
641
- if (this.state !== "complete" && this.state !== "error") {
642
- this.state = "error";
643
- this.sendToClient({
644
- type: "cu_error",
645
- sessionId: this.sessionId,
646
- message: "Agent loop ended unexpectedly",
647
- });
648
- this.notifyTerminal();
649
- }
650
- } catch (err) {
651
- if (this.abortController?.signal.aborted) {
652
- log.info({ sessionId: this.sessionId }, "Agent loop aborted");
653
- return;
654
- }
655
- const message = err instanceof Error ? err.message : String(err);
656
- log.error({ err, sessionId: this.sessionId }, "Agent loop failed");
657
- if (this.state !== "complete") {
658
- this.state = "error";
659
- this.sendToClient({
660
- type: "cu_error",
661
- sessionId: this.sessionId,
662
- message,
663
- });
664
- this.notifyTerminal();
665
- }
666
- } finally {
667
- // Always clean up skill projection state and session timer
668
- resetSkillToolProjection(this.skillProjectionState);
669
- if (this.sessionTimer) {
670
- clearTimeout(this.sessionTimer);
671
- this.sessionTimer = null;
672
- }
673
- }
674
- }
675
-
676
- private notifyTerminal(): void {
677
- if (this.terminalNotified) return;
678
- this.terminalNotified = true;
679
- resetSkillToolProjection(this.skillProjectionState);
680
- this.onTerminal?.(this.sessionId);
681
- }
682
-
683
- // ---------------------------------------------------------------------------
684
- // History compaction — strip old AX tree snapshots from tool results
685
- // ---------------------------------------------------------------------------
686
-
687
- /**
688
- * Returns a shallow copy of `messages` where all but the most recent
689
- * `MAX_AX_TREES_IN_HISTORY` `<ax-tree>` blocks have been replaced with a
690
- * short placeholder. This keeps the conversation context small so that
691
- * TTFT does not grow linearly with step count.
692
- */
693
- static compactHistory(messages: Message[]): Message[] {
694
- // Collect indices of user messages that contain an <ax-tree> block
695
- const indicesWithAxTree: number[] = [];
696
- for (let i = 0; i < messages.length; i++) {
697
- const msg = messages[i];
698
- if (msg.role !== "user") continue;
699
- for (const block of msg.content) {
700
- if (
701
- block.type === "tool_result" &&
702
- typeof block.content === "string" &&
703
- block.content.includes("<ax-tree>")
704
- ) {
705
- indicesWithAxTree.push(i);
706
- break;
707
- }
708
- }
709
- }
710
-
711
- if (indicesWithAxTree.length <= MAX_AX_TREES_IN_HISTORY) {
712
- return messages;
713
- }
714
-
715
- const toStrip = new Set(
716
- indicesWithAxTree.slice(0, -MAX_AX_TREES_IN_HISTORY),
717
- );
718
-
719
- return messages.map((msg, idx) => {
720
- if (!toStrip.has(idx)) return msg;
721
- return {
722
- ...msg,
723
- content: msg.content.map((block) => {
724
- if (
725
- block.type === "tool_result" &&
726
- typeof block.content === "string" &&
727
- block.content.includes("<ax-tree>")
728
- ) {
729
- return {
730
- ...block,
731
- content: block.content.replace(
732
- AX_TREE_PATTERN,
733
- AX_TREE_PLACEHOLDER,
734
- ),
735
- };
736
- }
737
- return block;
738
- }),
739
- };
740
- });
741
- }
742
-
743
- /**
744
- * Escapes any literal `</ax-tree>` occurrences inside AX tree content so
745
- * that the non-greedy compaction regex (`AX_TREE_PATTERN`) does not stop
746
- * prematurely when the user happens to be viewing XML/HTML source that
747
- * contains the closing tag. The escaped content does not need to be
748
- * unescaped because compaction replaces the entire block with a placeholder.
749
- */
750
- static escapeAxTreeContent(content: string): string {
751
- return content.replace(/<\/ax-tree>/gi, "&lt;/ax-tree&gt;");
752
- }
753
-
754
- // ---------------------------------------------------------------------------
755
- // Build rich tool-result content from an observation so the model sees
756
- // updated screen state on each turn (not just "Action executed").
757
- // ---------------------------------------------------------------------------
758
-
759
- private buildObservationResultContent(
760
- obs: CuObservation,
761
- hadPreviousAXTree: boolean,
762
- ): string {
763
- const parts: string[] = [];
764
-
765
- // Surface user guidance prominently so the model sees it first
766
- if (obs.userGuidance) {
767
- parts.push(`USER GUIDANCE: ${obs.userGuidance}`);
768
- parts.push("");
769
- }
770
-
771
- if (obs.executionResult) {
772
- parts.push(obs.executionResult);
773
- parts.push("");
774
- }
775
-
776
- // AX tree diff
777
- if (obs.axDiff) {
778
- parts.push(obs.axDiff);
779
- parts.push("");
780
- } else if (hadPreviousAXTree && obs.axTree != null) {
781
- const lastAction = this.actionHistory[this.actionHistory.length - 1];
782
- const wasWait = lastAction?.toolName === "computer_use_wait";
783
- if (
784
- this.consecutiveUnchangedSteps >=
785
- CONSECUTIVE_UNCHANGED_WARNING_THRESHOLD
786
- ) {
787
- parts.push(
788
- `WARNING: ${this.consecutiveUnchangedSteps} consecutive actions had NO VISIBLE EFFECT on the UI. You MUST try a completely different approach.`,
789
- );
790
- } else if (!wasWait) {
791
- parts.push(
792
- "Your last action had NO VISIBLE EFFECT on the UI. Try something different.",
793
- );
794
- }
795
- parts.push("");
796
- }
797
-
798
- // Current screen state — wrapped in markers so compactHistory can strip old snapshots
799
- if (obs.axTree) {
800
- parts.push("<ax-tree>");
801
- parts.push("CURRENT SCREEN STATE:");
802
- parts.push(ComputerUseSession.escapeAxTreeContent(obs.axTree));
803
- parts.push("</ax-tree>");
804
- }
805
-
806
- const screenshotMetadata = this.formatScreenshotMetadata(obs);
807
- if (screenshotMetadata.length > 0) {
808
- parts.push("");
809
- parts.push(...screenshotMetadata);
810
- }
811
-
812
- return parts.join("\n").trim() || "Action executed";
813
- }
814
-
815
- // ---------------------------------------------------------------------------
816
- // Message building (replicates AnthropicProvider.buildMessages from Swift)
817
- // ---------------------------------------------------------------------------
818
-
819
- private buildMessages(
820
- obs: CuObservation,
821
- hadPreviousAXTree: boolean,
822
- ): Message[] {
823
- const contentBlocks: ContentBlock[] = [];
824
-
825
- // Screenshot image block
826
- if (obs.screenshot) {
827
- contentBlocks.push({
828
- type: "image",
829
- source: {
830
- type: "base64",
831
- media_type: "image/jpeg",
832
- data: obs.screenshot,
833
- },
834
- });
835
- }
836
-
837
- // Text block
838
- const textParts: string[] = [];
839
- const trimmedTask = this.task.trim();
840
- if (trimmedTask) {
841
- textParts.push(`TASK: ${trimmedTask}`);
842
- } else {
843
- textParts.push("TASK: No explicit task provided.");
844
- }
845
- textParts.push("");
846
-
847
- // AX tree diff (compact summary of what changed)
848
- if (obs.axDiff && this.actionHistory.length > 0) {
849
- textParts.push(obs.axDiff);
850
- textParts.push("");
851
- } else if (
852
- hadPreviousAXTree &&
853
- obs.axTree != null &&
854
- this.actionHistory.length > 0
855
- ) {
856
- // AX tree unchanged — tell the model its action had no effect
857
- const lastAction = this.actionHistory[this.actionHistory.length - 1];
858
- const wasWait = lastAction?.toolName === "computer_use_wait";
859
- textParts.push("CHANGES SINCE LAST ACTION:");
860
- if (
861
- this.consecutiveUnchangedSteps >=
862
- CONSECUTIVE_UNCHANGED_WARNING_THRESHOLD
863
- ) {
864
- textParts.push(
865
- `WARNING: ${this.consecutiveUnchangedSteps} consecutive actions had NO VISIBLE EFFECT on the UI. You MUST try a completely different approach — do not repeat any of your recent actions.`,
866
- );
867
- } else if (!wasWait) {
868
- const actionDesc = `${lastAction?.toolName ?? "unknown"}`;
869
- textParts.push(
870
- `Your last action (${actionDesc}) had NO VISIBLE EFFECT on the UI. The screen is identical to the previous step. Do NOT repeat the same action — try something different.`,
871
- );
872
- } else {
873
- textParts.push(
874
- "No visible changes detected — the UI is identical to the previous step.",
875
- );
876
- }
877
- textParts.push("");
878
- }
879
-
880
- // Current screen state
881
- if (obs.axTree) {
882
- textParts.push(
883
- "CURRENT SCREEN STATE (accessibility tree of the focused window):",
884
- );
885
- textParts.push(obs.axTree);
886
- textParts.push("");
887
- textParts.push(
888
- "Use element_id with the [ID] numbers shown above to target elements.",
889
- );
890
-
891
- // Secondary windows for cross-app awareness
892
- if (obs.secondaryWindows) {
893
- textParts.push("");
894
- textParts.push(obs.secondaryWindows);
895
- textParts.push("");
896
- textParts.push(
897
- "Note: The element [ID]s above are from other windows — you can reference them for context but can only interact with the focused window's elements.",
898
- );
899
- }
900
-
901
- if (obs.screenshot) {
902
- textParts.push("");
903
- textParts.push(
904
- "A screenshot of the FULL SCREEN is also attached above. Use it to see content outside the focused window (e.g., reference documents, PDFs, other apps visible behind the current window).",
905
- );
906
- const screenshotMetadata = this.formatScreenshotMetadata(obs);
907
- if (screenshotMetadata.length > 0) {
908
- textParts.push(...screenshotMetadata);
909
- }
910
- }
911
- } else if (obs.screenshot) {
912
- textParts.push("CURRENT SCREEN STATE:");
913
- textParts.push(
914
- "See the screenshot above. No accessibility tree available — estimate coordinates from the image.",
915
- );
916
- const screenshotMetadata = this.formatScreenshotMetadata(obs);
917
- if (screenshotMetadata.length > 0) {
918
- textParts.push(...screenshotMetadata);
919
- }
920
- } else {
921
- textParts.push("CURRENT SCREEN STATE:");
922
- textParts.push("No screen data available.");
923
- }
924
-
925
- // Action history
926
- if (this.actionHistory.length > 0) {
927
- textParts.push("");
928
- textParts.push("ACTIONS TAKEN SO FAR:");
929
- let windowedHistory: ActionRecord[];
930
- if (this.actionHistory.length > MAX_HISTORY_ENTRIES) {
931
- textParts.push(
932
- ` [... ${this.actionHistory.length - MAX_HISTORY_ENTRIES} earlier actions omitted]`,
933
- );
934
- windowedHistory = this.actionHistory.slice(-MAX_HISTORY_ENTRIES);
935
- } else {
936
- windowedHistory = this.actionHistory;
937
- }
938
- for (const record of windowedHistory) {
939
- const result = record.result ?? "executed";
940
- textParts.push(` ${record.step}. ${record.toolName} → ${result}`);
941
- }
942
- }
943
-
944
- // Loop detection warning
945
- if (this.actionHistory.length >= LOOP_DETECTION_WINDOW) {
946
- const recent = this.actionHistory.slice(-LOOP_DETECTION_WINDOW);
947
- const allIdentical = recent.every(
948
- (r) =>
949
- r.toolName === recent[0].toolName &&
950
- JSON.stringify(r.input) === JSON.stringify(recent[0].input),
951
- );
952
- if (allIdentical) {
953
- textParts.push("");
954
- textParts.push(
955
- `WARNING: You have repeated the exact same action (${recent[0].toolName}) ${LOOP_DETECTION_WINDOW} times in a row. You MUST try a completely different approach or call computer_use_done with an explanation of why you are stuck.`,
956
- );
957
- }
958
- }
959
-
960
- // Surface user guidance prominently
961
- if (obs.userGuidance) {
962
- textParts.push("");
963
- textParts.push(`USER GUIDANCE: ${obs.userGuidance}`);
964
- }
965
-
966
- // Prompt for next action
967
- textParts.push("");
968
- if (this.actionHistory.length === 0) {
969
- textParts.push(
970
- "This is the first action. Examine the screen state and decide what to do first.",
971
- );
972
- } else {
973
- textParts.push("Decide the next action to take.");
974
- }
975
-
976
- contentBlocks.push({
977
- type: "text",
978
- text: textParts.join("\n"),
979
- });
980
-
981
- return [{ role: "user", content: contentBlocks }];
982
- }
983
-
984
- private formatScreenshotMetadata(obs: CuObservation): string[] {
985
- if (!obs.screenshot) return [];
986
-
987
- const lines: string[] = [];
988
- if (obs.screenshotWidthPx != null && obs.screenshotHeightPx != null) {
989
- lines.push(
990
- `Screenshot metadata: ${obs.screenshotWidthPx}x${obs.screenshotHeightPx} px`,
991
- );
992
- }
993
- if (obs.screenWidthPt != null && obs.screenHeightPt != null) {
994
- lines.push(
995
- `Screen metadata: ${obs.screenWidthPt}x${obs.screenHeightPt} pt`,
996
- );
997
- }
998
- if (obs.coordinateOrigin) {
999
- lines.push(`Coordinate origin: ${obs.coordinateOrigin}`);
1000
- }
1001
- if (obs.captureDisplayId != null) {
1002
- lines.push(`Capture display ID: ${obs.captureDisplayId}`);
1003
- }
1004
- return lines;
1005
- }
1006
-
1007
- hasPendingConfirmation(requestId: string): boolean {
1008
- return this.prompter?.hasPendingRequest(requestId) ?? false;
1009
- }
1010
-
1011
- handleConfirmationResponse(
1012
- requestId: string,
1013
- decision: UserDecision,
1014
- selectedPattern?: string,
1015
- selectedScope?: string,
1016
- decisionContext?: string,
1017
- ): void {
1018
- this.prompter?.resolveConfirmation(
1019
- requestId,
1020
- decision,
1021
- selectedPattern,
1022
- selectedScope,
1023
- decisionContext,
1024
- );
1025
- }
1026
- }