@loadmill/droid-cua 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ import path from "path";
2
+ import { access, readFile } from "fs/promises";
3
+ import { constants as fsConstants } from "fs";
4
+ import { compactAppContext } from "../device/openai.js";
5
+ export const APP_CONTEXT_FILENAME = "context.md";
6
+ export const DEFAULT_APP_CONTEXT_BUDGET = 300;
7
+ export const MIN_APP_CONTEXT_BUDGET = 100;
8
+ export const MAX_APP_CONTEXT_BUDGET = 2000;
9
+ export function normalizeAppContextBudget(value) {
10
+ if (typeof value !== "number" || !Number.isFinite(value)) {
11
+ return DEFAULT_APP_CONTEXT_BUDGET;
12
+ }
13
+ const normalized = Math.round(value);
14
+ if (normalized < MIN_APP_CONTEXT_BUDGET) {
15
+ return MIN_APP_CONTEXT_BUDGET;
16
+ }
17
+ if (normalized > MAX_APP_CONTEXT_BUDGET) {
18
+ return MAX_APP_CONTEXT_BUDGET;
19
+ }
20
+ return normalized;
21
+ }
22
+ export function getDefaultProjectContextPath(projectPath) {
23
+ return path.join(projectPath, APP_CONTEXT_FILENAME);
24
+ }
25
+ export async function readAppContextFile(filePath) {
26
+ await access(filePath, fsConstants.R_OK);
27
+ return await readFile(filePath, "utf-8");
28
+ }
29
+ export async function appContextFileExists(filePath) {
30
+ try {
31
+ await access(filePath, fsConstants.R_OK);
32
+ return true;
33
+ }
34
+ catch {
35
+ return false;
36
+ }
37
+ }
38
+ export async function buildAppContextBriefing({ contextPath, taskText, budget = DEFAULT_APP_CONTEXT_BUDGET, }) {
39
+ if (!contextPath) {
40
+ return { briefing: "", contextPath: null };
41
+ }
42
+ const normalizedBudget = normalizeAppContextBudget(budget);
43
+ if (normalizedBudget === 0) {
44
+ return { briefing: "", contextPath };
45
+ }
46
+ const rawContext = await readAppContextFile(contextPath);
47
+ const result = await compactAppContext({
48
+ contextDocument: rawContext,
49
+ taskDescription: taskText,
50
+ tokenBudget: normalizedBudget,
51
+ });
52
+ return {
53
+ briefing: result.briefing,
54
+ outputTokens: result.outputTokens,
55
+ contextPath,
56
+ };
57
+ }
@@ -5,6 +5,7 @@ import { handleModelAction } from "../device/actions.js";
5
5
  import { sendCUARequest } from "../device/openai.js";
6
6
  import { emitDesktopDebug } from "../utils/desktop-debug.js";
7
7
  import { getConfiguredStepDelayMs } from "../utils/step-delay.js";
8
+ import { printCliOutput } from "../utils/console-output.js";
8
9
  function extractComputerCalls(items) {
9
10
  const entries = [];
10
11
  for (const item of items) {
@@ -35,7 +36,46 @@ export class ExecutionEngine {
35
36
  this.session = session;
36
37
  this.recordScreenshots = options.recordScreenshots || false;
37
38
  this.screenshotDir = options.screenshotDir || null;
39
+ this.screenshotRecorder = options.screenshotRecorder || null;
38
40
  this.stepDelayMs = getConfiguredStepDelayMs();
41
+ this.reportedScreenshotWriteError = false;
42
+ }
43
+ async recordScreenshot(screenshotBase64, metadata = {}) {
44
+ if (typeof screenshotBase64 !== "string" || !screenshotBase64) {
45
+ return null;
46
+ }
47
+ try {
48
+ if (this.screenshotRecorder?.saveScreenshot) {
49
+ return await this.screenshotRecorder.saveScreenshot(screenshotBase64, metadata);
50
+ }
51
+ if (this.recordScreenshots && this.screenshotDir) {
52
+ const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
53
+ await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
54
+ return framePath;
55
+ }
56
+ }
57
+ catch (error) {
58
+ if (!this.reportedScreenshotWriteError) {
59
+ this.reportedScreenshotWriteError = true;
60
+ const scope = typeof metadata.sessionId === "string" ? "design" : "execution";
61
+ const ids = scope === "design"
62
+ ? {
63
+ sessionId: metadata.sessionId,
64
+ stepId: metadata.stepId,
65
+ instructionIndex: metadata.instructionIndex
66
+ }
67
+ : {
68
+ runId: metadata.runId,
69
+ stepId: metadata.stepId,
70
+ instructionIndex: metadata.instructionIndex
71
+ };
72
+ emitDesktopDebug("run.screenshot_write.error", scope, ids, {
73
+ captureSource: metadata.captureSource ?? null,
74
+ message: error instanceof Error ? error.message : "Failed to persist screenshot"
75
+ });
76
+ }
77
+ }
78
+ return null;
39
79
  }
40
80
  /**
41
81
  * Run a full turn with the CUA model
@@ -46,8 +86,9 @@ export class ExecutionEngine {
46
86
  * @param {Object} context - Optional Ink context for output
47
87
  */
48
88
  async runFullTurn(response, trackAction = null, context = null, stepContext = null) {
49
- const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
89
+ const addOutput = context?.addOutput || printCliOutput;
50
90
  let newResponseId = response.id;
91
+ const shouldStop = () => Boolean(trackAction?.());
51
92
  const eventMeta = (extra = {}) => ({
52
93
  runId: context?.runId,
53
94
  stepId: stepContext?.stepId,
@@ -56,11 +97,8 @@ export class ExecutionEngine {
56
97
  });
57
98
  while (true) {
58
99
  // Check for interruption before processing next batch of actions
59
- if (trackAction) {
60
- const shouldStop = trackAction(null); // null action = pre-batch check
61
- if (shouldStop) {
62
- return newResponseId;
63
- }
100
+ if (shouldStop()) {
101
+ return newResponseId;
64
102
  }
65
103
  const items = response.output || [];
66
104
  const computerCalls = extractComputerCalls(items);
@@ -120,6 +158,9 @@ export class ExecutionEngine {
120
158
  continue;
121
159
  let sawExplicitScreenshotAction = false;
122
160
  for (const action of actions) {
161
+ if (shouldStop()) {
162
+ return newResponseId;
163
+ }
123
164
  if (action.type === "screenshot") {
124
165
  sawExplicitScreenshotAction = true;
125
166
  addOutput({
@@ -138,16 +179,17 @@ export class ExecutionEngine {
138
179
  else {
139
180
  await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
140
181
  ...context,
182
+ shouldStop,
141
183
  stepId: stepContext?.stepId,
142
184
  instructionIndex: stepContext?.instructionIndex
143
185
  });
144
186
  // Track action and check for interruption
145
187
  if (trackAction) {
146
- const shouldStop = trackAction(action);
147
- if (shouldStop) {
148
- // User interrupted - stop execution immediately
149
- return newResponseId;
150
- }
188
+ trackAction(action);
189
+ }
190
+ if (shouldStop()) {
191
+ // User interrupted - stop execution immediately
192
+ return newResponseId;
151
193
  }
152
194
  // Add delay after UI-changing actions to let the interface update
153
195
  // before taking the screenshot (except for explicit wait actions which have their own delay)
@@ -156,6 +198,9 @@ export class ExecutionEngine {
156
198
  }
157
199
  }
158
200
  }
201
+ if (shouldStop()) {
202
+ return newResponseId;
203
+ }
159
204
  const screenshotBase64 = await getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
160
205
  emitDesktopDebug("device.screenshot", "device", {
161
206
  runId: context?.runId,
@@ -168,10 +213,14 @@ export class ExecutionEngine {
168
213
  height: this.session.deviceInfo?.scaled_height,
169
214
  base64Length: screenshotBase64.length
170
215
  });
171
- if (this.recordScreenshots && this.screenshotDir) {
172
- const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
173
- await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
174
- }
216
+ await this.recordScreenshot(screenshotBase64, {
217
+ runId: context?.runId,
218
+ sessionId: context?.sessionId,
219
+ stepId: stepContext?.stepId,
220
+ instructionIndex: stepContext?.instructionIndex,
221
+ callId: call_id,
222
+ captureSource: sawExplicitScreenshotAction ? "call-output-explicit-action" : "call-output-post-action"
223
+ });
175
224
  // Build next input: screenshot + any carryover reasoning
176
225
  const selectedCuaModel = process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
177
226
  const input = [{
@@ -186,6 +235,9 @@ export class ExecutionEngine {
186
235
  : {}),
187
236
  ...(pendingSafetyChecks.length > 0 ? { acknowledged_safety_checks: pendingSafetyChecks } : {})
188
237
  }];
238
+ if (shouldStop()) {
239
+ return newResponseId;
240
+ }
189
241
  response = await sendCUARequest({
190
242
  messages: input,
191
243
  previousResponseId: newResponseId,
@@ -41,6 +41,17 @@ function describeControlledDevice(deviceInfo = {}) {
41
41
  }
42
42
  return "a mobile device";
43
43
  }
44
+ function buildAppContextSection(briefing) {
45
+ const text = typeof briefing === "string" ? briefing.trim() : "";
46
+ if (!text) {
47
+ return "";
48
+ }
49
+ return `APP CONTEXT BRIEFING:
50
+ The following is a condensed description of the app you are testing, relevant to the current task.
51
+ Use this to understand screen layouts, terminology, navigation, and expected behavior.
52
+
53
+ ${text}`;
54
+ }
44
55
  export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
45
56
  const controlledDevice = describeControlledDevice(deviceInfo);
46
57
  const prompt = `
@@ -65,12 +76,25 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
65
76
  - Use 'keypress' only for a single mobile-safe key when absolutely necessary.
66
77
  - To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
67
78
  - Prefer tapping visible controls over hardware key events.
79
+ - Prefer on-screen navigation controls such as menus, tabs, drawer items, back arrows, close buttons, and explicit logout buttons over keypress actions.
80
+ - Do NOT use Back or ESC for normal app navigation when a reliable on-screen control is visible.
81
+ - Avoid using Back or ESC from a main or root screen, because it may leave the app.
82
+ - Exception: if the software keyboard is open and blocking the next needed control, Back or ESC may be used to dismiss the keyboard before continuing.
83
+ - Treat keypress actions as a fallback for limited cases only, such as a clearly needed single mobile-safe key or dismissing transient UI when no better visible control exists.
68
84
 
69
85
  CRITICAL - Automatic Timing:
70
86
  - After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
71
87
  - This 500ms is sufficient for normal UI updates and animations
72
88
  - DO NOT add 'wait' actions unnecessarily - trust the automatic delay
73
89
 
90
+ CRITICAL - Mutating Actions:
91
+ - Mutating actions change app state. Examples: submit, create, save, confirm, approve, reject, login, logout, send, place order, initiate transfer
92
+ - Before tapping a mutating action button, dismiss the software keyboard first when it is open and not required for the tap
93
+ - After performing a mutating action once, do NOT repeat the same mutating action unless the UI clearly shows the first attempt failed or had no effect
94
+ - Treat visible state change as success. Examples: form fields clear, submit button returns to normal, status changes, list refreshes, new row appears, success message appears, screen changes
95
+ - For form submissions specifically, if the relevant fields clear and the action button returns to its normal idle state, treat that as success even if the new row or confirmation is not obvious yet
96
+ - If the UI shows signs that the mutating action succeeded, stop acting for that instruction
97
+
74
98
  Use explicit 'wait' action ONLY in these specific cases:
75
99
  1. After launching apps from home screen or app drawer
76
100
  2. After pressing ENTER that triggers navigation (search, URL, form submit)
@@ -109,8 +133,9 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
109
133
  Only complete the current instruction. Do not proceed beyond the current step unless asked.
110
134
 
111
135
  Mobile-Specific Notes:
112
- - ESC key maps to the Home button (return to home screen)
113
- - Use Home button (ESC) to escape from stuck situations and restart
136
+ - HOME key returns to the home screen
137
+ - On Android, ESC key maps to Back
138
+ - On iOS, ESC has no effect; use visible on-screen controls instead
114
139
  - Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
115
140
  `;
116
141
  return prompt;
@@ -213,9 +238,10 @@ Remember: You are autonomous. Explore confidently. Generate simple, executable t
213
238
  { title: "Design Mode Instructions", text: designCustomText }
214
239
  ]);
215
240
  }
216
- export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
241
+ export function buildExecutionModePrompt(deviceInfo, customInstructions = {}, appContextBriefing = "") {
217
242
  const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
218
243
  const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
244
+ const appContextSection = buildAppContextSection(appContextBriefing);
219
245
  const prompt = `${basePrompt}
220
246
 
221
247
  EXECUTION MODE - Critical Behavior:
@@ -229,6 +255,9 @@ CRITICAL RULES:
229
255
  - Just execute the action silently and stop immediately
230
256
  - Only generate text if the action FAILED or cannot be completed
231
257
  - Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
258
+ - Never repeat the same mutating action with the same apparent intent unless the UI clearly shows failure or no state change
259
+ - If a submit/create/approve/reject/login action appears to succeed, stop instead of trying to reconfirm by doing it again
260
+ - For form submissions, cleared fields plus a reset action button are strong success signals; stop even if the created item is not yet obvious in the visible list
232
261
  - If target is not visible, perform bounded off-screen discovery first:
233
262
  1. Scroll the screen in the likely direction to reveal hidden controls
234
263
  2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
@@ -236,10 +265,13 @@ CRITICAL RULES:
236
265
  Your process:
237
266
  1. Read the instruction
238
267
  2. Execute the required actions
239
- 3. Stop immediately - no commentary, no questions
268
+ 3. Before tapping a mutating action, dismiss the keyboard if it is open and not needed
269
+ 4. After a mutating action, inspect the resulting screen for success cues such as cleared fields, reset buttons, changed status, refreshed content, or navigation
270
+ 5. Stop as soon as success is visible
271
+ 6. Stop immediately - no commentary, no questions
240
272
 
241
273
  Each instruction is independent. Do not reference previous instructions or ask about next steps.
242
- `;
274
+ ${appContextSection ? `\n\n${appContextSection}` : ""}`;
243
275
  return appendCustomSections(prompt, [
244
276
  { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
245
277
  { title: "Execution Mode Instructions", text: executionCustomText }
@@ -14,8 +14,8 @@ function normalizeMobileKeypress(keys = []) {
14
14
  }
15
15
  const key = String(keys[0]).trim().toUpperCase();
16
16
  const mobileKeyMap = {
17
- ESC: "KEYCODE_HOME",
18
- ESCAPE: "KEYCODE_HOME",
17
+ ESC: "KEYCODE_BACK",
18
+ ESCAPE: "KEYCODE_BACK",
19
19
  HOME: "KEYCODE_HOME",
20
20
  BACK: "KEYCODE_BACK",
21
21
  ENTER: "KEYCODE_ENTER",
@@ -1,6 +1,7 @@
1
1
  /**
2
2
  * Assertion handling for script validation
3
3
  */
4
+ import { printCliOutput } from "../utils/console-output.js";
4
5
  export function isAssertion(userInput) {
5
6
  const trimmed = userInput.trim();
6
7
  const lower = trimmed.toLowerCase();
@@ -56,7 +57,7 @@ export function extractFailureDetails(transcript) {
56
57
  }
57
58
  export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMode, context, stepContext = null) {
58
59
  const details = extractFailureDetails(transcript);
59
- const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
60
+ const addOutput = context?.addOutput || printCliOutput;
60
61
  const meta = {
61
62
  eventType: 'assertion_result',
62
63
  runId: context?.runId,
@@ -81,7 +82,7 @@ export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMo
81
82
  // Interactive mode: caller should clear remaining instructions
82
83
  }
83
84
  export function handleAssertionSuccess(assertionPrompt, context = null, stepContext = null) {
84
- const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
85
+ const addOutput = context?.addOutput || printCliOutput;
85
86
  addOutput({
86
87
  type: 'success',
87
88
  text: `✓ Assertion passed: ${assertionPrompt}`,
@@ -195,6 +195,7 @@ function readAppStatusEntry(payload) {
195
195
  }
196
196
  return entries;
197
197
  }
198
+ /** @type {import("../adapter").CloudProviderAdapter} */
198
199
  export const browserStackAdapter = {
199
200
  id: "browserstack",
200
201
  displayName: "BrowserStack",