@loadmill/droid-cua 2.2.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -0
- package/build/index.js +169 -24
- package/build/src/cli/headless-debug.js +55 -0
- package/build/src/cli/headless-execution-config.js +171 -0
- package/build/src/cli/ink-shell.js +8 -2
- package/build/src/commands/help.js +9 -1
- package/build/src/commands/run.js +30 -1
- package/build/src/core/app-context.js +57 -0
- package/build/src/core/execution-engine.js +67 -15
- package/build/src/core/prompts.js +37 -5
- package/build/src/device/android/actions.js +2 -2
- package/build/src/device/assertions.js +3 -2
- package/build/src/device/cloud/browserstack/adapter.js +1 -0
- package/build/src/device/cloud/lambdatest/adapter.js +402 -0
- package/build/src/device/cloud/registry.js +2 -1
- package/build/src/device/interface.js +1 -1
- package/build/src/device/ios/actions.js +8 -2
- package/build/src/device/loadmill.js +4 -3
- package/build/src/device/openai.js +118 -1
- package/build/src/modes/execution-mode.js +13 -18
- package/build/src/utils/console-output.js +35 -0
- package/build/src/utils/run-screenshot-recorder.js +98 -0
- package/build/src/utils/structured-debug-log-manager.js +325 -0
- package/package.json +2 -1
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import path from "path";
|
|
2
|
+
import { access, readFile } from "fs/promises";
|
|
3
|
+
import { constants as fsConstants } from "fs";
|
|
4
|
+
import { compactAppContext } from "../device/openai.js";
|
|
5
|
+
export const APP_CONTEXT_FILENAME = "context.md";
|
|
6
|
+
export const DEFAULT_APP_CONTEXT_BUDGET = 300;
|
|
7
|
+
export const MIN_APP_CONTEXT_BUDGET = 100;
|
|
8
|
+
export const MAX_APP_CONTEXT_BUDGET = 2000;
|
|
9
|
+
export function normalizeAppContextBudget(value) {
|
|
10
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
11
|
+
return DEFAULT_APP_CONTEXT_BUDGET;
|
|
12
|
+
}
|
|
13
|
+
const normalized = Math.round(value);
|
|
14
|
+
if (normalized < MIN_APP_CONTEXT_BUDGET) {
|
|
15
|
+
return MIN_APP_CONTEXT_BUDGET;
|
|
16
|
+
}
|
|
17
|
+
if (normalized > MAX_APP_CONTEXT_BUDGET) {
|
|
18
|
+
return MAX_APP_CONTEXT_BUDGET;
|
|
19
|
+
}
|
|
20
|
+
return normalized;
|
|
21
|
+
}
|
|
22
|
+
export function getDefaultProjectContextPath(projectPath) {
|
|
23
|
+
return path.join(projectPath, APP_CONTEXT_FILENAME);
|
|
24
|
+
}
|
|
25
|
+
export async function readAppContextFile(filePath) {
|
|
26
|
+
await access(filePath, fsConstants.R_OK);
|
|
27
|
+
return await readFile(filePath, "utf-8");
|
|
28
|
+
}
|
|
29
|
+
export async function appContextFileExists(filePath) {
|
|
30
|
+
try {
|
|
31
|
+
await access(filePath, fsConstants.R_OK);
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export async function buildAppContextBriefing({ contextPath, taskText, budget = DEFAULT_APP_CONTEXT_BUDGET, }) {
|
|
39
|
+
if (!contextPath) {
|
|
40
|
+
return { briefing: "", contextPath: null };
|
|
41
|
+
}
|
|
42
|
+
const normalizedBudget = normalizeAppContextBudget(budget);
|
|
43
|
+
if (normalizedBudget === 0) {
|
|
44
|
+
return { briefing: "", contextPath };
|
|
45
|
+
}
|
|
46
|
+
const rawContext = await readAppContextFile(contextPath);
|
|
47
|
+
const result = await compactAppContext({
|
|
48
|
+
contextDocument: rawContext,
|
|
49
|
+
taskDescription: taskText,
|
|
50
|
+
tokenBudget: normalizedBudget,
|
|
51
|
+
});
|
|
52
|
+
return {
|
|
53
|
+
briefing: result.briefing,
|
|
54
|
+
outputTokens: result.outputTokens,
|
|
55
|
+
contextPath,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -5,6 +5,7 @@ import { handleModelAction } from "../device/actions.js";
|
|
|
5
5
|
import { sendCUARequest } from "../device/openai.js";
|
|
6
6
|
import { emitDesktopDebug } from "../utils/desktop-debug.js";
|
|
7
7
|
import { getConfiguredStepDelayMs } from "../utils/step-delay.js";
|
|
8
|
+
import { printCliOutput } from "../utils/console-output.js";
|
|
8
9
|
function extractComputerCalls(items) {
|
|
9
10
|
const entries = [];
|
|
10
11
|
for (const item of items) {
|
|
@@ -35,7 +36,46 @@ export class ExecutionEngine {
|
|
|
35
36
|
this.session = session;
|
|
36
37
|
this.recordScreenshots = options.recordScreenshots || false;
|
|
37
38
|
this.screenshotDir = options.screenshotDir || null;
|
|
39
|
+
this.screenshotRecorder = options.screenshotRecorder || null;
|
|
38
40
|
this.stepDelayMs = getConfiguredStepDelayMs();
|
|
41
|
+
this.reportedScreenshotWriteError = false;
|
|
42
|
+
}
|
|
43
|
+
async recordScreenshot(screenshotBase64, metadata = {}) {
|
|
44
|
+
if (typeof screenshotBase64 !== "string" || !screenshotBase64) {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
try {
|
|
48
|
+
if (this.screenshotRecorder?.saveScreenshot) {
|
|
49
|
+
return await this.screenshotRecorder.saveScreenshot(screenshotBase64, metadata);
|
|
50
|
+
}
|
|
51
|
+
if (this.recordScreenshots && this.screenshotDir) {
|
|
52
|
+
const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
|
|
53
|
+
await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
|
|
54
|
+
return framePath;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
catch (error) {
|
|
58
|
+
if (!this.reportedScreenshotWriteError) {
|
|
59
|
+
this.reportedScreenshotWriteError = true;
|
|
60
|
+
const scope = typeof metadata.sessionId === "string" ? "design" : "execution";
|
|
61
|
+
const ids = scope === "design"
|
|
62
|
+
? {
|
|
63
|
+
sessionId: metadata.sessionId,
|
|
64
|
+
stepId: metadata.stepId,
|
|
65
|
+
instructionIndex: metadata.instructionIndex
|
|
66
|
+
}
|
|
67
|
+
: {
|
|
68
|
+
runId: metadata.runId,
|
|
69
|
+
stepId: metadata.stepId,
|
|
70
|
+
instructionIndex: metadata.instructionIndex
|
|
71
|
+
};
|
|
72
|
+
emitDesktopDebug("run.screenshot_write.error", scope, ids, {
|
|
73
|
+
captureSource: metadata.captureSource ?? null,
|
|
74
|
+
message: error instanceof Error ? error.message : "Failed to persist screenshot"
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return null;
|
|
39
79
|
}
|
|
40
80
|
/**
|
|
41
81
|
* Run a full turn with the CUA model
|
|
@@ -46,8 +86,9 @@ export class ExecutionEngine {
|
|
|
46
86
|
* @param {Object} context - Optional Ink context for output
|
|
47
87
|
*/
|
|
48
88
|
async runFullTurn(response, trackAction = null, context = null, stepContext = null) {
|
|
49
|
-
const addOutput = context?.addOutput ||
|
|
89
|
+
const addOutput = context?.addOutput || printCliOutput;
|
|
50
90
|
let newResponseId = response.id;
|
|
91
|
+
const shouldStop = () => Boolean(trackAction?.());
|
|
51
92
|
const eventMeta = (extra = {}) => ({
|
|
52
93
|
runId: context?.runId,
|
|
53
94
|
stepId: stepContext?.stepId,
|
|
@@ -56,11 +97,8 @@ export class ExecutionEngine {
|
|
|
56
97
|
});
|
|
57
98
|
while (true) {
|
|
58
99
|
// Check for interruption before processing next batch of actions
|
|
59
|
-
if (
|
|
60
|
-
|
|
61
|
-
if (shouldStop) {
|
|
62
|
-
return newResponseId;
|
|
63
|
-
}
|
|
100
|
+
if (shouldStop()) {
|
|
101
|
+
return newResponseId;
|
|
64
102
|
}
|
|
65
103
|
const items = response.output || [];
|
|
66
104
|
const computerCalls = extractComputerCalls(items);
|
|
@@ -120,6 +158,9 @@ export class ExecutionEngine {
|
|
|
120
158
|
continue;
|
|
121
159
|
let sawExplicitScreenshotAction = false;
|
|
122
160
|
for (const action of actions) {
|
|
161
|
+
if (shouldStop()) {
|
|
162
|
+
return newResponseId;
|
|
163
|
+
}
|
|
123
164
|
if (action.type === "screenshot") {
|
|
124
165
|
sawExplicitScreenshotAction = true;
|
|
125
166
|
addOutput({
|
|
@@ -138,16 +179,17 @@ export class ExecutionEngine {
|
|
|
138
179
|
else {
|
|
139
180
|
await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
|
|
140
181
|
...context,
|
|
182
|
+
shouldStop,
|
|
141
183
|
stepId: stepContext?.stepId,
|
|
142
184
|
instructionIndex: stepContext?.instructionIndex
|
|
143
185
|
});
|
|
144
186
|
// Track action and check for interruption
|
|
145
187
|
if (trackAction) {
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
188
|
+
trackAction(action);
|
|
189
|
+
}
|
|
190
|
+
if (shouldStop()) {
|
|
191
|
+
// User interrupted - stop execution immediately
|
|
192
|
+
return newResponseId;
|
|
151
193
|
}
|
|
152
194
|
// Add delay after UI-changing actions to let the interface update
|
|
153
195
|
// before taking the screenshot (except for explicit wait actions which have their own delay)
|
|
@@ -156,6 +198,9 @@ export class ExecutionEngine {
|
|
|
156
198
|
}
|
|
157
199
|
}
|
|
158
200
|
}
|
|
201
|
+
if (shouldStop()) {
|
|
202
|
+
return newResponseId;
|
|
203
|
+
}
|
|
159
204
|
const screenshotBase64 = await getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
|
|
160
205
|
emitDesktopDebug("device.screenshot", "device", {
|
|
161
206
|
runId: context?.runId,
|
|
@@ -168,10 +213,14 @@ export class ExecutionEngine {
|
|
|
168
213
|
height: this.session.deviceInfo?.scaled_height,
|
|
169
214
|
base64Length: screenshotBase64.length
|
|
170
215
|
});
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
216
|
+
await this.recordScreenshot(screenshotBase64, {
|
|
217
|
+
runId: context?.runId,
|
|
218
|
+
sessionId: context?.sessionId,
|
|
219
|
+
stepId: stepContext?.stepId,
|
|
220
|
+
instructionIndex: stepContext?.instructionIndex,
|
|
221
|
+
callId: call_id,
|
|
222
|
+
captureSource: sawExplicitScreenshotAction ? "call-output-explicit-action" : "call-output-post-action"
|
|
223
|
+
});
|
|
175
224
|
// Build next input: screenshot + any carryover reasoning
|
|
176
225
|
const selectedCuaModel = process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
|
|
177
226
|
const input = [{
|
|
@@ -186,6 +235,9 @@ export class ExecutionEngine {
|
|
|
186
235
|
: {}),
|
|
187
236
|
...(pendingSafetyChecks.length > 0 ? { acknowledged_safety_checks: pendingSafetyChecks } : {})
|
|
188
237
|
}];
|
|
238
|
+
if (shouldStop()) {
|
|
239
|
+
return newResponseId;
|
|
240
|
+
}
|
|
189
241
|
response = await sendCUARequest({
|
|
190
242
|
messages: input,
|
|
191
243
|
previousResponseId: newResponseId,
|
|
@@ -41,6 +41,17 @@ function describeControlledDevice(deviceInfo = {}) {
|
|
|
41
41
|
}
|
|
42
42
|
return "a mobile device";
|
|
43
43
|
}
|
|
44
|
+
function buildAppContextSection(briefing) {
|
|
45
|
+
const text = typeof briefing === "string" ? briefing.trim() : "";
|
|
46
|
+
if (!text) {
|
|
47
|
+
return "";
|
|
48
|
+
}
|
|
49
|
+
return `APP CONTEXT BRIEFING:
|
|
50
|
+
The following is a condensed description of the app you are testing, relevant to the current task.
|
|
51
|
+
Use this to understand screen layouts, terminology, navigation, and expected behavior.
|
|
52
|
+
|
|
53
|
+
${text}`;
|
|
54
|
+
}
|
|
44
55
|
export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
|
|
45
56
|
const controlledDevice = describeControlledDevice(deviceInfo);
|
|
46
57
|
const prompt = `
|
|
@@ -65,12 +76,25 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
|
|
|
65
76
|
- Use 'keypress' only for a single mobile-safe key when absolutely necessary.
|
|
66
77
|
- To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
|
|
67
78
|
- Prefer tapping visible controls over hardware key events.
|
|
79
|
+
- Prefer on-screen navigation controls such as menus, tabs, drawer items, back arrows, close buttons, and explicit logout buttons over keypress actions.
|
|
80
|
+
- Do NOT use Back or ESC for normal app navigation when a reliable on-screen control is visible.
|
|
81
|
+
- Avoid using Back or ESC from a main or root screen, because it may leave the app.
|
|
82
|
+
- Exception: if the software keyboard is open and blocking the next needed control, Back or ESC may be used to dismiss the keyboard before continuing.
|
|
83
|
+
- Treat keypress actions as a fallback for limited cases only, such as a clearly needed single mobile-safe key or dismissing transient UI when no better visible control exists.
|
|
68
84
|
|
|
69
85
|
CRITICAL - Automatic Timing:
|
|
70
86
|
- After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
|
|
71
87
|
- This 500ms is sufficient for normal UI updates and animations
|
|
72
88
|
- DO NOT add 'wait' actions unnecessarily - trust the automatic delay
|
|
73
89
|
|
|
90
|
+
CRITICAL - Mutating Actions:
|
|
91
|
+
- Mutating actions change app state. Examples: submit, create, save, confirm, approve, reject, login, logout, send, place order, initiate transfer
|
|
92
|
+
- Before tapping a mutating action button, dismiss the software keyboard first when it is open and not required for the tap
|
|
93
|
+
- After performing a mutating action once, do NOT repeat the same mutating action unless the UI clearly shows the first attempt failed or had no effect
|
|
94
|
+
- Treat visible state change as success. Examples: form fields clear, submit button returns to normal, status changes, list refreshes, new row appears, success message appears, screen changes
|
|
95
|
+
- For form submissions specifically, if the relevant fields clear and the action button returns to its normal idle state, treat that as success even if the new row or confirmation is not obvious yet
|
|
96
|
+
- If the UI shows signs that the mutating action succeeded, stop acting for that instruction
|
|
97
|
+
|
|
74
98
|
Use explicit 'wait' action ONLY in these specific cases:
|
|
75
99
|
1. After launching apps from home screen or app drawer
|
|
76
100
|
2. After pressing ENTER that triggers navigation (search, URL, form submit)
|
|
@@ -109,8 +133,9 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
|
|
|
109
133
|
Only complete the current instruction. Do not proceed beyond the current step unless asked.
|
|
110
134
|
|
|
111
135
|
Mobile-Specific Notes:
|
|
112
|
-
-
|
|
113
|
-
-
|
|
136
|
+
- HOME key returns to the home screen
|
|
137
|
+
- On Android, ESC key maps to Back
|
|
138
|
+
- On iOS, ESC has no effect; use visible on-screen controls instead
|
|
114
139
|
- Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
|
|
115
140
|
`;
|
|
116
141
|
return prompt;
|
|
@@ -213,9 +238,10 @@ Remember: You are autonomous. Explore confidently. Generate simple, executable t
|
|
|
213
238
|
{ title: "Design Mode Instructions", text: designCustomText }
|
|
214
239
|
]);
|
|
215
240
|
}
|
|
216
|
-
export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
|
|
241
|
+
export function buildExecutionModePrompt(deviceInfo, customInstructions = {}, appContextBriefing = "") {
|
|
217
242
|
const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
|
|
218
243
|
const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
|
|
244
|
+
const appContextSection = buildAppContextSection(appContextBriefing);
|
|
219
245
|
const prompt = `${basePrompt}
|
|
220
246
|
|
|
221
247
|
EXECUTION MODE - Critical Behavior:
|
|
@@ -229,6 +255,9 @@ CRITICAL RULES:
|
|
|
229
255
|
- Just execute the action silently and stop immediately
|
|
230
256
|
- Only generate text if the action FAILED or cannot be completed
|
|
231
257
|
- Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
|
|
258
|
+
- Never repeat the same mutating action with the same apparent intent unless the UI clearly shows failure or no state change
|
|
259
|
+
- If a submit/create/approve/reject/login action appears to succeed, stop instead of trying to reconfirm by doing it again
|
|
260
|
+
- For form submissions, cleared fields plus a reset action button are strong success signals; stop even if the created item is not yet obvious in the visible list
|
|
232
261
|
- If target is not visible, perform bounded off-screen discovery first:
|
|
233
262
|
1. Scroll the screen in the likely direction to reveal hidden controls
|
|
234
263
|
2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
|
|
@@ -236,10 +265,13 @@ CRITICAL RULES:
|
|
|
236
265
|
Your process:
|
|
237
266
|
1. Read the instruction
|
|
238
267
|
2. Execute the required actions
|
|
239
|
-
3.
|
|
268
|
+
3. Before tapping a mutating action, dismiss the keyboard if it is open and not needed
|
|
269
|
+
4. After a mutating action, inspect the resulting screen for success cues such as cleared fields, reset buttons, changed status, refreshed content, or navigation
|
|
270
|
+
5. Stop as soon as success is visible
|
|
271
|
+
6. Stop immediately - no commentary, no questions
|
|
240
272
|
|
|
241
273
|
Each instruction is independent. Do not reference previous instructions or ask about next steps.
|
|
242
|
-
`;
|
|
274
|
+
${appContextSection ? `\n\n${appContextSection}` : ""}`;
|
|
243
275
|
return appendCustomSections(prompt, [
|
|
244
276
|
{ title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
|
|
245
277
|
{ title: "Execution Mode Instructions", text: executionCustomText }
|
|
@@ -14,8 +14,8 @@ function normalizeMobileKeypress(keys = []) {
|
|
|
14
14
|
}
|
|
15
15
|
const key = String(keys[0]).trim().toUpperCase();
|
|
16
16
|
const mobileKeyMap = {
|
|
17
|
-
ESC: "
|
|
18
|
-
ESCAPE: "
|
|
17
|
+
ESC: "KEYCODE_BACK",
|
|
18
|
+
ESCAPE: "KEYCODE_BACK",
|
|
19
19
|
HOME: "KEYCODE_HOME",
|
|
20
20
|
BACK: "KEYCODE_BACK",
|
|
21
21
|
ENTER: "KEYCODE_ENTER",
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Assertion handling for script validation
|
|
3
3
|
*/
|
|
4
|
+
import { printCliOutput } from "../utils/console-output.js";
|
|
4
5
|
export function isAssertion(userInput) {
|
|
5
6
|
const trimmed = userInput.trim();
|
|
6
7
|
const lower = trimmed.toLowerCase();
|
|
@@ -56,7 +57,7 @@ export function extractFailureDetails(transcript) {
|
|
|
56
57
|
}
|
|
57
58
|
export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMode, context, stepContext = null) {
|
|
58
59
|
const details = extractFailureDetails(transcript);
|
|
59
|
-
const addOutput = context?.addOutput ||
|
|
60
|
+
const addOutput = context?.addOutput || printCliOutput;
|
|
60
61
|
const meta = {
|
|
61
62
|
eventType: 'assertion_result',
|
|
62
63
|
runId: context?.runId,
|
|
@@ -81,7 +82,7 @@ export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMo
|
|
|
81
82
|
// Interactive mode: caller should clear remaining instructions
|
|
82
83
|
}
|
|
83
84
|
export function handleAssertionSuccess(assertionPrompt, context = null, stepContext = null) {
|
|
84
|
-
const addOutput = context?.addOutput ||
|
|
85
|
+
const addOutput = context?.addOutput || printCliOutput;
|
|
85
86
|
addOutput({
|
|
86
87
|
type: 'success',
|
|
87
88
|
text: `✓ Assertion passed: ${assertionPrompt}`,
|