@loadmill/droid-cua 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/src/core/execution-engine.js +3 -1
- package/build/src/core/prompts.js +34 -18
- package/build/src/device/android/actions.js +2 -1
- package/build/src/device/cloud/actions.js +2 -1
- package/build/src/device/ios/actions.js +2 -1
- package/build/src/device/openai.js +58 -11
- package/build/src/modes/execution-mode.js +13 -1
- package/build/src/utils/step-delay.js +20 -0
- package/package.json +1 -1
|
@@ -4,6 +4,7 @@ import { getScreenshotAsBase64, getCurrentPlatform } from "../device/connection.
|
|
|
4
4
|
import { handleModelAction } from "../device/actions.js";
|
|
5
5
|
import { sendCUARequest } from "../device/openai.js";
|
|
6
6
|
import { emitDesktopDebug } from "../utils/desktop-debug.js";
|
|
7
|
+
import { getConfiguredStepDelayMs } from "../utils/step-delay.js";
|
|
7
8
|
function extractComputerCalls(items) {
|
|
8
9
|
const entries = [];
|
|
9
10
|
for (const item of items) {
|
|
@@ -34,6 +35,7 @@ export class ExecutionEngine {
|
|
|
34
35
|
this.session = session;
|
|
35
36
|
this.recordScreenshots = options.recordScreenshots || false;
|
|
36
37
|
this.screenshotDir = options.screenshotDir || null;
|
|
38
|
+
this.stepDelayMs = getConfiguredStepDelayMs();
|
|
37
39
|
}
|
|
38
40
|
/**
|
|
39
41
|
* Run a full turn with the CUA model
|
|
@@ -150,7 +152,7 @@ export class ExecutionEngine {
|
|
|
150
152
|
// Add delay after UI-changing actions to let the interface update
|
|
151
153
|
// before taking the screenshot (except for explicit wait actions which have their own delay)
|
|
152
154
|
if (action.type !== "wait") {
|
|
153
|
-
await new Promise(resolve => setTimeout(resolve,
|
|
155
|
+
await new Promise(resolve => setTimeout(resolve, this.stepDelayMs));
|
|
154
156
|
}
|
|
155
157
|
}
|
|
156
158
|
}
|
|
@@ -1,15 +1,33 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* System prompt templates for different modes
|
|
3
3
|
*/
|
|
4
|
-
function
|
|
5
|
-
const
|
|
6
|
-
|
|
4
|
+
function buildCustomInstructionsSection(sections = []) {
|
|
5
|
+
const nonEmptySections = sections
|
|
6
|
+
.map((section) => ({
|
|
7
|
+
title: section?.title,
|
|
8
|
+
text: typeof section?.text === "string" ? section.text.trim() : ""
|
|
9
|
+
}))
|
|
10
|
+
.filter((section) => section.title && section.text);
|
|
11
|
+
if (nonEmptySections.length === 0) {
|
|
12
|
+
return "";
|
|
13
|
+
}
|
|
14
|
+
const renderedSections = nonEmptySections
|
|
15
|
+
.map((section) => `${section.title}:\n${section.text}`)
|
|
16
|
+
.join("\n\n");
|
|
17
|
+
return `USER CUSTOM INSTRUCTIONS:
|
|
18
|
+
Follow these user-configured instructions in addition to the default behavior below.
|
|
19
|
+
Prefer these custom instructions when deciding how to behave.
|
|
20
|
+
|
|
21
|
+
${renderedSections}`;
|
|
22
|
+
}
|
|
23
|
+
function appendCustomSections(prompt, sections = []) {
|
|
24
|
+
const customSection = buildCustomInstructionsSection(sections);
|
|
25
|
+
if (!customSection) {
|
|
7
26
|
return prompt;
|
|
8
27
|
}
|
|
9
28
|
return `${prompt}
|
|
10
29
|
|
|
11
|
-
|
|
12
|
-
${trimmed}
|
|
30
|
+
${customSection}
|
|
13
31
|
`;
|
|
14
32
|
}
|
|
15
33
|
function describeControlledDevice(deviceInfo = {}) {
|
|
@@ -95,15 +113,11 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
|
|
|
95
113
|
- Use Home button (ESC) to escape from stuck situations and restart
|
|
96
114
|
- Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
|
|
97
115
|
`;
|
|
98
|
-
return
|
|
116
|
+
return prompt;
|
|
99
117
|
}
|
|
100
118
|
export function buildDesignModePrompt(deviceInfo, customInstructions = {}) {
|
|
101
119
|
const designCustomText = typeof customInstructions.designModeInstructions === "string" ? customInstructions.designModeInstructions.trim() : "";
|
|
102
|
-
const
|
|
103
|
-
const basePrompt = buildBaseSystemPrompt(deviceInfo, {
|
|
104
|
-
...customInstructions,
|
|
105
|
-
basePromptInstructions: mergedBaseInstructions
|
|
106
|
-
});
|
|
120
|
+
const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
|
|
107
121
|
const prompt = `${basePrompt}
|
|
108
122
|
|
|
109
123
|
DESIGN MODE:
|
|
@@ -194,15 +208,14 @@ WRONG Example (DON'T DO THIS):
|
|
|
194
208
|
|
|
195
209
|
Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
|
|
196
210
|
`;
|
|
197
|
-
return prompt
|
|
211
|
+
return appendCustomSections(prompt, [
|
|
212
|
+
{ title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
|
|
213
|
+
{ title: "Design Mode Instructions", text: designCustomText }
|
|
214
|
+
]);
|
|
198
215
|
}
|
|
199
216
|
export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
|
|
200
217
|
const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
|
|
201
|
-
const
|
|
202
|
-
const basePrompt = buildBaseSystemPrompt(deviceInfo, {
|
|
203
|
-
...customInstructions,
|
|
204
|
-
basePromptInstructions: mergedBaseInstructions
|
|
205
|
-
});
|
|
218
|
+
const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
|
|
206
219
|
const prompt = `${basePrompt}
|
|
207
220
|
|
|
208
221
|
EXECUTION MODE - Critical Behavior:
|
|
@@ -227,5 +240,8 @@ Your process:
|
|
|
227
240
|
|
|
228
241
|
Each instruction is independent. Do not reference previous instructions or ask about next steps.
|
|
229
242
|
`;
|
|
230
|
-
return prompt
|
|
243
|
+
return appendCustomSections(prompt, [
|
|
244
|
+
{ title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
|
|
245
|
+
{ title: "Execution Mode Instructions", text: executionCustomText }
|
|
246
|
+
]);
|
|
231
247
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { logger } from "../../utils/logger.js";
|
|
2
2
|
import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
|
|
3
|
+
import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
|
|
3
4
|
import { execAdb } from "./tools.js";
|
|
4
5
|
function adbShell(deviceId, command) {
|
|
5
6
|
return execAdb(["-s", deviceId, "shell", command]);
|
|
@@ -115,7 +116,7 @@ export async function handleModelAction(deviceId, action, scale = 1.0, context =
|
|
|
115
116
|
break;
|
|
116
117
|
case "wait":
|
|
117
118
|
addOutput({ type: 'action', text: 'Waiting...', ...meta({}) });
|
|
118
|
-
await new Promise(res => setTimeout(res,
|
|
119
|
+
await new Promise(res => setTimeout(res, getConfiguredStepDelayMs()));
|
|
119
120
|
break;
|
|
120
121
|
default:
|
|
121
122
|
addOutput({ type: 'info', text: `Unknown action: ${JSON.stringify(action)}` });
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { logger } from "../../utils/logger.js";
|
|
2
2
|
import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
|
|
3
|
+
import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
|
|
3
4
|
import { getActiveSession, getDevicePixelRatio } from "./connection.js";
|
|
4
5
|
function normalizeMobileKeypress(platform, keys = []) {
|
|
5
6
|
if (!Array.isArray(keys) || keys.length === 0) {
|
|
@@ -140,7 +141,7 @@ export async function handleModelAction(deviceId, action, scale = 1.0, context =
|
|
|
140
141
|
}
|
|
141
142
|
case "wait":
|
|
142
143
|
addOutput({ type: "action", text: "Waiting...", ...meta({}) });
|
|
143
|
-
await new Promise((resolve) => setTimeout(resolve,
|
|
144
|
+
await new Promise((resolve) => setTimeout(resolve, getConfiguredStepDelayMs()));
|
|
144
145
|
break;
|
|
145
146
|
default:
|
|
146
147
|
addOutput({ type: "info", text: `Unknown action: ${JSON.stringify(action)}` });
|
|
@@ -7,6 +7,7 @@ import * as appium from "./appium-client.js";
|
|
|
7
7
|
import { getActiveSession, getDevicePixelRatio } from "./connection.js";
|
|
8
8
|
import { logger } from "../../utils/logger.js";
|
|
9
9
|
import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
|
|
10
|
+
import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
|
|
10
11
|
function normalizeMobileKeypress(keys = []) {
|
|
11
12
|
if (!Array.isArray(keys) || keys.length === 0) {
|
|
12
13
|
throw new Error("Keypress action is missing keys");
|
|
@@ -136,7 +137,7 @@ export async function handleModelAction(simulatorId, action, scale = 1.0, contex
|
|
|
136
137
|
}
|
|
137
138
|
case "wait": {
|
|
138
139
|
addOutput({ type: "action", text: "Waiting...", ...meta({}) });
|
|
139
|
-
await new Promise((resolve) => setTimeout(resolve,
|
|
140
|
+
await new Promise((resolve) => setTimeout(resolve, getConfiguredStepDelayMs()));
|
|
140
141
|
break;
|
|
141
142
|
}
|
|
142
143
|
default:
|
|
@@ -6,28 +6,69 @@ const cuaDebugTracer = new CuaDebugTracer(logger);
|
|
|
6
6
|
function getSelectedCuaModel() {
|
|
7
7
|
return process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
|
|
8
8
|
}
|
|
9
|
+
function normalizePreviewMessages(messages, screenshotBase64, includeInitialScreenshot) {
|
|
10
|
+
const normalized = messages.map((message) => {
|
|
11
|
+
if (!message?.role) {
|
|
12
|
+
return message;
|
|
13
|
+
}
|
|
14
|
+
const contentItems = Array.isArray(message.content)
|
|
15
|
+
? message.content.map((item) => {
|
|
16
|
+
if (typeof item === "string") {
|
|
17
|
+
return { type: "input_text", text: item };
|
|
18
|
+
}
|
|
19
|
+
return item;
|
|
20
|
+
})
|
|
21
|
+
: [{ type: "input_text", text: String(message.content ?? "") }];
|
|
22
|
+
return {
|
|
23
|
+
role: message.role,
|
|
24
|
+
content: contentItems,
|
|
25
|
+
};
|
|
26
|
+
});
|
|
27
|
+
if (!includeInitialScreenshot || !screenshotBase64) {
|
|
28
|
+
return normalized;
|
|
29
|
+
}
|
|
30
|
+
for (let index = normalized.length - 1; index >= 0; index -= 1) {
|
|
31
|
+
const item = normalized[index];
|
|
32
|
+
if (item?.role !== "user" || !Array.isArray(item.content))
|
|
33
|
+
continue;
|
|
34
|
+
item.content.push({
|
|
35
|
+
type: "input_image",
|
|
36
|
+
image_url: `data:image/png;base64,${screenshotBase64}`,
|
|
37
|
+
});
|
|
38
|
+
return normalized;
|
|
39
|
+
}
|
|
40
|
+
normalized.push({
|
|
41
|
+
role: "user",
|
|
42
|
+
content: [{
|
|
43
|
+
type: "input_image",
|
|
44
|
+
image_url: `data:image/png;base64,${screenshotBase64}`,
|
|
45
|
+
}],
|
|
46
|
+
});
|
|
47
|
+
return normalized;
|
|
48
|
+
}
|
|
9
49
|
function buildCuaRequestParams({ cuaModel, previousResponseId, deviceInfo, input }) {
|
|
10
|
-
const common = {
|
|
11
|
-
model: cuaModel,
|
|
12
|
-
previous_response_id: previousResponseId || undefined,
|
|
13
|
-
input,
|
|
14
|
-
store: true,
|
|
15
|
-
truncation: "auto",
|
|
16
|
-
};
|
|
17
50
|
if (cuaModel === "computer-use-preview") {
|
|
18
51
|
return {
|
|
19
|
-
|
|
52
|
+
model: cuaModel,
|
|
53
|
+
previous_response_id: previousResponseId || undefined,
|
|
54
|
+
input,
|
|
55
|
+
store: true,
|
|
56
|
+
truncation: "auto",
|
|
20
57
|
tools: [{
|
|
21
58
|
type: "computer_use_preview",
|
|
22
59
|
display_width: deviceInfo.scaled_width,
|
|
23
60
|
display_height: deviceInfo.scaled_height,
|
|
24
61
|
environment: "browser",
|
|
25
62
|
}],
|
|
26
|
-
reasoning: {
|
|
63
|
+
reasoning: { summary: "concise" },
|
|
27
64
|
};
|
|
28
65
|
}
|
|
29
66
|
return {
|
|
30
|
-
|
|
67
|
+
model: cuaModel,
|
|
68
|
+
previous_response_id: previousResponseId || undefined,
|
|
69
|
+
input,
|
|
70
|
+
store: true,
|
|
71
|
+
truncation: "auto",
|
|
31
72
|
tools: [{
|
|
32
73
|
type: "computer",
|
|
33
74
|
}],
|
|
@@ -55,6 +96,9 @@ function mapCuaError(err, cuaModel) {
|
|
|
55
96
|
}
|
|
56
97
|
return err;
|
|
57
98
|
}
|
|
99
|
+
export function isNonRetryableCuaError(err) {
|
|
100
|
+
return err?.status === 400 && err?.type === "invalid_request_error";
|
|
101
|
+
}
|
|
58
102
|
function getOpenAI() {
|
|
59
103
|
if (!openai) {
|
|
60
104
|
openai = new OpenAI({
|
|
@@ -97,7 +141,10 @@ Output only the revised test script, nothing else.`
|
|
|
97
141
|
}
|
|
98
142
|
export async function sendCUARequest({ messages, screenshotBase64, previousResponseId, callId, deviceInfo, debugContext, }) {
|
|
99
143
|
const cuaModel = getSelectedCuaModel();
|
|
100
|
-
const
|
|
144
|
+
const includeInitialScreenshot = cuaModel === "computer-use-preview" && !previousResponseId && !callId;
|
|
145
|
+
const input = cuaModel === "computer-use-preview"
|
|
146
|
+
? normalizePreviewMessages(messages, screenshotBase64, includeInitialScreenshot)
|
|
147
|
+
: [...messages];
|
|
101
148
|
if (callId && screenshotBase64) {
|
|
102
149
|
input.push({
|
|
103
150
|
type: "computer_call_output",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { getScreenshotAsBase64, connectToDevice, getDeviceInfo, getCurrentPlatform } from "../device/connection.js";
|
|
2
|
-
import { sendCUARequest } from "../device/openai.js";
|
|
2
|
+
import { isNonRetryableCuaError, sendCUARequest } from "../device/openai.js";
|
|
3
3
|
import { isAssertion, extractAssertionPrompt, buildAssertionSystemPrompt, checkAssertionResult, handleAssertionFailure, handleAssertionSuccess, } from "../device/assertions.js";
|
|
4
4
|
import { isLoadmillInstruction, extractLoadmillCommand, executeLoadmillInstruction, } from "../device/loadmill.js";
|
|
5
5
|
import { logger } from "../utils/logger.js";
|
|
@@ -308,6 +308,18 @@ export class ExecutionMode {
|
|
|
308
308
|
stack: err.stack
|
|
309
309
|
});
|
|
310
310
|
const addOutput = context.addOutput || ((item) => console.log(item.text || item));
|
|
311
|
+
if (isNonRetryableCuaError(err)) {
|
|
312
|
+
const message = `CUA request was rejected by the API: ${err.message}`;
|
|
313
|
+
this.emit(addOutput, 'error', message, context, stepContext, {
|
|
314
|
+
eventType: 'error',
|
|
315
|
+
payload: {
|
|
316
|
+
message: err.message,
|
|
317
|
+
status: err.status,
|
|
318
|
+
type: err.type
|
|
319
|
+
}
|
|
320
|
+
});
|
|
321
|
+
return { success: false, error: message };
|
|
322
|
+
}
|
|
311
323
|
// Check if we've exceeded max retries
|
|
312
324
|
if (retryCount >= MAX_RETRIES) {
|
|
313
325
|
emitDesktopDebug("reconnect.attempt", "device", {
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
const DEFAULT_STEP_DELAY_MS = 1000;
|
|
2
|
+
const MAX_STEP_DELAY_MS = 10000;
|
|
3
|
+
export function normalizeStepDelayMs(value) {
|
|
4
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
5
|
+
return DEFAULT_STEP_DELAY_MS;
|
|
6
|
+
}
|
|
7
|
+
const normalized = Math.round(value);
|
|
8
|
+
if (normalized < 0) {
|
|
9
|
+
return 0;
|
|
10
|
+
}
|
|
11
|
+
if (normalized > MAX_STEP_DELAY_MS) {
|
|
12
|
+
return MAX_STEP_DELAY_MS;
|
|
13
|
+
}
|
|
14
|
+
return normalized;
|
|
15
|
+
}
|
|
16
|
+
export function getConfiguredStepDelayMs() {
|
|
17
|
+
const raw = Number.parseInt(process.env.DROID_CUA_STEP_DELAY_MS ?? "", 10);
|
|
18
|
+
return normalizeStepDelayMs(raw);
|
|
19
|
+
}
|
|
20
|
+
export { DEFAULT_STEP_DELAY_MS, MAX_STEP_DELAY_MS };
|