@loadmill/droid-cua 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import { getScreenshotAsBase64, getCurrentPlatform } from "../device/connection.
4
4
  import { handleModelAction } from "../device/actions.js";
5
5
  import { sendCUARequest } from "../device/openai.js";
6
6
  import { emitDesktopDebug } from "../utils/desktop-debug.js";
7
+ import { getConfiguredStepDelayMs } from "../utils/step-delay.js";
7
8
  function extractComputerCalls(items) {
8
9
  const entries = [];
9
10
  for (const item of items) {
@@ -34,6 +35,7 @@ export class ExecutionEngine {
34
35
  this.session = session;
35
36
  this.recordScreenshots = options.recordScreenshots || false;
36
37
  this.screenshotDir = options.screenshotDir || null;
38
+ this.stepDelayMs = getConfiguredStepDelayMs();
37
39
  }
38
40
  /**
39
41
  * Run a full turn with the CUA model
@@ -150,7 +152,7 @@ export class ExecutionEngine {
150
152
  // Add delay after UI-changing actions to let the interface update
151
153
  // before taking the screenshot (except for explicit wait actions which have their own delay)
152
154
  if (action.type !== "wait") {
153
- await new Promise(resolve => setTimeout(resolve, 500));
155
+ await new Promise(resolve => setTimeout(resolve, this.stepDelayMs));
154
156
  }
155
157
  }
156
158
  }
@@ -1,15 +1,33 @@
1
1
  /**
2
2
  * System prompt templates for different modes
3
3
  */
4
- function appendCustomSection(prompt, customText) {
5
- const trimmed = typeof customText === "string" ? customText.trim() : "";
6
- if (!trimmed) {
4
+ function buildCustomInstructionsSection(sections = []) {
5
+ const nonEmptySections = sections
6
+ .map((section) => ({
7
+ title: section?.title,
8
+ text: typeof section?.text === "string" ? section.text.trim() : ""
9
+ }))
10
+ .filter((section) => section.title && section.text);
11
+ if (nonEmptySections.length === 0) {
12
+ return "";
13
+ }
14
+ const renderedSections = nonEmptySections
15
+ .map((section) => `${section.title}:\n${section.text}`)
16
+ .join("\n\n");
17
+ return `USER CUSTOM INSTRUCTIONS:
18
+ Follow these user-configured instructions in addition to the default behavior below.
19
+ Prefer these custom instructions when deciding how to behave.
20
+
21
+ ${renderedSections}`;
22
+ }
23
+ function appendCustomSections(prompt, sections = []) {
24
+ const customSection = buildCustomInstructionsSection(sections);
25
+ if (!customSection) {
7
26
  return prompt;
8
27
  }
9
28
  return `${prompt}
10
29
 
11
- CUSTOM INSTRUCTIONS:
12
- ${trimmed}
30
+ ${customSection}
13
31
  `;
14
32
  }
15
33
  function describeControlledDevice(deviceInfo = {}) {
@@ -95,15 +113,11 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
95
113
  - Use Home button (ESC) to escape from stuck situations and restart
96
114
  - Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
97
115
  `;
98
- return appendCustomSection(prompt, customInstructions.basePromptInstructions);
116
+ return prompt;
99
117
  }
100
118
  export function buildDesignModePrompt(deviceInfo, customInstructions = {}) {
101
119
  const designCustomText = typeof customInstructions.designModeInstructions === "string" ? customInstructions.designModeInstructions.trim() : "";
102
- const mergedBaseInstructions = [customInstructions.basePromptInstructions, designCustomText].filter(Boolean).join("\n\n");
103
- const basePrompt = buildBaseSystemPrompt(deviceInfo, {
104
- ...customInstructions,
105
- basePromptInstructions: mergedBaseInstructions
106
- });
120
+ const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
107
121
  const prompt = `${basePrompt}
108
122
 
109
123
  DESIGN MODE:
@@ -194,15 +208,14 @@ WRONG Example (DON'T DO THIS):
194
208
 
195
209
  Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
196
210
  `;
197
- return prompt;
211
+ return appendCustomSections(prompt, [
212
+ { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
213
+ { title: "Design Mode Instructions", text: designCustomText }
214
+ ]);
198
215
  }
199
216
  export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
200
217
  const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
201
- const mergedBaseInstructions = [customInstructions.basePromptInstructions, executionCustomText].filter(Boolean).join("\n\n");
202
- const basePrompt = buildBaseSystemPrompt(deviceInfo, {
203
- ...customInstructions,
204
- basePromptInstructions: mergedBaseInstructions
205
- });
218
+ const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
206
219
  const prompt = `${basePrompt}
207
220
 
208
221
  EXECUTION MODE - Critical Behavior:
@@ -227,5 +240,8 @@ Your process:
227
240
 
228
241
  Each instruction is independent. Do not reference previous instructions or ask about next steps.
229
242
  `;
230
- return prompt;
243
+ return appendCustomSections(prompt, [
244
+ { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
245
+ { title: "Execution Mode Instructions", text: executionCustomText }
246
+ ]);
231
247
  }
@@ -1,5 +1,6 @@
1
1
  import { logger } from "../../utils/logger.js";
2
2
  import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
3
+ import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
3
4
  import { execAdb } from "./tools.js";
4
5
  function adbShell(deviceId, command) {
5
6
  return execAdb(["-s", deviceId, "shell", command]);
@@ -115,7 +116,7 @@ export async function handleModelAction(deviceId, action, scale = 1.0, context =
115
116
  break;
116
117
  case "wait":
117
118
  addOutput({ type: 'action', text: 'Waiting...', ...meta({}) });
118
- await new Promise(res => setTimeout(res, 1000));
119
+ await new Promise(res => setTimeout(res, getConfiguredStepDelayMs()));
119
120
  break;
120
121
  default:
121
122
  addOutput({ type: 'info', text: `Unknown action: ${JSON.stringify(action)}` });
@@ -1,5 +1,6 @@
1
1
  import { logger } from "../../utils/logger.js";
2
2
  import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
3
+ import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
3
4
  import { getActiveSession, getDevicePixelRatio } from "./connection.js";
4
5
  function normalizeMobileKeypress(platform, keys = []) {
5
6
  if (!Array.isArray(keys) || keys.length === 0) {
@@ -140,7 +141,7 @@ export async function handleModelAction(deviceId, action, scale = 1.0, context =
140
141
  }
141
142
  case "wait":
142
143
  addOutput({ type: "action", text: "Waiting...", ...meta({}) });
143
- await new Promise((resolve) => setTimeout(resolve, 1000));
144
+ await new Promise((resolve) => setTimeout(resolve, getConfiguredStepDelayMs()));
144
145
  break;
145
146
  default:
146
147
  addOutput({ type: "info", text: `Unknown action: ${JSON.stringify(action)}` });
@@ -7,6 +7,7 @@ import * as appium from "./appium-client.js";
7
7
  import { getActiveSession, getDevicePixelRatio } from "./connection.js";
8
8
  import { logger } from "../../utils/logger.js";
9
9
  import { emitDesktopDebug, truncateForDebug } from "../../utils/desktop-debug.js";
10
+ import { getConfiguredStepDelayMs } from "../../utils/step-delay.js";
10
11
  function normalizeMobileKeypress(keys = []) {
11
12
  if (!Array.isArray(keys) || keys.length === 0) {
12
13
  throw new Error("Keypress action is missing keys");
@@ -136,7 +137,7 @@ export async function handleModelAction(simulatorId, action, scale = 1.0, contex
136
137
  }
137
138
  case "wait": {
138
139
  addOutput({ type: "action", text: "Waiting...", ...meta({}) });
139
- await new Promise((resolve) => setTimeout(resolve, 1000));
140
+ await new Promise((resolve) => setTimeout(resolve, getConfiguredStepDelayMs()));
140
141
  break;
141
142
  }
142
143
  default:
@@ -6,28 +6,69 @@ const cuaDebugTracer = new CuaDebugTracer(logger);
6
6
  function getSelectedCuaModel() {
7
7
  return process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
8
8
  }
9
+ function normalizePreviewMessages(messages, screenshotBase64, includeInitialScreenshot) {
10
+ const normalized = messages.map((message) => {
11
+ if (!message?.role) {
12
+ return message;
13
+ }
14
+ const contentItems = Array.isArray(message.content)
15
+ ? message.content.map((item) => {
16
+ if (typeof item === "string") {
17
+ return { type: "input_text", text: item };
18
+ }
19
+ return item;
20
+ })
21
+ : [{ type: "input_text", text: String(message.content ?? "") }];
22
+ return {
23
+ role: message.role,
24
+ content: contentItems,
25
+ };
26
+ });
27
+ if (!includeInitialScreenshot || !screenshotBase64) {
28
+ return normalized;
29
+ }
30
+ for (let index = normalized.length - 1; index >= 0; index -= 1) {
31
+ const item = normalized[index];
32
+ if (item?.role !== "user" || !Array.isArray(item.content))
33
+ continue;
34
+ item.content.push({
35
+ type: "input_image",
36
+ image_url: `data:image/png;base64,${screenshotBase64}`,
37
+ });
38
+ return normalized;
39
+ }
40
+ normalized.push({
41
+ role: "user",
42
+ content: [{
43
+ type: "input_image",
44
+ image_url: `data:image/png;base64,${screenshotBase64}`,
45
+ }],
46
+ });
47
+ return normalized;
48
+ }
9
49
  function buildCuaRequestParams({ cuaModel, previousResponseId, deviceInfo, input }) {
10
- const common = {
11
- model: cuaModel,
12
- previous_response_id: previousResponseId || undefined,
13
- input,
14
- store: true,
15
- truncation: "auto",
16
- };
17
50
  if (cuaModel === "computer-use-preview") {
18
51
  return {
19
- ...common,
52
+ model: cuaModel,
53
+ previous_response_id: previousResponseId || undefined,
54
+ input,
55
+ store: true,
56
+ truncation: "auto",
20
57
  tools: [{
21
58
  type: "computer_use_preview",
22
59
  display_width: deviceInfo.scaled_width,
23
60
  display_height: deviceInfo.scaled_height,
24
61
  environment: "browser",
25
62
  }],
26
- reasoning: { generate_summary: "concise" },
63
+ reasoning: { summary: "concise" },
27
64
  };
28
65
  }
29
66
  return {
30
- ...common,
67
+ model: cuaModel,
68
+ previous_response_id: previousResponseId || undefined,
69
+ input,
70
+ store: true,
71
+ truncation: "auto",
31
72
  tools: [{
32
73
  type: "computer",
33
74
  }],
@@ -55,6 +96,9 @@ function mapCuaError(err, cuaModel) {
55
96
  }
56
97
  return err;
57
98
  }
99
+ export function isNonRetryableCuaError(err) {
100
+ return err?.status === 400 && err?.type === "invalid_request_error";
101
+ }
58
102
  function getOpenAI() {
59
103
  if (!openai) {
60
104
  openai = new OpenAI({
@@ -97,7 +141,10 @@ Output only the revised test script, nothing else.`
97
141
  }
98
142
  export async function sendCUARequest({ messages, screenshotBase64, previousResponseId, callId, deviceInfo, debugContext, }) {
99
143
  const cuaModel = getSelectedCuaModel();
100
- const input = [...messages];
144
+ const includeInitialScreenshot = cuaModel === "computer-use-preview" && !previousResponseId && !callId;
145
+ const input = cuaModel === "computer-use-preview"
146
+ ? normalizePreviewMessages(messages, screenshotBase64, includeInitialScreenshot)
147
+ : [...messages];
101
148
  if (callId && screenshotBase64) {
102
149
  input.push({
103
150
  type: "computer_call_output",
@@ -1,5 +1,5 @@
1
1
  import { getScreenshotAsBase64, connectToDevice, getDeviceInfo, getCurrentPlatform } from "../device/connection.js";
2
- import { sendCUARequest } from "../device/openai.js";
2
+ import { isNonRetryableCuaError, sendCUARequest } from "../device/openai.js";
3
3
  import { isAssertion, extractAssertionPrompt, buildAssertionSystemPrompt, checkAssertionResult, handleAssertionFailure, handleAssertionSuccess, } from "../device/assertions.js";
4
4
  import { isLoadmillInstruction, extractLoadmillCommand, executeLoadmillInstruction, } from "../device/loadmill.js";
5
5
  import { logger } from "../utils/logger.js";
@@ -308,6 +308,18 @@ export class ExecutionMode {
308
308
  stack: err.stack
309
309
  });
310
310
  const addOutput = context.addOutput || ((item) => console.log(item.text || item));
311
+ if (isNonRetryableCuaError(err)) {
312
+ const message = `CUA request was rejected by the API: ${err.message}`;
313
+ this.emit(addOutput, 'error', message, context, stepContext, {
314
+ eventType: 'error',
315
+ payload: {
316
+ message: err.message,
317
+ status: err.status,
318
+ type: err.type
319
+ }
320
+ });
321
+ return { success: false, error: message };
322
+ }
311
323
  // Check if we've exceeded max retries
312
324
  if (retryCount >= MAX_RETRIES) {
313
325
  emitDesktopDebug("reconnect.attempt", "device", {
@@ -0,0 +1,20 @@
1
+ const DEFAULT_STEP_DELAY_MS = 1000;
2
+ const MAX_STEP_DELAY_MS = 10000;
3
+ export function normalizeStepDelayMs(value) {
4
+ if (typeof value !== "number" || !Number.isFinite(value)) {
5
+ return DEFAULT_STEP_DELAY_MS;
6
+ }
7
+ const normalized = Math.round(value);
8
+ if (normalized < 0) {
9
+ return 0;
10
+ }
11
+ if (normalized > MAX_STEP_DELAY_MS) {
12
+ return MAX_STEP_DELAY_MS;
13
+ }
14
+ return normalized;
15
+ }
16
+ export function getConfiguredStepDelayMs() {
17
+ const raw = Number.parseInt(process.env.DROID_CUA_STEP_DELAY_MS ?? "", 10);
18
+ return normalizeStepDelayMs(raw);
19
+ }
20
+ export { DEFAULT_STEP_DELAY_MS, MAX_STEP_DELAY_MS };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@loadmill/droid-cua",
3
- "version": "2.2.0",
3
+ "version": "2.2.1",
4
4
  "description": "AI-powered Android testing agent using OpenAI's computer-use model and ADB",
5
5
  "main": "build/index.js",
6
6
  "type": "module",