@loadmill/droid-cua 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/LICENSE +1 -0
  2. package/README.md +227 -0
  3. package/bin/droid-cua +6 -0
  4. package/build/index.js +58 -0
  5. package/build/src/cli/app.js +115 -0
  6. package/build/src/cli/command-parser.js +57 -0
  7. package/build/src/cli/components/AgentStatus.js +21 -0
  8. package/build/src/cli/components/CommandSuggestions.js +33 -0
  9. package/build/src/cli/components/InputPanel.js +21 -0
  10. package/build/src/cli/components/OutputPanel.js +58 -0
  11. package/build/src/cli/components/StatusBar.js +22 -0
  12. package/build/src/cli/ink-shell.js +56 -0
  13. package/build/src/commands/create.js +42 -0
  14. package/build/src/commands/edit.js +61 -0
  15. package/build/src/commands/exit.js +20 -0
  16. package/build/src/commands/help.js +34 -0
  17. package/build/src/commands/index.js +49 -0
  18. package/build/src/commands/list.js +55 -0
  19. package/build/src/commands/run.js +112 -0
  20. package/build/src/commands/stop.js +32 -0
  21. package/build/src/commands/view.js +43 -0
  22. package/build/src/core/execution-engine.js +114 -0
  23. package/build/src/core/prompts.js +158 -0
  24. package/build/src/core/session.js +57 -0
  25. package/build/src/device/actions.js +81 -0
  26. package/build/src/device/assertions.js +75 -0
  27. package/build/src/device/connection.js +123 -0
  28. package/build/src/device/openai.js +124 -0
  29. package/build/src/modes/design-mode-ink.js +396 -0
  30. package/build/src/modes/design-mode.js +366 -0
  31. package/build/src/modes/execution-mode.js +165 -0
  32. package/build/src/test-store/test-manager.js +92 -0
  33. package/build/src/utils/logger.js +86 -0
  34. package/package.json +68 -0
@@ -0,0 +1,158 @@
1
+ /**
2
+ * System prompt templates for different modes
3
+ */
4
+ export function buildBaseSystemPrompt(deviceInfo) {
5
+ return `
6
+ You are controlling an Android phone in a sandboxed testing environment.
7
+ Follow the user's instructions to interact with the device.
8
+
9
+ The device screen has been scaled down for display.
10
+ You can interact with any part of the visible phone screen, including system UI, browser UI, and app content.
11
+
12
+ The screen you see is ${deviceInfo.scaled_width} x ${deviceInfo.scaled_height} pixels.
13
+ Pixel (0,0) is at the top-left corner.
14
+
15
+ When aiming for visual targets:
16
+ - Reason carefully about the approximate pixel position.
17
+ - Click precisely based on your visual estimate.
18
+
19
+ Available actions: click, scroll, type, keypress, wait, screenshot.
20
+
21
+ CRITICAL - Automatic Timing:
22
+ - After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
23
+ - This 500ms is sufficient for normal UI updates and animations
24
+ - DO NOT add 'wait' actions unnecessarily - trust the automatic delay
25
+
26
+ Use explicit 'wait' action ONLY in these specific cases:
27
+ 1. After launching apps from home screen or app drawer
28
+ 2. After pressing ENTER that triggers navigation (search, URL, form submit)
29
+ 3. After clicking links that open new apps or pages
30
+ 4. After actions that trigger heavy loading (camera, maps, etc.)
31
+
32
+ When you MUST wait:
33
+ - Click app icon from home → wait → Continue
34
+ - Type in search box → Press ENTER → wait → Continue
35
+ - Click link that opens new page/app → wait → Continue
36
+ - Open camera/maps/heavy feature → wait → Continue
37
+
38
+ When you should NOT wait (automatic 500ms handles it):
39
+ - Clicking UI buttons within a running app (click button - no wait needed)
40
+ - Typing in text fields (type text - no wait needed)
41
+ - Scrolling (scroll - no wait needed)
42
+ - Clicking tabs or menu items within an app (click - no wait needed)
43
+
44
+ Rule of thumb: Wait for app launches and navigation. Everything else has automatic timing.
45
+
46
+ Perform the user's requested actions within the current view.
47
+
48
+ If unsure about visual elements, take a screenshot to improve your reasoning.
49
+ If unsure about the user's intent, make the best decision you can based on context and continue automatically.
50
+
51
+ CRITICAL - Never Ask Questions:
52
+ - NEVER ask the user for confirmation, clarification, or next steps
53
+ - NEVER ask questions like "Should I...", "Would you like...", "Do you want me to..."
54
+ - NEVER wait for user guidance - make autonomous decisions
55
+ - If stuck, try alternative approaches (go back, try different UI element, restart app)
56
+ - ONLY stop when the task is complete or you've exhausted reasonable approaches
57
+
58
+ Act decisively to complete the task.
59
+
60
+ Stop acting once the task appears complete.
61
+ Only complete the current instruction. Do not proceed beyond the current step unless asked.
62
+
63
+ Mobile-Specific Notes:
64
+ - ESC key maps to the Home button (return to home screen)
65
+ - Use Home button (ESC) to escape from stuck situations and restart
66
+ - Back button navigates within apps
67
+ `;
68
+ }
69
+ export function buildDesignModePrompt(deviceInfo) {
70
+ const basePrompt = buildBaseSystemPrompt(deviceInfo);
71
+ return `${basePrompt}
72
+
73
+ DESIGN MODE:
74
+ You are helping design a test script for an Android app.
75
+
76
+ Your task:
77
+ 1. Understand what the user wants to test from their initial instruction
78
+ 2. Explore the app autonomously to understand the flows
79
+ 3. Take screenshots and interact as needed to discover the UI and behavior
80
+ 4. Once you've successfully completed the user's requested flow, immediately generate the test script
81
+
82
+ CRITICAL - After Completing the Task:
83
+ - DO NOT navigate back or away from the final screen
84
+ - The final screen state is what matters for verification
85
+ - Generate the test script immediately showing the current state
86
+ - Use assertions to verify state, not navigation
87
+ - "Check that it changed" means verify the current visual state, not navigate elsewhere
88
+
89
+ CRITICAL - Recognizing When You Are Stuck:
90
+ If you find yourself:
91
+ - Repeating similar actions multiple times (e.g., opening/closing the same app repeatedly)
92
+ - Not reaching a new screen or state after several attempts
93
+ - Unsure about a higher-level decision (which tab to use, which mode to enter, where to start)
94
+ - Unable to find the UI element or feature the user mentioned
95
+
96
+ THEN STOP ACTING IMMEDIATELY and ask the user for guidance:
97
+ 1. Briefly describe what you see on screen now
98
+ 2. Explain what you were trying to do and why you're stuck
99
+ 3. Ask a single, concrete question to unblock the next step
100
+
101
+ Example:
102
+ "Chrome is open but I don't see a search bar or new tab button. Should I open a new tab, or is there a specific way you'd like me to navigate?"
103
+
104
+ DO NOT continue brute-forcing the UI when stuck. The user prefers being asked over watching repeated failed attempts.
105
+ DO NOT ask if the user wants a script after successfully completing the flow - just generate it automatically.
106
+
107
+ CRITICAL - Test Script Format Rules:
108
+ - One simple instruction per line (NO numbers, NO bullets)
109
+ - Use imperative commands: "Open X", "Click Y", "Type Z"
110
+ - Include "assert: <condition>" lines to validate expected behavior
111
+ - End with "exit"
112
+ - Keep it simple and executable
113
+
114
+ CORRECT Example:
115
+ \`\`\`
116
+ Open Calculator app
117
+ assert: Calculator app is visible
118
+ Type "2"
119
+ Click the plus button
120
+ Type "3"
121
+ Click the equals button
122
+ assert: result shows 5
123
+ exit
124
+ \`\`\`
125
+
126
+ WRONG Example (DON'T DO THIS):
127
+ \`\`\`
128
+ 1. Open Calculator app
129
+ 2. Verify the app opened
130
+ 3. etc...
131
+ \`\`\`
132
+
133
+ Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
134
+ `;
135
+ }
136
+ export function buildExecutionModePrompt(deviceInfo) {
137
+ const basePrompt = buildBaseSystemPrompt(deviceInfo);
138
+ return `${basePrompt}
139
+
140
+ EXECUTION MODE - Critical Behavior:
141
+ You are executing test script commands one at a time. This is NOT a conversation.
142
+
143
+ CRITICAL RULES:
144
+ - DO NOT generate conversational text or narration
145
+ - DO NOT ask questions like "What should I do next?", "Would you like...", "Can I assist...?"
146
+ - DO NOT describe what you see on screen
147
+ - DO NOT say "Let me know if you need help" or similar phrases
148
+ - Just execute the action silently and stop immediately
149
+ - Only generate text if the action FAILED or cannot be completed
150
+
151
+ Your process:
152
+ 1. Read the instruction
153
+ 2. Execute the required actions
154
+ 3. Stop immediately - no commentary, no questions
155
+
156
+ Each instruction is independent. Do not reference previous instructions or ask about next steps.
157
+ `;
158
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Session manages the state for a single CLI session
3
+ * Includes device info, message history, transcript, and response chaining
4
+ */
5
+ export class Session {
6
+ constructor(deviceId, deviceInfo) {
7
+ this.deviceId = deviceId;
8
+ this.deviceInfo = deviceInfo;
9
+ this.messages = [];
10
+ this.previousResponseId = null;
11
+ this.transcript = [];
12
+ this.systemPrompt = null;
13
+ }
14
+ /**
15
+ * Add a message to the conversation history
16
+ */
17
+ addMessage(role, content) {
18
+ this.messages.push({ role, content });
19
+ }
20
+ /**
21
+ * Clear all messages (used in execution mode between turns)
22
+ * Preserves the system prompt if one was set
23
+ */
24
+ clearMessages() {
25
+ if (this.systemPrompt) {
26
+ this.messages = [{ role: "system", content: this.systemPrompt }];
27
+ }
28
+ else {
29
+ this.messages = [];
30
+ }
31
+ }
32
+ /**
33
+ * Set the system prompt and initialize messages array
34
+ */
35
+ setSystemPrompt(prompt) {
36
+ this.systemPrompt = prompt;
37
+ this.messages = [{ role: "system", content: prompt }];
38
+ }
39
+ /**
40
+ * Add a line to the transcript (for error recovery and logging)
41
+ */
42
+ addToTranscript(line) {
43
+ this.transcript.push(line);
44
+ }
45
+ /**
46
+ * Get the full transcript as a string
47
+ */
48
+ getTranscriptText() {
49
+ return this.transcript.join("\n");
50
+ }
51
+ /**
52
+ * Update the previous response ID for chaining
53
+ */
54
+ updateResponseId(responseId) {
55
+ this.previousResponseId = responseId;
56
+ }
57
+ }
@@ -0,0 +1,81 @@
1
+ import { exec } from "child_process";
2
+ import { promisify } from "util";
3
+ import { logger } from "../utils/logger.js";
4
+ const execAsync = promisify(exec);
5
+ function adbShell(deviceId, command) {
6
+ return execAsync(`adb -s ${deviceId} shell "${command}"`);
7
+ }
8
+ export async function handleModelAction(deviceId, action, scale = 1.0, context = null) {
9
+ const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
10
+ try {
11
+ const { x, y, x1, y1, x2, y2, text, keys, path } = action;
12
+ switch (action.type) {
13
+ case "click":
14
+ const realX = Math.round(x / scale);
15
+ const realY = Math.round(y / scale);
16
+ addOutput({ type: 'action', text: `Clicking at (${realX}, ${realY})` });
17
+ await adbShell(deviceId, `input tap ${realX} ${realY}`);
18
+ break;
19
+ case "scroll":
20
+ const scrollX = Math.round(action.scroll_x / scale);
21
+ const scrollY = Math.round(action.scroll_y / scale);
22
+ addOutput({ type: 'action', text: `Scrolling by (${scrollX}, ${scrollY})` });
23
+ const startX = 500;
24
+ const startY = 500;
25
+ const endX = startX + scrollX;
26
+ const endY = startY - scrollY; // <--- INVERT Y
27
+ await adbShell(deviceId, `input swipe ${startX} ${startY} ${endX} ${endY} 500`);
28
+ break;
29
+ case "drag":
30
+ if (path && path.length >= 2) {
31
+ const start = path[0];
32
+ const end = path[path.length - 1];
33
+ const realStartX = Math.round(start.x / scale);
34
+ const realStartY = Math.round(start.y / scale);
35
+ const realEndX = Math.round(end.x / scale);
36
+ const realEndY = Math.round(end.y / scale);
37
+ addOutput({ type: 'action', text: `Dragging from (${realStartX}, ${realStartY}) to (${realEndX}, ${realEndY})` });
38
+ await adbShell(deviceId, `input swipe ${realStartX} ${realStartY} ${realEndX} ${realEndY} 500`);
39
+ }
40
+ else {
41
+ addOutput({ type: 'info', text: `Drag action missing valid path: ${JSON.stringify(action)}` });
42
+ }
43
+ break;
44
+ case "type":
45
+ addOutput({ type: 'action', text: `Typing text: ${text}` });
46
+ const escapedText = text.replace(/(["\\$`])/g, "\\$1").replace(/ /g, "%s");
47
+ await adbShell(deviceId, `input text "${escapedText}"`);
48
+ break;
49
+ case "keypress":
50
+ // Map ESC to Android Home button (since ESC doesn't exist on mobile)
51
+ const mappedKeys = keys.map(key => {
52
+ if (key.toUpperCase() === 'ESC' || key.toUpperCase() === 'ESCAPE') {
53
+ return 'KEYCODE_HOME';
54
+ }
55
+ return key;
56
+ });
57
+ addOutput({ type: 'action', text: `Pressing key: ${mappedKeys.join(', ')}` });
58
+ for (const key of mappedKeys) {
59
+ await adbShell(deviceId, `input keyevent ${key}`);
60
+ }
61
+ break;
62
+ case "wait":
63
+ addOutput({ type: 'action', text: 'Waiting...' });
64
+ await new Promise(res => setTimeout(res, 1000));
65
+ break;
66
+ default:
67
+ addOutput({ type: 'info', text: `Unknown action: ${JSON.stringify(action)}` });
68
+ }
69
+ }
70
+ catch (error) {
71
+ // Log full error details to file
72
+ logger.error('Action execution error', {
73
+ action,
74
+ message: error.message,
75
+ stack: error.stack
76
+ });
77
+ // Show user-friendly error message
78
+ addOutput({ type: 'error', text: `Error executing action: ${error.message}` });
79
+ addOutput({ type: 'info', text: 'Full error details have been logged to the debug log.' });
80
+ }
81
+ }
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Assertion handling for script validation
3
+ */
4
+ export function isAssertion(userInput) {
5
+ const trimmed = userInput.trim();
6
+ const lower = trimmed.toLowerCase();
7
+ return lower.startsWith("assert:") || lower.startsWith("assert ");
8
+ }
9
+ export function extractAssertionPrompt(userInput) {
10
+ const trimmed = userInput.trim();
11
+ const lower = trimmed.toLowerCase();
12
+ // Handle "assert:" or "Assert:"
13
+ if (lower.startsWith("assert:")) {
14
+ return trimmed.substring("assert:".length).trim();
15
+ }
16
+ // Handle "assert " or "Assert "
17
+ if (lower.startsWith("assert ")) {
18
+ return trimmed.substring("assert".length).trim();
19
+ }
20
+ return trimmed;
21
+ }
22
+ export function buildAssertionSystemPrompt(baseSystemPrompt, assertionPrompt) {
23
+ return `${baseSystemPrompt}
24
+
25
+ ASSERTION MODE:
26
+ You are now validating an assertion. The user has provided an assertion statement that you must verify.
27
+
28
+ Your task:
29
+ 1. Take screenshots and perform LIMITED actions if needed to validate the assertion.
30
+ 2. Determine if the assertion is TRUE or FALSE based on the current state.
31
+ 3. You MUST respond with a clear verdict in this exact format:
32
+ - If the assertion is true, include the text: "ASSERTION RESULT: PASS"
33
+ - If the assertion is false or cannot be confidently validated, include: "ASSERTION RESULT: FAIL"
34
+ 4. After the verdict, provide a brief explanation (1-2 sentences) of why it passed or failed.
35
+
36
+ The assertion to validate is: "${assertionPrompt}"
37
+
38
+ Remember:
39
+ - If you cannot confidently validate the assertion, treat it as FAIL.
40
+ - You must include either "ASSERTION RESULT: PASS" or "ASSERTION RESULT: FAIL" in your response.
41
+ - Be thorough but efficient. Only take the actions necessary to validate the assertion.`;
42
+ }
43
+ export function checkAssertionResult(transcript) {
44
+ const transcriptText = transcript.join("\n");
45
+ const hasPassed = transcriptText.includes("ASSERTION RESULT: PASS");
46
+ const hasFailed = transcriptText.includes("ASSERTION RESULT: FAIL");
47
+ return {
48
+ passed: hasPassed && !hasFailed,
49
+ failed: hasFailed || !hasPassed,
50
+ };
51
+ }
52
+ export function extractFailureDetails(transcript) {
53
+ const recentTranscript = transcript.slice(-5).join("\n");
54
+ const parts = recentTranscript.split("ASSERTION RESULT: FAIL");
55
+ return parts[1]?.trim() || "Could not confidently validate the assertion.";
56
+ }
57
+ export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMode, context) {
58
+ const details = extractFailureDetails(transcript);
59
+ const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
60
+ addOutput({ type: 'error', text: '❌ ASSERTION FAILED' });
61
+ addOutput({ type: 'error', text: `Assertion: ${assertionPrompt}` });
62
+ addOutput({ type: 'error', text: `Details: ${details}` });
63
+ if (isHeadlessMode) {
64
+ // Headless mode: exit with error code
65
+ if (context?.exit) {
66
+ context.exit();
67
+ }
68
+ process.exit(1);
69
+ }
70
+ // Interactive mode: caller should clear remaining instructions
71
+ }
72
+ export function handleAssertionSuccess(assertionPrompt, context = null) {
73
+ const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
74
+ addOutput({ type: 'success', text: `✓ Assertion passed: ${assertionPrompt}` });
75
+ }
@@ -0,0 +1,123 @@
1
+ import { exec, spawn } from "child_process";
2
+ import { once } from "events";
3
+ import { promisify } from "util";
4
+ import sharp from "sharp";
5
+ import { logger } from "../utils/logger.js";
6
+ const execAsync = promisify(exec);
7
+ function wait(ms) {
8
+ return new Promise(resolve => setTimeout(resolve, ms));
9
+ }
10
+ async function listConnectedDevices() {
11
+ const { stdout } = await execAsync("adb devices");
12
+ return stdout
13
+ .trim()
14
+ .split("\n")
15
+ .slice(1)
16
+ .map(line => line.split("\t")[0])
17
+ .filter(id => id.length > 0);
18
+ }
19
+ async function waitForDeviceConnection(avdName, timeoutMs = 120000) {
20
+ const deadline = Date.now() + timeoutMs;
21
+ while (Date.now() < deadline) {
22
+ const devices = await listConnectedDevices();
23
+ const match = devices.find(id => id.includes(avdName));
24
+ if (match)
25
+ return match;
26
+ await wait(2000);
27
+ }
28
+ return null;
29
+ }
30
+ async function waitForDeviceBoot(deviceId, timeoutMs = 60000) {
31
+ const deadline = Date.now() + timeoutMs;
32
+ while (Date.now() < deadline) {
33
+ try {
34
+ const { stdout } = await execAsync(`adb -s ${deviceId} shell getprop sys.boot_completed`);
35
+ if (stdout.trim() === "1")
36
+ return true;
37
+ }
38
+ catch { }
39
+ await wait(2000);
40
+ }
41
+ return false;
42
+ }
43
+ export async function connectToDevice(avdName) {
44
+ const devices = await listConnectedDevices();
45
+ for (const id of devices) {
46
+ if (id.startsWith("emulator-")) {
47
+ try {
48
+ const { stdout } = await execAsync(`adb -s ${id} emu avd name`);
49
+ if (stdout.trim() === avdName) {
50
+ console.log(`Emulator ${avdName} is already running as ${id}`);
51
+ return id;
52
+ }
53
+ }
54
+ catch { }
55
+ }
56
+ }
57
+ console.log(`No emulator with AVD "${avdName}" is running. Launching...`);
58
+ const emulatorProcess = spawn("emulator", ["-avd", avdName], { detached: true, stdio: "ignore" });
59
+ emulatorProcess.unref();
60
+ const deviceId = await waitForDeviceConnection("emulator-", 120000);
61
+ if (!deviceId) {
62
+ console.error(`Emulator ${avdName} did not appear in time.`);
63
+ process.exit(1);
64
+ }
65
+ console.log(`Device ${deviceId} detected. Waiting for boot...`);
66
+ const booted = await waitForDeviceBoot(deviceId);
67
+ if (!booted) {
68
+ console.error(`Emulator ${avdName} did not finish booting.`);
69
+ process.exit(1);
70
+ }
71
+ console.log(`Emulator ${avdName} is fully booted.`);
72
+ return deviceId;
73
+ }
74
+ export async function getDeviceInfo(deviceId) {
75
+ const { stdout } = await execAsync(`adb -s ${deviceId} shell wm size`);
76
+ const match = stdout.match(/Physical size:\s*(\d+)x(\d+)/);
77
+ if (!match) {
78
+ console.error("Could not get device screen size.");
79
+ process.exit(1);
80
+ }
81
+ const [_, width, height] = match.map(Number);
82
+ const targetWidth = 400;
83
+ const scale = width > targetWidth ? targetWidth / width : 1.0;
84
+ const scaledWidth = Math.round(width * scale);
85
+ const scaledHeight = Math.round(height * scale);
86
+ return {
87
+ device_width: width,
88
+ device_height: height,
89
+ scaled_width: scaledWidth,
90
+ scaled_height: scaledHeight,
91
+ scale,
92
+ };
93
+ }
94
+ export async function getScreenshotAsBase64(deviceId, deviceInfo) {
95
+ const adb = spawn("adb", ["-s", deviceId, "exec-out", "screencap", "-p"]);
96
+ const chunks = [];
97
+ const stderrChunks = [];
98
+ adb.stdout.on("data", chunk => chunks.push(chunk));
99
+ adb.stderr.on("data", err => {
100
+ stderrChunks.push(err);
101
+ console.error("ADB stderr:", err.toString());
102
+ });
103
+ const [code] = await once(adb, "close");
104
+ if (code !== 0) {
105
+ const stderrOutput = Buffer.concat(stderrChunks).toString();
106
+ logger.error(`ADB screencap failed with code ${code}`, { stderr: stderrOutput });
107
+ throw new Error(`adb screencap exited with code ${code}`);
108
+ }
109
+ let buffer = Buffer.concat(chunks);
110
+ logger.debug(`Screenshot captured: ${buffer.length} bytes before scaling`);
111
+ if (buffer.length === 0) {
112
+ logger.error('Screenshot buffer is empty!', { deviceId, chunks: chunks.length });
113
+ throw new Error('Screenshot capture returned empty buffer');
114
+ }
115
+ if (deviceInfo.scale < 1.0) {
116
+ buffer = await sharp(buffer)
117
+ .resize({ width: deviceInfo.scaled_width, height: deviceInfo.scaled_height })
118
+ .png()
119
+ .toBuffer();
120
+ logger.debug(`Screenshot scaled: ${buffer.length} bytes after scaling`);
121
+ }
122
+ return buffer.toString("base64");
123
+ }
@@ -0,0 +1,124 @@
1
+ import OpenAI from "openai";
2
+ import dotenv from "dotenv";
3
+ import { logger } from "../utils/logger.js";
4
+ dotenv.config();
5
+ const openai = new OpenAI({
6
+ apiKey: process.env.OPENAI_API_KEY,
7
+ });
8
+ /**
9
+ * Revise a test script based on user feedback using simple chat completion
10
+ * @param {string} originalScript - The original test script
11
+ * @param {string} revisionRequest - User's requested changes
12
+ * @returns {Promise<string>} - The revised test script
13
+ */
14
+ export async function reviseTestScript(originalScript, revisionRequest) {
15
+ const response = await openai.chat.completions.create({
16
+ model: "gpt-4o",
17
+ messages: [{
18
+ role: "system",
19
+ content: `You are editing a test script based on user feedback.
20
+
21
+ Current test script:
22
+ ${originalScript}
23
+
24
+ User's revision request:
25
+ ${revisionRequest}
26
+
27
+ Apply the user's changes and output the revised test script.
28
+
29
+ FORMAT RULES:
30
+ - One simple instruction per line (NO numbers, NO bullets)
31
+ - Use imperative commands: "Open X", "Click Y", "Type Z"
32
+ - Include "assert: <condition>" lines to validate expected behavior
33
+ - End with "exit"
34
+
35
+ Output only the revised test script, nothing else.`
36
+ }]
37
+ });
38
+ return response.choices[0].message.content.trim();
39
+ }
40
+ export async function sendCUARequest({ messages, screenshotBase64, previousResponseId, callId, deviceInfo, }) {
41
+ const input = [...messages];
42
+ if (callId && screenshotBase64) {
43
+ input.push({
44
+ type: "computer_call_output",
45
+ call_id: callId,
46
+ output: {
47
+ type: "computer_screenshot",
48
+ image_url: `data:image/png;base64,${screenshotBase64}`,
49
+ },
50
+ });
51
+ }
52
+ const requestParams = {
53
+ model: "computer-use-preview",
54
+ previous_response_id: previousResponseId || undefined,
55
+ tools: [{
56
+ type: "computer_use_preview",
57
+ display_width: deviceInfo.scaled_width,
58
+ display_height: deviceInfo.scaled_height,
59
+ environment: "browser",
60
+ }],
61
+ input,
62
+ store: true,
63
+ reasoning: { generate_summary: "concise" },
64
+ truncation: "auto",
65
+ };
66
+ // Log request details (without full screenshot to avoid clutter)
67
+ const requestLog = {
68
+ ...requestParams,
69
+ input: input.map(item => {
70
+ if (item.type === "computer_call_output" && item.output?.image_url) {
71
+ // Extract actual base64 length from the image_url
72
+ const imageUrl = item.output.image_url;
73
+ const base64Data = imageUrl.replace('data:image/png;base64,', '');
74
+ return {
75
+ ...item,
76
+ output: {
77
+ ...item.output,
78
+ image_url: `data:image/png;base64,[${base64Data.length} chars]`
79
+ },
80
+ current_url: item.current_url,
81
+ acknowledged_safety_checks: item.acknowledged_safety_checks
82
+ };
83
+ }
84
+ return item;
85
+ })
86
+ };
87
+ logger.debug('CUA Request:', requestLog);
88
+ try {
89
+ const response = await openai.responses.create(requestParams);
90
+ // Log ALL output item types to catch everything
91
+ const outputTypes = (response.output || []).map(item => item.type);
92
+ const toolCalls = (response.output || [])
93
+ .filter(item => item.type === 'computer_call')
94
+ .map(item => ({
95
+ call_id: item.call_id,
96
+ action_type: item.action?.type
97
+ }));
98
+ const safetyChecks = (response.output || [])
99
+ .filter(item => item.type === 'pending_safety_check')
100
+ .map(item => ({
101
+ id: item.id,
102
+ code: item.code
103
+ }));
104
+ // Log full output array if there are unaccounted items
105
+ const accountedItems = toolCalls.length + safetyChecks.length;
106
+ const totalItems = response.output?.length || 0;
107
+ logger.debug('CUA Response:', {
108
+ id: response.id,
109
+ output_length: totalItems,
110
+ output_types: outputTypes,
111
+ tool_calls: toolCalls.length > 0 ? toolCalls : 'none',
112
+ pending_safety_checks: safetyChecks.length > 0 ? safetyChecks : 'none'
113
+ });
114
+ // If we're missing items in our logging, log the full output for investigation
115
+ if (accountedItems < totalItems) {
116
+ logger.debug('UNACCOUNTED OUTPUT ITEMS - Full output array:', response.output);
117
+ }
118
+ return response;
119
+ }
120
+ catch (err) {
121
+ logger.error('CUA Request failed', { request: requestLog, error: err });
122
+ throw err;
123
+ }
124
+ }