@loadmill/droid-cua 1.1.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,49 +51,81 @@ function renderSelection(title, items) {
51
51
  });
52
52
  }
53
53
  /**
54
- * Get list of available Android AVDs
54
+ * Get a unified Android list: connected ADB devices + launchable AVDs
55
55
  */
56
56
  async function getAndroidDevices() {
57
- const devices = [];
58
- // Get running emulators
57
+ const normalizeAvdName = (value) => value
58
+ .split(/\r?\n/)
59
+ .map((line) => line.trim())
60
+ .filter(Boolean)
61
+ .filter((line) => line.toUpperCase() !== "OK")[0] ?? "";
62
+ const connectedDevices = [];
63
+ const availableAvds = [];
64
+ const avdInventory = new Set();
65
+ const connectedAvdNames = new Set();
59
66
  try {
60
67
  const { stdout: adbOutput } = await execAsync("adb devices");
61
- const runningIds = adbOutput
62
- .trim()
68
+ const connectedIds = adbOutput
63
69
  .split("\n")
64
70
  .slice(1)
65
- .map((line) => line.split("\t")[0])
66
- .filter((id) => id.startsWith("emulator-"));
67
- for (const id of runningIds) {
71
+ .map((line) => line.trim())
72
+ .filter(Boolean)
73
+ .map((line) => {
74
+ const [id, state] = line.split("\t");
75
+ return { id: id?.trim() || "", state: state?.trim() || "" };
76
+ })
77
+ .filter((entry) => entry.id.length > 0 && entry.state === "device");
78
+ for (const entry of connectedIds) {
79
+ const id = entry.id;
80
+ let model = "";
81
+ let avdName = "";
68
82
  try {
69
- const { stdout } = await execAsync(`adb -s ${id} emu avd name`);
70
- const name = stdout.trim();
71
- devices.push({
72
- label: `${name} (running)`,
73
- value: name,
74
- running: true,
75
- });
83
+ const { stdout } = await execAsync(`adb -s "${id}" shell getprop ro.product.model`);
84
+ model = stdout.trim();
76
85
  }
77
86
  catch { }
87
+ if (id.startsWith("emulator-")) {
88
+ try {
89
+ const { stdout } = await execAsync(`adb -s "${id}" emu avd name`);
90
+ avdName = normalizeAvdName(stdout);
91
+ }
92
+ catch { }
93
+ }
94
+ if (avdName) {
95
+ connectedAvdNames.add(avdName);
96
+ }
97
+ connectedDevices.push({
98
+ label: model ? `${model} (${id})` : id,
99
+ value: `adb:${id}`,
100
+ running: true,
101
+ });
78
102
  }
79
103
  }
80
104
  catch { }
81
- // Get available AVDs
82
105
  try {
83
106
  const { stdout } = await execAsync("emulator -list-avds");
84
- const avds = stdout.trim().split("\n").filter((name) => name.length > 0);
107
+ const avds = stdout
108
+ .split("\n")
109
+ .map((line) => normalizeAvdName(line))
110
+ .filter(Boolean);
85
111
  for (const avd of avds) {
86
- if (!devices.some((d) => d.value === avd)) {
87
- devices.push({
88
- label: avd,
89
- value: avd,
90
- running: false,
91
- });
92
- }
112
+ avdInventory.add(avd);
93
113
  }
94
114
  }
95
115
  catch { }
96
- return devices;
116
+ for (const connectedAvdName of connectedAvdNames) {
117
+ avdInventory.delete(connectedAvdName);
118
+ }
119
+ for (const avdName of avdInventory) {
120
+ availableAvds.push({
121
+ label: avdName,
122
+ value: `avd:${avdName}`,
123
+ running: false,
124
+ });
125
+ }
126
+ connectedDevices.sort((a, b) => a.label.localeCompare(b.label));
127
+ availableAvds.sort((a, b) => a.label.localeCompare(b.label));
128
+ return [...connectedDevices, ...availableAvds];
97
129
  }
98
130
  /**
99
131
  * Get list of available iOS Simulators (iPhones only)
@@ -147,7 +179,7 @@ export async function selectDevice() {
147
179
  const hasIOS = iosDevices.length > 0;
148
180
  if (!hasAndroid && !hasIOS) {
149
181
  console.error("\nNo devices found!");
150
- console.error(" Android: Create an AVD with Android Studio");
182
+ console.error(" Android: Connect a device or create an AVD, then run droid-cua again");
151
183
  console.error(" iOS: Xcode Simulator must be available");
152
184
  process.exit(1);
153
185
  }
@@ -156,7 +188,7 @@ export async function selectDevice() {
156
188
  if (hasAndroid) {
157
189
  const runningCount = androidDevices.filter((d) => d.running).length;
158
190
  platformOptions.push({
159
- label: `Android${runningCount > 0 ? ` (${runningCount} running)` : ""} - ${androidDevices.length} emulator(s)`,
191
+ label: `Android${runningCount > 0 ? ` (${runningCount} connected)` : ""} - ${androidDevices.length} target(s)`,
160
192
  value: "android",
161
193
  });
162
194
  }
@@ -179,7 +211,7 @@ export async function selectDevice() {
179
211
  }
180
212
  // Select device
181
213
  const deviceList = platform === "ios" ? iosDevices : androidDevices;
182
- const deviceType = platform === "ios" ? "Simulator" : "Emulator";
214
+ const deviceType = platform === "ios" ? "Simulator" : "Android Target";
183
215
  let deviceName;
184
216
  if (deviceList.length === 1) {
185
217
  deviceName = deviceList[0].value;
@@ -16,7 +16,7 @@ export async function handleHelp(args, session, context) {
16
16
  addOutput({ type: 'info', text: ' droid-cua --avd <device-name> [options]' });
17
17
  addOutput({ type: 'info', text: '' });
18
18
  addOutput({ type: 'info', text: 'Options:' });
19
- addOutput({ type: 'info', text: ' --avd <name> Device name (Android AVD or iOS Simulator)' });
19
+ addOutput({ type: 'info', text: ' --avd <name> Device name (Android device ID/serial or iOS Simulator)' });
20
20
  addOutput({ type: 'info', text: ' --platform <platform> Force platform: android or ios' });
21
21
  addOutput({ type: 'info', text: ' --instructions <file> Run test file in headless mode' });
22
22
  addOutput({ type: 'info', text: ' --record Record screenshots during execution' });
@@ -37,7 +37,7 @@ export async function handleHelp(args, session, context) {
37
37
  addOutput({ type: 'info', text: ' /loadmill <command> Run Loadmill test flows using natural language' });
38
38
  addOutput({ type: 'info', text: '' });
39
39
  addOutput({ type: 'info', text: 'Platform Support:' });
40
- addOutput({ type: 'info', text: ' Android: Uses ADB to communicate with Android emulators' });
40
+ addOutput({ type: 'info', text: ' Android: Uses ADB to communicate with Android devices (physical or emulator)' });
41
41
  addOutput({ type: 'info', text: ' iOS: Uses Appium + XCUITest for iOS Simulator automation' });
42
42
  addOutput({ type: 'info', text: '' });
43
43
  addOutput({ type: 'info', text: 'Platform Detection:' });
@@ -53,7 +53,8 @@ export async function handleHelp(args, session, context) {
53
53
  addOutput({ type: 'info', text: ' Note: Appium server is auto-started when iOS platform is detected' });
54
54
  addOutput({ type: 'info', text: '' });
55
55
  addOutput({ type: 'info', text: 'Examples:' });
56
- addOutput({ type: 'info', text: ' droid-cua --avd Pixel_8_API_35 (Android emulator)' });
56
+ addOutput({ type: 'info', text: ' droid-cua --avd adb:emulator-5554 (Connected Android target by adb serial)' });
57
+ addOutput({ type: 'info', text: ' droid-cua --avd avd:Pixel_8_API_35 (Launch Android AVD then connect)' });
57
58
  addOutput({ type: 'info', text: ' droid-cua --avd "iPhone 16" (iOS Simulator, auto-detected)' });
58
59
  addOutput({ type: 'info', text: ' droid-cua --platform ios --avd MySim (Force iOS platform)' });
59
60
  addOutput({ type: 'info', text: ' /create login-test (design a new test)' });
@@ -3,6 +3,32 @@ import { writeFile } from "fs/promises";
3
3
  import { getScreenshotAsBase64, getCurrentPlatform } from "../device/connection.js";
4
4
  import { handleModelAction } from "../device/actions.js";
5
5
  import { sendCUARequest } from "../device/openai.js";
6
+ import { emitDesktopDebug } from "../utils/desktop-debug.js";
7
+ function extractComputerCalls(items) {
8
+ const entries = [];
9
+ for (const item of items) {
10
+ if (item?.type !== "computer_call")
11
+ continue;
12
+ const baseCallId = item.call_id || item.id;
13
+ if (!baseCallId)
14
+ continue;
15
+ const actions = [];
16
+ if (Array.isArray(item.actions)) {
17
+ for (const actionEntry of item.actions) {
18
+ if (!actionEntry)
19
+ continue;
20
+ const action = actionEntry.action || actionEntry;
21
+ if (action?.type)
22
+ actions.push(action);
23
+ }
24
+ }
25
+ else if (item.action?.type) {
26
+ actions.push(item.action);
27
+ }
28
+ entries.push({ call_id: baseCallId, actions });
29
+ }
30
+ return entries;
31
+ }
6
32
  export class ExecutionEngine {
7
33
  constructor(session, options = {}) {
8
34
  this.session = session;
@@ -17,9 +43,15 @@ export class ExecutionEngine {
17
43
  * @param {Function} trackAction - Optional callback to track actions for stuck detection
18
44
  * @param {Object} context - Optional Ink context for output
19
45
  */
20
- async runFullTurn(response, trackAction = null, context = null) {
46
+ async runFullTurn(response, trackAction = null, context = null, stepContext = null) {
21
47
  const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
22
48
  let newResponseId = response.id;
49
+ const eventMeta = (extra = {}) => ({
50
+ runId: context?.runId,
51
+ stepId: stepContext?.stepId,
52
+ instructionIndex: stepContext?.instructionIndex,
53
+ ...extra
54
+ });
23
55
  while (true) {
24
56
  // Check for interruption before processing next batch of actions
25
57
  if (trackAction) {
@@ -29,7 +61,7 @@ export class ExecutionEngine {
29
61
  }
30
62
  }
31
63
  const items = response.output || [];
32
- const actions = items.filter(item => item.type === "computer_call");
64
+ const computerCalls = extractComputerCalls(items);
33
65
  // ── Collect pending safety checks ──
34
66
  const pendingSafetyChecks = items
35
67
  .filter(item => item.type === "pending_safety_check")
@@ -39,7 +71,12 @@ export class ExecutionEngine {
39
71
  if (item.type === "reasoning") {
40
72
  for (const entry of item.summary) {
41
73
  if (entry.type === "summary_text") {
42
- addOutput({ type: 'reasoning', text: entry.text });
74
+ addOutput({
75
+ type: 'reasoning',
76
+ text: entry.text,
77
+ eventType: 'reasoning',
78
+ ...eventMeta()
79
+ });
43
80
  this.session.addToTranscript(`[Reasoning] ${entry.text}`);
44
81
  }
45
82
  }
@@ -47,45 +84,94 @@ export class ExecutionEngine {
47
84
  else if (item.type === "message") {
48
85
  const textPart = item.content.find(c => c.type === "output_text");
49
86
  if (textPart) {
50
- addOutput({ type: 'assistant', text: textPart.text });
87
+ addOutput({
88
+ type: 'assistant',
89
+ text: textPart.text,
90
+ eventType: 'assistant_message',
91
+ ...eventMeta()
92
+ });
51
93
  this.session.addToTranscript(`[Assistant] ${textPart.text}`);
52
94
  }
53
95
  }
54
96
  else if (item.type === "pending_safety_check") {
55
- addOutput({ type: 'warning', text: `⚠️ Safety check: ${item.code} - ${item.message}` });
97
+ addOutput({
98
+ type: 'warning',
99
+ text: `⚠️ Safety check: ${item.code} - ${item.message}`,
100
+ eventType: 'system_message',
101
+ ...eventMeta({
102
+ payload: {
103
+ id: item.id,
104
+ code: item.code,
105
+ message: item.message
106
+ }
107
+ })
108
+ });
56
109
  }
57
110
  }
58
- if (actions.length === 0) {
111
+ if (computerCalls.length === 0) {
59
112
  // No actions = turn complete
60
113
  break;
61
114
  }
62
115
  // ── Process model actions ──
63
- for (const { action, call_id } of actions) {
64
- if (action.type === "screenshot") {
65
- addOutput({ type: 'info', text: '📸 Capturing screen' });
66
- }
67
- else {
68
- await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, context);
69
- // Track action and check for interruption
70
- if (trackAction) {
71
- const shouldStop = trackAction(action);
72
- if (shouldStop) {
73
- // User interrupted - stop execution immediately
74
- return newResponseId;
75
- }
116
+ for (const { call_id, actions } of computerCalls) {
117
+ if (!call_id)
118
+ continue;
119
+ let sawExplicitScreenshotAction = false;
120
+ for (const action of actions) {
121
+ if (action.type === "screenshot") {
122
+ sawExplicitScreenshotAction = true;
123
+ addOutput({
124
+ type: 'info',
125
+ text: 'Capturing screen',
126
+ eventType: 'screenshot_captured',
127
+ actionType: 'screenshot',
128
+ ...eventMeta({
129
+ payload: {
130
+ callId: call_id,
131
+ source: 'explicit_action'
132
+ }
133
+ })
134
+ });
76
135
  }
77
- // Add delay after UI-changing actions to let the interface update
78
- // before taking the screenshot (except for explicit wait actions which have their own delay)
79
- if (action.type !== "wait") {
80
- await new Promise(resolve => setTimeout(resolve, 500));
136
+ else {
137
+ await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
138
+ ...context,
139
+ stepId: stepContext?.stepId,
140
+ instructionIndex: stepContext?.instructionIndex
141
+ });
142
+ // Track action and check for interruption
143
+ if (trackAction) {
144
+ const shouldStop = trackAction(action);
145
+ if (shouldStop) {
146
+ // User interrupted - stop execution immediately
147
+ return newResponseId;
148
+ }
149
+ }
150
+ // Add delay after UI-changing actions to let the interface update
151
+ // before taking the screenshot (except for explicit wait actions which have their own delay)
152
+ if (action.type !== "wait") {
153
+ await new Promise(resolve => setTimeout(resolve, 500));
154
+ }
81
155
  }
82
156
  }
83
157
  const screenshotBase64 = await getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
158
+ emitDesktopDebug("device.screenshot", "device", {
159
+ runId: context?.runId,
160
+ stepId: stepContext?.stepId,
161
+ instructionIndex: stepContext?.instructionIndex
162
+ }, {
163
+ source: sawExplicitScreenshotAction ? "explicit_action" : "post_action",
164
+ callId: call_id,
165
+ width: this.session.deviceInfo?.scaled_width,
166
+ height: this.session.deviceInfo?.scaled_height,
167
+ base64Length: screenshotBase64.length
168
+ });
84
169
  if (this.recordScreenshots && this.screenshotDir) {
85
170
  const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
86
171
  await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
87
172
  }
88
173
  // Build next input: screenshot + any carryover reasoning
174
+ const selectedCuaModel = process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
89
175
  const input = [{
90
176
  type: "computer_call_output",
91
177
  call_id,
@@ -93,21 +179,37 @@ export class ExecutionEngine {
93
179
  type: "computer_screenshot",
94
180
  image_url: `data:image/png;base64,${screenshotBase64}`,
95
181
  },
96
- current_url: getCurrentPlatform() === "ios" ? "ios://simulator" : "android://emulator",
182
+ ...(selectedCuaModel === "computer-use-preview"
183
+ ? { current_url: getCurrentPlatform() === "ios" ? "ios://simulator" : "android://device" }
184
+ : {}),
97
185
  ...(pendingSafetyChecks.length > 0 ? { acknowledged_safety_checks: pendingSafetyChecks } : {})
98
186
  }];
99
187
  response = await sendCUARequest({
100
188
  messages: input,
101
189
  previousResponseId: newResponseId,
102
190
  deviceInfo: this.session.deviceInfo,
191
+ debugContext: {
192
+ scope: context?.sessionId ? "design" : "execution",
193
+ runId: context?.runId,
194
+ sessionId: context?.sessionId,
195
+ stepId: stepContext?.stepId,
196
+ instructionIndex: stepContext?.instructionIndex
197
+ }
103
198
  });
104
199
  newResponseId = response.id;
200
+ // Each tool output advances the response chain; process newly returned calls next.
201
+ break;
105
202
  }
106
203
  }
107
204
  // ── At end, if last output was only reasoning ──
108
205
  const finalItems = response.output || [];
109
206
  if (finalItems.length > 0 && finalItems.at(-1).type === "reasoning") {
110
- addOutput({ type: 'info', text: 'Warning: last item was reasoning without follow-up. Dropping to avoid 400 error.' });
207
+ addOutput({
208
+ type: 'info',
209
+ text: 'Warning: last item was reasoning without follow-up. Dropping to avoid 400 error.',
210
+ eventType: 'system_message',
211
+ ...eventMeta()
212
+ });
111
213
  }
112
214
  return newResponseId;
113
215
  }
@@ -1,9 +1,32 @@
1
1
  /**
2
2
  * System prompt templates for different modes
3
3
  */
4
- export function buildBaseSystemPrompt(deviceInfo) {
5
- return `
6
- You are controlling an Android phone in a sandboxed testing environment.
4
+ function appendCustomSection(prompt, customText) {
5
+ const trimmed = typeof customText === "string" ? customText.trim() : "";
6
+ if (!trimmed) {
7
+ return prompt;
8
+ }
9
+ return `${prompt}
10
+
11
+ CUSTOM INSTRUCTIONS:
12
+ ${trimmed}
13
+ `;
14
+ }
15
+ function describeControlledDevice(deviceInfo = {}) {
16
+ const platform = typeof deviceInfo.platform === "string" ? deviceInfo.platform.trim().toLowerCase() : "";
17
+ const deviceName = typeof deviceInfo.device_name === "string" ? deviceInfo.device_name.trim() : "";
18
+ if (platform === "ios") {
19
+ return deviceName ? `an iOS simulator (${deviceName})` : "an iOS device";
20
+ }
21
+ if (platform === "android") {
22
+ return deviceName ? `an Android device (${deviceName})` : "an Android device";
23
+ }
24
+ return "a mobile device";
25
+ }
26
+ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
27
+ const controlledDevice = describeControlledDevice(deviceInfo);
28
+ const prompt = `
29
+ You are controlling ${controlledDevice} in a sandboxed testing environment.
7
30
  Follow the user's instructions to interact with the device.
8
31
 
9
32
  The device screen has been scaled down for display.
@@ -18,6 +41,13 @@ export function buildBaseSystemPrompt(deviceInfo) {
18
41
 
19
42
  Available actions: click, scroll, type, keypress, wait, screenshot.
20
43
 
44
+ CRITICAL - Mobile Input Constraints:
45
+ - This is a mobile device, not a desktop. Do NOT use desktop keyboard shortcuts or modifier chords.
46
+ - NEVER emit key combinations such as CTRL+A, CMD+A, CTRL+C, CTRL+V, ALT+TAB, SHIFT+ENTER, or similar shortcuts.
47
+ - Use 'keypress' only for a single mobile-safe key when absolutely necessary.
48
+ - To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
49
+ - Prefer tapping visible controls over hardware key events.
50
+
21
51
  CRITICAL - Automatic Timing:
22
52
  - After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
23
53
  - This 500ms is sufficient for normal UI updates and animations
@@ -63,15 +93,22 @@ export function buildBaseSystemPrompt(deviceInfo) {
63
93
  Mobile-Specific Notes:
64
94
  - ESC key maps to the Home button (return to home screen)
65
95
  - Use Home button (ESC) to escape from stuck situations and restart
66
- - Back button navigates within apps
96
+ - Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
67
97
  `;
98
+ return appendCustomSection(prompt, customInstructions.basePromptInstructions);
68
99
  }
69
- export function buildDesignModePrompt(deviceInfo) {
70
- const basePrompt = buildBaseSystemPrompt(deviceInfo);
71
- return `${basePrompt}
100
+ export function buildDesignModePrompt(deviceInfo, customInstructions = {}) {
101
+ const designCustomText = typeof customInstructions.designModeInstructions === "string" ? customInstructions.designModeInstructions.trim() : "";
102
+ const mergedBaseInstructions = [customInstructions.basePromptInstructions, designCustomText].filter(Boolean).join("\n\n");
103
+ const basePrompt = buildBaseSystemPrompt(deviceInfo, {
104
+ ...customInstructions,
105
+ basePromptInstructions: mergedBaseInstructions
106
+ });
107
+ const prompt = `${basePrompt}
72
108
 
73
109
  DESIGN MODE:
74
110
  You are helping design a test script for an Android app.
111
+ Some tests intentionally validate negative outcomes (errors, failures, rejected inputs). These are expected and should be treated as successful progress when they match the test goal.
75
112
 
76
113
  Your task:
77
114
  1. Understand what the user wants to test from their initial instruction
@@ -85,6 +122,7 @@ CRITICAL - After Completing the Task:
85
122
  - Generate the test script immediately showing the current state
86
123
  - Use assertions to verify state, not navigation
87
124
  - "Check that it changed" means verify the current visual state, not navigate elsewhere
125
+ - If the target validation state is visible (including expected error states), STOP actions and immediately output the final test script
88
126
 
89
127
  CRITICAL - Recognizing When You Are Stuck:
90
128
  If you find yourself:
@@ -104,10 +142,22 @@ Example:
104
142
  DO NOT continue brute-forcing the UI when stuck. The user prefers being asked over watching repeated failed attempts.
105
143
  DO NOT ask if the user wants a script after successfully completing the flow - just generate it automatically.
106
144
 
145
+ CRITICAL - Off-Screen Element Discovery:
146
+ - If a required element is not visible, assume it may be off-screen before changing strategy
147
+ - Humans naturally scroll when UI appears cropped; do the same
148
+ - Use this discovery sequence before retries or fallback navigation:
149
+ 1. Scroll the screen in the likely direction to reveal hidden content
150
+ 2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry discovery
151
+ - Do not repeat already-successful actions while searching for an off-screen target
152
+
107
153
  CRITICAL - Test Script Format Rules:
108
154
  - One simple instruction per line (NO numbers, NO bullets)
109
155
  - Use imperative commands: "Open X", "Click Y", "Type Z"
110
156
  - Include "assert: <condition>" lines to validate expected behavior
157
+ - Normalize validation wording into assertions:
158
+ - Convert "check", "verify", "ensure", "fetch", and "compare" intent into explicit "assert: ..." lines
159
+ - Do not leave standalone "Check ..." or "Verify ..." lines in the final script
160
+ - Merge duplicate or near-duplicate validation lines into one clear assertion
111
161
  - End with "exit"
112
162
  - Keep it simple and executable
113
163
 
@@ -132,10 +182,16 @@ WRONG Example (DON'T DO THIS):
132
182
 
133
183
  Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
134
184
  `;
185
+ return prompt;
135
186
  }
136
- export function buildExecutionModePrompt(deviceInfo) {
137
- const basePrompt = buildBaseSystemPrompt(deviceInfo);
138
- return `${basePrompt}
187
+ export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
188
+ const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
189
+ const mergedBaseInstructions = [customInstructions.basePromptInstructions, executionCustomText].filter(Boolean).join("\n\n");
190
+ const basePrompt = buildBaseSystemPrompt(deviceInfo, {
191
+ ...customInstructions,
192
+ basePromptInstructions: mergedBaseInstructions
193
+ });
194
+ const prompt = `${basePrompt}
139
195
 
140
196
  EXECUTION MODE - Critical Behavior:
141
197
  You are executing test script commands one at a time. This is NOT a conversation.
@@ -147,6 +203,10 @@ CRITICAL RULES:
147
203
  - DO NOT say "Let me know if you need help" or similar phrases
148
204
  - Just execute the action silently and stop immediately
149
205
  - Only generate text if the action FAILED or cannot be completed
206
+ - Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
207
+ - If target is not visible, perform bounded off-screen discovery first:
208
+ 1. Scroll the screen in the likely direction to reveal hidden controls
209
+ 2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
150
210
 
151
211
  Your process:
152
212
  1. Read the instruction
@@ -155,4 +215,5 @@ Your process:
155
215
 
156
216
  Each instruction is independent. Do not reference previous instructions or ask about next steps.
157
217
  `;
218
+ return prompt;
158
219
  }
@@ -7,7 +7,7 @@
7
7
  import { getDeviceBackend, getCurrentPlatform } from "./factory.js";
8
8
  /**
9
9
  * Handle an action from the CUA model
10
- * @param {string} deviceId - The device/emulator/simulator ID
10
+ * @param {string} deviceId - The connected device/simulator ID
11
11
  * @param {object} action - The action to execute
12
12
  * @param {number} scale - Scale factor for coordinates
13
13
  * @param {object} context - Context with addOutput function