@loadmill/droid-cua 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -177,6 +177,8 @@ Supported CLI options include:
177
177
  - `--context`
178
178
  - `--app-context-budget`
179
179
  - `--no-context`
180
+ - `--base-prompt`
181
+ - `--execution-prompt`
180
182
  - `--base-prompt-file`
181
183
  - `--execution-prompt-file`
182
184
  - `--record`
@@ -187,7 +189,18 @@ Config and precedence rules:
187
189
  - CLI flags override config file values.
188
190
  - `--context` overrides the config app-context path.
189
191
  - `--no-context` disables app context entirely.
192
+ - `--base-prompt` and `--execution-prompt` let you pass prompt customizations inline on the command line.
190
193
  - `--base-prompt-file` and `--execution-prompt-file` override the corresponding prompt customizations from config.
194
+ - If both inline and file-based prompt overrides are provided, the inline prompt flags win.
195
+
196
+ Example without a config file:
197
+ ```sh
198
+ droid-cua \
199
+ --avd adb:emulator-5554 \
200
+ --instructions tests/login.dcua \
201
+ --context tests/context.md \
202
+ --base-prompt "stop and look at the screen after every action you take."
203
+ ```
191
204
 
192
205
  Headless debug artifacts:
193
206
  - `--debug` writes desktop-style structured JSONL artifacts under `logs/`.
package/build/index.js CHANGED
@@ -23,6 +23,11 @@ const recordScreenshots = args["record"] || false;
23
23
  const instructionsFile = args.instructions || args.i || null;
24
24
  const appContextPath = typeof args.context === "string" ? args.context : null;
25
25
  const debugMode = args["debug"] || false;
26
+ const strictFlag = args.strict === true;
27
+ const noStrictFlag = args["no-strict"] === true || args.strict === false;
28
+ if (strictFlag && noStrictFlag) {
29
+ throw new Error("--strict and --no-strict cannot be used together.");
30
+ }
26
31
  const screenshotDir = path.join("droid-cua-recording-" + Date.now());
27
32
  if (recordScreenshots)
28
33
  await mkdir(screenshotDir, { recursive: true });
@@ -132,7 +137,7 @@ async function main() {
132
137
  status: "missing"
133
138
  });
134
139
  }
135
- const executionPrompt = buildExecutionModePrompt(deviceInfo, headlessConfig.promptCustomizations, appContextBriefing);
140
+ const executionPrompt = buildExecutionModePrompt(deviceInfo, headlessConfig.promptCustomizations, appContextBriefing, { strictMode: headlessConfig.strictMode });
136
141
  session.setSystemPrompt(executionPrompt);
137
142
  const screenshotRecorder = headlessDebug.createExecutionScreenshotRecorder({
138
143
  runId,
@@ -142,11 +147,13 @@ async function main() {
142
147
  const engine = screenshotRecorder
143
148
  ? new ExecutionEngine(session, {
144
149
  recordScreenshots: true,
145
- screenshotRecorder
150
+ screenshotRecorder,
151
+ strictMode: headlessConfig.strictMode
146
152
  })
147
153
  : new ExecutionEngine(session, {
148
154
  recordScreenshots,
149
155
  screenshotDir,
156
+ strictMode: headlessConfig.strictMode
150
157
  });
151
158
  executionMode = new ExecutionMode(session, engine, instructions, true);
152
159
  const result = await executionMode.execute({
@@ -199,6 +206,7 @@ async function main() {
199
206
  const engine = new ExecutionEngine(session, {
200
207
  recordScreenshots,
201
208
  screenshotDir,
209
+ strictMode: strictFlag,
202
210
  });
203
211
  // Otherwise, start interactive Ink shell
204
212
  await startInkShell(session, engine, {
@@ -23,6 +23,12 @@ function validateCuaModel(value, label) {
23
23
  }
24
24
  return value;
25
25
  }
26
+ function parseBooleanFlagState(args, key) {
27
+ return {
28
+ enabled: args[key] === true,
29
+ disabled: args[`no-${key}`] === true || args[key] === false,
30
+ };
31
+ }
26
32
  function parseBudgetValue(rawValue, label) {
27
33
  const numericValue = (() => {
28
34
  if (typeof rawValue === "number") {
@@ -102,6 +108,7 @@ async function loadConfigFromFile(configPath) {
102
108
  appContextEnabled: undefined,
103
109
  appContextBudget: undefined,
104
110
  appContextPath: undefined,
111
+ strictMode: undefined,
105
112
  };
106
113
  if ("cuaModel" in rawConfig) {
107
114
  normalized.cuaModel = validateCuaModel(rawConfig.cuaModel, "config.cuaModel");
@@ -122,18 +129,30 @@ async function loadConfigFromFile(configPath) {
122
129
  assertNonEmptyString(rawConfig.appContextPath, "config.appContextPath");
123
130
  normalized.appContextPath = path.resolve(configDir, rawConfig.appContextPath);
124
131
  }
132
+ if ("strictMode" in rawConfig) {
133
+ if (typeof rawConfig.strictMode !== "boolean") {
134
+ throw new Error("config.strictMode must be a boolean.");
135
+ }
136
+ normalized.strictMode = rawConfig.strictMode;
137
+ }
125
138
  return normalized;
126
139
  }
127
140
  export async function resolveHeadlessExecutionConfig(args, options = {}) {
128
141
  const cwd = typeof options.cwd === "string" ? options.cwd : process.cwd();
129
142
  const configPath = typeof args.config === "string" ? args.config : null;
130
143
  const explicitContextPath = typeof args.context === "string" ? path.resolve(cwd, args.context) : null;
131
- const noContext = args["no-context"] === true;
144
+ const noContext = args["no-context"] === true || args.context === false;
145
+ const { enabled: strictEnabledByFlag, disabled: strictDisabledByFlag } = parseBooleanFlagState(args, "strict");
146
+ const basePromptText = typeof args["base-prompt"] === "string" ? args["base-prompt"] : null;
147
+ const executionPromptText = typeof args["execution-prompt"] === "string" ? args["execution-prompt"] : null;
132
148
  const basePromptFilePath = typeof args["base-prompt-file"] === "string" ? path.resolve(cwd, args["base-prompt-file"]) : null;
133
149
  const executionPromptFilePath = typeof args["execution-prompt-file"] === "string" ? path.resolve(cwd, args["execution-prompt-file"]) : null;
134
150
  if (explicitContextPath && noContext) {
135
151
  throw new Error("--context and --no-context cannot be used together.");
136
152
  }
153
+ if (strictEnabledByFlag && strictDisabledByFlag) {
154
+ throw new Error("--strict and --no-strict cannot be used together.");
155
+ }
137
156
  const fileConfig = configPath ? await loadConfigFromFile(configPath) : null;
138
157
  const promptCustomizations = {
139
158
  ...createEmptyPromptCustomizations(),
@@ -146,6 +165,7 @@ export async function resolveHeadlessExecutionConfig(args, options = {}) {
146
165
  appContextEnabled: fileConfig?.appContextEnabled ?? true,
147
166
  appContextBudget: fileConfig?.appContextBudget ?? DEFAULT_APP_CONTEXT_BUDGET,
148
167
  appContextPath: fileConfig?.appContextPath || null,
168
+ strictMode: fileConfig?.strictMode ?? false,
149
169
  };
150
170
  if (typeof args["cua-model"] === "string") {
151
171
  resolved.cuaModel = validateCuaModel(args["cua-model"], "--cua-model");
@@ -159,6 +179,12 @@ export async function resolveHeadlessExecutionConfig(args, options = {}) {
159
179
  if (executionPromptFilePath) {
160
180
  resolved.promptCustomizations.executionModeInstructions = await readTextFile(executionPromptFilePath, "--execution-prompt-file");
161
181
  }
182
+ if (basePromptText !== null) {
183
+ resolved.promptCustomizations.basePromptInstructions = basePromptText;
184
+ }
185
+ if (executionPromptText !== null) {
186
+ resolved.promptCustomizations.executionModeInstructions = executionPromptText;
187
+ }
162
188
  if (explicitContextPath) {
163
189
  resolved.appContextEnabled = true;
164
190
  resolved.appContextPath = explicitContextPath;
@@ -167,5 +193,11 @@ export async function resolveHeadlessExecutionConfig(args, options = {}) {
167
193
  resolved.appContextEnabled = false;
168
194
  resolved.appContextPath = null;
169
195
  }
196
+ if (strictEnabledByFlag) {
197
+ resolved.strictMode = true;
198
+ }
199
+ else if (strictDisabledByFlag) {
200
+ resolved.strictMode = false;
201
+ }
170
202
  return resolved;
171
203
  }
@@ -24,8 +24,12 @@ export async function handleHelp(args, session, context) {
24
24
  addOutput({ type: 'info', text: ' --context <file> Optional app context file used to brief execution runs' });
25
25
  addOutput({ type: 'info', text: ' --app-context-budget Headless app context token budget override' });
26
26
  addOutput({ type: 'info', text: ' --no-context Disable app context for headless execution' });
27
+ addOutput({ type: 'info', text: ' --base-prompt <text> Headless base prompt customization text' });
28
+ addOutput({ type: 'info', text: ' --execution-prompt <text> Headless execution prompt customization text' });
27
29
  addOutput({ type: 'info', text: ' --base-prompt-file Headless base prompt customization file' });
28
30
  addOutput({ type: 'info', text: ' --execution-prompt-file Headless execution prompt customization file' });
31
+ addOutput({ type: 'info', text: ' --strict Strict Mode: re-observe after the first action in a chain' });
32
+ addOutput({ type: 'info', text: ' --no-strict Disable Strict Mode for headless config-driven runs' });
29
33
  addOutput({ type: 'info', text: ' --record Record screenshots during execution' });
30
34
  addOutput({ type: 'info', text: ' --debug Enable structured JSONL debug artifacts' });
31
35
  addOutput({ type: 'info', text: '' });
@@ -99,7 +99,7 @@ export async function handleRun(args, session, context) {
99
99
  }
100
100
  }
101
101
  // Set execution mode system prompt (replaces any design mode prompt)
102
- const executionPrompt = buildExecutionModePrompt(session.deviceInfo, {}, appContextBriefing);
102
+ const executionPrompt = buildExecutionModePrompt(session.deviceInfo, {}, appContextBriefing, { strictMode: Boolean(context.engine?.strictMode) });
103
103
  session.setSystemPrompt(executionPrompt);
104
104
  // Create execution mode
105
105
  const executionMode = new ExecutionMode(session, context.engine, instructions);
@@ -31,14 +31,68 @@ function extractComputerCalls(items) {
31
31
  }
32
32
  return entries;
33
33
  }
34
+ function getScopeAndIds(context = null, stepContext = null) {
35
+ const scope = context?.sessionId ? "design" : "execution";
36
+ const ids = scope === "design"
37
+ ? {
38
+ sessionId: context?.sessionId,
39
+ stepId: stepContext?.stepId,
40
+ instructionIndex: stepContext?.instructionIndex
41
+ }
42
+ : {
43
+ runId: context?.runId,
44
+ stepId: stepContext?.stepId,
45
+ instructionIndex: stepContext?.instructionIndex
46
+ };
47
+ return { scope, ids };
48
+ }
49
+ function buildStrictModePlan(actions = [], strictMode = false) {
50
+ if (!strictMode || actions.length === 0) {
51
+ return {
52
+ actionsToExecute: actions,
53
+ droppedActions: [],
54
+ truncationReason: null,
55
+ runtimeNote: null
56
+ };
57
+ }
58
+ const actionsToExecute = [actions[0]];
59
+ const droppedActions = actions.slice(1);
60
+ if (droppedActions.length === 0) {
61
+ return {
62
+ actionsToExecute,
63
+ droppedActions,
64
+ truncationReason: null,
65
+ runtimeNote: null
66
+ };
67
+ }
68
+ const leadingActionType = actionsToExecute[0]?.type;
69
+ const truncationReason = leadingActionType === "screenshot"
70
+ ? "leading_screenshot_reobserve"
71
+ : "post_first_action_reobserve";
72
+ const droppedActionTypes = droppedActions
73
+ .map((action) => action?.type)
74
+ .filter(Boolean);
75
+ const runtimeNote = `Strict Mode: I executed only the first ${leadingActionType === "screenshot" ? "screenshot request" : "action"} from your previous chain and intentionally skipped the remaining ${droppedActions.length} action${droppedActions.length === 1 ? "" : "s"} (${droppedActionTypes.join(", ")}) so I could re-observe the device before continuing. Base your next step only on what is visible now.`;
76
+ return {
77
+ actionsToExecute,
78
+ droppedActions,
79
+ truncationReason,
80
+ runtimeNote
81
+ };
82
+ }
34
83
  export class ExecutionEngine {
35
84
  constructor(session, options = {}) {
36
85
  this.session = session;
37
86
  this.recordScreenshots = options.recordScreenshots || false;
38
87
  this.screenshotDir = options.screenshotDir || null;
39
88
  this.screenshotRecorder = options.screenshotRecorder || null;
89
+ this.strictMode = options.strictMode === true;
40
90
  this.stepDelayMs = getConfiguredStepDelayMs();
41
91
  this.reportedScreenshotWriteError = false;
92
+ this.getScreenshotAsBase64 = options.getScreenshotAsBase64 || getScreenshotAsBase64;
93
+ this.handleModelAction = options.handleModelAction || handleModelAction;
94
+ this.sendCUARequest = options.sendCUARequest || sendCUARequest;
95
+ this.getCurrentPlatform = options.getCurrentPlatform || getCurrentPlatform;
42
96
  }
43
97
  async recordScreenshot(screenshotBase64, metadata = {}) {
44
98
  if (typeof screenshotBase64 !== "string" || !screenshotBase64) {
@@ -89,6 +143,7 @@ export class ExecutionEngine {
89
143
  const addOutput = context?.addOutput || printCliOutput;
90
144
  let newResponseId = response.id;
91
145
  const shouldStop = () => Boolean(trackAction?.());
146
+ const { scope, ids } = getScopeAndIds(context, stepContext);
92
147
  const eventMeta = (extra = {}) => ({
93
148
  runId: context?.runId,
94
149
  stepId: stepContext?.stepId,
@@ -156,8 +211,9 @@ export class ExecutionEngine {
156
211
  for (const { call_id, actions } of computerCalls) {
157
212
  if (!call_id)
158
213
  continue;
214
+ const { actionsToExecute, droppedActions, truncationReason, runtimeNote } = buildStrictModePlan(actions, this.strictMode);
159
215
  let sawExplicitScreenshotAction = false;
160
- for (const action of actions) {
216
+ for (const action of actionsToExecute) {
161
217
  if (shouldStop()) {
162
218
  return newResponseId;
163
219
  }
@@ -177,7 +233,7 @@ export class ExecutionEngine {
177
233
  });
178
234
  }
179
235
  else {
180
- await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
236
+ await this.handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
181
237
  ...context,
182
238
  shouldStop,
183
239
  stepId: stepContext?.stepId,
@@ -201,7 +257,7 @@ export class ExecutionEngine {
201
257
  if (shouldStop()) {
202
258
  return newResponseId;
203
259
  }
204
- const screenshotBase64 = await getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
260
+ const screenshotBase64 = await this.getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
205
261
  emitDesktopDebug("device.screenshot", "device", {
206
262
  runId: context?.runId,
207
263
  stepId: stepContext?.stepId,
@@ -221,6 +277,23 @@ export class ExecutionEngine {
221
277
  callId: call_id,
222
278
  captureSource: sawExplicitScreenshotAction ? "call-output-explicit-action" : "call-output-post-action"
223
279
  });
280
+ if (runtimeNote) {
281
+ const executedActionTypes = actionsToExecute
282
+ .map((action) => action?.type)
283
+ .filter(Boolean);
284
+ const droppedActionTypes = droppedActions
285
+ .map((action) => action?.type)
286
+ .filter(Boolean);
287
+ emitDesktopDebug("cua.strict_mode.truncation", scope, ids, {
288
+ callId: call_id,
289
+ executedCount: actionsToExecute.length,
290
+ droppedCount: droppedActions.length,
291
+ executedActionTypes,
292
+ droppedActionTypes,
293
+ reason: truncationReason,
294
+ note: runtimeNote
295
+ });
296
+ }
224
297
  // Build next input: screenshot + any carryover reasoning
225
298
  const selectedCuaModel = process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
226
299
  const input = [{
@@ -231,19 +304,25 @@ export class ExecutionEngine {
231
304
  image_url: `data:image/png;base64,${screenshotBase64}`,
232
305
  },
233
306
  ...(selectedCuaModel === "computer-use-preview"
234
- ? { current_url: getCurrentPlatform() === "ios" ? "ios://simulator" : "android://device" }
307
+ ? { current_url: this.getCurrentPlatform() === "ios" ? "ios://simulator" : "android://device" }
235
308
  : {}),
236
309
  ...(pendingSafetyChecks.length > 0 ? { acknowledged_safety_checks: pendingSafetyChecks } : {})
237
310
  }];
311
+ if (runtimeNote) {
312
+ input.push({
313
+ role: "user",
314
+ content: runtimeNote
315
+ });
316
+ }
238
317
  if (shouldStop()) {
239
318
  return newResponseId;
240
319
  }
241
- response = await sendCUARequest({
320
+ response = await this.sendCUARequest({
242
321
  messages: input,
243
322
  previousResponseId: newResponseId,
244
323
  deviceInfo: this.session.deviceInfo,
245
324
  debugContext: {
246
- scope: context?.sessionId ? "design" : "execution",
325
+ scope,
247
326
  runId: context?.runId,
248
327
  sessionId: context?.sessionId,
249
328
  stepId: stepContext?.stepId,
@@ -1,279 +1,3 @@
1
- /**
2
- * System prompt templates for different modes
3
- */
4
- function buildCustomInstructionsSection(sections = []) {
5
- const nonEmptySections = sections
6
- .map((section) => ({
7
- title: section?.title,
8
- text: typeof section?.text === "string" ? section.text.trim() : ""
9
- }))
10
- .filter((section) => section.title && section.text);
11
- if (nonEmptySections.length === 0) {
12
- return "";
13
- }
14
- const renderedSections = nonEmptySections
15
- .map((section) => `${section.title}:\n${section.text}`)
16
- .join("\n\n");
17
- return `USER CUSTOM INSTRUCTIONS:
18
- Follow these user-configured instructions in addition to the default behavior below.
19
- Prefer these custom instructions when deciding how to behave.
20
-
21
- ${renderedSections}`;
22
- }
23
- function appendCustomSections(prompt, sections = []) {
24
- const customSection = buildCustomInstructionsSection(sections);
25
- if (!customSection) {
26
- return prompt;
27
- }
28
- return `${prompt}
29
-
30
- ${customSection}
31
- `;
32
- }
33
- function describeControlledDevice(deviceInfo = {}) {
34
- const platform = typeof deviceInfo.platform === "string" ? deviceInfo.platform.trim().toLowerCase() : "";
35
- const deviceName = typeof deviceInfo.device_name === "string" ? deviceInfo.device_name.trim() : "";
36
- if (platform === "ios") {
37
- return deviceName ? `an iOS simulator (${deviceName})` : "an iOS device";
38
- }
39
- if (platform === "android") {
40
- return deviceName ? `an Android device (${deviceName})` : "an Android device";
41
- }
42
- return "a mobile device";
43
- }
44
- function buildAppContextSection(briefing) {
45
- const text = typeof briefing === "string" ? briefing.trim() : "";
46
- if (!text) {
47
- return "";
48
- }
49
- return `APP CONTEXT BRIEFING:
50
- The following is a condensed description of the app you are testing, relevant to the current task.
51
- Use this to understand screen layouts, terminology, navigation, and expected behavior.
52
-
53
- ${text}`;
54
- }
55
- export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
56
- const controlledDevice = describeControlledDevice(deviceInfo);
57
- const prompt = `
58
- You are controlling ${controlledDevice} in a sandboxed testing environment.
59
- Follow the user's instructions to interact with the device.
60
-
61
- The device screen has been scaled down for display.
62
- You can interact with any part of the visible phone screen, including system UI, browser UI, and app content.
63
-
64
- The screen you see is ${deviceInfo.scaled_width} x ${deviceInfo.scaled_height} pixels.
65
- Pixel (0,0) is at the top-left corner.
66
-
67
- When aiming for visual targets:
68
- - Reason carefully about the approximate pixel position.
69
- - Click precisely based on your visual estimate.
70
-
71
- Available actions: click, scroll, type, keypress, wait, screenshot.
72
-
73
- CRITICAL - Mobile Input Constraints:
74
- - This is a mobile device, not a desktop. Do NOT use desktop keyboard shortcuts or modifier chords.
75
- - NEVER emit key combinations such as CTRL+A, CMD+A, CTRL+C, CTRL+V, ALT+TAB, SHIFT+ENTER, or similar shortcuts.
76
- - Use 'keypress' only for a single mobile-safe key when absolutely necessary.
77
- - To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
78
- - Prefer tapping visible controls over hardware key events.
79
- - Prefer on-screen navigation controls such as menus, tabs, drawer items, back arrows, close buttons, and explicit logout buttons over keypress actions.
80
- - Do NOT use Back or ESC for normal app navigation when a reliable on-screen control is visible.
81
- - Avoid using Back or ESC from a main or root screen, because it may leave the app.
82
- - Exception: if the software keyboard is open and blocking the next needed control, Back or ESC may be used to dismiss the keyboard before continuing.
83
- - Treat keypress actions as a fallback for limited cases only, such as a clearly needed single mobile-safe key or dismissing transient UI when no better visible control exists.
84
-
85
- CRITICAL - Automatic Timing:
86
- - After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
87
- - This 500ms is sufficient for normal UI updates and animations
88
- - DO NOT add 'wait' actions unnecessarily - trust the automatic delay
89
-
90
- CRITICAL - Mutating Actions:
91
- - Mutating actions change app state. Examples: submit, create, save, confirm, approve, reject, login, logout, send, place order, initiate transfer
92
- - Before tapping a mutating action button, dismiss the software keyboard first when it is open and not required for the tap
93
- - After performing a mutating action once, do NOT repeat the same mutating action unless the UI clearly shows the first attempt failed or had no effect
94
- - Treat visible state change as success. Examples: form fields clear, submit button returns to normal, status changes, list refreshes, new row appears, success message appears, screen changes
95
- - For form submissions specifically, if the relevant fields clear and the action button returns to its normal idle state, treat that as success even if the new row or confirmation is not obvious yet
96
- - If the UI shows signs that the mutating action succeeded, stop acting for that instruction
97
-
98
- Use explicit 'wait' action ONLY in these specific cases:
99
- 1. After launching apps from home screen or app drawer
100
- 2. After pressing ENTER that triggers navigation (search, URL, form submit)
101
- 3. After clicking links that open new apps or pages
102
- 4. After actions that trigger heavy loading (camera, maps, etc.)
103
-
104
- When you MUST wait:
105
- - Click app icon from home → wait → Continue
106
- - Type in search box → Press ENTER → wait → Continue
107
- - Click link that opens new page/app → wait → Continue
108
- - Open camera/maps/heavy feature → wait → Continue
109
-
110
- When you should NOT wait (automatic 500ms handles it):
111
- - Clicking UI buttons within a running app (click button - no wait needed)
112
- - Typing in text fields (type text - no wait needed)
113
- - Scrolling (scroll - no wait needed)
114
- - Clicking tabs or menu items within an app (click - no wait needed)
115
-
116
- Rule of thumb: Wait for app launches and navigation. Everything else has automatic timing.
117
-
118
- Perform the user's requested actions within the current view.
119
-
120
- If unsure about visual elements, take a screenshot to improve your reasoning.
121
- If unsure about the user's intent, make the best decision you can based on context and continue automatically.
122
-
123
- CRITICAL - Never Ask Questions:
124
- - NEVER ask the user for confirmation, clarification, or next steps
125
- - NEVER ask questions like "Should I...", "Would you like...", "Do you want me to..."
126
- - NEVER wait for user guidance - make autonomous decisions
127
- - If stuck, try alternative approaches (go back, try different UI element, restart app)
128
- - ONLY stop when the task is complete or you've exhausted reasonable approaches
129
-
130
- Act decisively to complete the task.
131
-
132
- Stop acting once the task appears complete.
133
- Only complete the current instruction. Do not proceed beyond the current step unless asked.
134
-
135
- Mobile-Specific Notes:
136
- - HOME key returns to the home screen
137
- - On Android, ESC key maps to Back
138
- - On iOS, ESC has no effect; use visible on-screen controls instead
139
- - Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
140
- `;
141
- return prompt;
142
- }
143
- export function buildDesignModePrompt(deviceInfo, customInstructions = {}) {
144
- const designCustomText = typeof customInstructions.designModeInstructions === "string" ? customInstructions.designModeInstructions.trim() : "";
145
- const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
146
- const prompt = `${basePrompt}
147
-
148
- DESIGN MODE:
149
- You are helping design a test script for an Android app.
150
- Some tests intentionally validate negative outcomes (errors, failures, rejected inputs). These are expected and should be treated as successful progress when they match the test goal.
151
-
152
- Your task:
153
- 1. Understand what the user wants to test from their initial instruction
154
- 2. Explore the app autonomously to understand the flows
155
- 3. Take screenshots and interact as needed to discover the UI and behavior
156
- 4. Once you've successfully completed the user's requested flow, immediately generate the test script
157
-
158
- CRITICAL - After Completing the Task:
159
- - DO NOT navigate back or away from the final screen
160
- - The final screen state is what matters for verification
161
- - Generate the test script immediately showing the current state
162
- - Use assertions to verify state, not navigation
163
- - "Check that it changed" means verify the current visual state, not navigate elsewhere
164
- - If the target validation state is visible (including expected error states), STOP actions and immediately output the final test script
165
-
166
- CRITICAL - Recognizing When You Are Stuck:
167
- If you find yourself:
168
- - Repeating similar actions multiple times (e.g., opening/closing the same app repeatedly)
169
- - Not reaching a new screen or state after several attempts
170
- - Unsure about a higher-level decision (which tab to use, which mode to enter, where to start)
171
- - Unable to find the UI element or feature the user mentioned
172
-
173
- THEN STOP ACTING IMMEDIATELY and ask the user for guidance:
174
- 1. Briefly describe what you see on screen now
175
- 2. Explain what you were trying to do and why you're stuck
176
- 3. Ask a single, concrete question to unblock the next step
177
-
178
- Example:
179
- "Chrome is open but I don't see a search bar or new tab button. Should I open a new tab, or is there a specific way you'd like me to navigate?"
180
-
181
- DO NOT continue brute-forcing the UI when stuck. The user prefers being asked over watching repeated failed attempts.
182
- DO NOT ask if the user wants a script after successfully completing the flow - just generate it automatically.
183
-
184
- CRITICAL - Off-Screen Element Discovery:
185
- - If a required element is not visible, assume it may be off-screen before changing strategy
186
- - Humans naturally scroll when UI appears cropped; do the same
187
- - Use this discovery sequence before retries or fallback navigation:
188
- 1. Scroll the screen in the likely direction to reveal hidden content
189
- 2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry discovery
190
- - Do not repeat already-successful actions while searching for an off-screen target
191
-
192
- CRITICAL - Test Script Format Rules:
193
- - One simple instruction per line (NO numbers, NO bullets)
194
- - Use imperative commands: "Open X", "Click Y", "Type Z"
195
- - Include "assert: <condition>" lines to validate expected behavior
196
- - Normalize validation wording into assertions:
197
- - Convert "check", "verify", "ensure", "fetch", and "compare" intent into explicit "assert: ..." lines
198
- - Do not leave standalone "Check ..." or "Verify ..." lines in the final script
199
- - Merge duplicate or near-duplicate validation lines into one clear assertion
200
- - End with "exit"
201
- - Keep it simple and executable
202
- - When you generate the final result, include a suggested test name before the script
203
- - The suggested test name must be very short: prefer 2 to 4 words
204
- - Focus on the main user goal, not every assertion or detail
205
- - The suggested test name must be lowercase, kebab-case, and filename-safe
206
- - Use this exact final format:
207
- Suggested test name: short-kebab-case-name
208
-
209
- \`\`\`
210
- <test script here>
211
- \`\`\`
212
-
213
- CORRECT Example:
214
- Suggested test name: calculator-addition
215
-
216
- \`\`\`
217
- Open Calculator app
218
- assert: Calculator app is visible
219
- Type "2"
220
- Click the plus button
221
- Type "3"
222
- Click the equals button
223
- assert: result shows 5
224
- exit
225
- \`\`\`
226
-
227
- WRONG Example (DON'T DO THIS):
228
- \`\`\`
229
- 1. Open Calculator app
230
- 2. Verify the app opened
231
- 3. etc...
232
- \`\`\`
233
-
234
- Remember: You are autonomous. Explore confidently. Generate simple, executable test scripts.
235
- `;
236
- return appendCustomSections(prompt, [
237
- { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
238
- { title: "Design Mode Instructions", text: designCustomText }
239
- ]);
240
- }
241
- export function buildExecutionModePrompt(deviceInfo, customInstructions = {}, appContextBriefing = "") {
242
- const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
243
- const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
244
- const appContextSection = buildAppContextSection(appContextBriefing);
245
- const prompt = `${basePrompt}
246
-
247
- EXECUTION MODE - Critical Behavior:
248
- You are executing test script commands one at a time. This is NOT a conversation.
249
-
250
- CRITICAL RULES:
251
- - DO NOT generate conversational text or narration
252
- - DO NOT ask questions like "What should I do next?", "Would you like...", "Can I assist...?"
253
- - DO NOT describe what you see on screen
254
- - DO NOT say "Let me know if you need help" or similar phrases
255
- - Just execute the action silently and stop immediately
256
- - Only generate text if the action FAILED or cannot be completed
257
- - Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
258
- - Never repeat the same mutating action with the same apparent intent unless the UI clearly shows failure or no state change
259
- - If a submit/create/approve/reject/login action appears to succeed, stop instead of trying to reconfirm by doing it again
260
- - For form submissions, cleared fields plus a reset action button are strong success signals; stop even if the created item is not yet obvious in the visible list
261
- - If target is not visible, perform bounded off-screen discovery first:
262
- 1. Scroll the screen in the likely direction to reveal hidden controls
263
- 2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
264
-
265
- Your process:
266
- 1. Read the instruction
267
- 2. Execute the required actions
268
- 3. Before tapping a mutating action, dismiss the keyboard if it is open and not needed
269
- 4. After a mutating action, inspect the resulting screen for success cues such as cleared fields, reset buttons, changed status, refreshed content, or navigation
270
- 5. Stop as soon as success is visible
271
- 6. Stop immediately - no commentary, no questions
272
-
273
- Each instruction is independent. Do not reference previous instructions or ask about next steps.
274
- ${appContextSection ? `\n\n${appContextSection}` : ""}`;
275
- return appendCustomSections(prompt, [
276
- { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
277
- { title: "Execution Mode Instructions", text: executionCustomText }
278
- ]);
279
- }
1
+ export { buildBaseSystemPrompt } from "../prompts/base.js";
2
+ export { buildDesignModePrompt, buildDesignRecoveryPrompt } from "../prompts/design.js";
3
+ export { buildAppContextSection, buildAssertionSystemPrompt, buildExecutionModePrompt, buildExecutionRecoveryPrompt, } from "../prompts/execution.js";
@@ -2,6 +2,7 @@
2
2
  * Assertion handling for script validation
3
3
  */
4
4
  import { printCliOutput } from "../utils/console-output.js";
5
+ export { buildAssertionSystemPrompt } from "../prompts/execution.js";
5
6
  export function isAssertion(userInput) {
6
7
  const trimmed = userInput.trim();
7
8
  const lower = trimmed.toLowerCase();
@@ -20,27 +21,6 @@ export function extractAssertionPrompt(userInput) {
20
21
  }
21
22
  return trimmed;
22
23
  }
23
- export function buildAssertionSystemPrompt(baseSystemPrompt, assertionPrompt) {
24
- return `${baseSystemPrompt}
25
-
26
- ASSERTION MODE:
27
- You are now validating an assertion. The user has provided an assertion statement that you must verify.
28
-
29
- Your task:
30
- 1. Take screenshots and perform LIMITED actions if needed to validate the assertion.
31
- 2. Determine if the assertion is TRUE or FALSE based on the current state.
32
- 3. You MUST respond with a clear verdict in this exact format:
33
- - If the assertion is true, include the text: "ASSERTION RESULT: PASS"
34
- - If the assertion is false or cannot be confidently validated, include: "ASSERTION RESULT: FAIL"
35
- 4. After the verdict, provide a brief explanation (1-2 sentences) of why it passed or failed.
36
-
37
- The assertion to validate is: "${assertionPrompt}"
38
-
39
- Remember:
40
- - If you cannot confidently validate the assertion, treat it as FAIL.
41
- - You must include either "ASSERTION RESULT: PASS" or "ASSERTION RESULT: FAIL" in your response.
42
- - Be thorough but efficient. Only take the actions necessary to validate the assertion.`;
43
- }
44
24
  export function checkAssertionResult(transcript) {
45
25
  const transcriptText = transcript.join("\n");
46
26
  const hasPassed = transcriptText.includes("ASSERTION RESULT: PASS");