npm - @loadmill/droid-cua - Versions diffs - 2.2.1 → 2.3.0 - Mend

@loadmill/droid-cua 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +56 -0
package/build/index.js +169 -24
package/build/src/cli/headless-debug.js +55 -0
package/build/src/cli/headless-execution-config.js +171 -0
package/build/src/cli/ink-shell.js +8 -2
package/build/src/commands/help.js +9 -1
package/build/src/commands/run.js +30 -1
package/build/src/core/app-context.js +57 -0
package/build/src/core/execution-engine.js +67 -15
package/build/src/core/prompts.js +37 -5
package/build/src/device/android/actions.js +2 -2
package/build/src/device/assertions.js +3 -2
package/build/src/device/cloud/browserstack/adapter.js +1 -0
package/build/src/device/cloud/lambdatest/adapter.js +402 -0
package/build/src/device/cloud/registry.js +2 -1
package/build/src/device/interface.js +1 -1
package/build/src/device/ios/actions.js +8 -2
package/build/src/device/loadmill.js +4 -3
package/build/src/device/openai.js +118 -1
package/build/src/modes/execution-mode.js +13 -18
package/build/src/utils/console-output.js +35 -0
package/build/src/utils/run-screenshot-recorder.js +98 -0
package/build/src/utils/structured-debug-log-manager.js +325 -0
package/package.json +2 -1

package/build/src/core/app-context.js ADDED Viewed

@@ -0,0 +1,57 @@
+import path from "path";
+import { access, readFile } from "fs/promises";
+import { constants as fsConstants } from "fs";
+import { compactAppContext } from "../device/openai.js";
+export const APP_CONTEXT_FILENAME = "context.md";
+export const DEFAULT_APP_CONTEXT_BUDGET = 300;
+export const MIN_APP_CONTEXT_BUDGET = 100;
+export const MAX_APP_CONTEXT_BUDGET = 2000;
+export function normalizeAppContextBudget(value) {
+    if (typeof value !== "number" || !Number.isFinite(value)) {
+        return DEFAULT_APP_CONTEXT_BUDGET;
+    }
+    const normalized = Math.round(value);
+    if (normalized < MIN_APP_CONTEXT_BUDGET) {
+        return MIN_APP_CONTEXT_BUDGET;
+    }
+    if (normalized > MAX_APP_CONTEXT_BUDGET) {
+        return MAX_APP_CONTEXT_BUDGET;
+    }
+    return normalized;
+}
+export function getDefaultProjectContextPath(projectPath) {
+    return path.join(projectPath, APP_CONTEXT_FILENAME);
+}
+export async function readAppContextFile(filePath) {
+    await access(filePath, fsConstants.R_OK);
+    return await readFile(filePath, "utf-8");
+}
+export async function appContextFileExists(filePath) {
+    try {
+        await access(filePath, fsConstants.R_OK);
+        return true;
+    }
+    catch {
+        return false;
+    }
+}
+export async function buildAppContextBriefing({ contextPath, taskText, budget = DEFAULT_APP_CONTEXT_BUDGET, }) {
+    if (!contextPath) {
+        return { briefing: "", contextPath: null };
+    }
+    const normalizedBudget = normalizeAppContextBudget(budget);
+    if (normalizedBudget === 0) {
+        return { briefing: "", contextPath };
+    }
+    const rawContext = await readAppContextFile(contextPath);
+    const result = await compactAppContext({
+        contextDocument: rawContext,
+        taskDescription: taskText,
+        tokenBudget: normalizedBudget,
+    });
+    return {
+        briefing: result.briefing,
+        outputTokens: result.outputTokens,
+        contextPath,
+    };
+}

package/build/src/core/execution-engine.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { handleModelAction } from "../device/actions.js";
 import { sendCUARequest } from "../device/openai.js";
 import { emitDesktopDebug } from "../utils/desktop-debug.js";
 import { getConfiguredStepDelayMs } from "../utils/step-delay.js";
+import { printCliOutput } from "../utils/console-output.js";
 function extractComputerCalls(items) {
     const entries = [];
     for (const item of items) {
@@ -35,7 +36,46 @@ export class ExecutionEngine {
         this.session = session;
         this.recordScreenshots = options.recordScreenshots || false;
         this.screenshotDir = options.screenshotDir || null;
+        this.screenshotRecorder = options.screenshotRecorder || null;
         this.stepDelayMs = getConfiguredStepDelayMs();
+        this.reportedScreenshotWriteError = false;
+    }
+    async recordScreenshot(screenshotBase64, metadata = {}) {
+        if (typeof screenshotBase64 !== "string" || !screenshotBase64) {
+            return null;
+        }
+        try {
+            if (this.screenshotRecorder?.saveScreenshot) {
+                return await this.screenshotRecorder.saveScreenshot(screenshotBase64, metadata);
+            }
+            if (this.recordScreenshots && this.screenshotDir) {
+                const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
+                await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
+                return framePath;
+            }
+        }
+        catch (error) {
+            if (!this.reportedScreenshotWriteError) {
+                this.reportedScreenshotWriteError = true;
+                const scope = typeof metadata.sessionId === "string" ? "design" : "execution";
+                const ids = scope === "design"
+                    ? {
+                        sessionId: metadata.sessionId,
+                        stepId: metadata.stepId,
+                        instructionIndex: metadata.instructionIndex
+                    }
+                    : {
+                        runId: metadata.runId,
+                        stepId: metadata.stepId,
+                        instructionIndex: metadata.instructionIndex
+                    };
+                emitDesktopDebug("run.screenshot_write.error", scope, ids, {
+                    captureSource: metadata.captureSource ?? null,
+                    message: error instanceof Error ? error.message : "Failed to persist screenshot"
+                });
+            }
+        }
+        return null;
     }
     /**
      * Run a full turn with the CUA model
@@ -46,8 +86,9 @@ export class ExecutionEngine {
      * @param {Object} context - Optional Ink context for output
      */
     async runFullTurn(response, trackAction = null, context = null, stepContext = null) {
-        const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
+        const addOutput = context?.addOutput || printCliOutput;
         let newResponseId = response.id;
+        const shouldStop = () => Boolean(trackAction?.());
         const eventMeta = (extra = {}) => ({
             runId: context?.runId,
             stepId: stepContext?.stepId,
@@ -56,11 +97,8 @@ export class ExecutionEngine {
         });
         while (true) {
             // Check for interruption before processing next batch of actions
-            if (trackAction) {
-                const shouldStop = trackAction(null); // null action = pre-batch check
-                if (shouldStop) {
-                    return newResponseId;
-                }
+            if (shouldStop()) {
+                return newResponseId;
             }
             const items = response.output || [];
             const computerCalls = extractComputerCalls(items);
@@ -120,6 +158,9 @@ export class ExecutionEngine {
                     continue;
                 let sawExplicitScreenshotAction = false;
                 for (const action of actions) {
+                    if (shouldStop()) {
+                        return newResponseId;
+                    }
                     if (action.type === "screenshot") {
                         sawExplicitScreenshotAction = true;
                         addOutput({
@@ -138,16 +179,17 @@ export class ExecutionEngine {
                     else {
                         await handleModelAction(this.session.deviceId, action, this.session.deviceInfo.scale, {
                             ...context,
+                            shouldStop,
                             stepId: stepContext?.stepId,
                             instructionIndex: stepContext?.instructionIndex
                         });
                         // Track action and check for interruption
                         if (trackAction) {
-                            const shouldStop = trackAction(action);
-                            if (shouldStop) {
-                                // User interrupted - stop execution immediately
-                                return newResponseId;
-                            }
+                            trackAction(action);
+                        }
+                        if (shouldStop()) {
+                            // User interrupted - stop execution immediately
+                            return newResponseId;
                         }
                         // Add delay after UI-changing actions to let the interface update
                         // before taking the screenshot (except for explicit wait actions which have their own delay)
@@ -156,6 +198,9 @@ export class ExecutionEngine {
                         }
                     }
                 }
+                if (shouldStop()) {
+                    return newResponseId;
+                }
                 const screenshotBase64 = await getScreenshotAsBase64(this.session.deviceId, this.session.deviceInfo);
                 emitDesktopDebug("device.screenshot", "device", {
                     runId: context?.runId,
@@ -168,10 +213,14 @@ export class ExecutionEngine {
                     height: this.session.deviceInfo?.scaled_height,
                     base64Length: screenshotBase64.length
                 });
-                if (this.recordScreenshots && this.screenshotDir) {
-                    const framePath = path.join(this.screenshotDir, `frame_${String(Date.now())}.png`);
-                    await writeFile(framePath, Buffer.from(screenshotBase64, "base64"));
-                }
+                await this.recordScreenshot(screenshotBase64, {
+                    runId: context?.runId,
+                    sessionId: context?.sessionId,
+                    stepId: stepContext?.stepId,
+                    instructionIndex: stepContext?.instructionIndex,
+                    callId: call_id,
+                    captureSource: sawExplicitScreenshotAction ? "call-output-explicit-action" : "call-output-post-action"
+                });
                 // Build next input: screenshot + any carryover reasoning
                 const selectedCuaModel = process.env.OPENAI_CUA_MODEL === "computer-use-preview" ? "computer-use-preview" : "gpt-5.4";
                 const input = [{
@@ -186,6 +235,9 @@ export class ExecutionEngine {
                             : {}),
                         ...(pendingSafetyChecks.length > 0 ? { acknowledged_safety_checks: pendingSafetyChecks } : {})
                     }];
+                if (shouldStop()) {
+                    return newResponseId;
+                }
                 response = await sendCUARequest({
                     messages: input,
                     previousResponseId: newResponseId,

package/build/src/core/prompts.js CHANGED Viewed

@@ -41,6 +41,17 @@ function describeControlledDevice(deviceInfo = {}) {
     }
     return "a mobile device";
 }
+function buildAppContextSection(briefing) {
+    const text = typeof briefing === "string" ? briefing.trim() : "";
+    if (!text) {
+        return "";
+    }
+    return `APP CONTEXT BRIEFING:
+The following is a condensed description of the app you are testing, relevant to the current task.
+Use this to understand screen layouts, terminology, navigation, and expected behavior.
+${text}`;
+}
 export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
     const controlledDevice = describeControlledDevice(deviceInfo);
     const prompt = `
@@ -65,12 +76,25 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
   - Use 'keypress' only for a single mobile-safe key when absolutely necessary.
   - To replace text, tap into the field and type the desired value. If correction is needed, use mobile-safe deletion only.
   - Prefer tapping visible controls over hardware key events.
+  - Prefer on-screen navigation controls such as menus, tabs, drawer items, back arrows, close buttons, and explicit logout buttons over keypress actions.
+  - Do NOT use Back or ESC for normal app navigation when a reliable on-screen control is visible.
+  - Avoid using Back or ESC from a main or root screen, because it may leave the app.
+  - Exception: if the software keyboard is open and blocking the next needed control, Back or ESC may be used to dismiss the keyboard before continuing.
+  - Treat keypress actions as a fallback for limited cases only, such as a clearly needed single mobile-safe key or dismissing transient UI when no better visible control exists.
   CRITICAL - Automatic Timing:
   - After EVERY action (click, type, keypress, scroll), there is an automatic 500ms delay
   - This 500ms is sufficient for normal UI updates and animations
   - DO NOT add 'wait' actions unnecessarily - trust the automatic delay
+  CRITICAL - Mutating Actions:
+  - Mutating actions change app state. Examples: submit, create, save, confirm, approve, reject, login, logout, send, place order, initiate transfer
+  - Before tapping a mutating action button, dismiss the software keyboard first when it is open and not required for the tap
+  - After performing a mutating action once, do NOT repeat the same mutating action unless the UI clearly shows the first attempt failed or had no effect
+  - Treat visible state change as success. Examples: form fields clear, submit button returns to normal, status changes, list refreshes, new row appears, success message appears, screen changes
+  - For form submissions specifically, if the relevant fields clear and the action button returns to its normal idle state, treat that as success even if the new row or confirmation is not obvious yet
+  - If the UI shows signs that the mutating action succeeded, stop acting for that instruction
   Use explicit 'wait' action ONLY in these specific cases:
   1. After launching apps from home screen or app drawer
   2. After pressing ENTER that triggers navigation (search, URL, form submit)
@@ -109,8 +133,9 @@ export function buildBaseSystemPrompt(deviceInfo, customInstructions = {}) {
   Only complete the current instruction. Do not proceed beyond the current step unless asked.
   Mobile-Specific Notes:
-  - ESC key maps to the Home button (return to home screen)
-  - Use Home button (ESC) to escape from stuck situations and restart
+  - HOME key returns to the home screen
+  - On Android, ESC key maps to Back
+  - On iOS, ESC has no effect; use visible on-screen controls instead
   - Never use CTRL, CMD, ALT, OPTION, or SHIFT in a keypress action
   `;
     return prompt;
@@ -213,9 +238,10 @@ Remember: You are autonomous. Explore confidently. Generate simple, executable t
         { title: "Design Mode Instructions", text: designCustomText }
     ]);
 }
-export function buildExecutionModePrompt(deviceInfo, customInstructions = {}) {
+export function buildExecutionModePrompt(deviceInfo, customInstructions = {}, appContextBriefing = "") {
     const executionCustomText = typeof customInstructions.executionModeInstructions === "string" ? customInstructions.executionModeInstructions.trim() : "";
     const basePrompt = buildBaseSystemPrompt(deviceInfo, customInstructions);
+    const appContextSection = buildAppContextSection(appContextBriefing);
     const prompt = `${basePrompt}
 EXECUTION MODE - Critical Behavior:
@@ -229,6 +255,9 @@ CRITICAL RULES:
 - Just execute the action silently and stop immediately
 - Only generate text if the action FAILED or cannot be completed
 - Never emit desktop keyboard shortcuts or modifier combos; mobile execution only supports mobile-safe single-key presses
+- Never repeat the same mutating action with the same apparent intent unless the UI clearly shows failure or no state change
+- If a submit/create/approve/reject/login action appears to succeed, stop instead of trying to reconfirm by doing it again
+- For form submissions, cleared fields plus a reset action button are strong success signals; stop even if the created item is not yet obvious in the visible list
 - If target is not visible, perform bounded off-screen discovery first:
   1. Scroll the screen in the likely direction to reveal hidden controls
   2. If still missing, do one minimal fallback (e.g., close overlay or go back once), then retry
@@ -236,10 +265,13 @@ CRITICAL RULES:
 Your process:
 1. Read the instruction
 2. Execute the required actions
-3. Stop immediately - no commentary, no questions
+3. Before tapping a mutating action, dismiss the keyboard if it is open and not needed
+4. After a mutating action, inspect the resulting screen for success cues such as cleared fields, reset buttons, changed status, refreshed content, or navigation
+5. Stop as soon as success is visible
+6. Stop immediately - no commentary, no questions
 Each instruction is independent. Do not reference previous instructions or ask about next steps.
-`;
+${appContextSection ? `\n\n${appContextSection}` : ""}`;
     return appendCustomSections(prompt, [
         { title: "Base Prompt Instructions", text: customInstructions.basePromptInstructions },
         { title: "Execution Mode Instructions", text: executionCustomText }

package/build/src/device/android/actions.js CHANGED Viewed

@@ -14,8 +14,8 @@ function normalizeMobileKeypress(keys = []) {
     }
     const key = String(keys[0]).trim().toUpperCase();
     const mobileKeyMap = {
-        ESC: "KEYCODE_HOME",
-        ESCAPE: "KEYCODE_HOME",
+        ESC: "KEYCODE_BACK",
+        ESCAPE: "KEYCODE_BACK",
         HOME: "KEYCODE_HOME",
         BACK: "KEYCODE_BACK",
         ENTER: "KEYCODE_ENTER",

package/build/src/device/assertions.js CHANGED Viewed

@@ -1,6 +1,7 @@
 /**
  * Assertion handling for script validation
  */
+import { printCliOutput } from "../utils/console-output.js";
 export function isAssertion(userInput) {
     const trimmed = userInput.trim();
     const lower = trimmed.toLowerCase();
@@ -56,7 +57,7 @@ export function extractFailureDetails(transcript) {
 }
 export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMode, context, stepContext = null) {
     const details = extractFailureDetails(transcript);
-    const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
+    const addOutput = context?.addOutput || printCliOutput;
     const meta = {
         eventType: 'assertion_result',
         runId: context?.runId,
@@ -81,7 +82,7 @@ export function handleAssertionFailure(assertionPrompt, transcript, isHeadlessMo
     // Interactive mode: caller should clear remaining instructions
 }
 export function handleAssertionSuccess(assertionPrompt, context = null, stepContext = null) {
-    const addOutput = context?.addOutput || ((item) => console.log(item.text || item));
+    const addOutput = context?.addOutput || printCliOutput;
     addOutput({
         type: 'success',
         text: `✓ Assertion passed: ${assertionPrompt}`,

package/build/src/device/cloud/browserstack/adapter.js CHANGED Viewed

@@ -195,6 +195,7 @@ function readAppStatusEntry(payload) {
     }
     return entries;
 }
+/** @type {import("../adapter").CloudProviderAdapter} */
 export const browserStackAdapter = {
     id: "browserstack",
     displayName: "BrowserStack",