npm - @ishlabs/cli - Versions diffs - 0.24.1 → 0.26.0 - Mend

@ishlabs/cli 0.24.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/dist/commands/ask.js +3 -3
package/dist/commands/doctor.d.ts +26 -0
package/dist/commands/doctor.js +334 -0
package/dist/commands/iteration.js +1 -1
package/dist/commands/study-analyze.js +1 -1
package/dist/commands/study-run.js +80 -12
package/dist/commands/study.js +11 -7
package/dist/index.js +2 -0
package/dist/lib/alias-store.js +1 -1
package/dist/lib/api-client.d.ts +2 -0
package/dist/lib/docs.js +57 -42
package/dist/lib/local-sim/actions.d.ts +10 -2
package/dist/lib/local-sim/actions.js +18 -11
package/dist/lib/local-sim/adb.d.ts +113 -0
package/dist/lib/local-sim/adb.js +366 -0
package/dist/lib/local-sim/android.d.ts +111 -0
package/dist/lib/local-sim/android.js +504 -0
package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
package/dist/lib/local-sim/apk-manifest.js +210 -0
package/dist/lib/local-sim/browser.d.ts +22 -0
package/dist/lib/local-sim/browser.js +65 -0
package/dist/lib/local-sim/coordinates.d.ts +69 -0
package/dist/lib/local-sim/coordinates.js +59 -0
package/dist/lib/local-sim/device.d.ts +143 -0
package/dist/lib/local-sim/device.js +152 -0
package/dist/lib/local-sim/ios.d.ts +185 -0
package/dist/lib/local-sim/ios.js +599 -0
package/dist/lib/local-sim/loop.d.ts +14 -2
package/dist/lib/local-sim/loop.js +168 -73
package/dist/lib/local-sim/native-a11y.d.ts +111 -0
package/dist/lib/local-sim/native-a11y.js +419 -0
package/dist/lib/local-sim/simctl.d.ts +55 -0
package/dist/lib/local-sim/simctl.js +144 -0
package/dist/lib/local-sim/types.d.ts +39 -2
package/dist/lib/local-sim/upload.d.ts +1 -1
package/dist/lib/local-sim/upload.js +9 -6
package/dist/lib/local-sim/xcuitest.d.ts +60 -0
package/dist/lib/local-sim/xcuitest.js +303 -0
package/dist/lib/output.js +58 -12
package/dist/lib/paths.d.ts +8 -0
package/dist/lib/paths.js +12 -0
package/dist/lib/skill-content.js +10 -9
package/package.json +2 -1

package/dist/lib/local-sim/loop.js CHANGED Viewed

@@ -1,19 +1,31 @@
 /**
  * Local simulation loop orchestrator.
  *
- * Runs the observe → reason (remote) → act (local) loop for each
- * participant against a local Playwright browser.
+ * Runs the observe → reason (remote) → act (local) loop for each participant
+ * against a SimulationDevice (a Playwright browser today; a native Android
+ * emulator next). The loop is device-agnostic — see device.ts.
  */
-import { launchBrowser, launchSharedBrowser, createTab, captureObservation, takeScreenshot, takeScreenshotJpeg, navigateWithRetry, closeBrowser } from "./browser.js";
+import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
 import { uploadScreenshot } from "./upload.js";
-import { executeAction, detectNoVisibleChange, describeAction } from "./actions.js";
-import { TabManager } from "./tabs.js";
-import { enableDebug, isDebugEnabled, debugObservation, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
+import { detectNoVisibleChange, describeAction } from "./actions.js";
+import { createDevice } from "./device.js";
+import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
+/**
+ * Native (mobile) platforms drive a single physical device via screenshot →
+ * normalized-coordinate taps, with no accessibility tree or URL. Browser/web is
+ * everything else.
+ */
+function isNativePlatform(platform) {
+    return platform === "android" || platform === "ios";
+}
 /**
  * Convert a raw action (from either resolved_actions or output.action.actions)
- * into the flat LocalStepAction shape used by the executor.
+ * into the flat LocalStepAction shape used by the executor. Exported for unit
+ * tests of the native drag coordinate-shape split (the nested action's
+ * `coordinates` is a {x,y} tap point for most actions but a
+ * {startX,...,endY} path for a drag).
  */
-function flattenAction(raw, nodeId = null, nodeDescription = null) {
+export function flattenAction(raw, nodeId = null, nodeDescription = null) {
     // resolved_actions nest the action inside an "action" key
     const a = (raw.action ?? raw);
     const element = a.element;
@@ -36,8 +48,36 @@ function flattenAction(raw, nodeId = null, nodeDescription = null) {
         modifiers: Array.isArray(a.modifiers) ? a.modifiers : null,
         key: a.key ?? null,
         tab_id: a.tab_id ?? null,
+        orientation: a.orientation ?? null,
+        panel: a.panel ?? null,
+        scale: a.scale ?? null,
+        // Native path: ResolvedAction.coordinates (top level of the resolved_actions
+        // entry) is the single {x,y} execution point. Fall back to the nested action
+        // for raw output — but only a point-shaped {x,y}; a drag's nested
+        // coordinates is the {startX,...,endY} path (extracted into `drag` below),
+        // not a tap point, so guard against mis-assigning it here.
+        coordinates: pickPoint(raw.coordinates) ?? pickPoint(a.coordinates) ?? null,
+        // drag: the gesture path lives on the nested action's coordinates as
+        // {startX, startY, endX, endY} (DragCoordinates serialized by_alias).
+        drag: pickDrag(a.coordinates) ?? null,
     };
 }
+/** A nested action's coordinates only when it's the {x,y} tap-point shape. */
+function pickPoint(c) {
+    if (c && typeof c === "object" && "x" in c && "y" in c) {
+        const p = c;
+        return { x: p.x, y: p.y };
+    }
+    return null;
+}
+/** A nested action's coordinates only when it's the DragCoordinates shape. */
+function pickDrag(c) {
+    if (c && typeof c === "object" && "startX" in c && "endX" in c) {
+        const d = c;
+        return { startX: d.startX, startY: d.startY, endX: d.endX, endY: d.endY };
+    }
+    return null;
+}
 /**
  * Normalize the raw backend step response into the flat structure used by the loop.
  * Backend returns { output: { ... }, resolved_actions: [...], loop_detected }.
@@ -61,7 +101,10 @@ function normalizeStepResponse(raw) {
         sentiment_intensity: out.sentiment_intensity ?? 0,
         current_location: out.current_location,
         effort_seconds: out.effort_seconds,
-        assignment_completed: out.assignment_completed,
+        assignment_status: out.assignment_status,
+        // Terminate on completed OR abandoned — a stuck agent that gives up
+        // should stop the loop just like a finished one.
+        assignment_completed: out.assignment_status === "completed" || out.assignment_status === "abandoned",
         actions,
         loop_detected: raw.loop_detected,
     };
@@ -89,7 +132,13 @@ export async function runLocalSimulations(client, opts) {
         log("\nCancelling after current step...");
     };
     process.on("SIGINT", onSigint);
-    const concurrency = opts.parallel ?? opts.participantIds.length;
+    // Native runs share ONE physical device (emulator / simulator), so they
+    // can't run in parallel — force sequential regardless of --parallel.
+    const isNativeRun = isNativePlatform(opts.platform);
+    if (isNativeRun && (opts.parallel ?? 1) > 1) {
+        log("Native (android/ios) runs drive a single device — running sequentially.");
+    }
+    const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
     try {
         if (concurrency <= 1 || opts.participantIds.length <= 1) {
             // Sequential execution — each participant owns its own browser
@@ -162,12 +211,20 @@ async function runSingleSimulation(client, participantId, participantName, opts,
         product_id: opts.workspaceId,
         iteration_id: opts.iterationId,
     });
-    // Resolve URL and browser config from iteration details (with CLI fallback)
+    // Resolve target + config from iteration details (with CLI fallback).
+    // Platform precedence: --platform flag > iteration's stored platform > web.
     const iterDetails = initResponse.iteration_details;
+    const platform = opts.platform ?? iterDetails?.platform ?? "web";
+    const isNative = isNativePlatform(platform);
+    // Browser needs a URL to navigate; native uses the app package (from --app or
+    // the iteration target) and has no URL requirement.
     const navigationUrl = iterDetails?.url ?? opts.url;
-    if (!navigationUrl) {
+    if (!isNative && !navigationUrl) {
         throw new Error("No URL available: backend did not return iteration_details and no --url flag was provided.");
     }
+    // For native, launchOrReset() receives the app package (iteration target);
+    // the AndroidDevice prefers --app over this.
+    const launchTarget = isNative ? (navigationUrl ?? "") : navigationUrl;
     const screenFormat = opts.screenFormat ?? iterDetails?.screen_format ?? "desktop";
     const locale = opts.locale ?? iterDetails?.locale;
     // Cache session state for per-step requests
@@ -189,7 +246,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
     const stepContextValues = session.context_values.map(cv => cv.type === "secret" ? { ...cv, value: null } : cv);
     const maxSteps = opts.maxInteractions ?? session.max_interactions;
     const viewport = { width: 1440, height: 900 }; // TODO: extract from config
-    // Step 2: Launch browser
+    // Step 2: Build the target device (per-platform dispatch).
+    // Browser today; AndroidDevice (adb) slots in via createDevice() later.
     const browserOpts = {
         headed: opts.headed,
         slowMo: opts.slowMo,
@@ -198,15 +256,13 @@ async function runSingleSimulation(client, participantId, participantName, opts,
         locale,
         screenFormat,
     };
-    // Use shared browser if available (parallel mode), otherwise launch standalone
-    const ownsTheBrowser = !sharedBrowser;
-    const browserSession = sharedBrowser
-        ? await createTab(sharedBrowser, browserOpts)
-        : await launchBrowser(browserOpts);
-    // Active page can swap when a popup auto-focuses or the LLM issues
-    // switch_tab/close_tab. TabManager wires the context popup listener.
-    const tabs = new TabManager(browserSession.context, browserSession.page);
-    let page = tabs.activePage();
+    const device = await createDevice(platform, {
+        browserOpts,
+        contextValues: session.context_values,
+        sharedBrowser,
+        appPath: opts.appPath,
+        log,
+    });
     const history = [];
     const interactions = [];
     const debugSteps = [];
@@ -216,24 +272,51 @@ async function runSingleSimulation(client, participantId, participantName, opts,
     let accumulatedEffortMs = 0;
     let finalStatus = "completed";
     try {
-        // Step 3: Navigate to URL
-        await navigateWithRetry(page, navigationUrl);
+        // Step 3: Launch / navigate the target to its starting point.
+        await device.launchOrReset(launchTarget);
         // Step 4: Run assignment loop
         for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
             const assignment = session.assignments[assignmentIdx];
             log(`  Assignment ${assignmentIdx + 1}/${session.assignments.length}: ${assignment.name}`);
             let step = 0;
             let assignmentCompleted = false;
+            // The agent's last per-turn status, used to pick the terminal run-level
+            // status when the loop ends because the agent terminated (completed vs
+            // abandoned). Stays "in_progress" if the loop hits max_steps.
+            let lastAssignmentStatus = "in_progress";
             while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
-                // OBSERVE — refresh active page in case a popup or switch_tab changed it
-                page = tabs.activePage();
-                const obs = await captureObservation(page);
-                const lastTreeData = obs.treeData;
+                // OBSERVE — the device refreshes its own active surface (popup /
+                // switch_tab for browser) before capturing. (The browser device emits
+                // its own richer debugObservation with tree/scroll detail.)
+                // TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
+                const obs = await device.observe();
                 const currentScreenshot = obs.screenshot;
-                debugObservation(obs);
                 // Capture JPEG of observation for upload and recording (pre-action)
-                const obsJpeg = await takeScreenshotJpeg(page);
+                const obsJpeg = await device.captureScreenshotJpeg();
                 const obsBase64 = obsJpeg.toString("base64");
+                // Capture a height-capped full-page JPEG (pre-action, so it reflects
+                // the same screen the LLM reasons over). Sent to the backend as the
+                // PDQ basis + Frame representative_screenshot, matching the hosted
+                // run's full-page behavior. The per-interaction screenshot_url /
+                // recording stays the VIEWPORT (obsBase64) — unchanged.
+                // Degrade silently to omitting the field if capture fails: a frame is
+                // still created from the viewport.
+                const fullPageCap = screenFormat === "mobile_portrait"
+                    ? FULL_PAGE_HEIGHT_CAP_PX_MOBILE
+                    : FULL_PAGE_HEIGHT_CAP_PX_DESKTOP;
+                let fullPageBase64;
+                try {
+                    // Browser-only: native devices omit captureFullPageJpeg, so the
+                    // field is dropped and the frame is created from the viewport.
+                    fullPageBase64 = await device.captureFullPageJpeg?.({
+                        documentHeight: obs.documentHeight,
+                        cap: fullPageCap,
+                    });
+                }
+                catch (err) {
+                    const msg = err instanceof Error ? err.message : String(err);
+                    log(`    Warning: full-page screenshot capture failed — ${msg}`);
+                }
                 // Detect no-visible-change: compare this step's observation with the
                 // PREVIOUS step's observation (not the post-action screenshot).
                 // This tells us whether the previous step's action changed the page.
@@ -243,10 +326,9 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 previousObsScreenshot = currentScreenshot;
                 if (forwards.length > 0)
                     debugForwards(forwards);
-                const viewportSize = page.viewportSize() ?? viewport;
                 // Snapshot open tabs so the backend can prompt the LLM with tab ids
                 // (used by switch_tab/close_tab and to disambiguate cmd+click results).
-                const tabsSnapshot = await tabs.list();
+                const tabsSnapshot = obs.tabs;
                 // REASON (remote)
                 let stepResponse;
                 try {
@@ -256,10 +338,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                         assignment_name: assignment.name,
                         assignment_instructions: assignment.instructions,
                         screenshot: obs.screenshot,
-                        accessibility_tree: obs.treeData.simplified,
+                        accessibility_tree: obs.accessibilityTree,
                         current_url: obs.url,
-                        screen_width: viewportSize.width,
-                        screen_height: viewportSize.height,
+                        screen_width: obs.width,
+                        screen_height: obs.height,
                         interaction_count: step,
                         history,
                         forwards,
@@ -276,7 +358,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 catch (err) {
                     const msg = err instanceof Error ? err.message : String(err);
                     log(`    Step ${step + 1}: API error — ${msg}`);
-                    await page.waitForTimeout(2000);
+                    await new Promise((r) => setTimeout(r, 2000));
                     try {
                         const stepReqBody = {
                             participant_id: session.participant_id,
@@ -284,10 +366,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                             assignment_name: assignment.name,
                             assignment_instructions: assignment.instructions,
                             screenshot: obs.screenshot,
-                            accessibility_tree: obs.treeData.simplified,
+                            accessibility_tree: obs.accessibilityTree,
                             current_url: obs.url,
-                            screen_width: viewportSize.width,
-                            screen_height: viewportSize.height,
+                            screen_width: obs.width,
+                            screen_height: obs.height,
                             interaction_count: step,
                             history,
                             forwards,
@@ -313,27 +395,21 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 const actionDescs = [];
                 const elementNames = [];
                 const actionDebugEntries = [];
-                const preActionScreenshot = await takeScreenshot(page);
+                const preActionScreenshot = await device.captureScreenshot();
                 for (let i = 0; i < stepResponse.actions.length; i++) {
                     if (isCancelled())
                         break;
-                    // Pick up popup auto-switch / explicit tab switch from prior actions.
-                    page = tabs.activePage();
                     const action = stepResponse.actions[i];
-                    const tabsBefore = (await tabs.list()).length;
-                    const result = await executeAction(page, action, lastTreeData, session.context_values, tabs);
+                    const result = await device.executeAction(action);
                     const desc = describeAction(action);
                     debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
-                    // The action may have flipped the active tab — re-read.
-                    page = tabs.activePage();
-                    const tabsAfter = (await tabs.list()).length;
-                    const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
+                    const openedNewTab = result.openedNewTab;
                     let normalizedCoords = null;
                     if (result.coordinates) {
-                        const vp = page.viewportSize() ?? viewport;
+                        const dims = device.dimensions();
                         normalizedCoords = {
-                            x: Math.round((result.coordinates.x / vp.width) * 1000),
-                            y: Math.round((result.coordinates.y / vp.height) * 1000),
+                            x: Math.round((result.coordinates.x / dims.width) * 1000),
+                            y: Math.round((result.coordinates.y / dims.height) * 1000),
                         };
                     }
                     const actionType = action.type || "unknown";
@@ -355,6 +431,11 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                                 ...(action.modifiers?.length && { modifiers: action.modifiers }),
                                 ...(action.key && { key: action.key }),
                                 ...(action.tab_id && { tab_id: action.tab_id }),
+                                ...(action.orientation && { orientation: action.orientation }),
+                                ...(action.panel && { panel: action.panel }),
+                                // The recorded `coordinates` is the drag START; persist the END
+                                // (normalized 0-1000) too so the journey captures the full path.
+                                ...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
                                 ...(openedNewTab && { opened_new_tab: true }),
                             },
                             order: i,
@@ -376,7 +457,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                     }
                     // Check if UI changed significantly (skip for last action in batch)
                     if (i < stepResponse.actions.length - 1) {
-                        const midScreenshot = await takeScreenshot(page);
+                        const midScreenshot = await device.captureScreenshot();
                         if (!detectNoVisibleChange(preActionScreenshot, midScreenshot)) {
                             const blockedCount = stepResponse.actions.length - 1 - i;
                             forwards.push({
@@ -387,10 +468,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                         }
                     }
                 }
-                // Upload observation JPEG (pre-action — matches coordinates and LLM context)
+                // Upload observation screenshot (pre-action — matches coordinates and
+                // LLM context). Browser captures JPEG; native screencap is PNG.
+                const obsContentType = isNative ? "image/png" : "image/jpeg";
                 let screenshotUrl;
                 try {
-                    const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg);
+                    const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg, obsContentType);
                     screenshotUrl = uploadResult.screenshotUrl;
                 }
                 catch (err) {
@@ -407,6 +490,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                         screenshot_url: screenshotUrl,
                         location_name: stepResponse.current_location,
                         screen_format: screenFormat,
+                        ...(fullPageBase64 ? { full_page_screenshot_base64: fullPageBase64 } : {}),
+                        // Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
+                        // back to screen_format server-side.
+                        platform,
                     });
                     frameVersionId = matchResult.frame_version_id;
                 }
@@ -417,7 +504,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 // Debug-only: capture post-action screenshot to show result
                 let postActionBase64;
                 if (isDebugEnabled()) {
-                    const postJpeg = await takeScreenshotJpeg(page);
+                    const postJpeg = await device.captureScreenshotJpeg();
                     postActionBase64 = postJpeg.toString("base64");
                 }
                 // Accumulate effort (cumulative, not wall-clock)
@@ -437,10 +524,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                     step: step + 1,
                     assignment_id: assignment.id,
                     ...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
+                    // Dimensions of THIS step's screenshot (from observe()) so the backend
+                    // can populate the screenshot ref even when only screenshot_url is
+                    // sent (native) and it can't read the bytes for dims.
+                    screen_width: obs.width,
+                    screen_height: obs.height,
                     frame_version_id: frameVersionId,
                     timestamp_ms: accumulatedEffortMs,
                     comment: stepResponse.comment,
-                    url: page.url(),
+                    url: device.currentUrl(),
                     sentiment: {
                         label: stepResponse.sentiment,
                         valence: stepResponse.sentiment_valence,
@@ -448,7 +540,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                     },
                     actions: actionDatas,
                     current_location: stepResponse.current_location,
-                    assignment_completed: stepResponse.assignment_completed,
+                    assignment_status: stepResponse.assignment_status,
                     // Server reduces this to Interaction.tab when N >= 2; omit on
                     // single-tab steps to keep the payload (and DB column) null.
                     ...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
@@ -467,7 +559,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                     assignmentName: assignment.name,
                     screenshotBase64: obsBase64,
                     postActionScreenshotBase64: postActionBase64,
-                    url: page.url(),
+                    url: device.currentUrl(),
                     actions: actionDebugEntries,
                     comment: stepResponse.comment,
                     sentiment: {
@@ -480,6 +572,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                     effortSeconds: stepResponse.effort_seconds,
                 });
                 assignmentCompleted = stepResponse.assignment_completed;
+                lastAssignmentStatus = stepResponse.assignment_status;
                 step++;
             }
             if (isCancelled()) {
@@ -491,16 +584,25 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 });
                 break;
             }
+            // When the agent terminated, persist its ACTUAL terminal status
+            // (completed vs abandoned) rather than always "completed". When it
+            // didn't terminate, the loop hit max_steps.
+            const terminalStatus = assignmentCompleted
+                ? lastAssignmentStatus
+                : "max_steps_reached";
             assignmentStatuses.push({
                 assignment_id: assignment.id,
-                status: assignmentCompleted ? "completed" : "max_steps_reached",
+                status: terminalStatus,
                 step_count: step,
             });
-            if (assignmentCompleted) {
-                log(`    Assignment completed in ${step} steps`);
+            if (!assignmentCompleted) {
+                log(`    Assignment reached max steps (${maxSteps})`);
+            }
+            else if (lastAssignmentStatus === "abandoned") {
+                log(`    Assignment abandoned by agent after ${step} steps`);
             }
             else {
-                log(`    Assignment reached max steps (${maxSteps})`);
+                log(`    Assignment completed in ${step} steps`);
             }
         }
     }
@@ -518,7 +620,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
                 generateDebugReport(debugSteps, {
                     participantId: session.participant_id,
                     participantName,
-                    url: navigationUrl,
+                    url: launchTarget,
                     screenFormat,
                     finalStatus,
                     assignmentStatuses,
@@ -542,15 +644,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
             const msg = err instanceof Error ? err.message : String(err);
             log(`  Warning: failed to record results — ${msg}`);
         }
-        if (ownsTheBrowser) {
-            await closeBrowser(browserSession);
-        }
-        else {
-            // Shared mode: close just the tab, not the context or browser
-            try {
-                await browserSession.page.close();
-            }
-            catch { }
-        }
+        // Device owns its own teardown (full browser vs. just-the-tab for shared
+        // mode, app/emulator cleanup for native).
+        await device.close();
     }
 }

package/dist/lib/local-sim/native-a11y.d.ts ADDED Viewed

@@ -0,0 +1,111 @@
+/**
+ * Pure parser/serializer for native (Android/iOS) accessibility trees — the
+ * native counterpart of the browser's DOM-locator tree. It turns a raw device
+ * a11y dump into the SAME `[id] role "name"` string the backend's DOMLocator
+ * reasons over, plus a local `shortId → bounds` map the device taps the CENTER
+ * of. No bounds ship to the backend; like the browser path, the CLI keeps the
+ * map and resolves the LLM's returned short id locally.
+ *
+ * FCIS: this module is pure (string in, structs out) — no `adb`/`idb` I/O — so
+ * it's unit-testable without a device, exactly like `coordinates.ts`. The I/O
+ * lives in `adb.ts`/`simctl.ts`; the parse/serialize math lives here.
+ *
+ * COORDINATE SPACE — carried, not converted, by this module:
+ *   - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
+ *   - iOS WebDriverAgent /source frames are POINTS (`space: "points"`).
+ * The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
+ * IOSDevice taps points), so the `space` tag tells the caller which dimension a
+ * node's bounds-center belongs to. This module never mixes the two.
+ *
+ * ANCESTOR-VS-LEAF (the hard part): on Android the visible label
+ * ("Network & internet") sits on a `clickable=false` TextView nested inside the
+ * clickable PARENT row. Tapping the leaf's center misses the row's hit logic and
+ * lands "slightly off"; the click target is the row. So the serializer walks to
+ * the nearest clickable ANCESTOR, aggregates its descendants' text/content-desc
+ * into ONE label, and emits the CLICKABLE node WITH THE ROW'S BOUNDS — never the
+ * leaf. iOS Buttons are already labeled + actionable, so they emit directly.
+ */
+export type CoordinateSpace = "px" | "points";
+export interface Bounds {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+}
+/**
+ * One parsed native a11y node. `bounds` are in `space` (Android px, iOS points).
+ * `clickable` marks an actionable hit target. `resourceId` is the Android
+ * resource-id / iOS AXUniqueId when present (diagnostic; not used for tapping).
+ */
+export interface NativeNode {
+    role: string;
+    label: string;
+    bounds: Bounds;
+    clickable: boolean;
+    /** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
+    hasOwnLabel: boolean;
+    resourceId?: string;
+    space: CoordinateSpace;
+}
+export interface NativeTree {
+    /** `[id] role "label"` lines, one per emitted actionable node. */
+    simplified: string;
+    /** shortId → bounds (in the platform's space). The device taps the center. */
+    nodeMap: Map<string, Bounds>;
+}
+/**
+ * Parse a uiautomator XML dump into a flat list of leaf-significant nodes in
+ * document order. The dump is a single line of nested `<node ...>` tags; we
+ * rebuild the parent/child nesting from the open/close-tag stream (mirroring the
+ * "break after `>`" split the oracle scripts use, but tracking depth so the
+ * ancestor-aggregation in `serializeNativeTree` has the real tree).
+ *
+ * Returns the FLATTENED set of nodes (depth-first, document order) with their
+ * raw fields; the serializer decides which to emit and how to aggregate.
+ */
+export declare function parseUiautomatorXml(xml: string): NativeNode[];
+/**
+ * Parse WDA's `GET /source?format=json` — a NESTED accessibility tree — into the
+ * FLAT, depth-first `NativeNode[]` (POINTS) that `parseXcuiHierarchy` produces,
+ * so `serializeNativeTree` consumes it unchanged. WDA's `type` matches idb's iOS
+ * types (Button/StaticText/SearchField/Cell/Image/Application…), so
+ * `normalizeRole`/`IOS_ACTIONABLE_TYPES`/`frameToBounds` all apply as-is.
+ *
+ * KEY: WDA's `/source` is the FULL XCUIElement tree — every container and leaf —
+ * NOT idb's clean accessibility-elements list. iOS settings rows surface as an
+ * accessible `Button` ("General", isAccessible=1) that ALSO contains a duplicate
+ * inner `StaticText` ("General", isAccessible=0) and is wrapped in a `Cell`
+ * (isAccessible=0). Emitting all three yields "General General" + empty
+ * listitems. So we emit ONLY `isAccessible && isVisible` nodes — exactly the
+ * VoiceOver-exposed set idb returned: the labeled Button is both the label and
+ * the tap target; the duplicate StaticText and the wrapping Cell are pruned. A
+ * sparse a11y tree degrades to the loop's vision fallback, so strict filtering
+ * never strands the run.
+ *
+ * Accepts either the raw tree or the W3C `{ value: <tree> }` envelope WDA returns.
+ */
+export declare function parseXcuiHierarchy(json: string): NativeNode[];
+/**
+ * Serialize a flat NativeNode list (from `parseUiautomatorXml` /
+ * `parseXcuiHierarchy`) into the `[id] role "label"` string the DOMLocator
+ * reasons over, plus a `shortId → bounds` map for local tap resolution.
+ *
+ * Emission rules (kept tight, like the DOM serializer):
+ *  - ANCESTOR-VS-LEAF: a CLICKABLE node absorbs its descendants' labels and is
+ *    emitted with ITS OWN bounds (the tappable row). The descendant
+ *    label-bearing leaves are then NOT emitted on their own — their text lives
+ *    on the row. A label-bearing leaf with NO clickable ancestor (e.g. a
+ *    standalone heading) is emitted directly so on-screen text isn't lost.
+ *  - Skip pure decoration: a node that is neither clickable nor label-bearing,
+ *    and a generic/application container that didn't aggregate a label.
+ *
+ * The input list is depth-first / document order, which is the order the raw
+ * parsers produce; we recover ancestry from that order using bounds containment
+ * (Android leaves nest inside their clickable row's rect; iOS is already flat).
+ */
+export declare function serializeNativeTree(nodes: NativeNode[]): NativeTree;
+/** Center of a node's bounds — the point the device taps. */
+export declare function boundsCenter(b: Bounds): {
+    x: number;
+    y: number;
+};