@ishlabs/cli 0.26.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,7 @@ export declare class ApiClient {
53
53
  platform?: string;
54
54
  previous_frame_version_id?: string;
55
55
  same_screen_continuation?: boolean;
56
+ native_screen_signature?: string;
56
57
  }): Promise<{
57
58
  frame_version_id: string;
58
59
  }>;
@@ -40,6 +40,22 @@ export declare function appBuildFromDevice(pkg: string): Promise<{
40
40
  version: string | null;
41
41
  build: string | null;
42
42
  } | null>;
43
+ /**
44
+ * Pull `"pkg/activity"` out of `dumpsys activity activities`. The foreground
45
+ * activity surfaces as `topResumedActivity=ActivityRecord{... u0 pkg/activity
46
+ * t123}` (older builds: `mResumedActivity=...`); we take the `pkg/activity`
47
+ * token from whichever line is present. The activity may be a short `.Name`
48
+ * (relative to the package) — kept as-is, exactly what dumpsys reports. Returns
49
+ * "" when neither line is present.
50
+ */
51
+ export declare function parseTopActivity(out: string): string;
52
+ /**
53
+ * The foreground `"pkg/activity"` from `dumpsys activity activities`, a coarse
54
+ * input for the screen signature. Best-effort: returns "" on any failure (the
55
+ * signature degrades to its package-only coarse token, and the run never
56
+ * depends on this read).
57
+ */
58
+ export declare function currentActivity(): Promise<string>;
43
59
  /**
44
60
  * Capture the current screen as raw PNG bytes via `adb exec-out screencap -p`.
45
61
  * `exec-out` (not `shell`) avoids the CRLF translation that corrupts binary
@@ -152,6 +152,38 @@ export async function appBuildFromDevice(pkg) {
152
152
  return null;
153
153
  }
154
154
  }
155
+ /**
156
+ * Pull `"pkg/activity"` out of `dumpsys activity activities`. The foreground
157
+ * activity surfaces as `topResumedActivity=ActivityRecord{... u0 pkg/activity
158
+ * t123}` (older builds: `mResumedActivity=...`); we take the `pkg/activity`
159
+ * token from whichever line is present. The activity may be a short `.Name`
160
+ * (relative to the package) — kept as-is, exactly what dumpsys reports. Returns
161
+ * "" when neither line is present.
162
+ */
163
+ export function parseTopActivity(out) {
164
+ const m = /topResumedActivity=ActivityRecord\{[^}]*\s(\S+\/\S+)/.exec(out) ??
165
+ /mResumedActivity:\s*ActivityRecord\{[^}]*\s(\S+\/\S+)/.exec(out) ??
166
+ /mResumedActivity=ActivityRecord\{[^}]*\s(\S+\/\S+)/.exec(out);
167
+ if (!m)
168
+ return "";
169
+ // The token can carry a trailing task id glued by the regex boundary? No —
170
+ // `\S+/\S+` stops at the first whitespace, so it is exactly `pkg/activity`.
171
+ return m[1];
172
+ }
173
+ /**
174
+ * The foreground `"pkg/activity"` from `dumpsys activity activities`, a coarse
175
+ * input for the screen signature. Best-effort: returns "" on any failure (the
176
+ * signature degrades to its package-only coarse token, and the run never
177
+ * depends on this read).
178
+ */
179
+ export async function currentActivity() {
180
+ try {
181
+ return parseTopActivity(await adbShell(["dumpsys", "activity", "activities"], 15_000));
182
+ }
183
+ catch {
184
+ return "";
185
+ }
186
+ }
155
187
  /**
156
188
  * Capture the current screen as raw PNG bytes via `adb exec-out screencap -p`.
157
189
  * `exec-out` (not `shell`) avoids the CRLF translation that corrupts binary
@@ -191,8 +223,21 @@ export async function requireOneDevice() {
191
223
  if (online.length === 0) {
192
224
  throw new AdbError("No Android device/emulator online. Run `ish check android` to check your setup and how to boot one.");
193
225
  }
226
+ // Honor ANDROID_SERIAL (the standard adb convention): when it names an online
227
+ // device, pin to it instead of failing on "more than one device". The adb
228
+ // wrapper inherits process.env, so every subsequent `adb` call already targets
229
+ // that serial — this lets multiple emulators run in parallel, each driven by a
230
+ // CLI invocation with its own ANDROID_SERIAL.
231
+ const pinned = process.env.ANDROID_SERIAL?.trim();
232
+ if (pinned) {
233
+ if (online.some((l) => l.startsWith(`${pinned}\t`)))
234
+ return;
235
+ throw new AdbError(`ANDROID_SERIAL=${pinned} is set but that device is not online. ` +
236
+ `Online: ${online.map((l) => l.split("\t")[0]).join(", ") || "none"}.`);
237
+ }
194
238
  if (online.length > 1) {
195
- throw new AdbError(`Expected exactly one Android device, found ${online.length}. Stop the extras (the sim drives a single device).`);
239
+ throw new AdbError(`Expected exactly one Android device, found ${online.length}. ` +
240
+ `Stop the extras, or set ANDROID_SERIAL=<serial> to pin one (parallel runs).`);
196
241
  }
197
242
  }
198
243
  // --- Input gestures (all in screencap pixel space) ---
@@ -66,9 +66,11 @@ export declare class AndroidDevice implements SimulationDevice {
66
66
  private refreshDimensions;
67
67
  observe(): Promise<DeviceObservation>;
68
68
  /**
69
- * Dump + serialize the uiautomator a11y tree. Any failure (dump retries
70
- * exhausted, parse error) degrades to an empty tree so the backend falls back
71
- * to the vision path a missing tree must never abort the observation.
69
+ * Dump + serialize the uiautomator a11y tree. Returns the serialized tree, the
70
+ * node map, the FLAT parsed nodes (for the screen signature) and the
71
+ * foreground package read off the dump. Any failure (dump retries exhausted,
72
+ * parse error) degrades to an empty tree so the backend falls back to the
73
+ * vision path — a missing tree must never abort the observation.
72
74
  */
73
75
  private dumpTree;
74
76
  captureScreenshot(): Promise<string>;
@@ -19,10 +19,11 @@
19
19
  * - Vision path: px = round(x / 1000 * screencapWidth); same for y.
20
20
  */
21
21
  import { resolveTextValue } from "./actions.js";
22
- import { requireOneDevice, screencapPng, pngDimensions, dumpUiautomatorXml, inputTap, inputSwipe, inputDrag, inputLongPress, setUserRotation, forceStop, launchApp, installApk, isPackageInstalled, listPackages, isAdbKeyboardInstalled, enableAdbKeyboard, setIme, resetIme, currentIme, adbKeyboardType, adbKeyboardClear, pressKeyEvent, statusbarExpand, appBuildFromDevice, ADB_KEYBOARD_PKG, } from "./adb.js";
22
+ import { requireOneDevice, screencapPng, pngDimensions, dumpUiautomatorXml, inputTap, inputSwipe, inputDrag, inputLongPress, setUserRotation, forceStop, launchApp, installApk, isPackageInstalled, listPackages, isAdbKeyboardInstalled, enableAdbKeyboard, setIme, resetIme, currentIme, adbKeyboardType, adbKeyboardClear, pressKeyEvent, statusbarExpand, appBuildFromDevice, currentActivity, ADB_KEYBOARD_PKG, } from "./adb.js";
23
23
  import { isLocalPath } from "../upload.js";
24
24
  import { deNormalizePoint, deNormalizeDrag } from "./coordinates.js";
25
- import { parseUiautomatorXml, serializeNativeTree, boundsCenter } from "./native-a11y.js";
25
+ import { parseUiautomatorXml, serializeNativeTree, boundsCenter, androidPackage, } from "./native-a11y.js";
26
+ import { computeScreenSignature } from "./screen-signature.js";
26
27
  import { packageNameFromApk } from "./apk-manifest.js";
27
28
  // Let animations/IME transitions settle before the next observation so the
28
29
  // screenshot the LLM reasons over reflects the action's result.
@@ -175,14 +176,24 @@ export class AndroidDevice {
175
176
  return png;
176
177
  }
177
178
  async observe() {
178
- // Screencap and the a11y dump are independent reads run them in parallel.
179
- // The dump is wrapped so a failure degrades to the vision path (empty tree)
180
- // rather than aborting the observation.
181
- const [png, tree] = await Promise.all([
179
+ // Screencap, the a11y dump, and the foreground-activity read are independent
180
+ // — run them in parallel. The dump is wrapped so a failure degrades to the
181
+ // vision path (empty tree) rather than aborting the observation; the
182
+ // activity read is best-effort ("" on failure → package-only coarse token).
183
+ const [png, tree, activity] = await Promise.all([
182
184
  this.refreshDimensions(),
183
185
  this.dumpTree(),
186
+ currentActivity(),
184
187
  ]);
185
188
  this.lastNodeMap = tree.nodeMap;
189
+ // Scroll-invariant screen signature from this dump's parsed nodes + coarse
190
+ // inputs (foreground package/activity). Sent only when usable (see loop.ts).
191
+ const coarseInputs = {
192
+ platform: "android",
193
+ package: tree.package,
194
+ activity,
195
+ };
196
+ const screenSignature = computeScreenSignature(tree.nodes, coarseInputs);
186
197
  return {
187
198
  screenshot: png.toString("base64"),
188
199
  // Element path when the dump produced a tree; "" → backend vision branch.
@@ -193,12 +204,19 @@ export class AndroidDevice {
193
204
  // Native has no scrollable document; the screen IS the page.
194
205
  documentHeight: this.screenHeight,
195
206
  tabs: [],
207
+ screenSignature,
208
+ // Corpus-dump only (ISH_DUMP_CORPUS): the exact parsed nodes + coarse
209
+ // inputs the signature consumed, so any algorithm can be replayed offline.
210
+ nativeNodes: tree.nodes,
211
+ coarseInputs,
196
212
  };
197
213
  }
198
214
  /**
199
- * Dump + serialize the uiautomator a11y tree. Any failure (dump retries
200
- * exhausted, parse error) degrades to an empty tree so the backend falls back
201
- * to the vision path a missing tree must never abort the observation.
215
+ * Dump + serialize the uiautomator a11y tree. Returns the serialized tree, the
216
+ * node map, the FLAT parsed nodes (for the screen signature) and the
217
+ * foreground package read off the dump. Any failure (dump retries exhausted,
218
+ * parse error) degrades to an empty tree so the backend falls back to the
219
+ * vision path — a missing tree must never abort the observation.
202
220
  */
203
221
  async dumpTree() {
204
222
  try {
@@ -206,12 +224,12 @@ export class AndroidDevice {
206
224
  const nodes = parseUiautomatorXml(xml);
207
225
  const tree = serializeNativeTree(nodes);
208
226
  this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
209
- return tree;
227
+ return { ...tree, nodes, package: androidPackage(xml) };
210
228
  }
211
229
  catch (err) {
212
230
  const msg = err instanceof Error ? err.message : String(err);
213
231
  this.log(`a11y dump failed, falling back to vision: ${msg}`);
214
- return { simplified: "", nodeMap: new Map() };
232
+ return { simplified: "", nodeMap: new Map(), nodes: [], package: "" };
215
233
  }
216
234
  }
217
235
  async captureScreenshot() {
@@ -14,6 +14,8 @@
14
14
  import type { Browser } from "playwright-core";
15
15
  import type { LocalStepAction, LocalSimBrowserOptions, LocalTabInfo, ContextValue } from "./types.js";
16
16
  import type { BrowserSession } from "./browser.js";
17
+ import type { ScreenSignature, CoarseInputs } from "./screen-signature.js";
18
+ import type { NativeNode } from "./native-a11y.js";
17
19
  /**
18
20
  * One observation of the target's current state.
19
21
  *
@@ -39,6 +41,29 @@ export interface DeviceObservation {
39
41
  documentHeight: number;
40
42
  /** Open-tab snapshot (browser-only; empty for native). */
41
43
  tabs: LocalTabInfo[];
44
+ /**
45
+ * Native only: the scroll-invariant structural "screen signature" computed
46
+ * from this observation's a11y tree (see screen-signature.ts). The loop sends
47
+ * `value` as the match-frame anchor ONLY when `usable` is true; browser
48
+ * targets omit it. Undefined when the platform doesn't compute one.
49
+ */
50
+ screenSignature?: ScreenSignature;
51
+ /**
52
+ * Native only, corpus-dump only: the PARSED a11y nodes that
53
+ * `computeScreenSignature` consumed for this observation (the exact array, so
54
+ * any signature algorithm can be replayed offline against it). Populated by the
55
+ * android/ios `observe()`; the browser leaves it undefined. Only surfaced for
56
+ * the `ISH_DUMP_CORPUS` instrumentation in loop.ts — nothing in the live path
57
+ * reads it.
58
+ */
59
+ nativeNodes?: NativeNode[];
60
+ /**
61
+ * Native only, corpus-dump only: the `CoarseInputs` (platform / package /
62
+ * activity / bundleId) fed into `computeScreenSignature` for this observation.
63
+ * Populated by the android/ios `observe()`; the browser leaves it undefined.
64
+ * Same instrumentation-only purpose as `nativeNodes`.
65
+ */
66
+ coarseInputs?: CoarseInputs;
42
67
  }
43
68
  /**
44
69
  * Result of executing one action against the target.
@@ -86,10 +86,11 @@ export declare class IOSDevice implements SimulationDevice {
86
86
  private refreshScreen;
87
87
  observe(): Promise<DeviceObservation>;
88
88
  /**
89
- * Read + serialize WDA's /source a11y tree (bounds in POINTS). Any
90
- * failure (retries exhausted on a trivial tree, parse error) degrades to an
91
- * empty tree so the backend falls back to vision — a missing tree must never
92
- * abort the observation.
89
+ * Read + serialize WDA's /source a11y tree (bounds in POINTS). Returns the
90
+ * serialized tree, the node map and the FLAT parsed nodes (for the screen
91
+ * signature). Any failure (retries exhausted on a trivial tree, parse error)
92
+ * degrades to an empty tree so the backend falls back to vision — a missing
93
+ * tree must never abort the observation.
93
94
  */
94
95
  private dumpTree;
95
96
  captureScreenshot(): Promise<string>;
@@ -33,10 +33,11 @@
33
33
  import { resolveTextValue } from "./actions.js";
34
34
  import { requireOneBootedSimulator, screenshotPng, terminateApp, launchApp, installApp, isAppInstalled, bundleIdFromApp, appBuildFromSimulator, } from "./simctl.js";
35
35
  // iOS UI interaction + a11y run through WebDriverAgent (XCUITest), not idb.
36
- import { ensureWda, closeWda, describeScreen, describeAll, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, } from "./xcuitest.js";
36
+ import { ensureWda, closeWda, describeScreen, describeAll, activeBundleId, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, } from "./xcuitest.js";
37
37
  import { isLocalPath } from "../upload.js";
38
38
  import { deNormalizePoint, deNormalizeDrag, pointToPixel } from "./coordinates.js";
39
39
  import { parseXcuiHierarchy, serializeNativeTree, boundsCenter } from "./native-a11y.js";
40
+ import { computeScreenSignature } from "./screen-signature.js";
40
41
  // Let animations/transitions settle before the next observation so the
41
42
  // screenshot the LLM reasons over reflects the action's result.
42
43
  const POST_GESTURE_SETTLE_MS = 500;
@@ -176,14 +177,26 @@ export class IOSDevice {
176
177
  }
177
178
  async observe() {
178
179
  // Refresh geometry each step (orientation can change), then capture the
179
- // pixel screenshot and the a11y tree in parallel (independent reads). The
180
- // dump is wrapped so a failure degrades to the vision path (empty tree).
180
+ // pixel screenshot, the a11y tree, and the active bundle id in parallel
181
+ // (independent reads). The dump is wrapped so a failure degrades to the
182
+ // vision path (empty tree); the bundle-id read is best-effort ("" on
183
+ // failure → the navTitle-only coarse token).
181
184
  await this.refreshScreen();
182
- const [png, tree] = await Promise.all([
185
+ const [png, tree, bundleId] = await Promise.all([
183
186
  screenshotPng(),
184
187
  this.dumpTree(),
188
+ activeBundleId(this.udid),
185
189
  ]);
186
190
  this.lastNodeMap = tree.nodeMap;
191
+ // Scroll-invariant screen signature from this dump's parsed nodes + coarse
192
+ // inputs (active bundle id; navTitle is derived from the nodes). iOS is
193
+ // best-effort — sparse SwiftUI trees are usually unusable and fall back to
194
+ // Phase-1 continuity (sent only when usable; see loop.ts).
195
+ const coarseInputs = {
196
+ platform: "ios",
197
+ bundleId,
198
+ };
199
+ const screenSignature = computeScreenSignature(tree.nodes, coarseInputs);
187
200
  return {
188
201
  screenshot: png.toString("base64"),
189
202
  // Element path when describe-all produced a tree; "" → backend vision.
@@ -196,13 +209,19 @@ export class IOSDevice {
196
209
  // Native has no scrollable document; the screen IS the page.
197
210
  documentHeight: this.pixelHeight,
198
211
  tabs: [],
212
+ screenSignature,
213
+ // Corpus-dump only (ISH_DUMP_CORPUS): the exact parsed nodes + coarse
214
+ // inputs the signature consumed, so any algorithm can be replayed offline.
215
+ nativeNodes: tree.nodes,
216
+ coarseInputs,
199
217
  };
200
218
  }
201
219
  /**
202
- * Read + serialize WDA's /source a11y tree (bounds in POINTS). Any
203
- * failure (retries exhausted on a trivial tree, parse error) degrades to an
204
- * empty tree so the backend falls back to vision — a missing tree must never
205
- * abort the observation.
220
+ * Read + serialize WDA's /source a11y tree (bounds in POINTS). Returns the
221
+ * serialized tree, the node map and the FLAT parsed nodes (for the screen
222
+ * signature). Any failure (retries exhausted on a trivial tree, parse error)
223
+ * degrades to an empty tree so the backend falls back to vision — a missing
224
+ * tree must never abort the observation.
206
225
  */
207
226
  async dumpTree() {
208
227
  try {
@@ -210,12 +229,12 @@ export class IOSDevice {
210
229
  const nodes = parseXcuiHierarchy(json);
211
230
  const tree = serializeNativeTree(nodes);
212
231
  this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
213
- return tree;
232
+ return { ...tree, nodes };
214
233
  }
215
234
  catch (err) {
216
235
  const msg = err instanceof Error ? err.message : String(err);
217
236
  this.log(`a11y describe-all failed, falling back to vision: ${msg}`);
218
- return { simplified: "", nodeMap: new Map() };
237
+ return { simplified: "", nodeMap: new Map(), nodes: [] };
219
238
  }
220
239
  }
221
240
  async captureScreenshot() {
@@ -5,6 +5,7 @@
5
5
  * against a SimulationDevice (a Playwright browser today; a native Android
6
6
  * emulator next). The loop is device-agnostic — see device.ts.
7
7
  */
8
+ import { appendFileSync } from "node:fs";
8
9
  import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
9
10
  import { uploadScreenshot } from "./upload.js";
10
11
  import { detectNoVisibleChange, describeAction, classifyStepKind } from "./actions.js";
@@ -19,6 +20,58 @@ import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions,
19
20
  function isNativePlatform(platform) {
20
21
  return platform === "android" || platform === "ios";
21
22
  }
23
+ /**
24
+ * Build ONE corpus-dump JSON line capturing everything needed to replay any
25
+ * screen-signature algorithm offline against this observation. Pure (input →
26
+ * string); the caller owns the I/O and the env gating. `app` is the coarse
27
+ * package (android) / bundle id (ios). Each node is projected down to the exact
28
+ * fields `computeScreenSignature` reads, so the line is a faithful replay basis.
29
+ */
30
+ function buildCorpusDumpLine(input) {
31
+ const { coarse } = input;
32
+ return (JSON.stringify({
33
+ ts: input.ts,
34
+ app: (coarse.platform === "android" ? coarse.package : coarse.bundleId) ?? "",
35
+ platform: coarse.platform,
36
+ location: input.location,
37
+ coarse: {
38
+ platform: coarse.platform,
39
+ package: coarse.package ?? null,
40
+ activity: coarse.activity ?? null,
41
+ bundleId: coarse.bundleId ?? null,
42
+ },
43
+ nodes: input.nodes.map((n) => ({
44
+ role: n.role,
45
+ label: n.label,
46
+ resourceId: n.resourceId ?? null,
47
+ scrollable: n.scrollable,
48
+ insideScrollable: n.insideScrollable,
49
+ })),
50
+ signature: input.signature
51
+ ? {
52
+ value: input.signature.value,
53
+ usable: input.signature.usable,
54
+ tokenCount: input.signature.tokenCount,
55
+ }
56
+ : null,
57
+ frame_version_id: input.frameVersionId ?? null,
58
+ action_kind: input.actionKind,
59
+ }) + "\n");
60
+ }
61
+ /**
62
+ * Append one corpus-dump line to `path`. Best-effort: a dump failure (bad path,
63
+ * full disk) is swallowed so the instrumentation can NEVER abort a live sim.
64
+ * Gated entirely by the caller on ISH_DUMP_CORPUS + native source.
65
+ */
66
+ function appendCorpusDumpLine(path, input, log) {
67
+ try {
68
+ appendFileSync(path, buildCorpusDumpLine(input));
69
+ }
70
+ catch (err) {
71
+ const msg = err instanceof Error ? err.message : String(err);
72
+ log(` Warning: corpus dump append failed — ${msg}`);
73
+ }
74
+ }
22
75
  /**
23
76
  * Convert a raw action (from either resolved_actions or output.action.actions)
24
77
  * into the flat LocalStepAction shape used by the executor. Exported for unit
@@ -349,6 +402,11 @@ async function runSingleSimulation(client, participantId, participantName, opts,
349
402
  // TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
350
403
  const obs = await device.observe();
351
404
  const currentScreenshot = obs.screenshot;
405
+ // Corpus dump (ISH_DUMP_CORPUS): the action_kind of the step that LED to
406
+ // THIS observation is the inbound lastStepKind (carried from the prior
407
+ // step; reassigned below AFTER the match-frame call). At step 0 nothing
408
+ // preceded this screen, so report it as "initial".
409
+ const inboundActionKind = step === 0 ? "initial" : lastStepKind;
352
410
  // Capture JPEG of observation for upload and recording (pre-action)
353
411
  const obsJpeg = await device.captureScreenshotJpeg();
354
412
  const obsBase64 = obsJpeg.toString("base64");
@@ -580,6 +638,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
580
638
  // updated AFTER this call for the next iteration.
581
639
  ...(isNative && lastFrameVersionId ? { previous_frame_version_id: lastFrameVersionId } : {}),
582
640
  same_screen_continuation: isNative && (lastStepKind === "scroll" || lastStepKind === "keyboard"),
641
+ // Phase 2: scroll-invariant structural screen signature as an
642
+ // entry/cross-run anchor. Sent ONLY when usable (>= 2 stable chrome
643
+ // ids) — a sparse/empty id-set hashes to a colliding value that
644
+ // would silently over-merge distinct screens, so we omit it and let
645
+ // the backend fall back to Phase-1 continuity. Computed in the
646
+ // device's observe() from this step's parsed a11y tree.
647
+ ...(isNative && obs.screenSignature?.usable
648
+ ? { native_screen_signature: obs.screenSignature.value }
649
+ : {}),
583
650
  });
584
651
  frameVersionId = matchResult.frame_version_id;
585
652
  }
@@ -587,6 +654,26 @@ async function runSingleSimulation(client, participantId, participantName, opts,
587
654
  const msg = err instanceof Error ? err.message : String(err);
588
655
  log(` Warning: frame matching failed — ${msg}`);
589
656
  }
657
+ // Corpus dump (ISH_DUMP_CORPUS, native only): one JSON line per
658
+ // observation with everything needed to replay any screen-signature
659
+ // algorithm offline — the LLM screen label (ground truth), the coarse
660
+ // inputs, the exact parsed NativeNode[], the current algorithm's
661
+ // signature, the backend frame id, and the inbound action_kind. Fully
662
+ // gated and best-effort: zero overhead/behavior change when unset, and a
663
+ // dump failure never aborts the sim. Requires the native observe()'s
664
+ // optional nativeNodes/coarseInputs (browser leaves them undefined).
665
+ const corpusDumpPath = process.env.ISH_DUMP_CORPUS;
666
+ if (corpusDumpPath && isNative && obs.nativeNodes && obs.coarseInputs) {
667
+ appendCorpusDumpLine(corpusDumpPath, {
668
+ ts: step,
669
+ location: stepResponse.current_location,
670
+ coarse: obs.coarseInputs,
671
+ nodes: obs.nativeNodes,
672
+ signature: obs.screenSignature,
673
+ frameVersionId,
674
+ actionKind: inboundActionKind,
675
+ }, log);
676
+ }
590
677
  // Carry THIS step's logical-screen classification + matched frame
591
678
  // forward for the NEXT iteration's match-frame call (consumed above as
592
679
  // last*). Classify after the call so ordering is consume-then-update.
@@ -45,6 +45,23 @@ export interface NativeNode {
45
45
  /** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
46
46
  hasOwnLabel: boolean;
47
47
  resourceId?: string;
48
+ /**
49
+ * True for a scroll container (Android `scrollable="true"`; iOS
50
+ * ScrollView/Table/CollectionView). The screen-signature uses it to keep the
51
+ * container's OWN id as durable chrome — see screen-signature.ts.
52
+ */
53
+ scrollable: boolean;
54
+ /**
55
+ * True iff this node has a scrollable ANCESTOR — i.e. it is scroll CONTENT that
56
+ * shifts under a scroll. Computed STRUCTURALLY during parsing (tree ancestry),
57
+ * not geometrically: an overlay/FAB that merely sits inside a list's rect is
58
+ * NOT marked (it isn't a tree descendant), and on iOS the descendants of a
59
+ * pruned (isAccessible=0) scroll container still inherit the flag. The
60
+ * screen-signature excludes these from the stable token set so a scroll never
61
+ * changes the signature — see screen-signature.ts. A scroll container itself
62
+ * has `scrollable=true` but `insideScrollable=false` (unless nested).
63
+ */
64
+ insideScrollable: boolean;
48
65
  space: CoordinateSpace;
49
66
  }
50
67
  export interface NativeTree {
@@ -64,6 +81,13 @@ export interface NativeTree {
64
81
  * raw fields; the serializer decides which to emit and how to aggregate.
65
82
  */
66
83
  export declare function parseUiautomatorXml(xml: string): NativeNode[];
84
+ /**
85
+ * The foreground app's package name from a uiautomator dump's `package="..."`
86
+ * attribute. uiautomator stamps every node with the owning package; the first
87
+ * one is the foreground app. Used as a coarse-token input for the screen
88
+ * signature (see screen-signature.ts). Returns "" when absent (best-effort).
89
+ */
90
+ export declare function androidPackage(xml: string): string;
67
91
  /**
68
92
  * Parse WDA's `GET /source?format=json` — a NESTED accessibility tree — into the
69
93
  * FLAT, depth-first `NativeNode[]` (POINTS) that `parseXcuiHierarchy` produces,
@@ -122,7 +122,11 @@ function unescapeXml(s) {
122
122
  export function parseUiautomatorXml(xml) {
123
123
  const root = buildAndroidTree(xml);
124
124
  const out = [];
125
- const visit = (n) => {
125
+ // `parentScrollable` is true iff any ANCESTOR (not this node) had
126
+ // scrollable=true — i.e. this node is scroll CONTENT. Threaded down the
127
+ // descent so the screen-signature can exclude content structurally (a scroll
128
+ // moves these; chrome outside any scrollable keeps the signature stable).
129
+ const visit = (n, parentScrollable) => {
126
130
  // Drop nodes with no usable bounds (malformed/zero-area) — they have no
127
131
  // tappable center and would corrupt the nodeMap.
128
132
  if (n.bounds) {
@@ -134,14 +138,20 @@ export function parseUiautomatorXml(xml) {
134
138
  clickable: n.clickable,
135
139
  hasOwnLabel: label.length > 0,
136
140
  resourceId: n.resourceId || undefined,
141
+ scrollable: n.scrollable,
142
+ insideScrollable: parentScrollable,
137
143
  space: "px",
138
144
  });
139
145
  }
146
+ // A node inside a scrollable makes ALL its descendants scroll content; the
147
+ // container's own flag stays false (it's durable chrome) but its children
148
+ // inherit true.
149
+ const childScrollable = parentScrollable || n.scrollable;
140
150
  for (const c of n.children)
141
- visit(c);
151
+ visit(c, childScrollable);
142
152
  };
143
153
  for (const c of root.children)
144
- visit(c);
154
+ visit(c, false);
145
155
  return out;
146
156
  }
147
157
  /**
@@ -151,7 +161,7 @@ export function parseUiautomatorXml(xml) {
151
161
  * are its true descendants — required for ancestor-vs-leaf aggregation.
152
162
  */
153
163
  function buildAndroidTree(xml) {
154
- const root = makeRawAndroidNode("", "", "", "", false, null);
164
+ const root = makeRawAndroidNode("", "", "", "", false, false, null);
155
165
  const stack = [root];
156
166
  // Match every <node ...> / <node .../> open tag and standalone </node> close.
157
167
  // Attribute values are consumed as atomic quoted runs (`"[^"]*"`) so a literal
@@ -171,19 +181,34 @@ function buildAndroidTree(xml) {
171
181
  // the greedy run above swallows the trailing slash, so a `(\/?)` capture
172
182
  // can't see it.
173
183
  const selfClosing = tag.endsWith("/>");
174
- const node = makeRawAndroidNode(attr(tag, "class"), attr(tag, "text"), attr(tag, "content-desc"), attr(tag, "resource-id"), attr(tag, "clickable") === "true", parseAndroidBounds(attr(tag, "bounds")));
184
+ const node = makeRawAndroidNode(attr(tag, "class"), attr(tag, "text"), attr(tag, "content-desc"), attr(tag, "resource-id"), attr(tag, "clickable") === "true", attr(tag, "scrollable") === "true", parseAndroidBounds(attr(tag, "bounds")));
175
185
  stack[stack.length - 1].children.push(node);
176
186
  if (!selfClosing)
177
187
  stack.push(node);
178
188
  }
179
189
  return root;
180
190
  }
181
- function makeRawAndroidNode(role, text, contentDesc, resourceId, clickable, bounds) {
182
- return { role, text, contentDesc, resourceId, clickable, bounds, children: [] };
191
+ function makeRawAndroidNode(role, text, contentDesc, resourceId, clickable, scrollable, bounds) {
192
+ return { role, text, contentDesc, resourceId, clickable, scrollable, bounds, children: [] };
193
+ }
194
+ /**
195
+ * The foreground app's package name from a uiautomator dump's `package="..."`
196
+ * attribute. uiautomator stamps every node with the owning package; the first
197
+ * one is the foreground app. Used as a coarse-token input for the screen
198
+ * signature (see screen-signature.ts). Returns "" when absent (best-effort).
199
+ */
200
+ export function androidPackage(xml) {
201
+ const m = /<node\b[^>]*?\spackage="([^"]*)"/.exec(xml);
202
+ return m ? unescapeXml(m[1]) : "";
183
203
  }
184
204
  // ---------------------------------------------------------------------------
185
205
  // iOS — shared helpers for the WebDriverAgent (XCUITest) /source parser below
186
206
  // ---------------------------------------------------------------------------
207
+ /** iOS container types whose CONTENT scrolls. A node of one of these types (or
208
+ * any descendant of one) is marked `insideScrollable` so the screen signature
209
+ * excludes scroll content structurally while keeping the container's own id
210
+ * (see screen-signature.ts). */
211
+ const IOS_SCROLLABLE_TYPES = new Set(["ScrollView", "Table", "CollectionView"]);
187
212
  /** iOS roles/types that are directly actionable (the device taps their center). */
188
213
  const IOS_ACTIONABLE_TYPES = new Set([
189
214
  "Button",
@@ -258,15 +283,47 @@ export function parseXcuiHierarchy(json) {
258
283
  if (!root || typeof root !== "object")
259
284
  return [];
260
285
  const out = [];
261
- const visit = (n) => {
286
+ // `parentScrollable` is true iff this node OR any ANCESTOR is a scroll
287
+ // container. CRITICAL (the M1 fix): WDA's scroll CONTAINER is isAccessible=0
288
+ // and therefore NOT emitted, but its descendants are scroll content all the
289
+ // same — so the flag is threaded down the recursion regardless of whether the
290
+ // container node itself is emitted. The screen-signature excludes these
291
+ // structurally, so a scroll never changes the iOS signature.
292
+ const visit = (n, parentScrollable) => {
293
+ const rawType = n.type ?? "";
294
+ const typeKey = stripAxPrefix(rawType);
295
+ const isScroll = IOS_SCROLLABLE_TYPES.has(typeKey);
262
296
  const bounds = frameToBounds(n.rect ?? undefined);
297
+ // iOS NAVIGATION-BAR TITLE recovery. The bar carries the screen title in its
298
+ // `name`, but WDA marks the bar isAccessible=0 (so it's pruned) AND the large
299
+ // title StaticText scrolls WITH the content (insideScrollable). The title is
300
+ // then lost from the signature, silently OVER-MERGING distinct pushed screens
301
+ // (proven live: iOS Settings General/Accessibility/Privacy all reduced to the
302
+ // back button's parent label {tx:settings} → one frame). Emit the bar's name
303
+ // as a stable chrome node — it sits ABOVE the scroll (insideScrollable=false)
304
+ // and is scroll-invariant (constant as the large title collapses). Emitted
305
+ // first so `iosNavTitle` (find role==="navigationbar") sees the titled bar.
306
+ if (bounds && typeKey === "NavigationBar" && wdaTruthy(n.isVisible)) {
307
+ const navName = (n.name ?? "").trim();
308
+ if (navName) {
309
+ out.push({
310
+ role: normalizeRole(rawType),
311
+ label: navName,
312
+ bounds,
313
+ clickable: false,
314
+ hasOwnLabel: true,
315
+ resourceId: undefined,
316
+ scrollable: false,
317
+ insideScrollable: false,
318
+ space: "points",
319
+ });
320
+ }
321
+ }
263
322
  if (bounds && wdaTruthy(n.isAccessible) && wdaTruthy(n.isVisible)) {
264
323
  // Prefer the spoken label; fall back to a STRING value (search fields
265
324
  // expose their placeholder as `value`). Non-string values (a Switch's 1/0)
266
325
  // are ignored for the label, exactly like the idb path.
267
326
  const label = (n.label ?? (typeof n.value === "string" ? n.value : "")).trim();
268
- const rawType = n.type ?? "";
269
- const typeKey = stripAxPrefix(rawType);
270
327
  // `isEnabled` absent ⇒ assume enabled (WDA omits it on always-enabled types).
271
328
  const enabled = n.isEnabled == null ? true : wdaTruthy(n.isEnabled);
272
329
  const actionable = IOS_ACTIONABLE_TYPES.has(typeKey) && enabled;
@@ -277,16 +334,21 @@ export function parseXcuiHierarchy(json) {
277
334
  clickable: actionable,
278
335
  hasOwnLabel: label.length > 0,
279
336
  resourceId: (n.name || n.rawIdentifier) ?? undefined,
337
+ scrollable: isScroll,
338
+ insideScrollable: parentScrollable,
280
339
  space: "points",
281
340
  });
282
341
  }
283
342
  // Recurse into ALL children — an accessible element can nest inside a
284
- // non-accessible container (the Cell wrapping the Button), so we must not
285
- // prune the walk by accessibility, only the emission.
343
+ // non-accessible container (the Cell wrapping the Button, or the pruned
344
+ // scroll container), so we must not prune the walk by accessibility, only
345
+ // the emission. The scroll flag propagates onto descendants even though the
346
+ // container itself wasn't emitted.
347
+ const childScrollable = parentScrollable || isScroll;
286
348
  for (const c of n.children ?? [])
287
- visit(c);
349
+ visit(c, childScrollable);
288
350
  };
289
- visit(root);
351
+ visit(root, false);
290
352
  return out;
291
353
  }
292
354
  // ---------------------------------------------------------------------------
@@ -0,0 +1,77 @@
1
+ /**
2
+ * Native "screen signature" v2 — a SCROLL-INVARIANT structural identity for a
3
+ * logical native screen, derived from the accessibility tree, sent to the
4
+ * backend as an entry/cross-run frame anchor (Phase 2 of native frame
5
+ * continuity; Phase 1 reuses the prior frame on pure scroll/keyboard steps).
6
+ *
7
+ * FCIS: this module is PURE (NativeNode[] + coarse inputs in, signature out) —
8
+ * no device access. The device gathers the coarse inputs (foreground activity /
9
+ * bundle id) and the parsed tree; this turns them into `{value, usable}`.
10
+ *
11
+ * The signature has two parts:
12
+ * coarse — a cheap, almost-always-available anchor (android `package|activity`,
13
+ * ios `bundleId|navTitle`).
14
+ * tokens — the persistent CHROME tokens that are NOT scroll content. Each
15
+ * chrome node contributes its resource-id (`id:…`) AND its label
16
+ * (`tx:…`) when present. This is what makes the signature
17
+ * scroll-invariant AND lets two same-activity screens be told apart.
18
+ *
19
+ * WHY v2 (two verified gaps in the id-only v1):
20
+ * 1. LABELS close the shared-chrome OVER-MERGE. A single-Activity app — Jetpack
21
+ * Compose (exposes NO resource-ids beyond the framework `android:id/content`)
22
+ * or a View app with a fixed toolbar+container shared across fragments —
23
+ * gives two DISTINCT screens the SAME id-set → identical signature → SILENT
24
+ * over-merge (the cardinal failure). But those screens DO differ in chrome
25
+ * LABELS (a home screen vs a settings sub-screen show different toolbar /
26
+ * button text). Including labels makes distinct screens produce distinct
27
+ * signatures, and makes Compose usable at all (label-only tokens).
28
+ * 2. STRUCTURAL scroll-exclusion replaces v1's geometric `contains()`. v1
29
+ * excluded scroll content by bounds-containment, which (a) mis-flagged an
30
+ * overlay/FAB sitting inside a list's rect as content (→ could over-merge),
31
+ * and (b) on iOS the scroll CONTAINER is isAccessible=0 and pruned from the
32
+ * NativeNode[], so geometric exclusion never fired (scroll changed the
33
+ * signature → over-split, feature inert). v2 excludes by TREE STRUCTURE: a
34
+ * node is content iff `insideScrollable` (it has a scrollable ANCESTOR),
35
+ * computed during parsing — see native-a11y.ts. The scroll container's OWN
36
+ * tokens are kept (it's durable chrome; `insideScrollable` is about
37
+ * descendants).
38
+ *
39
+ * The remaining failure mode after v2 is SAFE: dynamic chrome labels (a live
40
+ * clock, an unread badge) cause OVER-SPLIT (a new frame), never over-merge — the
41
+ * backend just mints a fresh frame, which is the conservative direction.
42
+ *
43
+ * USABLE GUARD (load-bearing, unchanged in spirit): `usable` is true only with
44
+ * >= MIN_STABLE_TOKENS tokens. A signature derived from an empty/sparse token
45
+ * set must NEVER be sent — sha1("") (and any near-empty set) collides across
46
+ * distinct screens and would silently over-merge them. When unusable the caller
47
+ * omits the field entirely and the backend falls back to Phase-1 continuity.
48
+ * This is the SAFE default: Flutter (no a11y tree) and the sparsest screens
49
+ * degrade here; id-rich Android and label-rich Compose are the validated wins.
50
+ */
51
+ import type { NativeNode } from "./native-a11y.js";
52
+ /** Minimum stable-chrome tokens for a signature to be usable (sent to the backend). */
53
+ export declare const MIN_STABLE_TOKENS = 2;
54
+ /** Coarse-token inputs gathered from the device (cheap, almost-always-available). */
55
+ export interface CoarseInputs {
56
+ platform: "android" | "ios";
57
+ /** Android: foreground app package (uiautomator `package` attr). */
58
+ package?: string;
59
+ /** Android: foreground activity (`pkg/activity` from dumpsys). */
60
+ activity?: string;
61
+ /** iOS: active app bundle id (WDA /wda/activeAppInfo). navTitle is derived here. */
62
+ bundleId?: string;
63
+ }
64
+ export interface ScreenSignature {
65
+ /** `platform|coarse|sha1(tokens)` — the value sent as native_screen_signature. */
66
+ value: string;
67
+ /** True only with >= MIN_STABLE_TOKENS tokens; the caller omits the field when false. */
68
+ usable: boolean;
69
+ /** Number of stable chrome tokens — the guard's basis. */
70
+ tokenCount: number;
71
+ }
72
+ /**
73
+ * Compute the screen signature from this step's parsed tree + coarse inputs.
74
+ * `value` is `platform|coarse|sha1(tokens)`; `usable` gates whether it's safe to
75
+ * send (>= MIN_STABLE_TOKENS distinct stable chrome tokens).
76
+ */
77
+ export declare function computeScreenSignature(nodes: NativeNode[], coarse: CoarseInputs): ScreenSignature;
@@ -0,0 +1,166 @@
1
+ /**
2
+ * Native "screen signature" v2 — a SCROLL-INVARIANT structural identity for a
3
+ * logical native screen, derived from the accessibility tree, sent to the
4
+ * backend as an entry/cross-run frame anchor (Phase 2 of native frame
5
+ * continuity; Phase 1 reuses the prior frame on pure scroll/keyboard steps).
6
+ *
7
+ * FCIS: this module is PURE (NativeNode[] + coarse inputs in, signature out) —
8
+ * no device access. The device gathers the coarse inputs (foreground activity /
9
+ * bundle id) and the parsed tree; this turns them into `{value, usable}`.
10
+ *
11
+ * The signature has two parts:
12
+ * coarse — a cheap, almost-always-available anchor (android `package|activity`,
13
+ * ios `bundleId|navTitle`).
14
+ * tokens — the persistent CHROME tokens that are NOT scroll content. Each
15
+ * chrome node contributes its resource-id (`id:…`) AND its label
16
+ * (`tx:…`) when present. This is what makes the signature
17
+ * scroll-invariant AND lets two same-activity screens be told apart.
18
+ *
19
+ * WHY v2 (two verified gaps in the id-only v1):
20
+ * 1. LABELS close the shared-chrome OVER-MERGE. A single-Activity app — Jetpack
21
+ * Compose (exposes NO resource-ids beyond the framework `android:id/content`)
22
+ * or a View app with a fixed toolbar+container shared across fragments —
23
+ * gives two DISTINCT screens the SAME id-set → identical signature → SILENT
24
+ * over-merge (the cardinal failure). But those screens DO differ in chrome
25
+ * LABELS (a home screen vs a settings sub-screen show different toolbar /
26
+ * button text). Including labels makes distinct screens produce distinct
27
+ * signatures, and makes Compose usable at all (label-only tokens).
28
+ * 2. STRUCTURAL scroll-exclusion replaces v1's geometric `contains()`. v1
29
+ * excluded scroll content by bounds-containment, which (a) mis-flagged an
30
+ * overlay/FAB sitting inside a list's rect as content (→ could over-merge),
31
+ * and (b) on iOS the scroll CONTAINER is isAccessible=0 and pruned from the
32
+ * NativeNode[], so geometric exclusion never fired (scroll changed the
33
+ * signature → over-split, feature inert). v2 excludes by TREE STRUCTURE: a
34
+ * node is content iff `insideScrollable` (it has a scrollable ANCESTOR),
35
+ * computed during parsing — see native-a11y.ts. The scroll container's OWN
36
+ * tokens are kept (it's durable chrome; `insideScrollable` is about
37
+ * descendants).
38
+ *
39
+ * The remaining failure mode after v2 is SAFE: dynamic chrome labels (a live
40
+ * clock, an unread badge) cause OVER-SPLIT (a new frame), never over-merge — the
41
+ * backend just mints a fresh frame, which is the conservative direction.
42
+ *
43
+ * USABLE GUARD (load-bearing, unchanged in spirit): `usable` is true only with
44
+ * >= MIN_STABLE_TOKENS tokens. A signature derived from an empty/sparse token
45
+ * set must NEVER be sent — sha1("") (and any near-empty set) collides across
46
+ * distinct screens and would silently over-merge them. When unusable the caller
47
+ * omits the field entirely and the backend falls back to Phase-1 continuity.
48
+ * This is the SAFE default: Flutter (no a11y tree) and the sparsest screens
49
+ * degrade here; id-rich Android and label-rich Compose are the validated wins.
50
+ */
51
+ import { createHash } from "node:crypto";
52
+ /** Minimum stable-chrome tokens for a signature to be usable (sent to the backend). */
53
+ export const MIN_STABLE_TOKENS = 2;
54
+ /**
55
+ * App-bar / title chrome resource-ids. A node whose resource-id matches carries
56
+ * the SCREEN TITLE (e.g. a CollapsingToolbar's title, an ActionBar/Toolbar title).
57
+ * On modern Android the title sits INSIDE the scrollable app-bar (CoordinatorLayout
58
+ * / NestedScrollView), so structural scroll-exclusion drops it — collapsing every
59
+ * sub-screen of a single host activity (e.g. Android Settings' `.SubSettings`) to
60
+ * the SAME generic outer-container ids and silently OVER-MERGING distinct screens
61
+ * (proven live: Display/Apps/Network all → one frame). We rescue the title LABEL
62
+ * even when `insideScrollable`: the title text is the screen's most reliable
63
+ * discriminator and is scroll-INVARIANT (it stays constant as the bar collapses).
64
+ *
65
+ * WHY no `_title$` here (removed): a trailing-`_title$` rescue was originally added
66
+ * to also catch Android `homepage_title`, but (a) `homepage_title` is the VOLATILE
67
+ * home big-title we deliberately do NOT want to rescue, and (b) `_title$`
68
+ * OVER-MATCHES iOS list-row section ids that are pure SCROLL CONTENT —
69
+ * `MOTION_TITLE`, `SPEECH_TITLE`, `LIVE_SPEECH_TITLE`, `VOCAL_SHORTCUTS_TITLE`
70
+ * (role=text, insideScrollable=true). Rescuing those re-includes scroll-content row
71
+ * labels in the signature, so scrolling reveals different rows and CHURNS the
72
+ * signature → over-fragment (iOS Settings/Accessibility split into two frames). The
73
+ * SubSettings over-merge discriminator rides on the explicitly-matched
74
+ * `collapsing_toolbar` / `action_bar` / `toolbar`, NOT on an arbitrary `_title$`.
75
+ */
76
+ const TITLE_CHROME_ID = /(collapsing_toolbar|action_bar|toolbar)$/;
77
+ function isTitleChrome(resourceId) {
78
+ return resourceId !== "" && TITLE_CHROME_ID.test(resourceId.toLowerCase());
79
+ }
80
+ /** Max length of a label token's value — bounds the hashed string on chatty labels. */
81
+ const LABEL_TOKEN_MAX_LENGTH = 64;
82
+ /**
83
+ * Normalized role of the iOS navigation bar after `normalizeRole` in
84
+ * native-a11y.ts (`AXNavigationBar`/`NavigationBar` → "navigationbar"). The
85
+ * spike matched the raw "NavigationBar" role; the CLI's NativeNode carries the
86
+ * normalized role, so we match the normalized form here.
87
+ */
88
+ const IOS_NAV_BAR_ROLE = "navigationbar";
89
+ function sha1(s) {
90
+ return createHash("sha1").update(s).digest("hex");
91
+ }
92
+ /** First nav-bar node's label (iOS), else "" — best-effort coarse signal. */
93
+ function iosNavTitle(nodes) {
94
+ const nav = nodes.find((n) => n.role === IOS_NAV_BAR_ROLE);
95
+ return nav ? nav.label.trim() : "";
96
+ }
97
+ /** The coarse, almost-always-available anchor token. */
98
+ function coarseToken(nodes, coarse) {
99
+ if (coarse.platform === "android") {
100
+ return `${coarse.package ?? ""}|${coarse.activity ?? ""}`;
101
+ }
102
+ return `${coarse.bundleId ?? ""}|${iosNavTitle(nodes)}`;
103
+ }
104
+ /** Collapse whitespace, lowercase, and cap length so a label token is stable. */
105
+ function normalizeLabelToken(label) {
106
+ const collapsed = label.replace(/\s+/g, " ").trim().toLowerCase();
107
+ return collapsed.length <= LABEL_TOKEN_MAX_LENGTH
108
+ ? collapsed
109
+ : collapsed.slice(0, LABEL_TOKEN_MAX_LENGTH);
110
+ }
111
+ /**
112
+ * Sorted, de-duped token set of the persistent CHROME — every node that is NOT
113
+ * scroll content (`!insideScrollable`). Each such node contributes:
114
+ * - `id:<resourceId>` when its resource-id is non-empty, and
115
+ * - `tx:<label>` when its label is non-empty (normalized).
116
+ * A scroll CONTAINER's own tokens ARE kept (the container is durable chrome;
117
+ * `insideScrollable` flags its descendants, not the container). Labels are the
118
+ * v2 win: they discriminate single-Activity screens that share a chrome id-set
119
+ * (and give Compose, which exposes no ids, a usable signature at all).
120
+ */
121
+ function stableTokenSet(nodes) {
122
+ const out = new Set();
123
+ for (const n of nodes) {
124
+ const id = (n.resourceId ?? "").trim();
125
+ const label = n.label.trim();
126
+ // App-bar / title chrome carries the screen's name but sits inside the
127
+ // scrollable app-bar on modern Android — rescue its scroll-invariant LABEL
128
+ // even when insideScrollable (its generic container id is NOT added; the
129
+ // label is the discriminator). See TITLE_CHROME_ID.
130
+ if (label && isTitleChrome(id))
131
+ out.add(`tx:${normalizeLabelToken(label)}`);
132
+ if (n.insideScrollable)
133
+ continue; // scroll content — shifts under a scroll
134
+ // Suppress the `id:` token for a self-named / generic nav control. On iOS,
135
+ // parseXcuiHierarchy promotes the WDA `name` to resourceId, but WDA reports a
136
+ // NON-DETERMINISTIC name for the nav back button — "BackButton" when fresh,
137
+ // the parent screen's title (e.g. "Settings") once scrolled — while its
138
+ // visible label stays constant. That churn flips the `id:` token between
139
+ // scroll states and fragments the same screen. Drop the id when it's the
140
+ // literal "BackButton" OR is redundant with the node's own label (id ===
141
+ // normalized label): in both cases the `id:` carries no identity beyond the
142
+ // already-emitted `tx:` label. We KEEP the `tx:` token below.
143
+ const redundantNavId = id !== "" &&
144
+ (id === "BackButton" ||
145
+ (label !== "" && normalizeLabelToken(id) === normalizeLabelToken(label)));
146
+ if (id && !redundantNavId)
147
+ out.add(`id:${id}`);
148
+ if (label)
149
+ out.add(`tx:${normalizeLabelToken(label)}`);
150
+ }
151
+ return [...out].sort();
152
+ }
153
+ /**
154
+ * Compute the screen signature from this step's parsed tree + coarse inputs.
155
+ * `value` is `platform|coarse|sha1(tokens)`; `usable` gates whether it's safe to
156
+ * send (>= MIN_STABLE_TOKENS distinct stable chrome tokens).
157
+ */
158
+ export function computeScreenSignature(nodes, coarse) {
159
+ const tokens = stableTokenSet(nodes);
160
+ const value = `${coarse.platform}|${coarseToken(nodes, coarse)}|${sha1(tokens.join(","))}`;
161
+ return {
162
+ value,
163
+ usable: tokens.length >= MIN_STABLE_TOKENS,
164
+ tokenCount: tokens.length,
165
+ };
166
+ }
@@ -45,6 +45,13 @@ export declare function closeWda(udid: string): Promise<void>;
45
45
  export declare function describeScreen(udid: string): Promise<IosScreen>;
46
46
  /** Raw WDA `/source?format=json` string — feed to `parseXcuiHierarchy`. */
47
47
  export declare function describeAll(udid: string): Promise<string>;
48
+ /**
49
+ * The foreground app's bundle id via WDA `GET /wda/activeAppInfo`, a coarse
50
+ * input for the screen signature. Best-effort: returns "" on any failure (the
51
+ * signature degrades to a bundleId-less coarse token, and the run never depends
52
+ * on this read).
53
+ */
54
+ export declare function activeBundleId(udid: string): Promise<string>;
48
55
  export declare function uiTap(udid: string, x: number, y: number): Promise<void>;
49
56
  export declare function uiLongPress(udid: string, x: number, y: number, durationMs?: number): Promise<void>;
50
57
  export declare function uiSwipe(udid: string, x1: number, y1: number, x2: number, y2: number, durationMs?: number): Promise<void>;
@@ -250,6 +250,22 @@ export async function describeAll(udid) {
250
250
  const json = await wdaCall(s.port, "GET", `/session/${s.sessionId}/source?format=json`);
251
251
  return JSON.stringify(json);
252
252
  }
253
+ /**
254
+ * The foreground app's bundle id via WDA `GET /wda/activeAppInfo`, a coarse
255
+ * input for the screen signature. Best-effort: returns "" on any failure (the
256
+ * signature degrades to a bundleId-less coarse token, and the run never depends
257
+ * on this read).
258
+ */
259
+ export async function activeBundleId(udid) {
260
+ try {
261
+ const s = await getSession(udid);
262
+ const v = unwrap(await wdaCall(s.port, "GET", "/wda/activeAppInfo"));
263
+ return typeof v.bundleId === "string" ? v.bundleId : "";
264
+ }
265
+ catch {
266
+ return "";
267
+ }
268
+ }
253
269
  // ── Gestures (W3C pointer actions; coordinates in POINTS) ────────────────────
254
270
  function pointerAction(steps) {
255
271
  return {
@@ -97,8 +97,13 @@ export function iterationHasContent(details, modality) {
97
97
  const ep = readExternalChatbotEndpoint(details);
98
98
  return !!ep.endpoint || !!ep.chatbot_endpoint_id;
99
99
  }
100
- // interactive (default)
101
- return typeof d.url === "string" && d.url.length > 0;
100
+ // interactive (default): browser/figma iterations carry a `url`; native
101
+ // (ios/android) iterations carry an `app_artifact` instead and need no URL
102
+ // (the backend made url optional for native — ish-backend 9708e06e). Either
103
+ // satisfies "has content".
104
+ const hasUrl = typeof d.url === "string" && d.url.length > 0;
105
+ const hasApp = typeof d.app_artifact === "string" && d.app_artifact.length > 0;
106
+ return hasUrl || hasApp;
102
107
  }
103
108
  /** The flag fragment a user should pass to populate content for a given modality. */
104
109
  export function describeRequiredContentFlag(modality, chatMode) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ishlabs/cli",
3
- "version": "0.26.1",
3
+ "version": "0.27.0",
4
4
  "description": "The command-line interface for ish",
5
5
  "type": "module",
6
6
  "bin": {