@ishlabs/cli 0.25.0 → 0.26.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/commands/doctor.d.ts +42 -0
  2. package/dist/commands/doctor.js +359 -0
  3. package/dist/commands/iteration.js +23 -5
  4. package/dist/commands/study-participant.js +1 -1
  5. package/dist/commands/study-run.js +26 -1
  6. package/dist/commands/study-screenshots.js +38 -5
  7. package/dist/index.js +2 -0
  8. package/dist/lib/api-client.d.ts +3 -0
  9. package/dist/lib/api-client.js +6 -1
  10. package/dist/lib/docs.js +15 -3
  11. package/dist/lib/local-sim/actions.d.ts +18 -0
  12. package/dist/lib/local-sim/actions.js +32 -0
  13. package/dist/lib/local-sim/adb.d.ts +33 -0
  14. package/dist/lib/local-sim/adb.js +121 -17
  15. package/dist/lib/local-sim/android.d.ts +7 -1
  16. package/dist/lib/local-sim/android.js +21 -1
  17. package/dist/lib/local-sim/coordinates.d.ts +4 -4
  18. package/dist/lib/local-sim/coordinates.js +4 -4
  19. package/dist/lib/local-sim/device.d.ts +21 -2
  20. package/dist/lib/local-sim/device.js +1 -1
  21. package/dist/lib/local-sim/ios.d.ts +33 -10
  22. package/dist/lib/local-sim/ios.js +88 -20
  23. package/dist/lib/local-sim/loop.js +134 -25
  24. package/dist/lib/local-sim/native-a11y.d.ts +21 -7
  25. package/dist/lib/local-sim/native-a11y.js +82 -47
  26. package/dist/lib/local-sim/simctl.d.ts +28 -43
  27. package/dist/lib/local-sim/simctl.js +53 -142
  28. package/dist/lib/local-sim/types.d.ts +13 -2
  29. package/dist/lib/local-sim/xcuitest.d.ts +60 -0
  30. package/dist/lib/local-sim/xcuitest.js +303 -0
  31. package/dist/lib/paths.d.ts +14 -0
  32. package/dist/lib/paths.js +21 -0
  33. package/dist/lib/report-readiness.d.ts +44 -0
  34. package/dist/lib/report-readiness.js +74 -0
  35. package/dist/lib/skill-content.js +2 -0
  36. package/package.json +1 -1
@@ -1,14 +1,15 @@
1
1
  /**
2
- * IOSDevice — drives a local iOS simulator via `xcrun simctl` + `idb`,
2
+ * IOSDevice — drives a local iOS simulator via `xcrun simctl` (lifecycle +
3
+ * screenshot) and WebDriverAgent/XCUITest (UI + a11y; see xcuitest.ts),
3
4
  * implementing the SimulationDevice surface the loop expects. Mirrors
4
5
  * AndroidDevice; the one substantive difference is the coordinate space.
5
6
  *
6
7
  * Two resolution paths, mirroring the browser:
7
- * - ELEMENT (preferred): observe() reads the `idb ui describe-all` a11y tree,
8
- * serializes it to the `[id] role "label"` string the backend DOMLocator
9
- * reasons over, and keeps a local `shortId → bounds` map (bounds in POINTS).
10
- * The backend returns a `node_id`; executeAction() looks the bounds up and
11
- * taps the element's CENTER.
8
+ * - ELEMENT (preferred): observe() reads WDA's `/source` a11y tree, serializes
9
+ * it to the `[id] role "label"` string the backend DOMLocator reasons over,
10
+ * and keeps a local `shortId → bounds` map (bounds in POINTS). The backend
11
+ * returns a `node_id`; executeAction() looks the bounds up and taps the
12
+ * element's CENTER.
12
13
  * - VISION (fallback): when the tree is empty/sparse, observe() returns an
13
14
  * empty tree so the backend takes its vision branch and returns NORMALIZED
14
15
  * 0-1000 coordinates. Also taken per-action whenever node_id is absent.
@@ -16,7 +17,7 @@
16
17
  * COORDINATE SPACE — two spaces, the key difference from Android (where
17
18
  * screencap and tap share one pixel space):
18
19
  * `simctl io booted screenshot` is in PIXELS (e.g. 1179x2556 @3x), but
19
- * `idb ui tap/swipe` AND the `describe-all` a11y frames are POINTS (393x852).
20
+ * WDA taps/swipes AND the `/source` a11y frames are POINTS (393x852).
20
21
  * The invariant in BOTH paths: TAP in points, RECORD in pixels, because the
21
22
  * loop re-normalizes the recorded coord against dimensions() (PIXELS).
22
23
  * - VISION: tap pt = round(n/1000 * pointSize); record px = round(n/1000 * pixelSize).
@@ -30,10 +31,12 @@
30
31
  * backend never converts coords with screen_width/height.
31
32
  */
32
33
  import { resolveTextValue } from "./actions.js";
33
- import { requireOneBootedSimulator, describeScreen, describeAll, screenshotPng, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, terminateApp, launchApp, installApp, isAppInstalled, bundleIdFromApp, } from "./simctl.js";
34
+ import { requireOneBootedSimulator, screenshotPng, terminateApp, launchApp, installApp, isAppInstalled, bundleIdFromApp, appBuildFromSimulator, } from "./simctl.js";
35
+ // iOS UI interaction + a11y run through WebDriverAgent (XCUITest), not idb.
36
+ import { ensureWda, closeWda, describeScreen, describeAll, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, } from "./xcuitest.js";
34
37
  import { isLocalPath } from "../upload.js";
35
38
  import { deNormalizePoint, deNormalizeDrag, pointToPixel } from "./coordinates.js";
36
- import { parseIdbDescribeAll, serializeNativeTree, boundsCenter } from "./native-a11y.js";
39
+ import { parseXcuiHierarchy, serializeNativeTree, boundsCenter } from "./native-a11y.js";
37
40
  // Let animations/transitions settle before the next observation so the
38
41
  // screenshot the LLM reasons over reflects the action's result.
39
42
  const POST_GESTURE_SETTLE_MS = 500;
@@ -61,6 +64,8 @@ export class IOSDevice {
61
64
  appPath;
62
65
  /** udid of the single booted simulator we drive. */
63
66
  udid = "";
67
+ /** Set once the WebDriverAgent runner is up, so the startup note logs once. */
68
+ wdaStarted = false;
64
69
  /** POINT size — what idb ui tap/swipe consume (de-normalization basis for TAPS). */
65
70
  pointWidth = 0;
66
71
  pointHeight = 0;
@@ -96,6 +101,14 @@ export class IOSDevice {
96
101
  this.bundleId = await this.resolveBundleId(target);
97
102
  }
98
103
  const bundleId = this.bundleId;
104
+ // Bring up the WebDriverAgent runner (install + simctl-launch the prebuilt
105
+ // xctrunner, open a session). Idempotent and reused across participants, so
106
+ // the ~30-60s first-launch cost is paid once per run.
107
+ if (!this.wdaStarted) {
108
+ this.log("Starting the iOS automation runner (WebDriverAgent); first launch can take ~30-60s...");
109
+ }
110
+ await ensureWda(this.udid);
111
+ this.wdaStarted = true;
99
112
  // Prime screen geometry (points) before the first de-normalization.
100
113
  await this.refreshScreen();
101
114
  // Per-participant reset: terminate then relaunch from a clean state.
@@ -103,6 +116,21 @@ export class IOSDevice {
103
116
  await launchApp(this.udid, bundleId);
104
117
  await settle(1500); // cold start needs longer than a gesture settle
105
118
  }
119
+ /**
120
+ * The installed app's version/build, read off the simulator after
121
+ * launchOrReset has resolved the bundle id. Best-effort — null until the
122
+ * bundle id is known, or if simctl/plutil can't report it.
123
+ */
124
+ async appBuild() {
125
+ if (!this.bundleId || !this.udid)
126
+ return null;
127
+ const meta = await appBuildFromSimulator(this.udid, this.bundleId);
128
+ return {
129
+ package: this.bundleId,
130
+ version: meta?.version ?? null,
131
+ build: meta?.build ?? null,
132
+ };
133
+ }
106
134
  /**
107
135
  * Resolve the bundle id to drive, returning a non-null id or throwing.
108
136
  * Installs a local `.app` first and reads its CFBundleIdentifier from
@@ -171,7 +199,7 @@ export class IOSDevice {
171
199
  };
172
200
  }
173
201
  /**
174
- * Read + serialize the idb describe-all a11y tree (bounds in POINTS). Any
202
+ * Read + serialize WDA's /source a11y tree (bounds in POINTS). Any
175
203
  * failure (retries exhausted on a trivial tree, parse error) degrades to an
176
204
  * empty tree so the backend falls back to vision — a missing tree must never
177
205
  * abort the observation.
@@ -179,7 +207,7 @@ export class IOSDevice {
179
207
  async dumpTree() {
180
208
  try {
181
209
  const json = await describeAll(this.udid);
182
- const nodes = parseIdbDescribeAll(json);
210
+ const nodes = parseXcuiHierarchy(json);
183
211
  const tree = serializeNativeTree(nodes);
184
212
  this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
185
213
  return tree;
@@ -206,7 +234,7 @@ export class IOSDevice {
206
234
  // separately in points (see toPoints()).
207
235
  return { width: this.pixelWidth, height: this.pixelHeight };
208
236
  }
209
- /** Normalized 0-1000 → POINT space (idb ui tap/swipe take points). */
237
+ /** Normalized 0-1000 → POINT space (WDA taps/swipes take points). */
210
238
  toPoints(c) {
211
239
  return deNormalizePoint(c, this.pointWidth, this.pointHeight);
212
240
  }
@@ -227,7 +255,7 @@ export class IOSDevice {
227
255
  const bounds = this.lastNodeMap.get(action.node_id);
228
256
  if (!bounds)
229
257
  return { pt: null, px: null, stale: true };
230
- const pt = boundsCenter(bounds); // POINTS — idb taps directly
258
+ const pt = boundsCenter(bounds); // POINTS — WDA taps directly
231
259
  const px = pointToPixel(pt, this.pointWidth, this.pointHeight, this.pixelWidth, this.pixelHeight);
232
260
  return { pt, px, stale: false };
233
261
  }
@@ -237,7 +265,7 @@ export class IOSDevice {
237
265
  }
238
266
  async executeAction(action) {
239
267
  try {
240
- // pt drives the idb TAP (points); px is what we RECORD (pixels). ELEMENT
268
+ // pt drives the WDA TAP (points); px is what we RECORD (pixels). ELEMENT
241
269
  // path: pt = bounds-center, px = that center scaled to pixels. VISION
242
270
  // path: both derive from the same normalized coord. Either way the tap
243
271
  // lands right and the recorded px round-trips against dimensions().
@@ -279,6 +307,11 @@ export class IOSDevice {
279
307
  await this.navigateBack();
280
308
  break;
281
309
  }
310
+ case "open_system_panel": {
311
+ // Element-less, like navigate_back: best-effort top-edge pull-down.
312
+ await this.openSystemPanel(action.panel === "quick_settings" ? "quick_settings" : "notifications");
313
+ break;
314
+ }
282
315
  case "drag": {
283
316
  // A drag GRABS an element and RELEASES it elsewhere ("click the
284
317
  // element, move, let go") — distinct from a swipe (element-less
@@ -364,7 +397,7 @@ export class IOSDevice {
364
397
  await settle(250);
365
398
  }
366
399
  const text = resolveTextValue(action, this.contextValues);
367
- // idb ui text appends to the focused field; for click_type (replace) there
400
+ // WDA text input appends to the focused field; for click_type (replace) there
368
401
  // is no idb "clear", so we rely on the field being empty after focus. The
369
402
  // vision agent typically taps an empty field, so this matches Android's
370
403
  // common path; a true select-all clear isn't exposed by idb.
@@ -470,10 +503,10 @@ export class IOSDevice {
470
503
  * do drive the system gesture) when no back button is visible.
471
504
  */
472
505
  async navigateBack() {
473
- const nodes = parseIdbDescribeAll(await describeAll(this.udid));
506
+ const nodes = parseXcuiHierarchy(await describeAll(this.udid));
474
507
  const back = this.findBackButton(nodes);
475
508
  if (back) {
476
- const c = boundsCenter(back.bounds); // POINTS — idb taps directly
509
+ const c = boundsCenter(back.bounds); // POINTS — WDA taps directly
477
510
  await uiTap(this.udid, c.x, c.y);
478
511
  return;
479
512
  }
@@ -483,6 +516,39 @@ export class IOSDevice {
483
516
  const midY = Math.round(this.pointHeight / 2);
484
517
  await uiSwipe(this.udid, 1, midY, Math.round(this.pointWidth * 0.5), midY, 300);
485
518
  }
519
+ /**
520
+ * Best-effort open of an iOS system panel by swiping down from the top edge.
521
+ * iOS has no `cmd statusbar` equivalent, so on a Face-ID layout:
522
+ * - notifications → Notification Center: swipe down from the top-CENTER.
523
+ * - quick_settings → Control Center: swipe down from the top-RIGHT corner.
524
+ * Coordinates are POINTS (idb consumes points; see toPoints()/the swipe()
525
+ * helper). This is FLAKY on the simulator — idb's synthetic touch frequently
526
+ * doesn't trigger the system edge gesture (the same limitation navigateBack's
527
+ * edge-swipe hits). We compare a before/after screenshot and log LOUDLY when
528
+ * the screen didn't change, rather than silently reporting success, so a
529
+ * no-op is visible in the run. The executeAction caller still returns
530
+ * success:true (the gesture was attempted); the loud log is the signal.
531
+ */
532
+ async openSystemPanel(panel) {
533
+ const before = await screenshotPng();
534
+ const w = this.pointWidth;
535
+ const h = this.pointHeight;
536
+ // Start ON the top edge and travel a third of the screen down. Control
537
+ // Center lives under the top-right (battery/status) corner on Face-ID
538
+ // devices; Notification Center under the top-center notch area.
539
+ const startX = panel === "quick_settings" ? Math.round(w * 0.92) : Math.round(w * 0.5);
540
+ const startY = 1;
541
+ const endY = Math.round(h * 0.35);
542
+ await uiSwipe(this.udid, startX, startY, startX, endY, 350);
543
+ await settle();
544
+ // Loudly surface a no-op: the simulator's synthetic touch often can't drive
545
+ // the system edge gesture. An identical screenshot means the panel didn't open.
546
+ const after = await screenshotPng();
547
+ if (before.equals(after)) {
548
+ this.log(`open_system_panel (${panel}): top-edge swipe produced no visible change — ` +
549
+ `the simulator's synthetic touch likely didn't trigger the system gesture (flaky on the simulator).`);
550
+ }
551
+ }
486
552
  /**
487
553
  * The nav-bar back button: the leading (leftmost) actionable button in the
488
554
  * top nav-bar band. iOS HIG guarantees "back" is the leading nav item in a
@@ -526,7 +592,7 @@ export class IOSDevice {
526
592
  // the agent can adapt.
527
593
  const hint = {
528
594
  pinch_zoom: "no multi-touch on the native driver; the agent should zoom via double_tap",
529
- rotate_device: "idb exposes no clean rotate; leave orientation as-is",
595
+ rotate_device: "rotation is not wired on the native driver; leave orientation as-is",
530
596
  keyboard_shortcut: "no hardware keyboard on the native driver; use on-screen taps/text_input",
531
597
  switch_tab: "tabs are a browser concept; the native app has a single window",
532
598
  close_tab: "tabs are a browser concept; the native app has a single window",
@@ -540,7 +606,9 @@ export class IOSDevice {
540
606
  return "";
541
607
  }
542
608
  async close() {
543
- // Leave the app installed/running; the simulator is shared and the next run
544
- // resets via launchOrReset. Nothing to tear down (no IME state on iOS).
609
+ // Tear down the WebDriverAgent session (the runner is left installed on the
610
+ // shared simulator for the next run). The app resets via launchOrReset; no
611
+ // IME state to restore on iOS.
612
+ await closeWda(this.udid);
545
613
  }
546
614
  }
@@ -7,8 +7,9 @@
7
7
  */
8
8
  import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
9
9
  import { uploadScreenshot } from "./upload.js";
10
- import { detectNoVisibleChange, describeAction } from "./actions.js";
10
+ import { detectNoVisibleChange, describeAction, classifyStepKind } from "./actions.js";
11
11
  import { createDevice } from "./device.js";
12
+ import pkg from "../../../package.json" with { type: "json" };
12
13
  import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
13
14
  /**
14
15
  * Native (mobile) platforms drive a single physical device via screenshot →
@@ -49,6 +50,7 @@ export function flattenAction(raw, nodeId = null, nodeDescription = null) {
49
50
  key: a.key ?? null,
50
51
  tab_id: a.tab_id ?? null,
51
52
  orientation: a.orientation ?? null,
53
+ panel: a.panel ?? null,
52
54
  scale: a.scale ?? null,
53
55
  // Native path: ResolvedAction.coordinates (top level of the resolved_actions
54
56
  // entry) is the single {x,y} execution point. Fall back to the nested action
@@ -112,6 +114,34 @@ const SENTIMENT_ICONS = {
112
114
  Positive: "+", Negative: "-", Neutral: "~",
113
115
  Frustrated: "!", Confused: "?", Delighted: "*",
114
116
  };
117
+ const CLI_VERSION = pkg.version;
118
+ /**
119
+ * Stamp the app build this run drove onto the iteration, so the web app's
120
+ * run-settings card can show which build the iteration is on. Best-effort:
121
+ * a native run never depends on this landing, so failures are warned, not
122
+ * thrown. Only native platforms carry a build.
123
+ */
124
+ async function reportObservedApp(client, iterationId, platform, build, log) {
125
+ if (platform !== "ios" && platform !== "android")
126
+ return;
127
+ try {
128
+ await client.post(`/iterations/${iterationId}/observed-app`, {
129
+ platform,
130
+ package: build.package,
131
+ version: build.version,
132
+ build: build.build,
133
+ cli_version: CLI_VERSION,
134
+ });
135
+ const label = [build.version, build.build ? `(${build.build})` : null]
136
+ .filter(Boolean)
137
+ .join(" ");
138
+ log(`Recorded app build${label ? `: ${label}` : ""}`);
139
+ }
140
+ catch (err) {
141
+ const msg = err instanceof Error ? err.message : String(err);
142
+ console.warn(`Could not record app build for the iteration: ${msg}`);
143
+ }
144
+ }
115
145
  /**
116
146
  * Run local simulations — parallel when multiple participants, sequential by default.
117
147
  * Use --parallel <n> to control concurrency (default: number of participants).
@@ -138,6 +168,16 @@ export async function runLocalSimulations(client, opts) {
138
168
  log("Native (android/ios) runs drive a single device — running sequentially.");
139
169
  }
140
170
  const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
171
+ // Native runs stamp the app build onto the iteration once — every
172
+ // participant in a run drives the same installed build, so dedupe to a
173
+ // single best-effort POST after the first device resolves its app.
174
+ let appBuildReported = false;
175
+ const reportAppBuild = (build, platform) => {
176
+ if (appBuildReported)
177
+ return;
178
+ appBuildReported = true;
179
+ void reportObservedApp(client, opts.iterationId, platform, build, log);
180
+ };
141
181
  try {
142
182
  if (concurrency <= 1 || opts.participantIds.length <= 1) {
143
183
  // Sequential execution — each participant owns its own browser
@@ -148,7 +188,7 @@ export async function runLocalSimulations(client, opts) {
148
188
  log(`\nStarting local simulation for ${participantName}...`);
149
189
  try {
150
190
  const participantLog = (msg) => log(`[${participantName}] ${msg}`);
151
- await runSingleSimulation(client, participantId, participantName, opts, participantLog, () => cancelled);
191
+ await runSingleSimulation(client, participantId, participantName, opts, participantLog, () => cancelled, reportAppBuild);
152
192
  log(`Completed: ${participantName}`);
153
193
  }
154
194
  catch (err) {
@@ -182,7 +222,7 @@ export async function runLocalSimulations(client, opts) {
182
222
  const participantLog = (msg) => log(`[${participantName}] ${msg}`);
183
223
  participantLog("Starting...");
184
224
  try {
185
- await runSingleSimulation(client, participantId, participantName, opts, participantLog, () => cancelled, sharedBrowser);
225
+ await runSingleSimulation(client, participantId, participantName, opts, participantLog, () => cancelled, reportAppBuild, sharedBrowser);
186
226
  participantLog("Completed");
187
227
  }
188
228
  catch (err) {
@@ -202,7 +242,7 @@ export async function runLocalSimulations(client, opts) {
202
242
  process.off("SIGINT", onSigint);
203
243
  }
204
244
  }
205
- async function runSingleSimulation(client, participantId, participantName, opts, log, isCancelled, sharedBrowser) {
245
+ async function runSingleSimulation(client, participantId, participantName, opts, log, isCancelled, onAppBuild, sharedBrowser) {
206
246
  // Step 1: Initialize session
207
247
  const initResponse = await client.localSimInit({
208
248
  participant_id: participantId,
@@ -273,6 +313,19 @@ async function runSingleSimulation(client, participantId, participantName, opts,
273
313
  try {
274
314
  // Step 3: Launch / navigate the target to its starting point.
275
315
  await device.launchOrReset(launchTarget);
316
+ // Step 3b: Capture the installed app's build (native only). Best-effort —
317
+ // the dedupe in runLocalSimulations keeps this to one POST per run, and a
318
+ // failed read or report never disturbs the simulation.
319
+ if (onAppBuild) {
320
+ try {
321
+ const observed = await device.appBuild?.();
322
+ if (observed)
323
+ onAppBuild(observed, platform);
324
+ }
325
+ catch {
326
+ // ignore — build capture is non-essential
327
+ }
328
+ }
276
329
  // Step 4: Run assignment loop
277
330
  for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
278
331
  const assignment = session.assignments[assignmentIdx];
@@ -283,6 +336,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
283
336
  // status when the loop ends because the agent terminated (completed vs
284
337
  // abandoned). Stays "in_progress" if the loop hits max_steps.
285
338
  let lastAssignmentStatus = "in_progress";
339
+ // Frame continuity (native): carry the PREVIOUS step's logical-screen
340
+ // classification + matched frame forward, so this step's match-frame call
341
+ // can tell the backend to reuse the frame when the screen didn't change
342
+ // (pure scroll / non-submitting keyboard). Reset per assignment.
343
+ let lastStepKind = "none";
344
+ let lastFrameVersionId;
286
345
  while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
287
346
  // OBSERVE — the device refreshes its own active surface (popup /
288
347
  // switch_tab for browser) before capturing. (The browser device emits
@@ -394,12 +453,16 @@ async function runSingleSimulation(client, participantId, participantName, opts,
394
453
  const actionDescs = [];
395
454
  const elementNames = [];
396
455
  const actionDebugEntries = [];
456
+ // Per-action success (index-aligned with stepResponse.actions), used to
457
+ // classify this step's logical-screen kind for frame continuity.
458
+ const perActionSuccess = [];
397
459
  const preActionScreenshot = await device.captureScreenshot();
398
460
  for (let i = 0; i < stepResponse.actions.length; i++) {
399
461
  if (isCancelled())
400
462
  break;
401
463
  const action = stepResponse.actions[i];
402
464
  const result = await device.executeAction(action);
465
+ perActionSuccess[i] = result.success;
403
466
  const desc = describeAction(action);
404
467
  debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
405
468
  const openedNewTab = result.openedNewTab;
@@ -414,28 +477,44 @@ async function runSingleSimulation(client, participantId, participantName, opts,
414
477
  const actionType = action.type || "unknown";
415
478
  const INTERNAL_ACTIONS = new Set(["think"]);
416
479
  if (!INTERNAL_ACTIONS.has(actionType)) {
480
+ // Pack `data` to match the hosted sim's map_action_to_db so native
481
+ // rows render identically. value_type lets the FE flag var/secret;
482
+ // drag's full path goes under data.coordinates (0-1000), not a
483
+ // bespoke drag_end. Secret `value` stays masked (it's the variable
484
+ // key, not the resolved secret — masking is strictly safer than the
485
+ // web path, and value_type now drives the FE lock glyph).
486
+ const actionData = {
487
+ ...(action.value !== undefined && action.value !== null && { value: action.value_type === "secret" ? "***" : action.value }),
488
+ ...(action.value_type && { value_type: action.value_type }),
489
+ ...(action.mode && { mode: action.mode }),
490
+ ...(action.submit && { submit: action.submit }),
491
+ ...(action.direction && { direction: action.direction }),
492
+ ...(action.amount && { amount: action.amount }),
493
+ ...(action.count && action.count > 1 && { count: action.count }),
494
+ ...(action.duration_ms && { duration_ms: action.duration_ms }),
495
+ ...(action.modifiers?.length && { modifiers: action.modifiers }),
496
+ ...(action.key && { key: action.key }),
497
+ ...(action.tab_id && { tab_id: action.tab_id }),
498
+ ...(action.orientation && { orientation: action.orientation }),
499
+ ...(action.panel && { panel: action.panel }),
500
+ ...(action.drag && {
501
+ coordinates: {
502
+ startX: action.drag.startX,
503
+ startY: action.drag.startY,
504
+ endX: action.drag.endX,
505
+ endY: action.drag.endY,
506
+ },
507
+ }),
508
+ ...(openedNewTab && { opened_new_tab: true }),
509
+ };
417
510
  actionDatas.push({
418
511
  action_type: actionType,
419
512
  element_label: action.element_name ?? null,
420
513
  element_type: action.element_type ?? null,
421
- coordinates: normalizedCoords,
422
- data: {
423
- ...(action.value !== undefined && action.value !== null && { value: action.value_type === "secret" ? "***" : action.value }),
424
- ...(action.mode && { mode: action.mode }),
425
- ...(action.submit && { submit: action.submit }),
426
- ...(action.direction && { direction: action.direction }),
427
- ...(action.amount && { amount: action.amount }),
428
- ...(action.count && action.count > 1 && { count: action.count }),
429
- ...(action.duration_ms && { duration_ms: action.duration_ms }),
430
- ...(action.modifiers?.length && { modifiers: action.modifiers }),
431
- ...(action.key && { key: action.key }),
432
- ...(action.tab_id && { tab_id: action.tab_id }),
433
- ...(action.orientation && { orientation: action.orientation }),
434
- // The recorded `coordinates` is the drag START; persist the END
435
- // (normalized 0-1000) too so the journey captures the full path.
436
- ...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
437
- ...(openedNewTab && { opened_new_tab: true }),
438
- },
514
+ // Drag's path lives in data.coordinates; the hosted sim leaves the
515
+ // top-level coordinates null for a drag.
516
+ coordinates: action.drag ? null : normalizedCoords,
517
+ data: Object.keys(actionData).length ? actionData : null,
439
518
  order: i,
440
519
  });
441
520
  }
@@ -492,6 +571,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
492
571
  // Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
493
572
  // back to screen_format server-side.
494
573
  platform,
574
+ // Frame continuity: these describe the transition INTO this
575
+ // observation, produced by the PREVIOUS step's action. When that
576
+ // step was a pure scroll / non-submitting keyboard on a native
577
+ // device, the logical screen didn't change — tell the backend to
578
+ // reuse the previous frame instead of minting a new one off the
579
+ // shifted pixels. Carried from lastStepKind / lastFrameVersionId,
580
+ // updated AFTER this call for the next iteration.
581
+ ...(isNative && lastFrameVersionId ? { previous_frame_version_id: lastFrameVersionId } : {}),
582
+ same_screen_continuation: isNative && (lastStepKind === "scroll" || lastStepKind === "keyboard"),
495
583
  });
496
584
  frameVersionId = matchResult.frame_version_id;
497
585
  }
@@ -499,6 +587,11 @@ async function runSingleSimulation(client, participantId, participantName, opts,
499
587
  const msg = err instanceof Error ? err.message : String(err);
500
588
  log(` Warning: frame matching failed — ${msg}`);
501
589
  }
590
+ // Carry THIS step's logical-screen classification + matched frame
591
+ // forward for the NEXT iteration's match-frame call (consumed above as
592
+ // last*). Classify after the call so ordering is consume-then-update.
593
+ lastStepKind = classifyStepKind(stepResponse.actions, perActionSuccess);
594
+ lastFrameVersionId = frameVersionId;
502
595
  // Debug-only: capture post-action screenshot to show result
503
596
  let postActionBase64;
504
597
  if (isDebugEnabled()) {
@@ -518,7 +611,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
518
611
  forwards.push({ type: "LOOP_DETECTED", content: "A repetitive action cycle was detected. Try a different approach." });
519
612
  }
520
613
  // Record interaction (1-indexed step for backend)
521
- interactions.push({
614
+ const interaction = {
522
615
  step: step + 1,
523
616
  assignment_id: assignment.id,
524
617
  ...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
@@ -542,7 +635,24 @@ async function runSingleSimulation(client, participantId, participantName, opts,
542
635
  // Server reduces this to Interaction.tab when N >= 2; omit on
543
636
  // single-tab steps to keep the payload (and DB column) null.
544
637
  ...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
545
- });
638
+ };
639
+ // Keep the in-memory array for the debug HTML report.
640
+ interactions.push(interaction);
641
+ // Stream this interaction live so the backend persists + commits it
642
+ // immediately and fires INTERACTION_CREATED in realtime. A streaming
643
+ // failure must never abort the run — log and continue (the run-end
644
+ // finalize call still records the terminal state).
645
+ try {
646
+ await client.localSimRecordInteraction({
647
+ participant_id: session.participant_id,
648
+ product_id: session.product_id,
649
+ interaction,
650
+ });
651
+ }
652
+ catch (err) {
653
+ const msg = err instanceof Error ? err.message : String(err);
654
+ log(` Warning: failed to stream interaction ${step + 1} — ${msg}`);
655
+ }
546
656
  // Update history for next step
547
657
  history.push({
548
658
  comment: stepResponse.comment,
@@ -633,7 +743,6 @@ async function runSingleSimulation(client, participantId, participantName, opts,
633
743
  await client.localSimRecord({
634
744
  participant_id: session.participant_id,
635
745
  product_id: session.product_id,
636
- interactions,
637
746
  final_status: finalStatus,
638
747
  assignment_statuses: assignmentStatuses,
639
748
  });
@@ -12,7 +12,7 @@
12
12
  *
13
13
  * COORDINATE SPACE — carried, not converted, by this module:
14
14
  * - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
15
- * - iOS `idb ui describe-all` frames are POINTS (`space: "points"`).
15
+ * - iOS WebDriverAgent /source frames are POINTS (`space: "points"`).
16
16
  * The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
17
17
  * IOSDevice taps points), so the `space` tag tells the caller which dimension a
18
18
  * node's bounds-center belongs to. This module never mixes the two.
@@ -65,15 +65,29 @@ export interface NativeTree {
65
65
  */
66
66
  export declare function parseUiautomatorXml(xml: string): NativeNode[];
67
67
  /**
68
- * Parse `idb ui describe-all` JSON (a FLAT array of elements, each with a `frame`
69
- * in POINTS) into NativeNodes in array order. iOS is already a flat,
70
- * properly-labeled list no ancestor walk needed — so `clickable` is derived
71
- * from the element's role/type and whether it carries a usable label.
68
+ * Parse WDA's `GET /source?format=json` a NESTED accessibility tree into the
69
+ * FLAT, depth-first `NativeNode[]` (POINTS) that `parseXcuiHierarchy` produces,
70
+ * so `serializeNativeTree` consumes it unchanged. WDA's `type` matches idb's iOS
71
+ * types (Button/StaticText/SearchField/Cell/Image/Application…), so
72
+ * `normalizeRole`/`IOS_ACTIONABLE_TYPES`/`frameToBounds` all apply as-is.
73
+ *
74
+ * KEY: WDA's `/source` is the FULL XCUIElement tree — every container and leaf —
75
+ * NOT idb's clean accessibility-elements list. iOS settings rows surface as an
76
+ * accessible `Button` ("General", isAccessible=1) that ALSO contains a duplicate
77
+ * inner `StaticText` ("General", isAccessible=0) and is wrapped in a `Cell`
78
+ * (isAccessible=0). Emitting all three yields "General General" + empty
79
+ * listitems. So we emit ONLY `isAccessible && isVisible` nodes — exactly the
80
+ * VoiceOver-exposed set idb returned: the labeled Button is both the label and
81
+ * the tap target; the duplicate StaticText and the wrapping Cell are pruned. A
82
+ * sparse a11y tree degrades to the loop's vision fallback, so strict filtering
83
+ * never strands the run.
84
+ *
85
+ * Accepts either the raw tree or the W3C `{ value: <tree> }` envelope WDA returns.
72
86
  */
73
- export declare function parseIdbDescribeAll(json: string): NativeNode[];
87
+ export declare function parseXcuiHierarchy(json: string): NativeNode[];
74
88
  /**
75
89
  * Serialize a flat NativeNode list (from `parseUiautomatorXml` /
76
- * `parseIdbDescribeAll`) into the `[id] role "label"` string the DOMLocator
90
+ * `parseXcuiHierarchy`) into the `[id] role "label"` string the DOMLocator
77
91
  * reasons over, plus a `shortId → bounds` map for local tap resolution.
78
92
  *
79
93
  * Emission rules (kept tight, like the DOM serializer):