@ishlabs/cli 0.24.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/commands/ask.js +3 -3
  2. package/dist/commands/doctor.d.ts +26 -0
  3. package/dist/commands/doctor.js +334 -0
  4. package/dist/commands/iteration.js +1 -1
  5. package/dist/commands/study-analyze.js +1 -1
  6. package/dist/commands/study-run.js +80 -12
  7. package/dist/commands/study.js +11 -7
  8. package/dist/index.js +2 -0
  9. package/dist/lib/alias-store.js +1 -1
  10. package/dist/lib/api-client.d.ts +2 -0
  11. package/dist/lib/docs.js +57 -42
  12. package/dist/lib/local-sim/actions.d.ts +10 -2
  13. package/dist/lib/local-sim/actions.js +18 -11
  14. package/dist/lib/local-sim/adb.d.ts +113 -0
  15. package/dist/lib/local-sim/adb.js +366 -0
  16. package/dist/lib/local-sim/android.d.ts +111 -0
  17. package/dist/lib/local-sim/android.js +504 -0
  18. package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
  19. package/dist/lib/local-sim/apk-manifest.js +210 -0
  20. package/dist/lib/local-sim/browser.d.ts +22 -0
  21. package/dist/lib/local-sim/browser.js +65 -0
  22. package/dist/lib/local-sim/coordinates.d.ts +69 -0
  23. package/dist/lib/local-sim/coordinates.js +59 -0
  24. package/dist/lib/local-sim/device.d.ts +143 -0
  25. package/dist/lib/local-sim/device.js +152 -0
  26. package/dist/lib/local-sim/ios.d.ts +185 -0
  27. package/dist/lib/local-sim/ios.js +599 -0
  28. package/dist/lib/local-sim/loop.d.ts +14 -2
  29. package/dist/lib/local-sim/loop.js +168 -73
  30. package/dist/lib/local-sim/native-a11y.d.ts +111 -0
  31. package/dist/lib/local-sim/native-a11y.js +419 -0
  32. package/dist/lib/local-sim/simctl.d.ts +55 -0
  33. package/dist/lib/local-sim/simctl.js +144 -0
  34. package/dist/lib/local-sim/types.d.ts +39 -2
  35. package/dist/lib/local-sim/upload.d.ts +1 -1
  36. package/dist/lib/local-sim/upload.js +9 -6
  37. package/dist/lib/local-sim/xcuitest.d.ts +60 -0
  38. package/dist/lib/local-sim/xcuitest.js +303 -0
  39. package/dist/lib/output.js +58 -12
  40. package/dist/lib/paths.d.ts +8 -0
  41. package/dist/lib/paths.js +12 -0
  42. package/dist/lib/skill-content.js +10 -9
  43. package/package.json +2 -1
@@ -1,19 +1,31 @@
1
1
  /**
2
2
  * Local simulation loop orchestrator.
3
3
  *
4
- * Runs the observe → reason (remote) → act (local) loop for each
5
- * participant against a local Playwright browser.
4
+ * Runs the observe → reason (remote) → act (local) loop for each participant
5
+ * against a SimulationDevice (a Playwright browser today; a native Android
6
+ * emulator next). The loop is device-agnostic — see device.ts.
6
7
  */
7
- import { launchBrowser, launchSharedBrowser, createTab, captureObservation, takeScreenshot, takeScreenshotJpeg, navigateWithRetry, closeBrowser } from "./browser.js";
8
+ import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
8
9
  import { uploadScreenshot } from "./upload.js";
9
- import { executeAction, detectNoVisibleChange, describeAction } from "./actions.js";
10
- import { TabManager } from "./tabs.js";
11
- import { enableDebug, isDebugEnabled, debugObservation, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
10
+ import { detectNoVisibleChange, describeAction } from "./actions.js";
11
+ import { createDevice } from "./device.js";
12
+ import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
13
+ /**
14
+ * Native (mobile) platforms drive a single physical device via screenshot →
15
+ * normalized-coordinate taps, with no accessibility tree or URL. Browser/web is
16
+ * everything else.
17
+ */
18
+ function isNativePlatform(platform) {
19
+ return platform === "android" || platform === "ios";
20
+ }
12
21
  /**
13
22
  * Convert a raw action (from either resolved_actions or output.action.actions)
14
- * into the flat LocalStepAction shape used by the executor.
23
+ * into the flat LocalStepAction shape used by the executor. Exported for unit
24
+ * tests of the native drag coordinate-shape split (the nested action's
25
+ * `coordinates` is a {x,y} tap point for most actions but a
26
+ * {startX,...,endY} path for a drag).
15
27
  */
16
- function flattenAction(raw, nodeId = null, nodeDescription = null) {
28
+ export function flattenAction(raw, nodeId = null, nodeDescription = null) {
17
29
  // resolved_actions nest the action inside an "action" key
18
30
  const a = (raw.action ?? raw);
19
31
  const element = a.element;
@@ -36,8 +48,36 @@ function flattenAction(raw, nodeId = null, nodeDescription = null) {
36
48
  modifiers: Array.isArray(a.modifiers) ? a.modifiers : null,
37
49
  key: a.key ?? null,
38
50
  tab_id: a.tab_id ?? null,
51
+ orientation: a.orientation ?? null,
52
+ panel: a.panel ?? null,
53
+ scale: a.scale ?? null,
54
+ // Native path: ResolvedAction.coordinates (top level of the resolved_actions
55
+ // entry) is the single {x,y} execution point. Fall back to the nested action
56
+ // for raw output — but only a point-shaped {x,y}; a drag's nested
57
+ // coordinates is the {startX,...,endY} path (extracted into `drag` below),
58
+ // not a tap point, so guard against mis-assigning it here.
59
+ coordinates: pickPoint(raw.coordinates) ?? pickPoint(a.coordinates) ?? null,
60
+ // drag: the gesture path lives on the nested action's coordinates as
61
+ // {startX, startY, endX, endY} (DragCoordinates serialized by_alias).
62
+ drag: pickDrag(a.coordinates) ?? null,
39
63
  };
40
64
  }
65
+ /** A nested action's coordinates only when it's the {x,y} tap-point shape. */
66
+ function pickPoint(c) {
67
+ if (c && typeof c === "object" && "x" in c && "y" in c) {
68
+ const p = c;
69
+ return { x: p.x, y: p.y };
70
+ }
71
+ return null;
72
+ }
73
+ /** A nested action's coordinates only when it's the DragCoordinates shape. */
74
+ function pickDrag(c) {
75
+ if (c && typeof c === "object" && "startX" in c && "endX" in c) {
76
+ const d = c;
77
+ return { startX: d.startX, startY: d.startY, endX: d.endX, endY: d.endY };
78
+ }
79
+ return null;
80
+ }
41
81
  /**
42
82
  * Normalize the raw backend step response into the flat structure used by the loop.
43
83
  * Backend returns { output: { ... }, resolved_actions: [...], loop_detected }.
@@ -61,7 +101,10 @@ function normalizeStepResponse(raw) {
61
101
  sentiment_intensity: out.sentiment_intensity ?? 0,
62
102
  current_location: out.current_location,
63
103
  effort_seconds: out.effort_seconds,
64
- assignment_completed: out.assignment_completed,
104
+ assignment_status: out.assignment_status,
105
+ // Terminate on completed OR abandoned — a stuck agent that gives up
106
+ // should stop the loop just like a finished one.
107
+ assignment_completed: out.assignment_status === "completed" || out.assignment_status === "abandoned",
65
108
  actions,
66
109
  loop_detected: raw.loop_detected,
67
110
  };
@@ -89,7 +132,13 @@ export async function runLocalSimulations(client, opts) {
89
132
  log("\nCancelling after current step...");
90
133
  };
91
134
  process.on("SIGINT", onSigint);
92
- const concurrency = opts.parallel ?? opts.participantIds.length;
135
+ // Native runs share ONE physical device (emulator / simulator), so they
136
+ // can't run in parallel — force sequential regardless of --parallel.
137
+ const isNativeRun = isNativePlatform(opts.platform);
138
+ if (isNativeRun && (opts.parallel ?? 1) > 1) {
139
+ log("Native (android/ios) runs drive a single device — running sequentially.");
140
+ }
141
+ const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
93
142
  try {
94
143
  if (concurrency <= 1 || opts.participantIds.length <= 1) {
95
144
  // Sequential execution — each participant owns its own browser
@@ -162,12 +211,20 @@ async function runSingleSimulation(client, participantId, participantName, opts,
162
211
  product_id: opts.workspaceId,
163
212
  iteration_id: opts.iterationId,
164
213
  });
165
- // Resolve URL and browser config from iteration details (with CLI fallback)
214
+ // Resolve target + config from iteration details (with CLI fallback).
215
+ // Platform precedence: --platform flag > iteration's stored platform > web.
166
216
  const iterDetails = initResponse.iteration_details;
217
+ const platform = opts.platform ?? iterDetails?.platform ?? "web";
218
+ const isNative = isNativePlatform(platform);
219
+ // Browser needs a URL to navigate; native uses the app package (from --app or
220
+ // the iteration target) and has no URL requirement.
167
221
  const navigationUrl = iterDetails?.url ?? opts.url;
168
- if (!navigationUrl) {
222
+ if (!isNative && !navigationUrl) {
169
223
  throw new Error("No URL available: backend did not return iteration_details and no --url flag was provided.");
170
224
  }
225
+ // For native, launchOrReset() receives the app package (iteration target);
226
+ // the AndroidDevice prefers --app over this.
227
+ const launchTarget = isNative ? (navigationUrl ?? "") : navigationUrl;
171
228
  const screenFormat = opts.screenFormat ?? iterDetails?.screen_format ?? "desktop";
172
229
  const locale = opts.locale ?? iterDetails?.locale;
173
230
  // Cache session state for per-step requests
@@ -189,7 +246,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
189
246
  const stepContextValues = session.context_values.map(cv => cv.type === "secret" ? { ...cv, value: null } : cv);
190
247
  const maxSteps = opts.maxInteractions ?? session.max_interactions;
191
248
  const viewport = { width: 1440, height: 900 }; // TODO: extract from config
192
- // Step 2: Launch browser
249
+ // Step 2: Build the target device (per-platform dispatch).
250
+ // Browser today; AndroidDevice (adb) slots in via createDevice() later.
193
251
  const browserOpts = {
194
252
  headed: opts.headed,
195
253
  slowMo: opts.slowMo,
@@ -198,15 +256,13 @@ async function runSingleSimulation(client, participantId, participantName, opts,
198
256
  locale,
199
257
  screenFormat,
200
258
  };
201
- // Use shared browser if available (parallel mode), otherwise launch standalone
202
- const ownsTheBrowser = !sharedBrowser;
203
- const browserSession = sharedBrowser
204
- ? await createTab(sharedBrowser, browserOpts)
205
- : await launchBrowser(browserOpts);
206
- // Active page can swap when a popup auto-focuses or the LLM issues
207
- // switch_tab/close_tab. TabManager wires the context popup listener.
208
- const tabs = new TabManager(browserSession.context, browserSession.page);
209
- let page = tabs.activePage();
259
+ const device = await createDevice(platform, {
260
+ browserOpts,
261
+ contextValues: session.context_values,
262
+ sharedBrowser,
263
+ appPath: opts.appPath,
264
+ log,
265
+ });
210
266
  const history = [];
211
267
  const interactions = [];
212
268
  const debugSteps = [];
@@ -216,24 +272,51 @@ async function runSingleSimulation(client, participantId, participantName, opts,
216
272
  let accumulatedEffortMs = 0;
217
273
  let finalStatus = "completed";
218
274
  try {
219
- // Step 3: Navigate to URL
220
- await navigateWithRetry(page, navigationUrl);
275
+ // Step 3: Launch / navigate the target to its starting point.
276
+ await device.launchOrReset(launchTarget);
221
277
  // Step 4: Run assignment loop
222
278
  for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
223
279
  const assignment = session.assignments[assignmentIdx];
224
280
  log(` Assignment ${assignmentIdx + 1}/${session.assignments.length}: ${assignment.name}`);
225
281
  let step = 0;
226
282
  let assignmentCompleted = false;
283
+ // The agent's last per-turn status, used to pick the terminal run-level
284
+ // status when the loop ends because the agent terminated (completed vs
285
+ // abandoned). Stays "in_progress" if the loop hits max_steps.
286
+ let lastAssignmentStatus = "in_progress";
227
287
  while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
228
- // OBSERVE — refresh active page in case a popup or switch_tab changed it
229
- page = tabs.activePage();
230
- const obs = await captureObservation(page);
231
- const lastTreeData = obs.treeData;
288
+ // OBSERVE — the device refreshes its own active surface (popup /
289
+ // switch_tab for browser) before capturing. (The browser device emits
290
+ // its own richer debugObservation with tree/scroll detail.)
291
+ // TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
292
+ const obs = await device.observe();
232
293
  const currentScreenshot = obs.screenshot;
233
- debugObservation(obs);
234
294
  // Capture JPEG of observation for upload and recording (pre-action)
235
- const obsJpeg = await takeScreenshotJpeg(page);
295
+ const obsJpeg = await device.captureScreenshotJpeg();
236
296
  const obsBase64 = obsJpeg.toString("base64");
297
+ // Capture a height-capped full-page JPEG (pre-action, so it reflects
298
+ // the same screen the LLM reasons over). Sent to the backend as the
299
+ // PDQ basis + Frame representative_screenshot, matching the hosted
300
+ // run's full-page behavior. The per-interaction screenshot_url /
301
+ // recording stays the VIEWPORT (obsBase64) — unchanged.
302
+ // Degrade silently to omitting the field if capture fails: a frame is
303
+ // still created from the viewport.
304
+ const fullPageCap = screenFormat === "mobile_portrait"
305
+ ? FULL_PAGE_HEIGHT_CAP_PX_MOBILE
306
+ : FULL_PAGE_HEIGHT_CAP_PX_DESKTOP;
307
+ let fullPageBase64;
308
+ try {
309
+ // Browser-only: native devices omit captureFullPageJpeg, so the
310
+ // field is dropped and the frame is created from the viewport.
311
+ fullPageBase64 = await device.captureFullPageJpeg?.({
312
+ documentHeight: obs.documentHeight,
313
+ cap: fullPageCap,
314
+ });
315
+ }
316
+ catch (err) {
317
+ const msg = err instanceof Error ? err.message : String(err);
318
+ log(` Warning: full-page screenshot capture failed — ${msg}`);
319
+ }
237
320
  // Detect no-visible-change: compare this step's observation with the
238
321
  // PREVIOUS step's observation (not the post-action screenshot).
239
322
  // This tells us whether the previous step's action changed the page.
@@ -243,10 +326,9 @@ async function runSingleSimulation(client, participantId, participantName, opts,
243
326
  previousObsScreenshot = currentScreenshot;
244
327
  if (forwards.length > 0)
245
328
  debugForwards(forwards);
246
- const viewportSize = page.viewportSize() ?? viewport;
247
329
  // Snapshot open tabs so the backend can prompt the LLM with tab ids
248
330
  // (used by switch_tab/close_tab and to disambiguate cmd+click results).
249
- const tabsSnapshot = await tabs.list();
331
+ const tabsSnapshot = obs.tabs;
250
332
  // REASON (remote)
251
333
  let stepResponse;
252
334
  try {
@@ -256,10 +338,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
256
338
  assignment_name: assignment.name,
257
339
  assignment_instructions: assignment.instructions,
258
340
  screenshot: obs.screenshot,
259
- accessibility_tree: obs.treeData.simplified,
341
+ accessibility_tree: obs.accessibilityTree,
260
342
  current_url: obs.url,
261
- screen_width: viewportSize.width,
262
- screen_height: viewportSize.height,
343
+ screen_width: obs.width,
344
+ screen_height: obs.height,
263
345
  interaction_count: step,
264
346
  history,
265
347
  forwards,
@@ -276,7 +358,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
276
358
  catch (err) {
277
359
  const msg = err instanceof Error ? err.message : String(err);
278
360
  log(` Step ${step + 1}: API error — ${msg}`);
279
- await page.waitForTimeout(2000);
361
+ await new Promise((r) => setTimeout(r, 2000));
280
362
  try {
281
363
  const stepReqBody = {
282
364
  participant_id: session.participant_id,
@@ -284,10 +366,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
284
366
  assignment_name: assignment.name,
285
367
  assignment_instructions: assignment.instructions,
286
368
  screenshot: obs.screenshot,
287
- accessibility_tree: obs.treeData.simplified,
369
+ accessibility_tree: obs.accessibilityTree,
288
370
  current_url: obs.url,
289
- screen_width: viewportSize.width,
290
- screen_height: viewportSize.height,
371
+ screen_width: obs.width,
372
+ screen_height: obs.height,
291
373
  interaction_count: step,
292
374
  history,
293
375
  forwards,
@@ -313,27 +395,21 @@ async function runSingleSimulation(client, participantId, participantName, opts,
313
395
  const actionDescs = [];
314
396
  const elementNames = [];
315
397
  const actionDebugEntries = [];
316
- const preActionScreenshot = await takeScreenshot(page);
398
+ const preActionScreenshot = await device.captureScreenshot();
317
399
  for (let i = 0; i < stepResponse.actions.length; i++) {
318
400
  if (isCancelled())
319
401
  break;
320
- // Pick up popup auto-switch / explicit tab switch from prior actions.
321
- page = tabs.activePage();
322
402
  const action = stepResponse.actions[i];
323
- const tabsBefore = (await tabs.list()).length;
324
- const result = await executeAction(page, action, lastTreeData, session.context_values, tabs);
403
+ const result = await device.executeAction(action);
325
404
  const desc = describeAction(action);
326
405
  debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
327
- // The action may have flipped the active tab — re-read.
328
- page = tabs.activePage();
329
- const tabsAfter = (await tabs.list()).length;
330
- const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
406
+ const openedNewTab = result.openedNewTab;
331
407
  let normalizedCoords = null;
332
408
  if (result.coordinates) {
333
- const vp = page.viewportSize() ?? viewport;
409
+ const dims = device.dimensions();
334
410
  normalizedCoords = {
335
- x: Math.round((result.coordinates.x / vp.width) * 1000),
336
- y: Math.round((result.coordinates.y / vp.height) * 1000),
411
+ x: Math.round((result.coordinates.x / dims.width) * 1000),
412
+ y: Math.round((result.coordinates.y / dims.height) * 1000),
337
413
  };
338
414
  }
339
415
  const actionType = action.type || "unknown";
@@ -355,6 +431,11 @@ async function runSingleSimulation(client, participantId, participantName, opts,
355
431
  ...(action.modifiers?.length && { modifiers: action.modifiers }),
356
432
  ...(action.key && { key: action.key }),
357
433
  ...(action.tab_id && { tab_id: action.tab_id }),
434
+ ...(action.orientation && { orientation: action.orientation }),
435
+ ...(action.panel && { panel: action.panel }),
436
+ // The recorded `coordinates` is the drag START; persist the END
437
+ // (normalized 0-1000) too so the journey captures the full path.
438
+ ...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
358
439
  ...(openedNewTab && { opened_new_tab: true }),
359
440
  },
360
441
  order: i,
@@ -376,7 +457,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
376
457
  }
377
458
  // Check if UI changed significantly (skip for last action in batch)
378
459
  if (i < stepResponse.actions.length - 1) {
379
- const midScreenshot = await takeScreenshot(page);
460
+ const midScreenshot = await device.captureScreenshot();
380
461
  if (!detectNoVisibleChange(preActionScreenshot, midScreenshot)) {
381
462
  const blockedCount = stepResponse.actions.length - 1 - i;
382
463
  forwards.push({
@@ -387,10 +468,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
387
468
  }
388
469
  }
389
470
  }
390
- // Upload observation JPEG (pre-action — matches coordinates and LLM context)
471
+ // Upload observation screenshot (pre-action — matches coordinates and
472
+ // LLM context). Browser captures JPEG; native screencap is PNG.
473
+ const obsContentType = isNative ? "image/png" : "image/jpeg";
391
474
  let screenshotUrl;
392
475
  try {
393
- const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg);
476
+ const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg, obsContentType);
394
477
  screenshotUrl = uploadResult.screenshotUrl;
395
478
  }
396
479
  catch (err) {
@@ -407,6 +490,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
407
490
  screenshot_url: screenshotUrl,
408
491
  location_name: stepResponse.current_location,
409
492
  screen_format: screenFormat,
493
+ ...(fullPageBase64 ? { full_page_screenshot_base64: fullPageBase64 } : {}),
494
+ // Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
495
+ // back to screen_format server-side.
496
+ platform,
410
497
  });
411
498
  frameVersionId = matchResult.frame_version_id;
412
499
  }
@@ -417,7 +504,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
417
504
  // Debug-only: capture post-action screenshot to show result
418
505
  let postActionBase64;
419
506
  if (isDebugEnabled()) {
420
- const postJpeg = await takeScreenshotJpeg(page);
507
+ const postJpeg = await device.captureScreenshotJpeg();
421
508
  postActionBase64 = postJpeg.toString("base64");
422
509
  }
423
510
  // Accumulate effort (cumulative, not wall-clock)
@@ -437,10 +524,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
437
524
  step: step + 1,
438
525
  assignment_id: assignment.id,
439
526
  ...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
527
+ // Dimensions of THIS step's screenshot (from observe()) so the backend
528
+ // can populate the screenshot ref even when only screenshot_url is
529
+ // sent (native) and it can't read the bytes for dims.
530
+ screen_width: obs.width,
531
+ screen_height: obs.height,
440
532
  frame_version_id: frameVersionId,
441
533
  timestamp_ms: accumulatedEffortMs,
442
534
  comment: stepResponse.comment,
443
- url: page.url(),
535
+ url: device.currentUrl(),
444
536
  sentiment: {
445
537
  label: stepResponse.sentiment,
446
538
  valence: stepResponse.sentiment_valence,
@@ -448,7 +540,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
448
540
  },
449
541
  actions: actionDatas,
450
542
  current_location: stepResponse.current_location,
451
- assignment_completed: stepResponse.assignment_completed,
543
+ assignment_status: stepResponse.assignment_status,
452
544
  // Server reduces this to Interaction.tab when N >= 2; omit on
453
545
  // single-tab steps to keep the payload (and DB column) null.
454
546
  ...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
@@ -467,7 +559,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
467
559
  assignmentName: assignment.name,
468
560
  screenshotBase64: obsBase64,
469
561
  postActionScreenshotBase64: postActionBase64,
470
- url: page.url(),
562
+ url: device.currentUrl(),
471
563
  actions: actionDebugEntries,
472
564
  comment: stepResponse.comment,
473
565
  sentiment: {
@@ -480,6 +572,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
480
572
  effortSeconds: stepResponse.effort_seconds,
481
573
  });
482
574
  assignmentCompleted = stepResponse.assignment_completed;
575
+ lastAssignmentStatus = stepResponse.assignment_status;
483
576
  step++;
484
577
  }
485
578
  if (isCancelled()) {
@@ -491,16 +584,25 @@ async function runSingleSimulation(client, participantId, participantName, opts,
491
584
  });
492
585
  break;
493
586
  }
587
+ // When the agent terminated, persist its ACTUAL terminal status
588
+ // (completed vs abandoned) rather than always "completed". When it
589
+ // didn't terminate, the loop hit max_steps.
590
+ const terminalStatus = assignmentCompleted
591
+ ? lastAssignmentStatus
592
+ : "max_steps_reached";
494
593
  assignmentStatuses.push({
495
594
  assignment_id: assignment.id,
496
- status: assignmentCompleted ? "completed" : "max_steps_reached",
595
+ status: terminalStatus,
497
596
  step_count: step,
498
597
  });
499
- if (assignmentCompleted) {
500
- log(` Assignment completed in ${step} steps`);
598
+ if (!assignmentCompleted) {
599
+ log(` Assignment reached max steps (${maxSteps})`);
600
+ }
601
+ else if (lastAssignmentStatus === "abandoned") {
602
+ log(` Assignment abandoned by agent after ${step} steps`);
501
603
  }
502
604
  else {
503
- log(` Assignment reached max steps (${maxSteps})`);
605
+ log(` Assignment completed in ${step} steps`);
504
606
  }
505
607
  }
506
608
  }
@@ -518,7 +620,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
518
620
  generateDebugReport(debugSteps, {
519
621
  participantId: session.participant_id,
520
622
  participantName,
521
- url: navigationUrl,
623
+ url: launchTarget,
522
624
  screenFormat,
523
625
  finalStatus,
524
626
  assignmentStatuses,
@@ -542,15 +644,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
542
644
  const msg = err instanceof Error ? err.message : String(err);
543
645
  log(` Warning: failed to record results — ${msg}`);
544
646
  }
545
- if (ownsTheBrowser) {
546
- await closeBrowser(browserSession);
547
- }
548
- else {
549
- // Shared mode: close just the tab, not the context or browser
550
- try {
551
- await browserSession.page.close();
552
- }
553
- catch { }
554
- }
647
+ // Device owns its own teardown (full browser vs. just-the-tab for shared
648
+ // mode, app/emulator cleanup for native).
649
+ await device.close();
555
650
  }
556
651
  }
@@ -0,0 +1,111 @@
1
+ /**
2
+ * Pure parser/serializer for native (Android/iOS) accessibility trees — the
3
+ * native counterpart of the browser's DOM-locator tree. It turns a raw device
4
+ * a11y dump into the SAME `[id] role "name"` string the backend's DOMLocator
5
+ * reasons over, plus a local `shortId → bounds` map the device taps the CENTER
6
+ * of. No bounds ship to the backend; like the browser path, the CLI keeps the
7
+ * map and resolves the LLM's returned short id locally.
8
+ *
9
+ * FCIS: this module is pure (string in, structs out) — no `adb`/`idb` I/O — so
10
+ * it's unit-testable without a device, exactly like `coordinates.ts`. The I/O
11
+ * lives in `adb.ts`/`simctl.ts`; the parse/serialize math lives here.
12
+ *
13
+ * COORDINATE SPACE — carried, not converted, by this module:
14
+ * - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
15
+ * - iOS WebDriverAgent /source frames are POINTS (`space: "points"`).
16
+ * The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
17
+ * IOSDevice taps points), so the `space` tag tells the caller which dimension a
18
+ * node's bounds-center belongs to. This module never mixes the two.
19
+ *
20
+ * ANCESTOR-VS-LEAF (the hard part): on Android the visible label
21
+ * ("Network & internet") sits on a `clickable=false` TextView nested inside the
22
+ * clickable PARENT row. Tapping the leaf's center misses the row's hit logic and
23
+ * lands "slightly off"; the click target is the row. So the serializer walks to
24
+ * the nearest clickable ANCESTOR, aggregates its descendants' text/content-desc
25
+ * into ONE label, and emits the CLICKABLE node WITH THE ROW'S BOUNDS — never the
26
+ * leaf. iOS Buttons are already labeled + actionable, so they emit directly.
27
+ */
28
+ export type CoordinateSpace = "px" | "points";
29
+ export interface Bounds {
30
+ x: number;
31
+ y: number;
32
+ width: number;
33
+ height: number;
34
+ }
35
+ /**
36
+ * One parsed native a11y node. `bounds` are in `space` (Android px, iOS points).
37
+ * `clickable` marks an actionable hit target. `resourceId` is the Android
38
+ * resource-id / iOS AXUniqueId when present (diagnostic; not used for tapping).
39
+ */
40
+ export interface NativeNode {
41
+ role: string;
42
+ label: string;
43
+ bounds: Bounds;
44
+ clickable: boolean;
45
+ /** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
46
+ hasOwnLabel: boolean;
47
+ resourceId?: string;
48
+ space: CoordinateSpace;
49
+ }
50
+ export interface NativeTree {
51
+ /** `[id] role "label"` lines, one per emitted actionable node. */
52
+ simplified: string;
53
+ /** shortId → bounds (in the platform's space). The device taps the center. */
54
+ nodeMap: Map<string, Bounds>;
55
+ }
56
+ /**
57
+ * Parse a uiautomator XML dump into a flat list of leaf-significant nodes in
58
+ * document order. The dump is a single line of nested `<node ...>` tags; we
59
+ * rebuild the parent/child nesting from the open/close-tag stream (mirroring the
60
+ * "break after `>`" split the oracle scripts use, but tracking depth so the
61
+ * ancestor-aggregation in `serializeNativeTree` has the real tree).
62
+ *
63
+ * Returns the FLATTENED set of nodes (depth-first, document order) with their
64
+ * raw fields; the serializer decides which to emit and how to aggregate.
65
+ */
66
+ export declare function parseUiautomatorXml(xml: string): NativeNode[];
67
+ /**
68
+ * Parse WDA's `GET /source?format=json` — a NESTED accessibility tree — into the
69
+ * FLAT, depth-first `NativeNode[]` (POINTS) that `parseXcuiHierarchy` produces,
70
+ * so `serializeNativeTree` consumes it unchanged. WDA's `type` matches idb's iOS
71
+ * types (Button/StaticText/SearchField/Cell/Image/Application…), so
72
+ * `normalizeRole`/`IOS_ACTIONABLE_TYPES`/`frameToBounds` all apply as-is.
73
+ *
74
+ * KEY: WDA's `/source` is the FULL XCUIElement tree — every container and leaf —
75
+ * NOT idb's clean accessibility-elements list. iOS settings rows surface as an
76
+ * accessible `Button` ("General", isAccessible=1) that ALSO contains a duplicate
77
+ * inner `StaticText` ("General", isAccessible=0) and is wrapped in a `Cell`
78
+ * (isAccessible=0). Emitting all three yields "General General" + empty
79
+ * listitems. So we emit ONLY `isAccessible && isVisible` nodes — exactly the
80
+ * VoiceOver-exposed set idb returned: the labeled Button is both the label and
81
+ * the tap target; the duplicate StaticText and the wrapping Cell are pruned. A
82
+ * sparse a11y tree degrades to the loop's vision fallback, so strict filtering
83
+ * never strands the run.
84
+ *
85
+ * Accepts either the raw tree or the W3C `{ value: <tree> }` envelope WDA returns.
86
+ */
87
+ export declare function parseXcuiHierarchy(json: string): NativeNode[];
88
+ /**
89
+ * Serialize a flat NativeNode list (from `parseUiautomatorXml` /
90
+ * `parseXcuiHierarchy`) into the `[id] role "label"` string the DOMLocator
91
+ * reasons over, plus a `shortId → bounds` map for local tap resolution.
92
+ *
93
+ * Emission rules (kept tight, like the DOM serializer):
94
+ * - ANCESTOR-VS-LEAF: a CLICKABLE node absorbs its descendants' labels and is
95
+ * emitted with ITS OWN bounds (the tappable row). The descendant
96
+ * label-bearing leaves are then NOT emitted on their own — their text lives
97
+ * on the row. A label-bearing leaf with NO clickable ancestor (e.g. a
98
+ * standalone heading) is emitted directly so on-screen text isn't lost.
99
+ * - Skip pure decoration: a node that is neither clickable nor label-bearing,
100
+ * and a generic/application container that didn't aggregate a label.
101
+ *
102
+ * The input list is depth-first / document order, which is the order the raw
103
+ * parsers produce; we recover ancestry from that order using bounds containment
104
+ * (Android leaves nest inside their clickable row's rect; iOS is already flat).
105
+ */
106
+ export declare function serializeNativeTree(nodes: NativeNode[]): NativeTree;
107
+ /** Center of a node's bounds — the point the device taps. */
108
+ export declare function boundsCenter(b: Bounds): {
109
+ x: number;
110
+ y: number;
111
+ };