@ishlabs/cli 0.24.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/commands/ask.js +3 -3
  2. package/dist/commands/iteration.js +1 -1
  3. package/dist/commands/study-analyze.js +1 -1
  4. package/dist/commands/study-run.js +80 -12
  5. package/dist/commands/study.js +11 -7
  6. package/dist/lib/alias-store.js +1 -1
  7. package/dist/lib/api-client.d.ts +2 -0
  8. package/dist/lib/docs.js +57 -42
  9. package/dist/lib/local-sim/actions.d.ts +10 -2
  10. package/dist/lib/local-sim/actions.js +16 -11
  11. package/dist/lib/local-sim/adb.d.ts +103 -0
  12. package/dist/lib/local-sim/adb.js +352 -0
  13. package/dist/lib/local-sim/android.d.ts +111 -0
  14. package/dist/lib/local-sim/android.js +499 -0
  15. package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
  16. package/dist/lib/local-sim/apk-manifest.js +210 -0
  17. package/dist/lib/local-sim/browser.d.ts +22 -0
  18. package/dist/lib/local-sim/browser.js +65 -0
  19. package/dist/lib/local-sim/coordinates.d.ts +69 -0
  20. package/dist/lib/local-sim/coordinates.js +59 -0
  21. package/dist/lib/local-sim/device.d.ts +143 -0
  22. package/dist/lib/local-sim/device.js +152 -0
  23. package/dist/lib/local-sim/ios.d.ts +168 -0
  24. package/dist/lib/local-sim/ios.js +546 -0
  25. package/dist/lib/local-sim/loop.d.ts +14 -2
  26. package/dist/lib/local-sim/loop.js +166 -73
  27. package/dist/lib/local-sim/native-a11y.d.ts +97 -0
  28. package/dist/lib/local-sim/native-a11y.js +384 -0
  29. package/dist/lib/local-sim/simctl.d.ts +85 -0
  30. package/dist/lib/local-sim/simctl.js +273 -0
  31. package/dist/lib/local-sim/types.d.ts +37 -2
  32. package/dist/lib/local-sim/upload.d.ts +1 -1
  33. package/dist/lib/local-sim/upload.js +9 -6
  34. package/dist/lib/output.js +58 -12
  35. package/dist/lib/skill-content.js +10 -9
  36. package/package.json +2 -1
@@ -1,19 +1,31 @@
1
1
  /**
2
2
  * Local simulation loop orchestrator.
3
3
  *
4
- * Runs the observe → reason (remote) → act (local) loop for each
5
- * participant against a local Playwright browser.
4
+ * Runs the observe → reason (remote) → act (local) loop for each participant
5
+ * against a SimulationDevice (a Playwright browser today; a native Android
6
+ * emulator next). The loop is device-agnostic — see device.ts.
6
7
  */
7
- import { launchBrowser, launchSharedBrowser, createTab, captureObservation, takeScreenshot, takeScreenshotJpeg, navigateWithRetry, closeBrowser } from "./browser.js";
8
+ import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
8
9
  import { uploadScreenshot } from "./upload.js";
9
- import { executeAction, detectNoVisibleChange, describeAction } from "./actions.js";
10
- import { TabManager } from "./tabs.js";
11
- import { enableDebug, isDebugEnabled, debugObservation, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
10
+ import { detectNoVisibleChange, describeAction } from "./actions.js";
11
+ import { createDevice } from "./device.js";
12
+ import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
13
+ /**
14
+ * Native (mobile) platforms drive a single physical device via screenshot →
15
+ * normalized-coordinate taps, with no accessibility tree or URL. Browser/web is
16
+ * everything else.
17
+ */
18
+ function isNativePlatform(platform) {
19
+ return platform === "android" || platform === "ios";
20
+ }
12
21
  /**
13
22
  * Convert a raw action (from either resolved_actions or output.action.actions)
14
- * into the flat LocalStepAction shape used by the executor.
23
+ * into the flat LocalStepAction shape used by the executor. Exported for unit
24
+ * tests of the native drag coordinate-shape split (the nested action's
25
+ * `coordinates` is a {x,y} tap point for most actions but a
26
+ * {startX,...,endY} path for a drag).
15
27
  */
16
- function flattenAction(raw, nodeId = null, nodeDescription = null) {
28
+ export function flattenAction(raw, nodeId = null, nodeDescription = null) {
17
29
  // resolved_actions nest the action inside an "action" key
18
30
  const a = (raw.action ?? raw);
19
31
  const element = a.element;
@@ -36,8 +48,35 @@ function flattenAction(raw, nodeId = null, nodeDescription = null) {
36
48
  modifiers: Array.isArray(a.modifiers) ? a.modifiers : null,
37
49
  key: a.key ?? null,
38
50
  tab_id: a.tab_id ?? null,
51
+ orientation: a.orientation ?? null,
52
+ scale: a.scale ?? null,
53
+ // Native path: ResolvedAction.coordinates (top level of the resolved_actions
54
+ // entry) is the single {x,y} execution point. Fall back to the nested action
55
+ // for raw output — but only a point-shaped {x,y}; a drag's nested
56
+ // coordinates is the {startX,...,endY} path (extracted into `drag` below),
57
+ // not a tap point, so guard against mis-assigning it here.
58
+ coordinates: pickPoint(raw.coordinates) ?? pickPoint(a.coordinates) ?? null,
59
+ // drag: the gesture path lives on the nested action's coordinates as
60
+ // {startX, startY, endX, endY} (DragCoordinates serialized by_alias).
61
+ drag: pickDrag(a.coordinates) ?? null,
39
62
  };
40
63
  }
64
+ /** A nested action's coordinates only when it's the {x,y} tap-point shape. */
65
+ function pickPoint(c) {
66
+ if (c && typeof c === "object" && "x" in c && "y" in c) {
67
+ const p = c;
68
+ return { x: p.x, y: p.y };
69
+ }
70
+ return null;
71
+ }
72
+ /** A nested action's coordinates only when it's the DragCoordinates shape. */
73
+ function pickDrag(c) {
74
+ if (c && typeof c === "object" && "startX" in c && "endX" in c) {
75
+ const d = c;
76
+ return { startX: d.startX, startY: d.startY, endX: d.endX, endY: d.endY };
77
+ }
78
+ return null;
79
+ }
41
80
  /**
42
81
  * Normalize the raw backend step response into the flat structure used by the loop.
43
82
  * Backend returns { output: { ... }, resolved_actions: [...], loop_detected }.
@@ -61,7 +100,10 @@ function normalizeStepResponse(raw) {
61
100
  sentiment_intensity: out.sentiment_intensity ?? 0,
62
101
  current_location: out.current_location,
63
102
  effort_seconds: out.effort_seconds,
64
- assignment_completed: out.assignment_completed,
103
+ assignment_status: out.assignment_status,
104
+ // Terminate on completed OR abandoned — a stuck agent that gives up
105
+ // should stop the loop just like a finished one.
106
+ assignment_completed: out.assignment_status === "completed" || out.assignment_status === "abandoned",
65
107
  actions,
66
108
  loop_detected: raw.loop_detected,
67
109
  };
@@ -89,7 +131,13 @@ export async function runLocalSimulations(client, opts) {
89
131
  log("\nCancelling after current step...");
90
132
  };
91
133
  process.on("SIGINT", onSigint);
92
- const concurrency = opts.parallel ?? opts.participantIds.length;
134
+ // Native runs share ONE physical device (emulator / simulator), so they
135
+ // can't run in parallel — force sequential regardless of --parallel.
136
+ const isNativeRun = isNativePlatform(opts.platform);
137
+ if (isNativeRun && (opts.parallel ?? 1) > 1) {
138
+ log("Native (android/ios) runs drive a single device — running sequentially.");
139
+ }
140
+ const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
93
141
  try {
94
142
  if (concurrency <= 1 || opts.participantIds.length <= 1) {
95
143
  // Sequential execution — each participant owns its own browser
@@ -162,12 +210,20 @@ async function runSingleSimulation(client, participantId, participantName, opts,
162
210
  product_id: opts.workspaceId,
163
211
  iteration_id: opts.iterationId,
164
212
  });
165
- // Resolve URL and browser config from iteration details (with CLI fallback)
213
+ // Resolve target + config from iteration details (with CLI fallback).
214
+ // Platform precedence: --platform flag > iteration's stored platform > web.
166
215
  const iterDetails = initResponse.iteration_details;
216
+ const platform = opts.platform ?? iterDetails?.platform ?? "web";
217
+ const isNative = isNativePlatform(platform);
218
+ // Browser needs a URL to navigate; native uses the app package (from --app or
219
+ // the iteration target) and has no URL requirement.
167
220
  const navigationUrl = iterDetails?.url ?? opts.url;
168
- if (!navigationUrl) {
221
+ if (!isNative && !navigationUrl) {
169
222
  throw new Error("No URL available: backend did not return iteration_details and no --url flag was provided.");
170
223
  }
224
+ // For native, launchOrReset() receives the app package (iteration target);
225
+ // the AndroidDevice prefers --app over this.
226
+ const launchTarget = isNative ? (navigationUrl ?? "") : navigationUrl;
171
227
  const screenFormat = opts.screenFormat ?? iterDetails?.screen_format ?? "desktop";
172
228
  const locale = opts.locale ?? iterDetails?.locale;
173
229
  // Cache session state for per-step requests
@@ -189,7 +245,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
189
245
  const stepContextValues = session.context_values.map(cv => cv.type === "secret" ? { ...cv, value: null } : cv);
190
246
  const maxSteps = opts.maxInteractions ?? session.max_interactions;
191
247
  const viewport = { width: 1440, height: 900 }; // TODO: extract from config
192
- // Step 2: Launch browser
248
+ // Step 2: Build the target device (per-platform dispatch).
249
+ // Browser today; AndroidDevice (adb) slots in via createDevice() later.
193
250
  const browserOpts = {
194
251
  headed: opts.headed,
195
252
  slowMo: opts.slowMo,
@@ -198,15 +255,13 @@ async function runSingleSimulation(client, participantId, participantName, opts,
198
255
  locale,
199
256
  screenFormat,
200
257
  };
201
- // Use shared browser if available (parallel mode), otherwise launch standalone
202
- const ownsTheBrowser = !sharedBrowser;
203
- const browserSession = sharedBrowser
204
- ? await createTab(sharedBrowser, browserOpts)
205
- : await launchBrowser(browserOpts);
206
- // Active page can swap when a popup auto-focuses or the LLM issues
207
- // switch_tab/close_tab. TabManager wires the context popup listener.
208
- const tabs = new TabManager(browserSession.context, browserSession.page);
209
- let page = tabs.activePage();
258
+ const device = await createDevice(platform, {
259
+ browserOpts,
260
+ contextValues: session.context_values,
261
+ sharedBrowser,
262
+ appPath: opts.appPath,
263
+ log,
264
+ });
210
265
  const history = [];
211
266
  const interactions = [];
212
267
  const debugSteps = [];
@@ -216,24 +271,51 @@ async function runSingleSimulation(client, participantId, participantName, opts,
216
271
  let accumulatedEffortMs = 0;
217
272
  let finalStatus = "completed";
218
273
  try {
219
- // Step 3: Navigate to URL
220
- await navigateWithRetry(page, navigationUrl);
274
+ // Step 3: Launch / navigate the target to its starting point.
275
+ await device.launchOrReset(launchTarget);
221
276
  // Step 4: Run assignment loop
222
277
  for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
223
278
  const assignment = session.assignments[assignmentIdx];
224
279
  log(` Assignment ${assignmentIdx + 1}/${session.assignments.length}: ${assignment.name}`);
225
280
  let step = 0;
226
281
  let assignmentCompleted = false;
282
+ // The agent's last per-turn status, used to pick the terminal run-level
283
+ // status when the loop ends because the agent terminated (completed vs
284
+ // abandoned). Stays "in_progress" if the loop hits max_steps.
285
+ let lastAssignmentStatus = "in_progress";
227
286
  while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
228
- // OBSERVE — refresh active page in case a popup or switch_tab changed it
229
- page = tabs.activePage();
230
- const obs = await captureObservation(page);
231
- const lastTreeData = obs.treeData;
287
+ // OBSERVE — the device refreshes its own active surface (popup /
288
+ // switch_tab for browser) before capturing. (The browser device emits
289
+ // its own richer debugObservation with tree/scroll detail.)
290
+ // TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
291
+ const obs = await device.observe();
232
292
  const currentScreenshot = obs.screenshot;
233
- debugObservation(obs);
234
293
  // Capture JPEG of observation for upload and recording (pre-action)
235
- const obsJpeg = await takeScreenshotJpeg(page);
294
+ const obsJpeg = await device.captureScreenshotJpeg();
236
295
  const obsBase64 = obsJpeg.toString("base64");
296
+ // Capture a height-capped full-page JPEG (pre-action, so it reflects
297
+ // the same screen the LLM reasons over). Sent to the backend as the
298
+ // PDQ basis + Frame representative_screenshot, matching the hosted
299
+ // run's full-page behavior. The per-interaction screenshot_url /
300
+ // recording stays the VIEWPORT (obsBase64) — unchanged.
301
+ // Degrade silently to omitting the field if capture fails: a frame is
302
+ // still created from the viewport.
303
+ const fullPageCap = screenFormat === "mobile_portrait"
304
+ ? FULL_PAGE_HEIGHT_CAP_PX_MOBILE
305
+ : FULL_PAGE_HEIGHT_CAP_PX_DESKTOP;
306
+ let fullPageBase64;
307
+ try {
308
+ // Browser-only: native devices omit captureFullPageJpeg, so the
309
+ // field is dropped and the frame is created from the viewport.
310
+ fullPageBase64 = await device.captureFullPageJpeg?.({
311
+ documentHeight: obs.documentHeight,
312
+ cap: fullPageCap,
313
+ });
314
+ }
315
+ catch (err) {
316
+ const msg = err instanceof Error ? err.message : String(err);
317
+ log(` Warning: full-page screenshot capture failed — ${msg}`);
318
+ }
237
319
  // Detect no-visible-change: compare this step's observation with the
238
320
  // PREVIOUS step's observation (not the post-action screenshot).
239
321
  // This tells us whether the previous step's action changed the page.
@@ -243,10 +325,9 @@ async function runSingleSimulation(client, participantId, participantName, opts,
243
325
  previousObsScreenshot = currentScreenshot;
244
326
  if (forwards.length > 0)
245
327
  debugForwards(forwards);
246
- const viewportSize = page.viewportSize() ?? viewport;
247
328
  // Snapshot open tabs so the backend can prompt the LLM with tab ids
248
329
  // (used by switch_tab/close_tab and to disambiguate cmd+click results).
249
- const tabsSnapshot = await tabs.list();
330
+ const tabsSnapshot = obs.tabs;
250
331
  // REASON (remote)
251
332
  let stepResponse;
252
333
  try {
@@ -256,10 +337,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
256
337
  assignment_name: assignment.name,
257
338
  assignment_instructions: assignment.instructions,
258
339
  screenshot: obs.screenshot,
259
- accessibility_tree: obs.treeData.simplified,
340
+ accessibility_tree: obs.accessibilityTree,
260
341
  current_url: obs.url,
261
- screen_width: viewportSize.width,
262
- screen_height: viewportSize.height,
342
+ screen_width: obs.width,
343
+ screen_height: obs.height,
263
344
  interaction_count: step,
264
345
  history,
265
346
  forwards,
@@ -276,7 +357,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
276
357
  catch (err) {
277
358
  const msg = err instanceof Error ? err.message : String(err);
278
359
  log(` Step ${step + 1}: API error — ${msg}`);
279
- await page.waitForTimeout(2000);
360
+ await new Promise((r) => setTimeout(r, 2000));
280
361
  try {
281
362
  const stepReqBody = {
282
363
  participant_id: session.participant_id,
@@ -284,10 +365,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
284
365
  assignment_name: assignment.name,
285
366
  assignment_instructions: assignment.instructions,
286
367
  screenshot: obs.screenshot,
287
- accessibility_tree: obs.treeData.simplified,
368
+ accessibility_tree: obs.accessibilityTree,
288
369
  current_url: obs.url,
289
- screen_width: viewportSize.width,
290
- screen_height: viewportSize.height,
370
+ screen_width: obs.width,
371
+ screen_height: obs.height,
291
372
  interaction_count: step,
292
373
  history,
293
374
  forwards,
@@ -313,27 +394,21 @@ async function runSingleSimulation(client, participantId, participantName, opts,
313
394
  const actionDescs = [];
314
395
  const elementNames = [];
315
396
  const actionDebugEntries = [];
316
- const preActionScreenshot = await takeScreenshot(page);
397
+ const preActionScreenshot = await device.captureScreenshot();
317
398
  for (let i = 0; i < stepResponse.actions.length; i++) {
318
399
  if (isCancelled())
319
400
  break;
320
- // Pick up popup auto-switch / explicit tab switch from prior actions.
321
- page = tabs.activePage();
322
401
  const action = stepResponse.actions[i];
323
- const tabsBefore = (await tabs.list()).length;
324
- const result = await executeAction(page, action, lastTreeData, session.context_values, tabs);
402
+ const result = await device.executeAction(action);
325
403
  const desc = describeAction(action);
326
404
  debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
327
- // The action may have flipped the active tab — re-read.
328
- page = tabs.activePage();
329
- const tabsAfter = (await tabs.list()).length;
330
- const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
405
+ const openedNewTab = result.openedNewTab;
331
406
  let normalizedCoords = null;
332
407
  if (result.coordinates) {
333
- const vp = page.viewportSize() ?? viewport;
408
+ const dims = device.dimensions();
334
409
  normalizedCoords = {
335
- x: Math.round((result.coordinates.x / vp.width) * 1000),
336
- y: Math.round((result.coordinates.y / vp.height) * 1000),
410
+ x: Math.round((result.coordinates.x / dims.width) * 1000),
411
+ y: Math.round((result.coordinates.y / dims.height) * 1000),
337
412
  };
338
413
  }
339
414
  const actionType = action.type || "unknown";
@@ -355,6 +430,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
355
430
  ...(action.modifiers?.length && { modifiers: action.modifiers }),
356
431
  ...(action.key && { key: action.key }),
357
432
  ...(action.tab_id && { tab_id: action.tab_id }),
433
+ ...(action.orientation && { orientation: action.orientation }),
434
+ // The recorded `coordinates` is the drag START; persist the END
435
+ // (normalized 0-1000) too so the journey captures the full path.
436
+ ...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
358
437
  ...(openedNewTab && { opened_new_tab: true }),
359
438
  },
360
439
  order: i,
@@ -376,7 +455,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
376
455
  }
377
456
  // Check if UI changed significantly (skip for last action in batch)
378
457
  if (i < stepResponse.actions.length - 1) {
379
- const midScreenshot = await takeScreenshot(page);
458
+ const midScreenshot = await device.captureScreenshot();
380
459
  if (!detectNoVisibleChange(preActionScreenshot, midScreenshot)) {
381
460
  const blockedCount = stepResponse.actions.length - 1 - i;
382
461
  forwards.push({
@@ -387,10 +466,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
387
466
  }
388
467
  }
389
468
  }
390
- // Upload observation JPEG (pre-action — matches coordinates and LLM context)
469
+ // Upload observation screenshot (pre-action — matches coordinates and
470
+ // LLM context). Browser captures JPEG; native screencap is PNG.
471
+ const obsContentType = isNative ? "image/png" : "image/jpeg";
391
472
  let screenshotUrl;
392
473
  try {
393
- const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg);
474
+ const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg, obsContentType);
394
475
  screenshotUrl = uploadResult.screenshotUrl;
395
476
  }
396
477
  catch (err) {
@@ -407,6 +488,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
407
488
  screenshot_url: screenshotUrl,
408
489
  location_name: stepResponse.current_location,
409
490
  screen_format: screenFormat,
491
+ ...(fullPageBase64 ? { full_page_screenshot_base64: fullPageBase64 } : {}),
492
+ // Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
493
+ // back to screen_format server-side.
494
+ platform,
410
495
  });
411
496
  frameVersionId = matchResult.frame_version_id;
412
497
  }
@@ -417,7 +502,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
417
502
  // Debug-only: capture post-action screenshot to show result
418
503
  let postActionBase64;
419
504
  if (isDebugEnabled()) {
420
- const postJpeg = await takeScreenshotJpeg(page);
505
+ const postJpeg = await device.captureScreenshotJpeg();
421
506
  postActionBase64 = postJpeg.toString("base64");
422
507
  }
423
508
  // Accumulate effort (cumulative, not wall-clock)
@@ -437,10 +522,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
437
522
  step: step + 1,
438
523
  assignment_id: assignment.id,
439
524
  ...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
525
+ // Dimensions of THIS step's screenshot (from observe()) so the backend
526
+ // can populate the screenshot ref even when only screenshot_url is
527
+ // sent (native) and it can't read the bytes for dims.
528
+ screen_width: obs.width,
529
+ screen_height: obs.height,
440
530
  frame_version_id: frameVersionId,
441
531
  timestamp_ms: accumulatedEffortMs,
442
532
  comment: stepResponse.comment,
443
- url: page.url(),
533
+ url: device.currentUrl(),
444
534
  sentiment: {
445
535
  label: stepResponse.sentiment,
446
536
  valence: stepResponse.sentiment_valence,
@@ -448,7 +538,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
448
538
  },
449
539
  actions: actionDatas,
450
540
  current_location: stepResponse.current_location,
451
- assignment_completed: stepResponse.assignment_completed,
541
+ assignment_status: stepResponse.assignment_status,
452
542
  // Server reduces this to Interaction.tab when N >= 2; omit on
453
543
  // single-tab steps to keep the payload (and DB column) null.
454
544
  ...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
@@ -467,7 +557,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
467
557
  assignmentName: assignment.name,
468
558
  screenshotBase64: obsBase64,
469
559
  postActionScreenshotBase64: postActionBase64,
470
- url: page.url(),
560
+ url: device.currentUrl(),
471
561
  actions: actionDebugEntries,
472
562
  comment: stepResponse.comment,
473
563
  sentiment: {
@@ -480,6 +570,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
480
570
  effortSeconds: stepResponse.effort_seconds,
481
571
  });
482
572
  assignmentCompleted = stepResponse.assignment_completed;
573
+ lastAssignmentStatus = stepResponse.assignment_status;
483
574
  step++;
484
575
  }
485
576
  if (isCancelled()) {
@@ -491,16 +582,25 @@ async function runSingleSimulation(client, participantId, participantName, opts,
491
582
  });
492
583
  break;
493
584
  }
585
+ // When the agent terminated, persist its ACTUAL terminal status
586
+ // (completed vs abandoned) rather than always "completed". When it
587
+ // didn't terminate, the loop hit max_steps.
588
+ const terminalStatus = assignmentCompleted
589
+ ? lastAssignmentStatus
590
+ : "max_steps_reached";
494
591
  assignmentStatuses.push({
495
592
  assignment_id: assignment.id,
496
- status: assignmentCompleted ? "completed" : "max_steps_reached",
593
+ status: terminalStatus,
497
594
  step_count: step,
498
595
  });
499
- if (assignmentCompleted) {
500
- log(` Assignment completed in ${step} steps`);
596
+ if (!assignmentCompleted) {
597
+ log(` Assignment reached max steps (${maxSteps})`);
598
+ }
599
+ else if (lastAssignmentStatus === "abandoned") {
600
+ log(` Assignment abandoned by agent after ${step} steps`);
501
601
  }
502
602
  else {
503
- log(` Assignment reached max steps (${maxSteps})`);
603
+ log(` Assignment completed in ${step} steps`);
504
604
  }
505
605
  }
506
606
  }
@@ -518,7 +618,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
518
618
  generateDebugReport(debugSteps, {
519
619
  participantId: session.participant_id,
520
620
  participantName,
521
- url: navigationUrl,
621
+ url: launchTarget,
522
622
  screenFormat,
523
623
  finalStatus,
524
624
  assignmentStatuses,
@@ -542,15 +642,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
542
642
  const msg = err instanceof Error ? err.message : String(err);
543
643
  log(` Warning: failed to record results — ${msg}`);
544
644
  }
545
- if (ownsTheBrowser) {
546
- await closeBrowser(browserSession);
547
- }
548
- else {
549
- // Shared mode: close just the tab, not the context or browser
550
- try {
551
- await browserSession.page.close();
552
- }
553
- catch { }
554
- }
645
+ // Device owns its own teardown (full browser vs. just-the-tab for shared
646
+ // mode, app/emulator cleanup for native).
647
+ await device.close();
555
648
  }
556
649
  }
@@ -0,0 +1,97 @@
1
+ /**
2
+ * Pure parser/serializer for native (Android/iOS) accessibility trees — the
3
+ * native counterpart of the browser's DOM-locator tree. It turns a raw device
4
+ * a11y dump into the SAME `[id] role "name"` string the backend's DOMLocator
5
+ * reasons over, plus a local `shortId → bounds` map the device taps the CENTER
6
+ * of. No bounds ship to the backend; like the browser path, the CLI keeps the
7
+ * map and resolves the LLM's returned short id locally.
8
+ *
9
+ * FCIS: this module is pure (string in, structs out) — no `adb`/`idb` I/O — so
10
+ * it's unit-testable without a device, exactly like `coordinates.ts`. The I/O
11
+ * lives in `adb.ts`/`simctl.ts`; the parse/serialize math lives here.
12
+ *
13
+ * COORDINATE SPACE — carried, not converted, by this module:
14
+ * - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
15
+ * - iOS `idb ui describe-all` frames are POINTS (`space: "points"`).
16
+ * The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
17
+ * IOSDevice taps points), so the `space` tag tells the caller which dimension a
18
+ * node's bounds-center belongs to. This module never mixes the two.
19
+ *
20
+ * ANCESTOR-VS-LEAF (the hard part): on Android the visible label
21
+ * ("Network & internet") sits on a `clickable=false` TextView nested inside the
22
+ * clickable PARENT row. Tapping the leaf's center misses the row's hit logic and
23
+ * lands "slightly off"; the click target is the row. So the serializer walks to
24
+ * the nearest clickable ANCESTOR, aggregates its descendants' text/content-desc
25
+ * into ONE label, and emits the CLICKABLE node WITH THE ROW'S BOUNDS — never the
26
+ * leaf. iOS Buttons are already labeled + actionable, so they emit directly.
27
+ */
28
+ export type CoordinateSpace = "px" | "points";
29
+ export interface Bounds {
30
+ x: number;
31
+ y: number;
32
+ width: number;
33
+ height: number;
34
+ }
35
+ /**
36
+ * One parsed native a11y node. `bounds` are in `space` (Android px, iOS points).
37
+ * `clickable` marks an actionable hit target. `resourceId` is the Android
38
+ * resource-id / iOS AXUniqueId when present (diagnostic; not used for tapping).
39
+ */
40
+ export interface NativeNode {
41
+ role: string;
42
+ label: string;
43
+ bounds: Bounds;
44
+ clickable: boolean;
45
+ /** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
46
+ hasOwnLabel: boolean;
47
+ resourceId?: string;
48
+ space: CoordinateSpace;
49
+ }
50
+ export interface NativeTree {
51
+ /** `[id] role "label"` lines, one per emitted actionable node. */
52
+ simplified: string;
53
+ /** shortId → bounds (in the platform's space). The device taps the center. */
54
+ nodeMap: Map<string, Bounds>;
55
+ }
56
+ /**
57
+ * Parse a uiautomator XML dump into a flat list of leaf-significant nodes in
58
+ * document order. The dump is a single line of nested `<node ...>` tags; we
59
+ * rebuild the parent/child nesting from the open/close-tag stream (mirroring the
60
+ * "break after `>`" split the oracle scripts use, but tracking depth so the
61
+ * ancestor-aggregation in `serializeNativeTree` has the real tree).
62
+ *
63
+ * Returns the FLATTENED set of nodes (depth-first, document order) with their
64
+ * raw fields; the serializer decides which to emit and how to aggregate.
65
+ */
66
+ export declare function parseUiautomatorXml(xml: string): NativeNode[];
67
+ /**
68
+ * Parse `idb ui describe-all` JSON (a FLAT array of elements, each with a `frame`
69
+ * in POINTS) into NativeNodes in array order. iOS is already a flat,
70
+ * properly-labeled list — no ancestor walk needed — so `clickable` is derived
71
+ * from the element's role/type and whether it carries a usable label.
72
+ */
73
+ export declare function parseIdbDescribeAll(json: string): NativeNode[];
74
+ /**
75
+ * Serialize a flat NativeNode list (from `parseUiautomatorXml` /
76
+ * `parseIdbDescribeAll`) into the `[id] role "label"` string the DOMLocator
77
+ * reasons over, plus a `shortId → bounds` map for local tap resolution.
78
+ *
79
+ * Emission rules (kept tight, like the DOM serializer):
80
+ * - ANCESTOR-VS-LEAF: a CLICKABLE node absorbs its descendants' labels and is
81
+ * emitted with ITS OWN bounds (the tappable row). The descendant
82
+ * label-bearing leaves are then NOT emitted on their own — their text lives
83
+ * on the row. A label-bearing leaf with NO clickable ancestor (e.g. a
84
+ * standalone heading) is emitted directly so on-screen text isn't lost.
85
+ * - Skip pure decoration: a node that is neither clickable nor label-bearing,
86
+ * and a generic/application container that didn't aggregate a label.
87
+ *
88
+ * The input list is depth-first / document order, which is the order the raw
89
+ * parsers produce; we recover ancestry from that order using bounds containment
90
+ * (Android leaves nest inside their clickable row's rect; iOS is already flat).
91
+ */
92
+ export declare function serializeNativeTree(nodes: NativeNode[]): NativeTree;
93
+ /** Center of a node's bounds — the point the device taps. */
94
+ export declare function boundsCenter(b: Bounds): {
95
+ x: number;
96
+ y: number;
97
+ };