@ishlabs/cli 0.24.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/commands/ask.js +3 -3
  2. package/dist/commands/doctor.d.ts +26 -0
  3. package/dist/commands/doctor.js +334 -0
  4. package/dist/commands/iteration.js +1 -1
  5. package/dist/commands/study-analyze.js +1 -1
  6. package/dist/commands/study-run.js +80 -12
  7. package/dist/commands/study.js +11 -7
  8. package/dist/index.js +2 -0
  9. package/dist/lib/alias-store.js +1 -1
  10. package/dist/lib/api-client.d.ts +2 -0
  11. package/dist/lib/docs.js +57 -42
  12. package/dist/lib/local-sim/actions.d.ts +10 -2
  13. package/dist/lib/local-sim/actions.js +18 -11
  14. package/dist/lib/local-sim/adb.d.ts +113 -0
  15. package/dist/lib/local-sim/adb.js +366 -0
  16. package/dist/lib/local-sim/android.d.ts +111 -0
  17. package/dist/lib/local-sim/android.js +504 -0
  18. package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
  19. package/dist/lib/local-sim/apk-manifest.js +210 -0
  20. package/dist/lib/local-sim/browser.d.ts +22 -0
  21. package/dist/lib/local-sim/browser.js +65 -0
  22. package/dist/lib/local-sim/coordinates.d.ts +69 -0
  23. package/dist/lib/local-sim/coordinates.js +59 -0
  24. package/dist/lib/local-sim/device.d.ts +143 -0
  25. package/dist/lib/local-sim/device.js +152 -0
  26. package/dist/lib/local-sim/ios.d.ts +185 -0
  27. package/dist/lib/local-sim/ios.js +599 -0
  28. package/dist/lib/local-sim/loop.d.ts +14 -2
  29. package/dist/lib/local-sim/loop.js +168 -73
  30. package/dist/lib/local-sim/native-a11y.d.ts +111 -0
  31. package/dist/lib/local-sim/native-a11y.js +419 -0
  32. package/dist/lib/local-sim/simctl.d.ts +55 -0
  33. package/dist/lib/local-sim/simctl.js +144 -0
  34. package/dist/lib/local-sim/types.d.ts +39 -2
  35. package/dist/lib/local-sim/upload.d.ts +1 -1
  36. package/dist/lib/local-sim/upload.js +9 -6
  37. package/dist/lib/local-sim/xcuitest.d.ts +60 -0
  38. package/dist/lib/local-sim/xcuitest.js +303 -0
  39. package/dist/lib/output.js +58 -12
  40. package/dist/lib/paths.d.ts +8 -0
  41. package/dist/lib/paths.js +12 -0
  42. package/dist/lib/skill-content.js +10 -9
  43. package/package.json +2 -1
@@ -0,0 +1,599 @@
1
+ /**
2
+ * IOSDevice — drives a local iOS simulator via `xcrun simctl` (lifecycle +
3
+ * screenshot) and WebDriverAgent/XCUITest (UI + a11y; see xcuitest.ts),
4
+ * implementing the SimulationDevice surface the loop expects. Mirrors
5
+ * AndroidDevice; the one substantive difference is the coordinate space.
6
+ *
7
+ * Two resolution paths, mirroring the browser:
8
+ * - ELEMENT (preferred): observe() reads WDA's `/source` a11y tree, serializes
9
+ * it to the `[id] role "label"` string the backend DOMLocator reasons over,
10
+ * and keeps a local `shortId → bounds` map (bounds in POINTS). The backend
11
+ * returns a `node_id`; executeAction() looks the bounds up and taps the
12
+ * element's CENTER.
13
+ * - VISION (fallback): when the tree is empty/sparse, observe() returns an
14
+ * empty tree so the backend takes its vision branch and returns NORMALIZED
15
+ * 0-1000 coordinates. Also taken per-action whenever node_id is absent.
16
+ *
17
+ * COORDINATE SPACE — two spaces, the key difference from Android (where
18
+ * screencap and tap share one pixel space):
19
+ * `simctl io booted screenshot` is in PIXELS (e.g. 1179x2556 @3x), but
20
+ * WDA taps/swipes AND the `/source` a11y frames are POINTS (393x852).
21
+ * The invariant in BOTH paths: TAP in points, RECORD in pixels, because the
22
+ * loop re-normalizes the recorded coord against dimensions() (PIXELS).
23
+ * - VISION: tap pt = round(n/1000 * pointSize); record px = round(n/1000 * pixelSize).
24
+ * - ELEMENT: tap = bounds-center (already POINTS); record = that center
25
+ * scaled POINTS→PIXELS via pointToPixel() (the @Nx scale).
26
+ * dimensions() returns the PIXEL size, so the loop re-normalizes the recorded
27
+ * px back to a stable 0-1000. Recording in points would drift: the point grid
28
+ * (393) is coarser than the 0-1000 grid, so a points round-trip double-rounds
29
+ * (500→197→501). Pixels (1179 > 1000) are finer → identity. The vision model
30
+ * is resolution-independent (0-1000 is a fraction of the image), so the
31
+ * backend never converts coords with screen_width/height.
32
+ */
33
+ import { resolveTextValue } from "./actions.js";
34
+ import { requireOneBootedSimulator, screenshotPng, terminateApp, launchApp, installApp, isAppInstalled, bundleIdFromApp, } from "./simctl.js";
35
+ // iOS UI interaction + a11y run through WebDriverAgent (XCUITest), not idb.
36
+ import { ensureWda, closeWda, describeScreen, describeAll, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, } from "./xcuitest.js";
37
+ import { isLocalPath } from "../upload.js";
38
+ import { deNormalizePoint, deNormalizeDrag, pointToPixel } from "./coordinates.js";
39
+ import { parseXcuiHierarchy, serializeNativeTree, boundsCenter } from "./native-a11y.js";
40
+ // Let animations/transitions settle before the next observation so the
41
+ // screenshot the LLM reasons over reflects the action's result.
42
+ const POST_GESTURE_SETTLE_MS = 500;
43
+ // Leading nav-bar labels that are NOT a back affordance — used to keep
44
+ // navigate_back's back-button resolver from tapping a destructive/wrong
45
+ // control (Cancel/Close discard work; Edit/Done/Add/Save/Menu are actions,
46
+ // not navigation). A stock back button is labeled with the parent screen's
47
+ // title, so it never collides with this set.
48
+ const NON_BACK_LEADING_LABELS = new Set([
49
+ "cancel",
50
+ "close",
51
+ "done",
52
+ "edit",
53
+ "add",
54
+ "save",
55
+ "menu",
56
+ ]);
57
+ async function settle(ms = POST_GESTURE_SETTLE_MS) {
58
+ await new Promise((r) => setTimeout(r, ms));
59
+ }
60
+ export class IOSDevice {
61
+ contextValues;
62
+ log;
63
+ bundleId;
64
+ appPath;
65
+ /** udid of the single booted simulator we drive. */
66
+ udid = "";
67
+ /** Set once the WebDriverAgent runner is up, so the startup note logs once. */
68
+ wdaStarted = false;
69
+ /** POINT size — what idb ui tap/swipe consume (de-normalization basis for TAPS). */
70
+ pointWidth = 0;
71
+ pointHeight = 0;
72
+ /**
73
+ * PIXEL size — the screenshot resolution and the RECORDED coord space.
74
+ * Recording in pixels (not points) keeps the loop's round-trip exact: the
75
+ * point grid (e.g. 393) is coarser than the 0-1000 normalized grid, so a
76
+ * points round-trip double-rounds and drifts; pixels (e.g. 1179 > 1000) are
77
+ * finer, so de-normalize-then-re-normalize is an identity.
78
+ */
79
+ pixelWidth = 0;
80
+ pixelHeight = 0;
81
+ /**
82
+ * shortId → bounds (POINTS — idb describe-all frames) from the last observe(),
83
+ * the local counterpart of BrowserDevice.lastTreeData. executeAction()
84
+ * resolves a backend `node_id` against this; the bounds-center is the POINT
85
+ * tap target (recorded in pixels via pointToPixel).
86
+ */
87
+ lastNodeMap = new Map();
88
+ constructor(opts) {
89
+ this.contextValues = opts.contextValues;
90
+ this.log = opts.log ?? (() => { });
91
+ this.bundleId = opts.bundleId ?? null;
92
+ this.appPath = opts.appPath;
93
+ }
94
+ async launchOrReset(target) {
95
+ this.udid = await requireOneBootedSimulator();
96
+ // First call: install the .app (if --app is a local path) and resolve the
97
+ // bundle id to terminate/relaunch on. `target` is the iteration's platform
98
+ // target (a bundle id) when no --app is supplied. Throws (rather than
99
+ // silently driving the foreground) if the bundle id can't be resolved.
100
+ if (!this.bundleId) {
101
+ this.bundleId = await this.resolveBundleId(target);
102
+ }
103
+ const bundleId = this.bundleId;
104
+ // Bring up the WebDriverAgent runner (install + simctl-launch the prebuilt
105
+ // xctrunner, open a session). Idempotent and reused across participants, so
106
+ // the ~30-60s first-launch cost is paid once per run.
107
+ if (!this.wdaStarted) {
108
+ this.log("Starting the iOS automation runner (WebDriverAgent); first launch can take ~30-60s...");
109
+ }
110
+ await ensureWda(this.udid);
111
+ this.wdaStarted = true;
112
+ // Prime screen geometry (points) before the first de-normalization.
113
+ await this.refreshScreen();
114
+ // Per-participant reset: terminate then relaunch from a clean state.
115
+ await terminateApp(this.udid, bundleId);
116
+ await launchApp(this.udid, bundleId);
117
+ await settle(1500); // cold start needs longer than a gesture settle
118
+ }
119
+ /**
120
+ * Resolve the bundle id to drive, returning a non-null id or throwing.
121
+ * Installs a local `.app` first and reads its CFBundleIdentifier from
122
+ * Info.plist (no list-diff needed — a .app carries its id). A non-.app local
123
+ * value is treated as an already-installed bundle id.
124
+ */
125
+ async resolveBundleId(target) {
126
+ const appSpec = this.appPath ?? (target && target.trim() ? target.trim() : null);
127
+ if (!appSpec) {
128
+ throw new Error("No app to drive: pass --app <path-to.app | installed.bundle.id>, or set the iteration's " +
129
+ "platform target to an installed bundle id.");
130
+ }
131
+ // `isLocalPath` returns false for http(s):// and throws on other schemes.
132
+ const local = isLocalPath(appSpec);
133
+ if (!local) {
134
+ throw new Error(`--app received a URL (${appSpec}). Installing a hosted .app on the simulator is not supported yet — ` +
135
+ `pass a local .app path or an already-installed bundle id.`);
136
+ }
137
+ if (appSpec.toLowerCase().endsWith(".app")) {
138
+ const id = await bundleIdFromApp(appSpec);
139
+ if (!id) {
140
+ throw new Error(`Could not read CFBundleIdentifier from "${appSpec}/Info.plist". ` +
141
+ `Pass --app <bundle.id> explicitly if the .app layout is unusual.`);
142
+ }
143
+ this.log(`Installing ${appSpec} (${id})...`);
144
+ await installApp(this.udid, appSpec);
145
+ return id;
146
+ }
147
+ // Local non-.app value: treat as an installed bundle id.
148
+ if (await isAppInstalled(this.udid, appSpec)) {
149
+ return appSpec;
150
+ }
151
+ throw new Error(`App "${appSpec}" is not installed on the simulator and is not a local .app path. ` +
152
+ `Pass --app <path-to.app> to install it, or install it first.`);
153
+ }
154
+ async refreshScreen() {
155
+ const screen = await describeScreen(this.udid);
156
+ this.pointWidth = screen.pointWidth;
157
+ this.pointHeight = screen.pointHeight;
158
+ this.pixelWidth = screen.pixelWidth;
159
+ this.pixelHeight = screen.pixelHeight;
160
+ return screen;
161
+ }
162
+ async observe() {
163
+ // Refresh geometry each step (orientation can change), then capture the
164
+ // pixel screenshot and the a11y tree in parallel (independent reads). The
165
+ // dump is wrapped so a failure degrades to the vision path (empty tree).
166
+ await this.refreshScreen();
167
+ const [png, tree] = await Promise.all([
168
+ screenshotPng(),
169
+ this.dumpTree(),
170
+ ]);
171
+ this.lastNodeMap = tree.nodeMap;
172
+ return {
173
+ screenshot: png.toString("base64"),
174
+ // Element path when describe-all produced a tree; "" → backend vision.
175
+ accessibilityTree: tree.simplified,
176
+ url: "",
177
+ // PIXELS — match dimensions() and the pixel screenshot we send, so the
178
+ // loop's coordinate round-trip is exact (see dimensions()/toPixels()).
179
+ width: this.pixelWidth,
180
+ height: this.pixelHeight,
181
+ // Native has no scrollable document; the screen IS the page.
182
+ documentHeight: this.pixelHeight,
183
+ tabs: [],
184
+ };
185
+ }
186
+ /**
187
+ * Read + serialize WDA's /source a11y tree (bounds in POINTS). Any
188
+ * failure (retries exhausted on a trivial tree, parse error) degrades to an
189
+ * empty tree so the backend falls back to vision — a missing tree must never
190
+ * abort the observation.
191
+ */
192
+ async dumpTree() {
193
+ try {
194
+ const json = await describeAll(this.udid);
195
+ const nodes = parseXcuiHierarchy(json);
196
+ const tree = serializeNativeTree(nodes);
197
+ this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
198
+ return tree;
199
+ }
200
+ catch (err) {
201
+ const msg = err instanceof Error ? err.message : String(err);
202
+ this.log(`a11y describe-all failed, falling back to vision: ${msg}`);
203
+ return { simplified: "", nodeMap: new Map() };
204
+ }
205
+ }
206
+ async captureScreenshot() {
207
+ const png = await screenshotPng();
208
+ return png.toString("base64");
209
+ }
210
+ async captureScreenshotJpeg() {
211
+ // simctl screenshot only emits PNG. We return the PNG bytes; the upload/
212
+ // record path treats them as opaque image bytes (PDQ frame-matching works
213
+ // on PNG). The loop labels native uploads image/png.
214
+ return screenshotPng();
215
+ }
216
+ dimensions() {
217
+ // PIXELS — the space the loop re-normalizes the recorded coord against.
218
+ // Pixels (finer than the 0-1000 grid) make that round-trip exact; idb taps
219
+ // separately in points (see toPoints()).
220
+ return { width: this.pixelWidth, height: this.pixelHeight };
221
+ }
222
+ /** Normalized 0-1000 → POINT space (WDA taps/swipes take points). */
223
+ toPoints(c) {
224
+ return deNormalizePoint(c, this.pointWidth, this.pointHeight);
225
+ }
226
+ /** Normalized 0-1000 → PIXEL space (the recorded/reported coord). */
227
+ toPixels(c) {
228
+ return deNormalizePoint(c, this.pixelWidth, this.pixelHeight);
229
+ }
230
+ /**
231
+ * Resolve the POINT tap target + PIXEL record coord for a positional action.
232
+ * ELEMENT path (node_id): the bounds-center is the POINT tap; the recorded
233
+ * pixel coord is that center scaled POINTS→PIXELS so it round-trips against
234
+ * dimensions() (pixels). VISION path: de-normalize the 0-1000 coord into both
235
+ * spaces. Returns {stale:true} for a node_id with no bounds (tree moved); the
236
+ * caller fails the action so the loop forwards DOM_ELEMENT_NOT_FOUND.
237
+ */
238
+ resolveTarget(action) {
239
+ if (action.node_id) {
240
+ const bounds = this.lastNodeMap.get(action.node_id);
241
+ if (!bounds)
242
+ return { pt: null, px: null, stale: true };
243
+ const pt = boundsCenter(bounds); // POINTS — WDA taps directly
244
+ const px = pointToPixel(pt, this.pointWidth, this.pointHeight, this.pixelWidth, this.pixelHeight);
245
+ return { pt, px, stale: false };
246
+ }
247
+ const pt = action.coordinates ? this.toPoints(action.coordinates) : null;
248
+ const px = action.coordinates ? this.toPixels(action.coordinates) : null;
249
+ return { pt, px, stale: false };
250
+ }
251
+ async executeAction(action) {
252
+ try {
253
+ // pt drives the WDA TAP (points); px is what we RECORD (pixels). ELEMENT
254
+ // path: pt = bounds-center, px = that center scaled to pixels. VISION
255
+ // path: both derive from the same normalized coord. Either way the tap
256
+ // lands right and the recorded px round-trips against dimensions().
257
+ const resolved = this.resolveTarget(action);
258
+ if (resolved.stale)
259
+ return this.failStaleNode(action);
260
+ const { pt, px } = resolved;
261
+ switch (action.type) {
262
+ case "tap":
263
+ case "double_tap": {
264
+ if (!pt)
265
+ return this.failNoCoords(action);
266
+ const count = action.type === "double_tap" ? 2 : action.count ?? 1;
267
+ for (let i = 0; i < count; i++)
268
+ await uiTap(this.udid, pt.x, pt.y);
269
+ break;
270
+ }
271
+ case "long_press": {
272
+ if (!pt)
273
+ return this.failNoCoords(action);
274
+ await uiLongPress(this.udid, pt.x, pt.y, action.duration_ms ?? 600);
275
+ break;
276
+ }
277
+ case "text_input": {
278
+ await this.typeText(action, pt);
279
+ break;
280
+ }
281
+ case "scroll": {
282
+ await this.scroll(action);
283
+ break;
284
+ }
285
+ case "swipe":
286
+ case "pull_to_refresh": {
287
+ await this.swipe(action.direction ?? (action.type === "pull_to_refresh" ? "down" : "up"));
288
+ break;
289
+ }
290
+ case "navigate_back": {
291
+ // iOS has no hardware back; the system "back" is a left-edge swipe.
292
+ await this.navigateBack();
293
+ break;
294
+ }
295
+ case "open_system_panel": {
296
+ // Element-less, like navigate_back: best-effort top-edge pull-down.
297
+ await this.openSystemPanel(action.panel === "quick_settings" ? "quick_settings" : "notifications");
298
+ break;
299
+ }
300
+ case "drag": {
301
+ // A drag GRABS an element and RELEASES it elsewhere ("click the
302
+ // element, move, let go") — distinct from a swipe (element-less
303
+ // directional). Press the resolved element center (pt — the same
304
+ // element path a tap uses, in POINTS), move to the drop point,
305
+ // release. Record the grab point→PIXELS so it round-trips against
306
+ // dimensions() (pixels).
307
+ const recorded = await this.drag(action, pt);
308
+ if (!recorded)
309
+ return this.failNoCoords(action);
310
+ await settle();
311
+ return {
312
+ success: true,
313
+ elementName: action.element_name,
314
+ coordinates: recorded,
315
+ openedNewTab: false,
316
+ };
317
+ }
318
+ case "wait": {
319
+ await settle(action.duration_ms ?? 1000);
320
+ break;
321
+ }
322
+ case "think": {
323
+ // Reasoning-only: no device interaction.
324
+ return { success: true, elementName: action.element_name, coordinates: null, openedNewTab: false };
325
+ }
326
+ case "pinch_zoom":
327
+ case "rotate_device":
328
+ case "keyboard_shortcut":
329
+ case "switch_tab":
330
+ case "close_tab": {
331
+ // Capabilities the single-app iOS driver genuinely can't perform:
332
+ // true multi-touch pinch, sensor rotation (idb has no clean rotate),
333
+ // browser tabs, desktop keyboard shortcuts. Fail LOUDLY (not silently)
334
+ // so the loop forwards it and the agent can adapt.
335
+ return this.failUnsupported(action);
336
+ }
337
+ default: {
338
+ this.log(`Unknown native action: ${action.type}`);
339
+ return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
340
+ }
341
+ }
342
+ await settle();
343
+ return {
344
+ success: true,
345
+ elementName: action.element_name,
346
+ // Report PIXEL coords so the loop re-normalizes them against the pixel
347
+ // dimensions() (points would drift since 393 < 1000). The tap itself
348
+ // used points — either the de-normalized 0-1000 (vision) or the
349
+ // bounds-center (element); px is the matching pixel coord for each.
350
+ coordinates: px,
351
+ openedNewTab: false,
352
+ };
353
+ }
354
+ catch (err) {
355
+ const msg = err instanceof Error ? err.message : String(err);
356
+ this.log(`Action ${action.type} failed: ${msg}`);
357
+ return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
358
+ }
359
+ }
360
+ failNoCoords(action) {
361
+ // The backend couldn't vision-locate a target (coordinates=null). Skip the
362
+ // action (don't crash, don't silently succeed) and surface it like the
363
+ // browser path's unresolved-element case — success:false makes the loop
364
+ // push a DOM_ELEMENT_NOT_FOUND forward so the LLM learns the target missed.
365
+ const target = action.element_description || action.element_name || "(no description)";
366
+ this.log(`Skipping native action with no resolved target: ${action.type} ${target}`);
367
+ return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
368
+ }
369
+ failStaleNode(action) {
370
+ // The backend resolved a node_id that's no longer in the latest tree (the
371
+ // screen changed between observe() and act). Fail like the browser's
372
+ // unresolved-element case — success:false forwards DOM_ELEMENT_NOT_FOUND so
373
+ // the loop re-observes and the agent retries against a fresh tree.
374
+ const target = action.element_description || action.element_name || action.node_id || "(unknown)";
375
+ this.log(`Stale node_id "${action.node_id}" not in current a11y tree: ${action.type} ${target}`);
376
+ return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
377
+ }
378
+ async typeText(action, pt) {
379
+ // Focus the field first if the model gave a target.
380
+ if (pt) {
381
+ await uiTap(this.udid, pt.x, pt.y);
382
+ await settle(250);
383
+ }
384
+ const text = resolveTextValue(action, this.contextValues);
385
+ // WDA text input appends to the focused field; for click_type (replace) there
386
+ // is no idb "clear", so we rely on the field being empty after focus. The
387
+ // vision agent typically taps an empty field, so this matches Android's
388
+ // common path; a true select-all clear isn't exposed by idb.
389
+ if (text)
390
+ await uiText(this.udid, text);
391
+ if (action.submit) {
392
+ await settle(150);
393
+ await uiKey(this.udid, HID_KEY_RETURN);
394
+ }
395
+ }
396
+ async scroll(action) {
397
+ const w = this.pointWidth;
398
+ const h = this.pointHeight;
399
+ const cx = Math.round(w / 2);
400
+ const amountMap = {
401
+ small: 0.25, medium: 0.45, large: 0.7, extra_large: 0.9,
402
+ };
403
+ const frac = amountMap[action.amount ?? "medium"] ?? 0.45;
404
+ const dist = Math.round(h * frac);
405
+ const mid = Math.round(h / 2);
406
+ switch (action.direction) {
407
+ case "up":
408
+ // Reveal content above: swipe finger downward.
409
+ await uiSwipe(this.udid, cx, mid - dist / 2, cx, mid + dist / 2);
410
+ break;
411
+ case "to_top":
412
+ await uiSwipe(this.udid, cx, Math.round(h * 0.2), cx, Math.round(h * 0.9), 400);
413
+ break;
414
+ case "to_bottom":
415
+ await uiSwipe(this.udid, cx, Math.round(h * 0.9), cx, Math.round(h * 0.2), 400);
416
+ break;
417
+ case "down":
418
+ case "to_element":
419
+ default:
420
+ // Reveal content below: swipe finger upward.
421
+ await uiSwipe(this.udid, cx, mid + dist / 2, cx, mid - dist / 2);
422
+ break;
423
+ }
424
+ }
425
+ async swipe(direction) {
426
+ const w = this.pointWidth;
427
+ const h = this.pointHeight;
428
+ const cx = Math.round(w / 2);
429
+ const cy = Math.round(h / 2);
430
+ const d = Math.round(h * 0.4);
431
+ const dx = Math.round(w * 0.4);
432
+ switch (direction) {
433
+ case "up":
434
+ await uiSwipe(this.udid, cx, cy + d / 2, cx, cy - d / 2);
435
+ break;
436
+ case "down":
437
+ await uiSwipe(this.udid, cx, cy - d / 2, cx, cy + d / 2);
438
+ break;
439
+ case "left":
440
+ await uiSwipe(this.udid, cx + dx / 2, cy, cx - dx / 2, cy);
441
+ break;
442
+ case "right":
443
+ await uiSwipe(this.udid, cx - dx / 2, cy, cx + dx / 2, cy);
444
+ break;
445
+ default:
446
+ await uiSwipe(this.udid, cx, cy + d / 2, cx, cy - d / 2);
447
+ break;
448
+ }
449
+ }
450
+ /**
451
+ * Perform a drag: press the GRABBED element, move to the drop point, release.
452
+ * A drag is "click an element and let it go", so the press lands element-
453
+ * center (the resolved `grab` in POINTS — node_id bounds center, or the
454
+ * vision coordinate when the tree is blind), NOT the backend's vision-
455
+ * estimated start. The release point is the drag END (drag.endX/endY). A
456
+ * ~0.8s idb swipe reads as a drag, not a flick. Returns the grab point scaled
457
+ * to PIXELS (pointToPixel) to record so it round-trips against dimensions()
458
+ * (pixels), or null if there's no end to drag toward.
459
+ *
460
+ * idb LIMITATION: `idb ui swipe` only exposes --duration/--delta — it has no
461
+ * press-and-HOLD-then-move primitive (unlike Android's `input draganddrop`).
462
+ * So this drives the immediate-drag surfaces (sliders, drag-to-dismiss, drag
463
+ * handles that pick up on touch-move) but does NOT trigger a long-press
464
+ * pickup (home-screen jiggle mode, in-app reorder that needs a hold first) —
465
+ * verified on-device: a long uiSwipe leaves home-screen icons unmoved. The
466
+ * grab/release SEMANTICS are still correct; the gap is purely the missing
467
+ * hold, which idb can't perform in one continuous gesture.
468
+ */
469
+ async drag(action, grab) {
470
+ if (!action.drag)
471
+ return null;
472
+ const { start, end } = deNormalizeDrag(action.drag, this.pointWidth, this.pointHeight); // POINTS
473
+ // Grab the resolved element center; fall back to the backend's own start
474
+ // only when nothing resolved (no node_id and no vision coordinate).
475
+ const press = grab ?? start;
476
+ await uiSwipe(this.udid, press.x, press.y, end.x, end.y, 800);
477
+ return pointToPixel(press, this.pointWidth, this.pointHeight, this.pixelWidth, this.pixelHeight);
478
+ }
479
+ /**
480
+ * iOS has no hardware back. The system interactive-pop (left-edge swipe) is
481
+ * NOT reliably triggerable through idb's synthetic touch — verified on the
482
+ * simulator: no edge-swipe variant (start x, travel, duration, delta) pops
483
+ * the view. So we resolve and TAP the nav-bar back button instead: iOS HIG
484
+ * places "back" as the LEADING (leftmost) button in the top nav bar of any
485
+ * pushed view, so the leftmost button in the nav-bar band is it — verified to
486
+ * pop a Settings sub-screen back to root. The left-edge swipe remains a
487
+ * best-effort fallback for real devices (where idb sends real HID events that
488
+ * do drive the system gesture) when no back button is visible.
489
+ */
490
+ async navigateBack() {
491
+ const nodes = parseXcuiHierarchy(await describeAll(this.udid));
492
+ const back = this.findBackButton(nodes);
493
+ if (back) {
494
+ const c = boundsCenter(back.bounds); // POINTS — WDA taps directly
495
+ await uiTap(this.udid, c.x, c.y);
496
+ return;
497
+ }
498
+ // No nav-bar back button (root screen, or a custom chrome): fall back to the
499
+ // system edge-swipe — works on real devices, a no-op on the simulator.
500
+ this.log("navigate_back: no nav-bar back button found; trying left-edge swipe");
501
+ const midY = Math.round(this.pointHeight / 2);
502
+ await uiSwipe(this.udid, 1, midY, Math.round(this.pointWidth * 0.5), midY, 300);
503
+ }
504
+ /**
505
+ * Best-effort open of an iOS system panel by swiping down from the top edge.
506
+ * iOS has no `cmd statusbar` equivalent, so on a Face-ID layout:
507
+ * - notifications → Notification Center: swipe down from the top-CENTER.
508
+ * - quick_settings → Control Center: swipe down from the top-RIGHT corner.
509
+ * Coordinates are POINTS (idb consumes points; see toPoints()/the swipe()
510
+ * helper). This is FLAKY on the simulator — idb's synthetic touch frequently
511
+ * doesn't trigger the system edge gesture (the same limitation navigateBack's
512
+ * edge-swipe hits). We compare a before/after screenshot and log LOUDLY when
513
+ * the screen didn't change, rather than silently reporting success, so a
514
+ * no-op is visible in the run. The executeAction caller still returns
515
+ * success:true (the gesture was attempted); the loud log is the signal.
516
+ */
517
+ async openSystemPanel(panel) {
518
+ const before = await screenshotPng();
519
+ const w = this.pointWidth;
520
+ const h = this.pointHeight;
521
+ // Start ON the top edge and travel a third of the screen down. Control
522
+ // Center lives under the top-right (battery/status) corner on Face-ID
523
+ // devices; Notification Center under the top-center notch area.
524
+ const startX = panel === "quick_settings" ? Math.round(w * 0.92) : Math.round(w * 0.5);
525
+ const startY = 1;
526
+ const endY = Math.round(h * 0.35);
527
+ await uiSwipe(this.udid, startX, startY, startX, endY, 350);
528
+ await settle();
529
+ // Loudly surface a no-op: the simulator's synthetic touch often can't drive
530
+ // the system edge gesture. An identical screenshot means the panel didn't open.
531
+ const after = await screenshotPng();
532
+ if (before.equals(after)) {
533
+ this.log(`open_system_panel (${panel}): top-edge swipe produced no visible change — ` +
534
+ `the simulator's synthetic touch likely didn't trigger the system gesture (flaky on the simulator).`);
535
+ }
536
+ }
537
+ /**
538
+ * The nav-bar back button: the leading (leftmost) actionable button in the
539
+ * top nav-bar band. iOS HIG guarantees "back" is the leading nav item in a
540
+ * pushed view, so the leftmost button high on the screen is it. Returns null
541
+ * on root screens (no leading back item) so the caller can fall back.
542
+ *
543
+ * The geometry alone (leftmost-top) would mis-fire on a modal whose LEADING
544
+ * item is Cancel/Close, or a root with a leading Edit/menu — and tapping
545
+ * Cancel/Close can DISCARD work. A stock back button is labeled with the
546
+ * PARENT screen's title (e.g. "Settings"), not "Back", so there's no reliable
547
+ * positive label signal; instead we exclude the known non-back leading
548
+ * labels. If every leading button is one of those, we return null and let the
549
+ * caller fall back rather than tap a destructive control.
550
+ *
551
+ * Known limitation: a glyph-only leading button with NO accessible label
552
+ * (e.g. a hamburger/avatar/logo) isn't in the deny-list, so on a screen whose
553
+ * leading control is an unlabeled non-back icon this can tap the wrong control
554
+ * (silently — it returns success). Acceptable for the common case (stock nav
555
+ * bars have a labeled back button), but it's why pushed views, not root/menu
556
+ * screens, are where navigate_back is reliable.
557
+ */
558
+ findBackButton(nodes) {
559
+ const navBandBottom = this.pointHeight * 0.15;
560
+ const leftZone = this.pointWidth * 0.3;
561
+ const candidates = nodes.filter((n) => n.role === "button" &&
562
+ n.clickable &&
563
+ n.bounds.width > 0 &&
564
+ n.bounds.height > 0 &&
565
+ n.bounds.y < navBandBottom &&
566
+ n.bounds.x < leftZone &&
567
+ !NON_BACK_LEADING_LABELS.has(n.label.trim().toLowerCase()));
568
+ if (candidates.length === 0)
569
+ return null;
570
+ // Leftmost wins; tie-break by topmost.
571
+ candidates.sort((a, b) => a.bounds.x - b.bounds.x || a.bounds.y - b.bounds.y);
572
+ return candidates[0];
573
+ }
574
+ failUnsupported(action) {
575
+ // A capability the single-app iOS driver genuinely can't perform. Fail with
576
+ // a clear, diagnosable reason — NOT a silent false — so it's visible WHY and
577
+ // the agent can adapt.
578
+ const hint = {
579
+ pinch_zoom: "no multi-touch on the native driver; the agent should zoom via double_tap",
580
+ rotate_device: "rotation is not wired on the native driver; leave orientation as-is",
581
+ keyboard_shortcut: "no hardware keyboard on the native driver; use on-screen taps/text_input",
582
+ switch_tab: "tabs are a browser concept; the native app has a single window",
583
+ close_tab: "tabs are a browser concept; the native app has a single window",
584
+ };
585
+ this.log(`${action.type} not supported by the iOS native driver — ${hint[action.type] ?? "no native equivalent"}`);
586
+ return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
587
+ }
588
+ currentUrl() {
589
+ // Native has no URL; recording stores "" (current_location comes from the
590
+ // backend's reasoning output, not the device).
591
+ return "";
592
+ }
593
+ async close() {
594
+ // Tear down the WebDriverAgent session (the runner is left installed on the
595
+ // shared simulator for the next run). The app resets via launchOrReset; no
596
+ // IME state to restore on iOS.
597
+ await closeWda(this.udid);
598
+ }
599
+ }
@@ -1,10 +1,12 @@
1
1
  /**
2
2
  * Local simulation loop orchestrator.
3
3
  *
4
- * Runs the observe → reason (remote) → act (local) loop for each
5
- * participant against a local Playwright browser.
4
+ * Runs the observe → reason (remote) → act (local) loop for each participant
5
+ * against a SimulationDevice (a Playwright browser today; a native Android
6
+ * emulator next). The loop is device-agnostic — see device.ts.
6
7
  */
7
8
  import type { ApiClient } from "../api-client.js";
9
+ import type { LocalStepAction } from "./types.js";
8
10
  export interface DebugStep {
9
11
  step: number;
10
12
  assignmentName: string;
@@ -35,6 +37,14 @@ export interface DebugStep {
35
37
  assignmentCompleted: boolean;
36
38
  effortSeconds: number;
37
39
  }
40
+ /**
41
+ * Convert a raw action (from either resolved_actions or output.action.actions)
42
+ * into the flat LocalStepAction shape used by the executor. Exported for unit
43
+ * tests of the native drag coordinate-shape split (the nested action's
44
+ * `coordinates` is a {x,y} tap point for most actions but a
45
+ * {startX,...,endY} path for a drag).
46
+ */
47
+ export declare function flattenAction(raw: Record<string, unknown>, nodeId?: string | null, nodeDescription?: string | null): LocalStepAction;
38
48
  export interface LocalSimRunOptions {
39
49
  workspaceId: string;
40
50
  studyId: string;
@@ -52,6 +62,8 @@ export interface LocalSimRunOptions {
52
62
  json?: boolean;
53
63
  debug?: boolean;
54
64
  parallel?: number;
65
+ platform?: string;
66
+ appPath?: string;
55
67
  }
56
68
  /**
57
69
  * Run local simulations — parallel when multiple participants, sequential by default.