@ishlabs/cli 0.24.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/ask.js +3 -3
- package/dist/commands/iteration.js +1 -1
- package/dist/commands/study-analyze.js +1 -1
- package/dist/commands/study-run.js +80 -12
- package/dist/commands/study.js +11 -7
- package/dist/lib/alias-store.js +1 -1
- package/dist/lib/api-client.d.ts +2 -0
- package/dist/lib/docs.js +57 -42
- package/dist/lib/local-sim/actions.d.ts +10 -2
- package/dist/lib/local-sim/actions.js +16 -11
- package/dist/lib/local-sim/adb.d.ts +103 -0
- package/dist/lib/local-sim/adb.js +352 -0
- package/dist/lib/local-sim/android.d.ts +111 -0
- package/dist/lib/local-sim/android.js +499 -0
- package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
- package/dist/lib/local-sim/apk-manifest.js +210 -0
- package/dist/lib/local-sim/browser.d.ts +22 -0
- package/dist/lib/local-sim/browser.js +65 -0
- package/dist/lib/local-sim/coordinates.d.ts +69 -0
- package/dist/lib/local-sim/coordinates.js +59 -0
- package/dist/lib/local-sim/device.d.ts +143 -0
- package/dist/lib/local-sim/device.js +152 -0
- package/dist/lib/local-sim/ios.d.ts +168 -0
- package/dist/lib/local-sim/ios.js +546 -0
- package/dist/lib/local-sim/loop.d.ts +14 -2
- package/dist/lib/local-sim/loop.js +166 -73
- package/dist/lib/local-sim/native-a11y.d.ts +97 -0
- package/dist/lib/local-sim/native-a11y.js +384 -0
- package/dist/lib/local-sim/simctl.d.ts +85 -0
- package/dist/lib/local-sim/simctl.js +273 -0
- package/dist/lib/local-sim/types.d.ts +37 -2
- package/dist/lib/local-sim/upload.d.ts +1 -1
- package/dist/lib/local-sim/upload.js +9 -6
- package/dist/lib/output.js +58 -12
- package/dist/lib/skill-content.js +10 -9
- package/package.json +2 -1
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* IOSDevice — drives a local iOS simulator via `xcrun simctl` + `idb`,
|
|
3
|
+
* implementing the SimulationDevice surface the loop expects. Mirrors
|
|
4
|
+
* AndroidDevice; the one substantive difference is the coordinate space.
|
|
5
|
+
*
|
|
6
|
+
* Two resolution paths, mirroring the browser:
|
|
7
|
+
* - ELEMENT (preferred): observe() reads the `idb ui describe-all` a11y tree,
|
|
8
|
+
* serializes it to the `[id] role "label"` string the backend DOMLocator
|
|
9
|
+
* reasons over, and keeps a local `shortId → bounds` map (bounds in POINTS).
|
|
10
|
+
* The backend returns a `node_id`; executeAction() looks the bounds up and
|
|
11
|
+
* taps the element's CENTER.
|
|
12
|
+
* - VISION (fallback): when the tree is empty/sparse, observe() returns an
|
|
13
|
+
* empty tree so the backend takes its vision branch and returns NORMALIZED
|
|
14
|
+
* 0-1000 coordinates. Also taken per-action whenever node_id is absent.
|
|
15
|
+
*
|
|
16
|
+
* COORDINATE SPACE — two spaces, the key difference from Android (where
|
|
17
|
+
* screencap and tap share one pixel space):
|
|
18
|
+
* `simctl io booted screenshot` is in PIXELS (e.g. 1179x2556 @3x), but
|
|
19
|
+
* `idb ui tap/swipe` AND the `describe-all` a11y frames are POINTS (393x852).
|
|
20
|
+
* The invariant in BOTH paths: TAP in points, RECORD in pixels, because the
|
|
21
|
+
* loop re-normalizes the recorded coord against dimensions() (PIXELS).
|
|
22
|
+
* - VISION: tap pt = round(n/1000 * pointSize); record px = round(n/1000 * pixelSize).
|
|
23
|
+
* - ELEMENT: tap = bounds-center (already POINTS); record = that center
|
|
24
|
+
* scaled POINTS→PIXELS via pointToPixel() (the @Nx scale).
|
|
25
|
+
* dimensions() returns the PIXEL size, so the loop re-normalizes the recorded
|
|
26
|
+
* px back to a stable 0-1000. Recording in points would drift: the point grid
|
|
27
|
+
* (393) is coarser than the 0-1000 grid, so a points round-trip double-rounds
|
|
28
|
+
* (500→197→501). Pixels (1179 > 1000) are finer → identity. The vision model
|
|
29
|
+
* is resolution-independent (0-1000 is a fraction of the image), so the
|
|
30
|
+
* backend never converts coords with screen_width/height.
|
|
31
|
+
*/
|
|
32
|
+
import { resolveTextValue } from "./actions.js";
|
|
33
|
+
import { requireOneBootedSimulator, describeScreen, describeAll, screenshotPng, uiTap, uiLongPress, uiSwipe, uiText, uiKey, HID_KEY_RETURN, terminateApp, launchApp, installApp, isAppInstalled, bundleIdFromApp, } from "./simctl.js";
|
|
34
|
+
import { isLocalPath } from "../upload.js";
|
|
35
|
+
import { deNormalizePoint, deNormalizeDrag, pointToPixel } from "./coordinates.js";
|
|
36
|
+
import { parseIdbDescribeAll, serializeNativeTree, boundsCenter } from "./native-a11y.js";
|
|
37
|
+
// Let animations/transitions settle before the next observation so the
|
|
38
|
+
// screenshot the LLM reasons over reflects the action's result.
|
|
39
|
+
const POST_GESTURE_SETTLE_MS = 500;
|
|
40
|
+
// Leading nav-bar labels that are NOT a back affordance — used to keep
|
|
41
|
+
// navigate_back's back-button resolver from tapping a destructive/wrong
|
|
42
|
+
// control (Cancel/Close discard work; Edit/Done/Add/Save/Menu are actions,
|
|
43
|
+
// not navigation). A stock back button is labeled with the parent screen's
|
|
44
|
+
// title, so it never collides with this set.
|
|
45
|
+
const NON_BACK_LEADING_LABELS = new Set([
|
|
46
|
+
"cancel",
|
|
47
|
+
"close",
|
|
48
|
+
"done",
|
|
49
|
+
"edit",
|
|
50
|
+
"add",
|
|
51
|
+
"save",
|
|
52
|
+
"menu",
|
|
53
|
+
]);
|
|
54
|
+
async function settle(ms = POST_GESTURE_SETTLE_MS) {
|
|
55
|
+
await new Promise((r) => setTimeout(r, ms));
|
|
56
|
+
}
|
|
57
|
+
export class IOSDevice {
|
|
58
|
+
contextValues;
|
|
59
|
+
log;
|
|
60
|
+
bundleId;
|
|
61
|
+
appPath;
|
|
62
|
+
/** udid of the single booted simulator we drive. */
|
|
63
|
+
udid = "";
|
|
64
|
+
/** POINT size — what idb ui tap/swipe consume (de-normalization basis for TAPS). */
|
|
65
|
+
pointWidth = 0;
|
|
66
|
+
pointHeight = 0;
|
|
67
|
+
/**
|
|
68
|
+
* PIXEL size — the screenshot resolution and the RECORDED coord space.
|
|
69
|
+
* Recording in pixels (not points) keeps the loop's round-trip exact: the
|
|
70
|
+
* point grid (e.g. 393) is coarser than the 0-1000 normalized grid, so a
|
|
71
|
+
* points round-trip double-rounds and drifts; pixels (e.g. 1179 > 1000) are
|
|
72
|
+
* finer, so de-normalize-then-re-normalize is an identity.
|
|
73
|
+
*/
|
|
74
|
+
pixelWidth = 0;
|
|
75
|
+
pixelHeight = 0;
|
|
76
|
+
/**
|
|
77
|
+
* shortId → bounds (POINTS — idb describe-all frames) from the last observe(),
|
|
78
|
+
* the local counterpart of BrowserDevice.lastTreeData. executeAction()
|
|
79
|
+
* resolves a backend `node_id` against this; the bounds-center is the POINT
|
|
80
|
+
* tap target (recorded in pixels via pointToPixel).
|
|
81
|
+
*/
|
|
82
|
+
lastNodeMap = new Map();
|
|
83
|
+
constructor(opts) {
|
|
84
|
+
this.contextValues = opts.contextValues;
|
|
85
|
+
this.log = opts.log ?? (() => { });
|
|
86
|
+
this.bundleId = opts.bundleId ?? null;
|
|
87
|
+
this.appPath = opts.appPath;
|
|
88
|
+
}
|
|
89
|
+
async launchOrReset(target) {
|
|
90
|
+
this.udid = await requireOneBootedSimulator();
|
|
91
|
+
// First call: install the .app (if --app is a local path) and resolve the
|
|
92
|
+
// bundle id to terminate/relaunch on. `target` is the iteration's platform
|
|
93
|
+
// target (a bundle id) when no --app is supplied. Throws (rather than
|
|
94
|
+
// silently driving the foreground) if the bundle id can't be resolved.
|
|
95
|
+
if (!this.bundleId) {
|
|
96
|
+
this.bundleId = await this.resolveBundleId(target);
|
|
97
|
+
}
|
|
98
|
+
const bundleId = this.bundleId;
|
|
99
|
+
// Prime screen geometry (points) before the first de-normalization.
|
|
100
|
+
await this.refreshScreen();
|
|
101
|
+
// Per-participant reset: terminate then relaunch from a clean state.
|
|
102
|
+
await terminateApp(this.udid, bundleId);
|
|
103
|
+
await launchApp(this.udid, bundleId);
|
|
104
|
+
await settle(1500); // cold start needs longer than a gesture settle
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Resolve the bundle id to drive, returning a non-null id or throwing.
|
|
108
|
+
* Installs a local `.app` first and reads its CFBundleIdentifier from
|
|
109
|
+
* Info.plist (no list-diff needed — a .app carries its id). A non-.app local
|
|
110
|
+
* value is treated as an already-installed bundle id.
|
|
111
|
+
*/
|
|
112
|
+
async resolveBundleId(target) {
|
|
113
|
+
const appSpec = this.appPath ?? (target && target.trim() ? target.trim() : null);
|
|
114
|
+
if (!appSpec) {
|
|
115
|
+
throw new Error("No app to drive: pass --app <path-to.app | installed.bundle.id>, or set the iteration's " +
|
|
116
|
+
"platform target to an installed bundle id.");
|
|
117
|
+
}
|
|
118
|
+
// `isLocalPath` returns false for http(s):// and throws on other schemes.
|
|
119
|
+
const local = isLocalPath(appSpec);
|
|
120
|
+
if (!local) {
|
|
121
|
+
throw new Error(`--app received a URL (${appSpec}). Installing a hosted .app on the simulator is not supported yet — ` +
|
|
122
|
+
`pass a local .app path or an already-installed bundle id.`);
|
|
123
|
+
}
|
|
124
|
+
if (appSpec.toLowerCase().endsWith(".app")) {
|
|
125
|
+
const id = await bundleIdFromApp(appSpec);
|
|
126
|
+
if (!id) {
|
|
127
|
+
throw new Error(`Could not read CFBundleIdentifier from "${appSpec}/Info.plist". ` +
|
|
128
|
+
`Pass --app <bundle.id> explicitly if the .app layout is unusual.`);
|
|
129
|
+
}
|
|
130
|
+
this.log(`Installing ${appSpec} (${id})...`);
|
|
131
|
+
await installApp(this.udid, appSpec);
|
|
132
|
+
return id;
|
|
133
|
+
}
|
|
134
|
+
// Local non-.app value: treat as an installed bundle id.
|
|
135
|
+
if (await isAppInstalled(this.udid, appSpec)) {
|
|
136
|
+
return appSpec;
|
|
137
|
+
}
|
|
138
|
+
throw new Error(`App "${appSpec}" is not installed on the simulator and is not a local .app path. ` +
|
|
139
|
+
`Pass --app <path-to.app> to install it, or install it first.`);
|
|
140
|
+
}
|
|
141
|
+
async refreshScreen() {
|
|
142
|
+
const screen = await describeScreen(this.udid);
|
|
143
|
+
this.pointWidth = screen.pointWidth;
|
|
144
|
+
this.pointHeight = screen.pointHeight;
|
|
145
|
+
this.pixelWidth = screen.pixelWidth;
|
|
146
|
+
this.pixelHeight = screen.pixelHeight;
|
|
147
|
+
return screen;
|
|
148
|
+
}
|
|
149
|
+
async observe() {
|
|
150
|
+
// Refresh geometry each step (orientation can change), then capture the
|
|
151
|
+
// pixel screenshot and the a11y tree in parallel (independent reads). The
|
|
152
|
+
// dump is wrapped so a failure degrades to the vision path (empty tree).
|
|
153
|
+
await this.refreshScreen();
|
|
154
|
+
const [png, tree] = await Promise.all([
|
|
155
|
+
screenshotPng(),
|
|
156
|
+
this.dumpTree(),
|
|
157
|
+
]);
|
|
158
|
+
this.lastNodeMap = tree.nodeMap;
|
|
159
|
+
return {
|
|
160
|
+
screenshot: png.toString("base64"),
|
|
161
|
+
// Element path when describe-all produced a tree; "" → backend vision.
|
|
162
|
+
accessibilityTree: tree.simplified,
|
|
163
|
+
url: "",
|
|
164
|
+
// PIXELS — match dimensions() and the pixel screenshot we send, so the
|
|
165
|
+
// loop's coordinate round-trip is exact (see dimensions()/toPixels()).
|
|
166
|
+
width: this.pixelWidth,
|
|
167
|
+
height: this.pixelHeight,
|
|
168
|
+
// Native has no scrollable document; the screen IS the page.
|
|
169
|
+
documentHeight: this.pixelHeight,
|
|
170
|
+
tabs: [],
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Read + serialize the idb describe-all a11y tree (bounds in POINTS). Any
|
|
175
|
+
* failure (retries exhausted on a trivial tree, parse error) degrades to an
|
|
176
|
+
* empty tree so the backend falls back to vision — a missing tree must never
|
|
177
|
+
* abort the observation.
|
|
178
|
+
*/
|
|
179
|
+
async dumpTree() {
|
|
180
|
+
try {
|
|
181
|
+
const json = await describeAll(this.udid);
|
|
182
|
+
const nodes = parseIdbDescribeAll(json);
|
|
183
|
+
const tree = serializeNativeTree(nodes);
|
|
184
|
+
this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
|
|
185
|
+
return tree;
|
|
186
|
+
}
|
|
187
|
+
catch (err) {
|
|
188
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
189
|
+
this.log(`a11y describe-all failed, falling back to vision: ${msg}`);
|
|
190
|
+
return { simplified: "", nodeMap: new Map() };
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
async captureScreenshot() {
|
|
194
|
+
const png = await screenshotPng();
|
|
195
|
+
return png.toString("base64");
|
|
196
|
+
}
|
|
197
|
+
async captureScreenshotJpeg() {
|
|
198
|
+
// simctl screenshot only emits PNG. We return the PNG bytes; the upload/
|
|
199
|
+
// record path treats them as opaque image bytes (PDQ frame-matching works
|
|
200
|
+
// on PNG). The loop labels native uploads image/png.
|
|
201
|
+
return screenshotPng();
|
|
202
|
+
}
|
|
203
|
+
dimensions() {
|
|
204
|
+
// PIXELS — the space the loop re-normalizes the recorded coord against.
|
|
205
|
+
// Pixels (finer than the 0-1000 grid) make that round-trip exact; idb taps
|
|
206
|
+
// separately in points (see toPoints()).
|
|
207
|
+
return { width: this.pixelWidth, height: this.pixelHeight };
|
|
208
|
+
}
|
|
209
|
+
/** Normalized 0-1000 → POINT space (idb ui tap/swipe take points). */
|
|
210
|
+
toPoints(c) {
|
|
211
|
+
return deNormalizePoint(c, this.pointWidth, this.pointHeight);
|
|
212
|
+
}
|
|
213
|
+
/** Normalized 0-1000 → PIXEL space (the recorded/reported coord). */
|
|
214
|
+
toPixels(c) {
|
|
215
|
+
return deNormalizePoint(c, this.pixelWidth, this.pixelHeight);
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Resolve the POINT tap target + PIXEL record coord for a positional action.
|
|
219
|
+
* ELEMENT path (node_id): the bounds-center is the POINT tap; the recorded
|
|
220
|
+
* pixel coord is that center scaled POINTS→PIXELS so it round-trips against
|
|
221
|
+
* dimensions() (pixels). VISION path: de-normalize the 0-1000 coord into both
|
|
222
|
+
* spaces. Returns {stale:true} for a node_id with no bounds (tree moved); the
|
|
223
|
+
* caller fails the action so the loop forwards DOM_ELEMENT_NOT_FOUND.
|
|
224
|
+
*/
|
|
225
|
+
resolveTarget(action) {
|
|
226
|
+
if (action.node_id) {
|
|
227
|
+
const bounds = this.lastNodeMap.get(action.node_id);
|
|
228
|
+
if (!bounds)
|
|
229
|
+
return { pt: null, px: null, stale: true };
|
|
230
|
+
const pt = boundsCenter(bounds); // POINTS — idb taps directly
|
|
231
|
+
const px = pointToPixel(pt, this.pointWidth, this.pointHeight, this.pixelWidth, this.pixelHeight);
|
|
232
|
+
return { pt, px, stale: false };
|
|
233
|
+
}
|
|
234
|
+
const pt = action.coordinates ? this.toPoints(action.coordinates) : null;
|
|
235
|
+
const px = action.coordinates ? this.toPixels(action.coordinates) : null;
|
|
236
|
+
return { pt, px, stale: false };
|
|
237
|
+
}
|
|
238
|
+
async executeAction(action) {
|
|
239
|
+
try {
|
|
240
|
+
// pt drives the idb TAP (points); px is what we RECORD (pixels). ELEMENT
|
|
241
|
+
// path: pt = bounds-center, px = that center scaled to pixels. VISION
|
|
242
|
+
// path: both derive from the same normalized coord. Either way the tap
|
|
243
|
+
// lands right and the recorded px round-trips against dimensions().
|
|
244
|
+
const resolved = this.resolveTarget(action);
|
|
245
|
+
if (resolved.stale)
|
|
246
|
+
return this.failStaleNode(action);
|
|
247
|
+
const { pt, px } = resolved;
|
|
248
|
+
switch (action.type) {
|
|
249
|
+
case "tap":
|
|
250
|
+
case "double_tap": {
|
|
251
|
+
if (!pt)
|
|
252
|
+
return this.failNoCoords(action);
|
|
253
|
+
const count = action.type === "double_tap" ? 2 : action.count ?? 1;
|
|
254
|
+
for (let i = 0; i < count; i++)
|
|
255
|
+
await uiTap(this.udid, pt.x, pt.y);
|
|
256
|
+
break;
|
|
257
|
+
}
|
|
258
|
+
case "long_press": {
|
|
259
|
+
if (!pt)
|
|
260
|
+
return this.failNoCoords(action);
|
|
261
|
+
await uiLongPress(this.udid, pt.x, pt.y, action.duration_ms ?? 600);
|
|
262
|
+
break;
|
|
263
|
+
}
|
|
264
|
+
case "text_input": {
|
|
265
|
+
await this.typeText(action, pt);
|
|
266
|
+
break;
|
|
267
|
+
}
|
|
268
|
+
case "scroll": {
|
|
269
|
+
await this.scroll(action);
|
|
270
|
+
break;
|
|
271
|
+
}
|
|
272
|
+
case "swipe":
|
|
273
|
+
case "pull_to_refresh": {
|
|
274
|
+
await this.swipe(action.direction ?? (action.type === "pull_to_refresh" ? "down" : "up"));
|
|
275
|
+
break;
|
|
276
|
+
}
|
|
277
|
+
case "navigate_back": {
|
|
278
|
+
// iOS has no hardware back; the system "back" is a left-edge swipe.
|
|
279
|
+
await this.navigateBack();
|
|
280
|
+
break;
|
|
281
|
+
}
|
|
282
|
+
case "drag": {
|
|
283
|
+
// A drag GRABS an element and RELEASES it elsewhere ("click the
|
|
284
|
+
// element, move, let go") — distinct from a swipe (element-less
|
|
285
|
+
// directional). Press the resolved element center (pt — the same
|
|
286
|
+
// element path a tap uses, in POINTS), move to the drop point,
|
|
287
|
+
// release. Record the grab point→PIXELS so it round-trips against
|
|
288
|
+
// dimensions() (pixels).
|
|
289
|
+
const recorded = await this.drag(action, pt);
|
|
290
|
+
if (!recorded)
|
|
291
|
+
return this.failNoCoords(action);
|
|
292
|
+
await settle();
|
|
293
|
+
return {
|
|
294
|
+
success: true,
|
|
295
|
+
elementName: action.element_name,
|
|
296
|
+
coordinates: recorded,
|
|
297
|
+
openedNewTab: false,
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
case "wait": {
|
|
301
|
+
await settle(action.duration_ms ?? 1000);
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
case "think": {
|
|
305
|
+
// Reasoning-only: no device interaction.
|
|
306
|
+
return { success: true, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
307
|
+
}
|
|
308
|
+
case "pinch_zoom":
|
|
309
|
+
case "rotate_device":
|
|
310
|
+
case "keyboard_shortcut":
|
|
311
|
+
case "switch_tab":
|
|
312
|
+
case "close_tab": {
|
|
313
|
+
// Capabilities the single-app iOS driver genuinely can't perform:
|
|
314
|
+
// true multi-touch pinch, sensor rotation (idb has no clean rotate),
|
|
315
|
+
// browser tabs, desktop keyboard shortcuts. Fail LOUDLY (not silently)
|
|
316
|
+
// so the loop forwards it and the agent can adapt.
|
|
317
|
+
return this.failUnsupported(action);
|
|
318
|
+
}
|
|
319
|
+
default: {
|
|
320
|
+
this.log(`Unknown native action: ${action.type}`);
|
|
321
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
await settle();
|
|
325
|
+
return {
|
|
326
|
+
success: true,
|
|
327
|
+
elementName: action.element_name,
|
|
328
|
+
// Report PIXEL coords so the loop re-normalizes them against the pixel
|
|
329
|
+
// dimensions() (points would drift since 393 < 1000). The tap itself
|
|
330
|
+
// used points — either the de-normalized 0-1000 (vision) or the
|
|
331
|
+
// bounds-center (element); px is the matching pixel coord for each.
|
|
332
|
+
coordinates: px,
|
|
333
|
+
openedNewTab: false,
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
catch (err) {
|
|
337
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
338
|
+
this.log(`Action ${action.type} failed: ${msg}`);
|
|
339
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
failNoCoords(action) {
|
|
343
|
+
// The backend couldn't vision-locate a target (coordinates=null). Skip the
|
|
344
|
+
// action (don't crash, don't silently succeed) and surface it like the
|
|
345
|
+
// browser path's unresolved-element case — success:false makes the loop
|
|
346
|
+
// push a DOM_ELEMENT_NOT_FOUND forward so the LLM learns the target missed.
|
|
347
|
+
const target = action.element_description || action.element_name || "(no description)";
|
|
348
|
+
this.log(`Skipping native action with no resolved target: ${action.type} ${target}`);
|
|
349
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
350
|
+
}
|
|
351
|
+
failStaleNode(action) {
|
|
352
|
+
// The backend resolved a node_id that's no longer in the latest tree (the
|
|
353
|
+
// screen changed between observe() and act). Fail like the browser's
|
|
354
|
+
// unresolved-element case — success:false forwards DOM_ELEMENT_NOT_FOUND so
|
|
355
|
+
// the loop re-observes and the agent retries against a fresh tree.
|
|
356
|
+
const target = action.element_description || action.element_name || action.node_id || "(unknown)";
|
|
357
|
+
this.log(`Stale node_id "${action.node_id}" not in current a11y tree: ${action.type} ${target}`);
|
|
358
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
359
|
+
}
|
|
360
|
+
async typeText(action, pt) {
|
|
361
|
+
// Focus the field first if the model gave a target.
|
|
362
|
+
if (pt) {
|
|
363
|
+
await uiTap(this.udid, pt.x, pt.y);
|
|
364
|
+
await settle(250);
|
|
365
|
+
}
|
|
366
|
+
const text = resolveTextValue(action, this.contextValues);
|
|
367
|
+
// idb ui text appends to the focused field; for click_type (replace) there
|
|
368
|
+
// is no idb "clear", so we rely on the field being empty after focus. The
|
|
369
|
+
// vision agent typically taps an empty field, so this matches Android's
|
|
370
|
+
// common path; a true select-all clear isn't exposed by idb.
|
|
371
|
+
if (text)
|
|
372
|
+
await uiText(this.udid, text);
|
|
373
|
+
if (action.submit) {
|
|
374
|
+
await settle(150);
|
|
375
|
+
await uiKey(this.udid, HID_KEY_RETURN);
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
async scroll(action) {
|
|
379
|
+
const w = this.pointWidth;
|
|
380
|
+
const h = this.pointHeight;
|
|
381
|
+
const cx = Math.round(w / 2);
|
|
382
|
+
const amountMap = {
|
|
383
|
+
small: 0.25, medium: 0.45, large: 0.7, extra_large: 0.9,
|
|
384
|
+
};
|
|
385
|
+
const frac = amountMap[action.amount ?? "medium"] ?? 0.45;
|
|
386
|
+
const dist = Math.round(h * frac);
|
|
387
|
+
const mid = Math.round(h / 2);
|
|
388
|
+
switch (action.direction) {
|
|
389
|
+
case "up":
|
|
390
|
+
// Reveal content above: swipe finger downward.
|
|
391
|
+
await uiSwipe(this.udid, cx, mid - dist / 2, cx, mid + dist / 2);
|
|
392
|
+
break;
|
|
393
|
+
case "to_top":
|
|
394
|
+
await uiSwipe(this.udid, cx, Math.round(h * 0.2), cx, Math.round(h * 0.9), 400);
|
|
395
|
+
break;
|
|
396
|
+
case "to_bottom":
|
|
397
|
+
await uiSwipe(this.udid, cx, Math.round(h * 0.9), cx, Math.round(h * 0.2), 400);
|
|
398
|
+
break;
|
|
399
|
+
case "down":
|
|
400
|
+
case "to_element":
|
|
401
|
+
default:
|
|
402
|
+
// Reveal content below: swipe finger upward.
|
|
403
|
+
await uiSwipe(this.udid, cx, mid + dist / 2, cx, mid - dist / 2);
|
|
404
|
+
break;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
async swipe(direction) {
|
|
408
|
+
const w = this.pointWidth;
|
|
409
|
+
const h = this.pointHeight;
|
|
410
|
+
const cx = Math.round(w / 2);
|
|
411
|
+
const cy = Math.round(h / 2);
|
|
412
|
+
const d = Math.round(h * 0.4);
|
|
413
|
+
const dx = Math.round(w * 0.4);
|
|
414
|
+
switch (direction) {
|
|
415
|
+
case "up":
|
|
416
|
+
await uiSwipe(this.udid, cx, cy + d / 2, cx, cy - d / 2);
|
|
417
|
+
break;
|
|
418
|
+
case "down":
|
|
419
|
+
await uiSwipe(this.udid, cx, cy - d / 2, cx, cy + d / 2);
|
|
420
|
+
break;
|
|
421
|
+
case "left":
|
|
422
|
+
await uiSwipe(this.udid, cx + dx / 2, cy, cx - dx / 2, cy);
|
|
423
|
+
break;
|
|
424
|
+
case "right":
|
|
425
|
+
await uiSwipe(this.udid, cx - dx / 2, cy, cx + dx / 2, cy);
|
|
426
|
+
break;
|
|
427
|
+
default:
|
|
428
|
+
await uiSwipe(this.udid, cx, cy + d / 2, cx, cy - d / 2);
|
|
429
|
+
break;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Perform a drag: press the GRABBED element, move to the drop point, release.
|
|
434
|
+
* A drag is "click an element and let it go", so the press lands element-
|
|
435
|
+
* center (the resolved `grab` in POINTS — node_id bounds center, or the
|
|
436
|
+
* vision coordinate when the tree is blind), NOT the backend's vision-
|
|
437
|
+
* estimated start. The release point is the drag END (drag.endX/endY). A
|
|
438
|
+
* ~0.8s idb swipe reads as a drag, not a flick. Returns the grab point scaled
|
|
439
|
+
* to PIXELS (pointToPixel) to record so it round-trips against dimensions()
|
|
440
|
+
* (pixels), or null if there's no end to drag toward.
|
|
441
|
+
*
|
|
442
|
+
* idb LIMITATION: `idb ui swipe` only exposes --duration/--delta — it has no
|
|
443
|
+
* press-and-HOLD-then-move primitive (unlike Android's `input draganddrop`).
|
|
444
|
+
* So this drives the immediate-drag surfaces (sliders, drag-to-dismiss, drag
|
|
445
|
+
* handles that pick up on touch-move) but does NOT trigger a long-press
|
|
446
|
+
* pickup (home-screen jiggle mode, in-app reorder that needs a hold first) —
|
|
447
|
+
* verified on-device: a long uiSwipe leaves home-screen icons unmoved. The
|
|
448
|
+
* grab/release SEMANTICS are still correct; the gap is purely the missing
|
|
449
|
+
* hold, which idb can't perform in one continuous gesture.
|
|
450
|
+
*/
|
|
451
|
+
async drag(action, grab) {
|
|
452
|
+
if (!action.drag)
|
|
453
|
+
return null;
|
|
454
|
+
const { start, end } = deNormalizeDrag(action.drag, this.pointWidth, this.pointHeight); // POINTS
|
|
455
|
+
// Grab the resolved element center; fall back to the backend's own start
|
|
456
|
+
// only when nothing resolved (no node_id and no vision coordinate).
|
|
457
|
+
const press = grab ?? start;
|
|
458
|
+
await uiSwipe(this.udid, press.x, press.y, end.x, end.y, 800);
|
|
459
|
+
return pointToPixel(press, this.pointWidth, this.pointHeight, this.pixelWidth, this.pixelHeight);
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* iOS has no hardware back. The system interactive-pop (left-edge swipe) is
|
|
463
|
+
* NOT reliably triggerable through idb's synthetic touch — verified on the
|
|
464
|
+
* simulator: no edge-swipe variant (start x, travel, duration, delta) pops
|
|
465
|
+
* the view. So we resolve and TAP the nav-bar back button instead: iOS HIG
|
|
466
|
+
* places "back" as the LEADING (leftmost) button in the top nav bar of any
|
|
467
|
+
* pushed view, so the leftmost button in the nav-bar band is it — verified to
|
|
468
|
+
* pop a Settings sub-screen back to root. The left-edge swipe remains a
|
|
469
|
+
* best-effort fallback for real devices (where idb sends real HID events that
|
|
470
|
+
* do drive the system gesture) when no back button is visible.
|
|
471
|
+
*/
|
|
472
|
+
async navigateBack() {
|
|
473
|
+
const nodes = parseIdbDescribeAll(await describeAll(this.udid));
|
|
474
|
+
const back = this.findBackButton(nodes);
|
|
475
|
+
if (back) {
|
|
476
|
+
const c = boundsCenter(back.bounds); // POINTS — idb taps directly
|
|
477
|
+
await uiTap(this.udid, c.x, c.y);
|
|
478
|
+
return;
|
|
479
|
+
}
|
|
480
|
+
// No nav-bar back button (root screen, or a custom chrome): fall back to the
|
|
481
|
+
// system edge-swipe — works on real devices, a no-op on the simulator.
|
|
482
|
+
this.log("navigate_back: no nav-bar back button found; trying left-edge swipe");
|
|
483
|
+
const midY = Math.round(this.pointHeight / 2);
|
|
484
|
+
await uiSwipe(this.udid, 1, midY, Math.round(this.pointWidth * 0.5), midY, 300);
|
|
485
|
+
}
|
|
486
|
+
/**
|
|
487
|
+
* The nav-bar back button: the leading (leftmost) actionable button in the
|
|
488
|
+
* top nav-bar band. iOS HIG guarantees "back" is the leading nav item in a
|
|
489
|
+
* pushed view, so the leftmost button high on the screen is it. Returns null
|
|
490
|
+
* on root screens (no leading back item) so the caller can fall back.
|
|
491
|
+
*
|
|
492
|
+
* The geometry alone (leftmost-top) would mis-fire on a modal whose LEADING
|
|
493
|
+
* item is Cancel/Close, or a root with a leading Edit/menu — and tapping
|
|
494
|
+
* Cancel/Close can DISCARD work. A stock back button is labeled with the
|
|
495
|
+
* PARENT screen's title (e.g. "Settings"), not "Back", so there's no reliable
|
|
496
|
+
* positive label signal; instead we exclude the known non-back leading
|
|
497
|
+
* labels. If every leading button is one of those, we return null and let the
|
|
498
|
+
* caller fall back rather than tap a destructive control.
|
|
499
|
+
*
|
|
500
|
+
* Known limitation: a glyph-only leading button with NO accessible label
|
|
501
|
+
* (e.g. a hamburger/avatar/logo) isn't in the deny-list, so on a screen whose
|
|
502
|
+
* leading control is an unlabeled non-back icon this can tap the wrong control
|
|
503
|
+
* (silently — it returns success). Acceptable for the common case (stock nav
|
|
504
|
+
* bars have a labeled back button), but it's why pushed views, not root/menu
|
|
505
|
+
* screens, are where navigate_back is reliable.
|
|
506
|
+
*/
|
|
507
|
+
findBackButton(nodes) {
|
|
508
|
+
const navBandBottom = this.pointHeight * 0.15;
|
|
509
|
+
const leftZone = this.pointWidth * 0.3;
|
|
510
|
+
const candidates = nodes.filter((n) => n.role === "button" &&
|
|
511
|
+
n.clickable &&
|
|
512
|
+
n.bounds.width > 0 &&
|
|
513
|
+
n.bounds.height > 0 &&
|
|
514
|
+
n.bounds.y < navBandBottom &&
|
|
515
|
+
n.bounds.x < leftZone &&
|
|
516
|
+
!NON_BACK_LEADING_LABELS.has(n.label.trim().toLowerCase()));
|
|
517
|
+
if (candidates.length === 0)
|
|
518
|
+
return null;
|
|
519
|
+
// Leftmost wins; tie-break by topmost.
|
|
520
|
+
candidates.sort((a, b) => a.bounds.x - b.bounds.x || a.bounds.y - b.bounds.y);
|
|
521
|
+
return candidates[0];
|
|
522
|
+
}
|
|
523
|
+
failUnsupported(action) {
|
|
524
|
+
// A capability the single-app iOS driver genuinely can't perform. Fail with
|
|
525
|
+
// a clear, diagnosable reason — NOT a silent false — so it's visible WHY and
|
|
526
|
+
// the agent can adapt.
|
|
527
|
+
const hint = {
|
|
528
|
+
pinch_zoom: "no multi-touch on the native driver; the agent should zoom via double_tap",
|
|
529
|
+
rotate_device: "idb exposes no clean rotate; leave orientation as-is",
|
|
530
|
+
keyboard_shortcut: "no hardware keyboard on the native driver; use on-screen taps/text_input",
|
|
531
|
+
switch_tab: "tabs are a browser concept; the native app has a single window",
|
|
532
|
+
close_tab: "tabs are a browser concept; the native app has a single window",
|
|
533
|
+
};
|
|
534
|
+
this.log(`${action.type} not supported by the iOS native driver — ${hint[action.type] ?? "no native equivalent"}`);
|
|
535
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
536
|
+
}
|
|
537
|
+
currentUrl() {
|
|
538
|
+
// Native has no URL; recording stores "" (current_location comes from the
|
|
539
|
+
// backend's reasoning output, not the device).
|
|
540
|
+
return "";
|
|
541
|
+
}
|
|
542
|
+
async close() {
|
|
543
|
+
// Leave the app installed/running; the simulator is shared and the next run
|
|
544
|
+
// resets via launchOrReset. Nothing to tear down (no IME state on iOS).
|
|
545
|
+
}
|
|
546
|
+
}
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Local simulation loop orchestrator.
|
|
3
3
|
*
|
|
4
|
-
* Runs the observe → reason (remote) → act (local) loop for each
|
|
5
|
-
*
|
|
4
|
+
* Runs the observe → reason (remote) → act (local) loop for each participant
|
|
5
|
+
* against a SimulationDevice (a Playwright browser today; a native Android
|
|
6
|
+
* emulator next). The loop is device-agnostic — see device.ts.
|
|
6
7
|
*/
|
|
7
8
|
import type { ApiClient } from "../api-client.js";
|
|
9
|
+
import type { LocalStepAction } from "./types.js";
|
|
8
10
|
export interface DebugStep {
|
|
9
11
|
step: number;
|
|
10
12
|
assignmentName: string;
|
|
@@ -35,6 +37,14 @@ export interface DebugStep {
|
|
|
35
37
|
assignmentCompleted: boolean;
|
|
36
38
|
effortSeconds: number;
|
|
37
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* Convert a raw action (from either resolved_actions or output.action.actions)
|
|
42
|
+
* into the flat LocalStepAction shape used by the executor. Exported for unit
|
|
43
|
+
* tests of the native drag coordinate-shape split (the nested action's
|
|
44
|
+
* `coordinates` is a {x,y} tap point for most actions but a
|
|
45
|
+
* {startX,...,endY} path for a drag).
|
|
46
|
+
*/
|
|
47
|
+
export declare function flattenAction(raw: Record<string, unknown>, nodeId?: string | null, nodeDescription?: string | null): LocalStepAction;
|
|
38
48
|
export interface LocalSimRunOptions {
|
|
39
49
|
workspaceId: string;
|
|
40
50
|
studyId: string;
|
|
@@ -52,6 +62,8 @@ export interface LocalSimRunOptions {
|
|
|
52
62
|
json?: boolean;
|
|
53
63
|
debug?: boolean;
|
|
54
64
|
parallel?: number;
|
|
65
|
+
platform?: string;
|
|
66
|
+
appPath?: string;
|
|
55
67
|
}
|
|
56
68
|
/**
|
|
57
69
|
* Run local simulations — parallel when multiple participants, sequential by default.
|