@ishlabs/cli 0.24.1 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/ask.js +3 -3
- package/dist/commands/doctor.d.ts +26 -0
- package/dist/commands/doctor.js +334 -0
- package/dist/commands/iteration.js +1 -1
- package/dist/commands/study-analyze.js +1 -1
- package/dist/commands/study-run.js +80 -12
- package/dist/commands/study.js +11 -7
- package/dist/index.js +2 -0
- package/dist/lib/alias-store.js +1 -1
- package/dist/lib/api-client.d.ts +2 -0
- package/dist/lib/docs.js +57 -42
- package/dist/lib/local-sim/actions.d.ts +10 -2
- package/dist/lib/local-sim/actions.js +18 -11
- package/dist/lib/local-sim/adb.d.ts +113 -0
- package/dist/lib/local-sim/adb.js +366 -0
- package/dist/lib/local-sim/android.d.ts +111 -0
- package/dist/lib/local-sim/android.js +504 -0
- package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
- package/dist/lib/local-sim/apk-manifest.js +210 -0
- package/dist/lib/local-sim/browser.d.ts +22 -0
- package/dist/lib/local-sim/browser.js +65 -0
- package/dist/lib/local-sim/coordinates.d.ts +69 -0
- package/dist/lib/local-sim/coordinates.js +59 -0
- package/dist/lib/local-sim/device.d.ts +143 -0
- package/dist/lib/local-sim/device.js +152 -0
- package/dist/lib/local-sim/ios.d.ts +185 -0
- package/dist/lib/local-sim/ios.js +599 -0
- package/dist/lib/local-sim/loop.d.ts +14 -2
- package/dist/lib/local-sim/loop.js +168 -73
- package/dist/lib/local-sim/native-a11y.d.ts +111 -0
- package/dist/lib/local-sim/native-a11y.js +419 -0
- package/dist/lib/local-sim/simctl.d.ts +55 -0
- package/dist/lib/local-sim/simctl.js +144 -0
- package/dist/lib/local-sim/types.d.ts +39 -2
- package/dist/lib/local-sim/upload.d.ts +1 -1
- package/dist/lib/local-sim/upload.js +9 -6
- package/dist/lib/local-sim/xcuitest.d.ts +60 -0
- package/dist/lib/local-sim/xcuitest.js +303 -0
- package/dist/lib/output.js +58 -12
- package/dist/lib/paths.d.ts +8 -0
- package/dist/lib/paths.js +12 -0
- package/dist/lib/skill-content.js +10 -9
- package/package.json +2 -1
|
@@ -1,19 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Local simulation loop orchestrator.
|
|
3
3
|
*
|
|
4
|
-
* Runs the observe → reason (remote) → act (local) loop for each
|
|
5
|
-
*
|
|
4
|
+
* Runs the observe → reason (remote) → act (local) loop for each participant
|
|
5
|
+
* against a SimulationDevice (a Playwright browser today; a native Android
|
|
6
|
+
* emulator next). The loop is device-agnostic — see device.ts.
|
|
6
7
|
*/
|
|
7
|
-
import {
|
|
8
|
+
import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
|
|
8
9
|
import { uploadScreenshot } from "./upload.js";
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
11
|
-
import { enableDebug, isDebugEnabled,
|
|
10
|
+
import { detectNoVisibleChange, describeAction } from "./actions.js";
|
|
11
|
+
import { createDevice } from "./device.js";
|
|
12
|
+
import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
|
|
13
|
+
/**
|
|
14
|
+
* Native (mobile) platforms drive a single physical device via screenshot →
|
|
15
|
+
* normalized-coordinate taps, with no accessibility tree or URL. Browser/web is
|
|
16
|
+
* everything else.
|
|
17
|
+
*/
|
|
18
|
+
function isNativePlatform(platform) {
|
|
19
|
+
return platform === "android" || platform === "ios";
|
|
20
|
+
}
|
|
12
21
|
/**
|
|
13
22
|
* Convert a raw action (from either resolved_actions or output.action.actions)
|
|
14
|
-
* into the flat LocalStepAction shape used by the executor.
|
|
23
|
+
* into the flat LocalStepAction shape used by the executor. Exported for unit
|
|
24
|
+
* tests of the native drag coordinate-shape split (the nested action's
|
|
25
|
+
* `coordinates` is a {x,y} tap point for most actions but a
|
|
26
|
+
* {startX,...,endY} path for a drag).
|
|
15
27
|
*/
|
|
16
|
-
function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
28
|
+
export function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
17
29
|
// resolved_actions nest the action inside an "action" key
|
|
18
30
|
const a = (raw.action ?? raw);
|
|
19
31
|
const element = a.element;
|
|
@@ -36,8 +48,36 @@ function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
|
36
48
|
modifiers: Array.isArray(a.modifiers) ? a.modifiers : null,
|
|
37
49
|
key: a.key ?? null,
|
|
38
50
|
tab_id: a.tab_id ?? null,
|
|
51
|
+
orientation: a.orientation ?? null,
|
|
52
|
+
panel: a.panel ?? null,
|
|
53
|
+
scale: a.scale ?? null,
|
|
54
|
+
// Native path: ResolvedAction.coordinates (top level of the resolved_actions
|
|
55
|
+
// entry) is the single {x,y} execution point. Fall back to the nested action
|
|
56
|
+
// for raw output — but only a point-shaped {x,y}; a drag's nested
|
|
57
|
+
// coordinates is the {startX,...,endY} path (extracted into `drag` below),
|
|
58
|
+
// not a tap point, so guard against mis-assigning it here.
|
|
59
|
+
coordinates: pickPoint(raw.coordinates) ?? pickPoint(a.coordinates) ?? null,
|
|
60
|
+
// drag: the gesture path lives on the nested action's coordinates as
|
|
61
|
+
// {startX, startY, endX, endY} (DragCoordinates serialized by_alias).
|
|
62
|
+
drag: pickDrag(a.coordinates) ?? null,
|
|
39
63
|
};
|
|
40
64
|
}
|
|
65
|
+
/** A nested action's coordinates only when it's the {x,y} tap-point shape. */
|
|
66
|
+
function pickPoint(c) {
|
|
67
|
+
if (c && typeof c === "object" && "x" in c && "y" in c) {
|
|
68
|
+
const p = c;
|
|
69
|
+
return { x: p.x, y: p.y };
|
|
70
|
+
}
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
/** A nested action's coordinates only when it's the DragCoordinates shape. */
|
|
74
|
+
function pickDrag(c) {
|
|
75
|
+
if (c && typeof c === "object" && "startX" in c && "endX" in c) {
|
|
76
|
+
const d = c;
|
|
77
|
+
return { startX: d.startX, startY: d.startY, endX: d.endX, endY: d.endY };
|
|
78
|
+
}
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
41
81
|
/**
|
|
42
82
|
* Normalize the raw backend step response into the flat structure used by the loop.
|
|
43
83
|
* Backend returns { output: { ... }, resolved_actions: [...], loop_detected }.
|
|
@@ -61,7 +101,10 @@ function normalizeStepResponse(raw) {
|
|
|
61
101
|
sentiment_intensity: out.sentiment_intensity ?? 0,
|
|
62
102
|
current_location: out.current_location,
|
|
63
103
|
effort_seconds: out.effort_seconds,
|
|
64
|
-
|
|
104
|
+
assignment_status: out.assignment_status,
|
|
105
|
+
// Terminate on completed OR abandoned — a stuck agent that gives up
|
|
106
|
+
// should stop the loop just like a finished one.
|
|
107
|
+
assignment_completed: out.assignment_status === "completed" || out.assignment_status === "abandoned",
|
|
65
108
|
actions,
|
|
66
109
|
loop_detected: raw.loop_detected,
|
|
67
110
|
};
|
|
@@ -89,7 +132,13 @@ export async function runLocalSimulations(client, opts) {
|
|
|
89
132
|
log("\nCancelling after current step...");
|
|
90
133
|
};
|
|
91
134
|
process.on("SIGINT", onSigint);
|
|
92
|
-
|
|
135
|
+
// Native runs share ONE physical device (emulator / simulator), so they
|
|
136
|
+
// can't run in parallel — force sequential regardless of --parallel.
|
|
137
|
+
const isNativeRun = isNativePlatform(opts.platform);
|
|
138
|
+
if (isNativeRun && (opts.parallel ?? 1) > 1) {
|
|
139
|
+
log("Native (android/ios) runs drive a single device — running sequentially.");
|
|
140
|
+
}
|
|
141
|
+
const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
|
|
93
142
|
try {
|
|
94
143
|
if (concurrency <= 1 || opts.participantIds.length <= 1) {
|
|
95
144
|
// Sequential execution — each participant owns its own browser
|
|
@@ -162,12 +211,20 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
162
211
|
product_id: opts.workspaceId,
|
|
163
212
|
iteration_id: opts.iterationId,
|
|
164
213
|
});
|
|
165
|
-
// Resolve
|
|
214
|
+
// Resolve target + config from iteration details (with CLI fallback).
|
|
215
|
+
// Platform precedence: --platform flag > iteration's stored platform > web.
|
|
166
216
|
const iterDetails = initResponse.iteration_details;
|
|
217
|
+
const platform = opts.platform ?? iterDetails?.platform ?? "web";
|
|
218
|
+
const isNative = isNativePlatform(platform);
|
|
219
|
+
// Browser needs a URL to navigate; native uses the app package (from --app or
|
|
220
|
+
// the iteration target) and has no URL requirement.
|
|
167
221
|
const navigationUrl = iterDetails?.url ?? opts.url;
|
|
168
|
-
if (!navigationUrl) {
|
|
222
|
+
if (!isNative && !navigationUrl) {
|
|
169
223
|
throw new Error("No URL available: backend did not return iteration_details and no --url flag was provided.");
|
|
170
224
|
}
|
|
225
|
+
// For native, launchOrReset() receives the app package (iteration target);
|
|
226
|
+
// the AndroidDevice prefers --app over this.
|
|
227
|
+
const launchTarget = isNative ? (navigationUrl ?? "") : navigationUrl;
|
|
171
228
|
const screenFormat = opts.screenFormat ?? iterDetails?.screen_format ?? "desktop";
|
|
172
229
|
const locale = opts.locale ?? iterDetails?.locale;
|
|
173
230
|
// Cache session state for per-step requests
|
|
@@ -189,7 +246,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
189
246
|
const stepContextValues = session.context_values.map(cv => cv.type === "secret" ? { ...cv, value: null } : cv);
|
|
190
247
|
const maxSteps = opts.maxInteractions ?? session.max_interactions;
|
|
191
248
|
const viewport = { width: 1440, height: 900 }; // TODO: extract from config
|
|
192
|
-
// Step 2:
|
|
249
|
+
// Step 2: Build the target device (per-platform dispatch).
|
|
250
|
+
// Browser today; AndroidDevice (adb) slots in via createDevice() later.
|
|
193
251
|
const browserOpts = {
|
|
194
252
|
headed: opts.headed,
|
|
195
253
|
slowMo: opts.slowMo,
|
|
@@ -198,15 +256,13 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
198
256
|
locale,
|
|
199
257
|
screenFormat,
|
|
200
258
|
};
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
const tabs = new TabManager(browserSession.context, browserSession.page);
|
|
209
|
-
let page = tabs.activePage();
|
|
259
|
+
const device = await createDevice(platform, {
|
|
260
|
+
browserOpts,
|
|
261
|
+
contextValues: session.context_values,
|
|
262
|
+
sharedBrowser,
|
|
263
|
+
appPath: opts.appPath,
|
|
264
|
+
log,
|
|
265
|
+
});
|
|
210
266
|
const history = [];
|
|
211
267
|
const interactions = [];
|
|
212
268
|
const debugSteps = [];
|
|
@@ -216,24 +272,51 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
216
272
|
let accumulatedEffortMs = 0;
|
|
217
273
|
let finalStatus = "completed";
|
|
218
274
|
try {
|
|
219
|
-
// Step 3:
|
|
220
|
-
await
|
|
275
|
+
// Step 3: Launch / navigate the target to its starting point.
|
|
276
|
+
await device.launchOrReset(launchTarget);
|
|
221
277
|
// Step 4: Run assignment loop
|
|
222
278
|
for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
|
|
223
279
|
const assignment = session.assignments[assignmentIdx];
|
|
224
280
|
log(` Assignment ${assignmentIdx + 1}/${session.assignments.length}: ${assignment.name}`);
|
|
225
281
|
let step = 0;
|
|
226
282
|
let assignmentCompleted = false;
|
|
283
|
+
// The agent's last per-turn status, used to pick the terminal run-level
|
|
284
|
+
// status when the loop ends because the agent terminated (completed vs
|
|
285
|
+
// abandoned). Stays "in_progress" if the loop hits max_steps.
|
|
286
|
+
let lastAssignmentStatus = "in_progress";
|
|
227
287
|
while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
|
|
228
|
-
// OBSERVE —
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
288
|
+
// OBSERVE — the device refreshes its own active surface (popup /
|
|
289
|
+
// switch_tab for browser) before capturing. (The browser device emits
|
|
290
|
+
// its own richer debugObservation with tree/scroll detail.)
|
|
291
|
+
// TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
|
|
292
|
+
const obs = await device.observe();
|
|
232
293
|
const currentScreenshot = obs.screenshot;
|
|
233
|
-
debugObservation(obs);
|
|
234
294
|
// Capture JPEG of observation for upload and recording (pre-action)
|
|
235
|
-
const obsJpeg = await
|
|
295
|
+
const obsJpeg = await device.captureScreenshotJpeg();
|
|
236
296
|
const obsBase64 = obsJpeg.toString("base64");
|
|
297
|
+
// Capture a height-capped full-page JPEG (pre-action, so it reflects
|
|
298
|
+
// the same screen the LLM reasons over). Sent to the backend as the
|
|
299
|
+
// PDQ basis + Frame representative_screenshot, matching the hosted
|
|
300
|
+
// run's full-page behavior. The per-interaction screenshot_url /
|
|
301
|
+
// recording stays the VIEWPORT (obsBase64) — unchanged.
|
|
302
|
+
// Degrade silently to omitting the field if capture fails: a frame is
|
|
303
|
+
// still created from the viewport.
|
|
304
|
+
const fullPageCap = screenFormat === "mobile_portrait"
|
|
305
|
+
? FULL_PAGE_HEIGHT_CAP_PX_MOBILE
|
|
306
|
+
: FULL_PAGE_HEIGHT_CAP_PX_DESKTOP;
|
|
307
|
+
let fullPageBase64;
|
|
308
|
+
try {
|
|
309
|
+
// Browser-only: native devices omit captureFullPageJpeg, so the
|
|
310
|
+
// field is dropped and the frame is created from the viewport.
|
|
311
|
+
fullPageBase64 = await device.captureFullPageJpeg?.({
|
|
312
|
+
documentHeight: obs.documentHeight,
|
|
313
|
+
cap: fullPageCap,
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
catch (err) {
|
|
317
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
318
|
+
log(` Warning: full-page screenshot capture failed — ${msg}`);
|
|
319
|
+
}
|
|
237
320
|
// Detect no-visible-change: compare this step's observation with the
|
|
238
321
|
// PREVIOUS step's observation (not the post-action screenshot).
|
|
239
322
|
// This tells us whether the previous step's action changed the page.
|
|
@@ -243,10 +326,9 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
243
326
|
previousObsScreenshot = currentScreenshot;
|
|
244
327
|
if (forwards.length > 0)
|
|
245
328
|
debugForwards(forwards);
|
|
246
|
-
const viewportSize = page.viewportSize() ?? viewport;
|
|
247
329
|
// Snapshot open tabs so the backend can prompt the LLM with tab ids
|
|
248
330
|
// (used by switch_tab/close_tab and to disambiguate cmd+click results).
|
|
249
|
-
const tabsSnapshot =
|
|
331
|
+
const tabsSnapshot = obs.tabs;
|
|
250
332
|
// REASON (remote)
|
|
251
333
|
let stepResponse;
|
|
252
334
|
try {
|
|
@@ -256,10 +338,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
256
338
|
assignment_name: assignment.name,
|
|
257
339
|
assignment_instructions: assignment.instructions,
|
|
258
340
|
screenshot: obs.screenshot,
|
|
259
|
-
accessibility_tree: obs.
|
|
341
|
+
accessibility_tree: obs.accessibilityTree,
|
|
260
342
|
current_url: obs.url,
|
|
261
|
-
screen_width:
|
|
262
|
-
screen_height:
|
|
343
|
+
screen_width: obs.width,
|
|
344
|
+
screen_height: obs.height,
|
|
263
345
|
interaction_count: step,
|
|
264
346
|
history,
|
|
265
347
|
forwards,
|
|
@@ -276,7 +358,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
276
358
|
catch (err) {
|
|
277
359
|
const msg = err instanceof Error ? err.message : String(err);
|
|
278
360
|
log(` Step ${step + 1}: API error — ${msg}`);
|
|
279
|
-
await
|
|
361
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
280
362
|
try {
|
|
281
363
|
const stepReqBody = {
|
|
282
364
|
participant_id: session.participant_id,
|
|
@@ -284,10 +366,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
284
366
|
assignment_name: assignment.name,
|
|
285
367
|
assignment_instructions: assignment.instructions,
|
|
286
368
|
screenshot: obs.screenshot,
|
|
287
|
-
accessibility_tree: obs.
|
|
369
|
+
accessibility_tree: obs.accessibilityTree,
|
|
288
370
|
current_url: obs.url,
|
|
289
|
-
screen_width:
|
|
290
|
-
screen_height:
|
|
371
|
+
screen_width: obs.width,
|
|
372
|
+
screen_height: obs.height,
|
|
291
373
|
interaction_count: step,
|
|
292
374
|
history,
|
|
293
375
|
forwards,
|
|
@@ -313,27 +395,21 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
313
395
|
const actionDescs = [];
|
|
314
396
|
const elementNames = [];
|
|
315
397
|
const actionDebugEntries = [];
|
|
316
|
-
const preActionScreenshot = await
|
|
398
|
+
const preActionScreenshot = await device.captureScreenshot();
|
|
317
399
|
for (let i = 0; i < stepResponse.actions.length; i++) {
|
|
318
400
|
if (isCancelled())
|
|
319
401
|
break;
|
|
320
|
-
// Pick up popup auto-switch / explicit tab switch from prior actions.
|
|
321
|
-
page = tabs.activePage();
|
|
322
402
|
const action = stepResponse.actions[i];
|
|
323
|
-
const
|
|
324
|
-
const result = await executeAction(page, action, lastTreeData, session.context_values, tabs);
|
|
403
|
+
const result = await device.executeAction(action);
|
|
325
404
|
const desc = describeAction(action);
|
|
326
405
|
debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
|
|
327
|
-
|
|
328
|
-
page = tabs.activePage();
|
|
329
|
-
const tabsAfter = (await tabs.list()).length;
|
|
330
|
-
const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
|
|
406
|
+
const openedNewTab = result.openedNewTab;
|
|
331
407
|
let normalizedCoords = null;
|
|
332
408
|
if (result.coordinates) {
|
|
333
|
-
const
|
|
409
|
+
const dims = device.dimensions();
|
|
334
410
|
normalizedCoords = {
|
|
335
|
-
x: Math.round((result.coordinates.x /
|
|
336
|
-
y: Math.round((result.coordinates.y /
|
|
411
|
+
x: Math.round((result.coordinates.x / dims.width) * 1000),
|
|
412
|
+
y: Math.round((result.coordinates.y / dims.height) * 1000),
|
|
337
413
|
};
|
|
338
414
|
}
|
|
339
415
|
const actionType = action.type || "unknown";
|
|
@@ -355,6 +431,11 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
355
431
|
...(action.modifiers?.length && { modifiers: action.modifiers }),
|
|
356
432
|
...(action.key && { key: action.key }),
|
|
357
433
|
...(action.tab_id && { tab_id: action.tab_id }),
|
|
434
|
+
...(action.orientation && { orientation: action.orientation }),
|
|
435
|
+
...(action.panel && { panel: action.panel }),
|
|
436
|
+
// The recorded `coordinates` is the drag START; persist the END
|
|
437
|
+
// (normalized 0-1000) too so the journey captures the full path.
|
|
438
|
+
...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
|
|
358
439
|
...(openedNewTab && { opened_new_tab: true }),
|
|
359
440
|
},
|
|
360
441
|
order: i,
|
|
@@ -376,7 +457,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
376
457
|
}
|
|
377
458
|
// Check if UI changed significantly (skip for last action in batch)
|
|
378
459
|
if (i < stepResponse.actions.length - 1) {
|
|
379
|
-
const midScreenshot = await
|
|
460
|
+
const midScreenshot = await device.captureScreenshot();
|
|
380
461
|
if (!detectNoVisibleChange(preActionScreenshot, midScreenshot)) {
|
|
381
462
|
const blockedCount = stepResponse.actions.length - 1 - i;
|
|
382
463
|
forwards.push({
|
|
@@ -387,10 +468,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
387
468
|
}
|
|
388
469
|
}
|
|
389
470
|
}
|
|
390
|
-
// Upload observation
|
|
471
|
+
// Upload observation screenshot (pre-action — matches coordinates and
|
|
472
|
+
// LLM context). Browser captures JPEG; native screencap is PNG.
|
|
473
|
+
const obsContentType = isNative ? "image/png" : "image/jpeg";
|
|
391
474
|
let screenshotUrl;
|
|
392
475
|
try {
|
|
393
|
-
const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg);
|
|
476
|
+
const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg, obsContentType);
|
|
394
477
|
screenshotUrl = uploadResult.screenshotUrl;
|
|
395
478
|
}
|
|
396
479
|
catch (err) {
|
|
@@ -407,6 +490,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
407
490
|
screenshot_url: screenshotUrl,
|
|
408
491
|
location_name: stepResponse.current_location,
|
|
409
492
|
screen_format: screenFormat,
|
|
493
|
+
...(fullPageBase64 ? { full_page_screenshot_base64: fullPageBase64 } : {}),
|
|
494
|
+
// Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
|
|
495
|
+
// back to screen_format server-side.
|
|
496
|
+
platform,
|
|
410
497
|
});
|
|
411
498
|
frameVersionId = matchResult.frame_version_id;
|
|
412
499
|
}
|
|
@@ -417,7 +504,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
417
504
|
// Debug-only: capture post-action screenshot to show result
|
|
418
505
|
let postActionBase64;
|
|
419
506
|
if (isDebugEnabled()) {
|
|
420
|
-
const postJpeg = await
|
|
507
|
+
const postJpeg = await device.captureScreenshotJpeg();
|
|
421
508
|
postActionBase64 = postJpeg.toString("base64");
|
|
422
509
|
}
|
|
423
510
|
// Accumulate effort (cumulative, not wall-clock)
|
|
@@ -437,10 +524,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
437
524
|
step: step + 1,
|
|
438
525
|
assignment_id: assignment.id,
|
|
439
526
|
...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
|
|
527
|
+
// Dimensions of THIS step's screenshot (from observe()) so the backend
|
|
528
|
+
// can populate the screenshot ref even when only screenshot_url is
|
|
529
|
+
// sent (native) and it can't read the bytes for dims.
|
|
530
|
+
screen_width: obs.width,
|
|
531
|
+
screen_height: obs.height,
|
|
440
532
|
frame_version_id: frameVersionId,
|
|
441
533
|
timestamp_ms: accumulatedEffortMs,
|
|
442
534
|
comment: stepResponse.comment,
|
|
443
|
-
url:
|
|
535
|
+
url: device.currentUrl(),
|
|
444
536
|
sentiment: {
|
|
445
537
|
label: stepResponse.sentiment,
|
|
446
538
|
valence: stepResponse.sentiment_valence,
|
|
@@ -448,7 +540,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
448
540
|
},
|
|
449
541
|
actions: actionDatas,
|
|
450
542
|
current_location: stepResponse.current_location,
|
|
451
|
-
|
|
543
|
+
assignment_status: stepResponse.assignment_status,
|
|
452
544
|
// Server reduces this to Interaction.tab when N >= 2; omit on
|
|
453
545
|
// single-tab steps to keep the payload (and DB column) null.
|
|
454
546
|
...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
|
|
@@ -467,7 +559,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
467
559
|
assignmentName: assignment.name,
|
|
468
560
|
screenshotBase64: obsBase64,
|
|
469
561
|
postActionScreenshotBase64: postActionBase64,
|
|
470
|
-
url:
|
|
562
|
+
url: device.currentUrl(),
|
|
471
563
|
actions: actionDebugEntries,
|
|
472
564
|
comment: stepResponse.comment,
|
|
473
565
|
sentiment: {
|
|
@@ -480,6 +572,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
480
572
|
effortSeconds: stepResponse.effort_seconds,
|
|
481
573
|
});
|
|
482
574
|
assignmentCompleted = stepResponse.assignment_completed;
|
|
575
|
+
lastAssignmentStatus = stepResponse.assignment_status;
|
|
483
576
|
step++;
|
|
484
577
|
}
|
|
485
578
|
if (isCancelled()) {
|
|
@@ -491,16 +584,25 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
491
584
|
});
|
|
492
585
|
break;
|
|
493
586
|
}
|
|
587
|
+
// When the agent terminated, persist its ACTUAL terminal status
|
|
588
|
+
// (completed vs abandoned) rather than always "completed". When it
|
|
589
|
+
// didn't terminate, the loop hit max_steps.
|
|
590
|
+
const terminalStatus = assignmentCompleted
|
|
591
|
+
? lastAssignmentStatus
|
|
592
|
+
: "max_steps_reached";
|
|
494
593
|
assignmentStatuses.push({
|
|
495
594
|
assignment_id: assignment.id,
|
|
496
|
-
status:
|
|
595
|
+
status: terminalStatus,
|
|
497
596
|
step_count: step,
|
|
498
597
|
});
|
|
499
|
-
if (assignmentCompleted) {
|
|
500
|
-
log(` Assignment
|
|
598
|
+
if (!assignmentCompleted) {
|
|
599
|
+
log(` Assignment reached max steps (${maxSteps})`);
|
|
600
|
+
}
|
|
601
|
+
else if (lastAssignmentStatus === "abandoned") {
|
|
602
|
+
log(` Assignment abandoned by agent after ${step} steps`);
|
|
501
603
|
}
|
|
502
604
|
else {
|
|
503
|
-
log(` Assignment
|
|
605
|
+
log(` Assignment completed in ${step} steps`);
|
|
504
606
|
}
|
|
505
607
|
}
|
|
506
608
|
}
|
|
@@ -518,7 +620,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
518
620
|
generateDebugReport(debugSteps, {
|
|
519
621
|
participantId: session.participant_id,
|
|
520
622
|
participantName,
|
|
521
|
-
url:
|
|
623
|
+
url: launchTarget,
|
|
522
624
|
screenFormat,
|
|
523
625
|
finalStatus,
|
|
524
626
|
assignmentStatuses,
|
|
@@ -542,15 +644,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
542
644
|
const msg = err instanceof Error ? err.message : String(err);
|
|
543
645
|
log(` Warning: failed to record results — ${msg}`);
|
|
544
646
|
}
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
else {
|
|
549
|
-
// Shared mode: close just the tab, not the context or browser
|
|
550
|
-
try {
|
|
551
|
-
await browserSession.page.close();
|
|
552
|
-
}
|
|
553
|
-
catch { }
|
|
554
|
-
}
|
|
647
|
+
// Device owns its own teardown (full browser vs. just-the-tab for shared
|
|
648
|
+
// mode, app/emulator cleanup for native).
|
|
649
|
+
await device.close();
|
|
555
650
|
}
|
|
556
651
|
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure parser/serializer for native (Android/iOS) accessibility trees — the
|
|
3
|
+
* native counterpart of the browser's DOM-locator tree. It turns a raw device
|
|
4
|
+
* a11y dump into the SAME `[id] role "name"` string the backend's DOMLocator
|
|
5
|
+
* reasons over, plus a local `shortId → bounds` map the device taps the CENTER
|
|
6
|
+
* of. No bounds ship to the backend; like the browser path, the CLI keeps the
|
|
7
|
+
* map and resolves the LLM's returned short id locally.
|
|
8
|
+
*
|
|
9
|
+
* FCIS: this module is pure (string in, structs out) — no `adb`/`idb` I/O — so
|
|
10
|
+
* it's unit-testable without a device, exactly like `coordinates.ts`. The I/O
|
|
11
|
+
* lives in `adb.ts`/`simctl.ts`; the parse/serialize math lives here.
|
|
12
|
+
*
|
|
13
|
+
* COORDINATE SPACE — carried, not converted, by this module:
|
|
14
|
+
* - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
|
|
15
|
+
* - iOS WebDriverAgent /source frames are POINTS (`space: "points"`).
|
|
16
|
+
* The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
|
|
17
|
+
* IOSDevice taps points), so the `space` tag tells the caller which dimension a
|
|
18
|
+
* node's bounds-center belongs to. This module never mixes the two.
|
|
19
|
+
*
|
|
20
|
+
* ANCESTOR-VS-LEAF (the hard part): on Android the visible label
|
|
21
|
+
* ("Network & internet") sits on a `clickable=false` TextView nested inside the
|
|
22
|
+
* clickable PARENT row. Tapping the leaf's center misses the row's hit logic and
|
|
23
|
+
* lands "slightly off"; the click target is the row. So the serializer walks to
|
|
24
|
+
* the nearest clickable ANCESTOR, aggregates its descendants' text/content-desc
|
|
25
|
+
* into ONE label, and emits the CLICKABLE node WITH THE ROW'S BOUNDS — never the
|
|
26
|
+
* leaf. iOS Buttons are already labeled + actionable, so they emit directly.
|
|
27
|
+
*/
|
|
28
|
+
export type CoordinateSpace = "px" | "points";
|
|
29
|
+
export interface Bounds {
|
|
30
|
+
x: number;
|
|
31
|
+
y: number;
|
|
32
|
+
width: number;
|
|
33
|
+
height: number;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* One parsed native a11y node. `bounds` are in `space` (Android px, iOS points).
|
|
37
|
+
* `clickable` marks an actionable hit target. `resourceId` is the Android
|
|
38
|
+
* resource-id / iOS AXUniqueId when present (diagnostic; not used for tapping).
|
|
39
|
+
*/
|
|
40
|
+
export interface NativeNode {
|
|
41
|
+
role: string;
|
|
42
|
+
label: string;
|
|
43
|
+
bounds: Bounds;
|
|
44
|
+
clickable: boolean;
|
|
45
|
+
/** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
|
|
46
|
+
hasOwnLabel: boolean;
|
|
47
|
+
resourceId?: string;
|
|
48
|
+
space: CoordinateSpace;
|
|
49
|
+
}
|
|
50
|
+
export interface NativeTree {
|
|
51
|
+
/** `[id] role "label"` lines, one per emitted actionable node. */
|
|
52
|
+
simplified: string;
|
|
53
|
+
/** shortId → bounds (in the platform's space). The device taps the center. */
|
|
54
|
+
nodeMap: Map<string, Bounds>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Parse a uiautomator XML dump into a flat list of leaf-significant nodes in
|
|
58
|
+
* document order. The dump is a single line of nested `<node ...>` tags; we
|
|
59
|
+
* rebuild the parent/child nesting from the open/close-tag stream (mirroring the
|
|
60
|
+
* "break after `>`" split the oracle scripts use, but tracking depth so the
|
|
61
|
+
* ancestor-aggregation in `serializeNativeTree` has the real tree).
|
|
62
|
+
*
|
|
63
|
+
* Returns the FLATTENED set of nodes (depth-first, document order) with their
|
|
64
|
+
* raw fields; the serializer decides which to emit and how to aggregate.
|
|
65
|
+
*/
|
|
66
|
+
export declare function parseUiautomatorXml(xml: string): NativeNode[];
|
|
67
|
+
/**
|
|
68
|
+
* Parse WDA's `GET /source?format=json` — a NESTED accessibility tree — into the
|
|
69
|
+
* FLAT, depth-first `NativeNode[]` (POINTS) that `parseXcuiHierarchy` produces,
|
|
70
|
+
* so `serializeNativeTree` consumes it unchanged. WDA's `type` matches idb's iOS
|
|
71
|
+
* types (Button/StaticText/SearchField/Cell/Image/Application…), so
|
|
72
|
+
* `normalizeRole`/`IOS_ACTIONABLE_TYPES`/`frameToBounds` all apply as-is.
|
|
73
|
+
*
|
|
74
|
+
* KEY: WDA's `/source` is the FULL XCUIElement tree — every container and leaf —
|
|
75
|
+
* NOT idb's clean accessibility-elements list. iOS settings rows surface as an
|
|
76
|
+
* accessible `Button` ("General", isAccessible=1) that ALSO contains a duplicate
|
|
77
|
+
* inner `StaticText` ("General", isAccessible=0) and is wrapped in a `Cell`
|
|
78
|
+
* (isAccessible=0). Emitting all three yields "General General" + empty
|
|
79
|
+
* listitems. So we emit ONLY `isAccessible && isVisible` nodes — exactly the
|
|
80
|
+
* VoiceOver-exposed set idb returned: the labeled Button is both the label and
|
|
81
|
+
* the tap target; the duplicate StaticText and the wrapping Cell are pruned. A
|
|
82
|
+
* sparse a11y tree degrades to the loop's vision fallback, so strict filtering
|
|
83
|
+
* never strands the run.
|
|
84
|
+
*
|
|
85
|
+
* Accepts either the raw tree or the W3C `{ value: <tree> }` envelope WDA returns.
|
|
86
|
+
*/
|
|
87
|
+
export declare function parseXcuiHierarchy(json: string): NativeNode[];
|
|
88
|
+
/**
|
|
89
|
+
* Serialize a flat NativeNode list (from `parseUiautomatorXml` /
|
|
90
|
+
* `parseXcuiHierarchy`) into the `[id] role "label"` string the DOMLocator
|
|
91
|
+
* reasons over, plus a `shortId → bounds` map for local tap resolution.
|
|
92
|
+
*
|
|
93
|
+
* Emission rules (kept tight, like the DOM serializer):
|
|
94
|
+
* - ANCESTOR-VS-LEAF: a CLICKABLE node absorbs its descendants' labels and is
|
|
95
|
+
* emitted with ITS OWN bounds (the tappable row). The descendant
|
|
96
|
+
* label-bearing leaves are then NOT emitted on their own — their text lives
|
|
97
|
+
* on the row. A label-bearing leaf with NO clickable ancestor (e.g. a
|
|
98
|
+
* standalone heading) is emitted directly so on-screen text isn't lost.
|
|
99
|
+
* - Skip pure decoration: a node that is neither clickable nor label-bearing,
|
|
100
|
+
* and a generic/application container that didn't aggregate a label.
|
|
101
|
+
*
|
|
102
|
+
* The input list is depth-first / document order, which is the order the raw
|
|
103
|
+
* parsers produce; we recover ancestry from that order using bounds containment
|
|
104
|
+
* (Android leaves nest inside their clickable row's rect; iOS is already flat).
|
|
105
|
+
*/
|
|
106
|
+
export declare function serializeNativeTree(nodes: NativeNode[]): NativeTree;
|
|
107
|
+
/** Center of a node's bounds — the point the device taps. */
|
|
108
|
+
export declare function boundsCenter(b: Bounds): {
|
|
109
|
+
x: number;
|
|
110
|
+
y: number;
|
|
111
|
+
};
|