@ishlabs/cli 0.24.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/ask.js +3 -3
- package/dist/commands/iteration.js +1 -1
- package/dist/commands/study-analyze.js +1 -1
- package/dist/commands/study-run.js +80 -12
- package/dist/commands/study.js +11 -7
- package/dist/lib/alias-store.js +1 -1
- package/dist/lib/api-client.d.ts +2 -0
- package/dist/lib/docs.js +57 -42
- package/dist/lib/local-sim/actions.d.ts +10 -2
- package/dist/lib/local-sim/actions.js +16 -11
- package/dist/lib/local-sim/adb.d.ts +103 -0
- package/dist/lib/local-sim/adb.js +352 -0
- package/dist/lib/local-sim/android.d.ts +111 -0
- package/dist/lib/local-sim/android.js +499 -0
- package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
- package/dist/lib/local-sim/apk-manifest.js +210 -0
- package/dist/lib/local-sim/browser.d.ts +22 -0
- package/dist/lib/local-sim/browser.js +65 -0
- package/dist/lib/local-sim/coordinates.d.ts +69 -0
- package/dist/lib/local-sim/coordinates.js +59 -0
- package/dist/lib/local-sim/device.d.ts +143 -0
- package/dist/lib/local-sim/device.js +152 -0
- package/dist/lib/local-sim/ios.d.ts +168 -0
- package/dist/lib/local-sim/ios.js +546 -0
- package/dist/lib/local-sim/loop.d.ts +14 -2
- package/dist/lib/local-sim/loop.js +166 -73
- package/dist/lib/local-sim/native-a11y.d.ts +97 -0
- package/dist/lib/local-sim/native-a11y.js +384 -0
- package/dist/lib/local-sim/simctl.d.ts +85 -0
- package/dist/lib/local-sim/simctl.js +273 -0
- package/dist/lib/local-sim/types.d.ts +37 -2
- package/dist/lib/local-sim/upload.d.ts +1 -1
- package/dist/lib/local-sim/upload.js +9 -6
- package/dist/lib/output.js +58 -12
- package/dist/lib/skill-content.js +10 -9
- package/package.json +2 -1
|
@@ -1,19 +1,31 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Local simulation loop orchestrator.
|
|
3
3
|
*
|
|
4
|
-
* Runs the observe → reason (remote) → act (local) loop for each
|
|
5
|
-
*
|
|
4
|
+
* Runs the observe → reason (remote) → act (local) loop for each participant
|
|
5
|
+
* against a SimulationDevice (a Playwright browser today; a native Android
|
|
6
|
+
* emulator next). The loop is device-agnostic — see device.ts.
|
|
6
7
|
*/
|
|
7
|
-
import {
|
|
8
|
+
import { launchSharedBrowser, FULL_PAGE_HEIGHT_CAP_PX_MOBILE, FULL_PAGE_HEIGHT_CAP_PX_DESKTOP, } from "./browser.js";
|
|
8
9
|
import { uploadScreenshot } from "./upload.js";
|
|
9
|
-
import {
|
|
10
|
-
import {
|
|
11
|
-
import { enableDebug, isDebugEnabled,
|
|
10
|
+
import { detectNoVisibleChange, describeAction } from "./actions.js";
|
|
11
|
+
import { createDevice } from "./device.js";
|
|
12
|
+
import { enableDebug, isDebugEnabled, debugRawResponse, debugNormalizedActions, debugActionExecution, debugForwards, debugStepSummary, debugRecord, } from "./debug.js";
|
|
13
|
+
/**
|
|
14
|
+
* Native (mobile) platforms drive a single physical device via screenshot →
|
|
15
|
+
* normalized-coordinate taps, with no accessibility tree or URL. Browser/web is
|
|
16
|
+
* everything else.
|
|
17
|
+
*/
|
|
18
|
+
function isNativePlatform(platform) {
|
|
19
|
+
return platform === "android" || platform === "ios";
|
|
20
|
+
}
|
|
12
21
|
/**
|
|
13
22
|
* Convert a raw action (from either resolved_actions or output.action.actions)
|
|
14
|
-
* into the flat LocalStepAction shape used by the executor.
|
|
23
|
+
* into the flat LocalStepAction shape used by the executor. Exported for unit
|
|
24
|
+
* tests of the native drag coordinate-shape split (the nested action's
|
|
25
|
+
* `coordinates` is a {x,y} tap point for most actions but a
|
|
26
|
+
* {startX,...,endY} path for a drag).
|
|
15
27
|
*/
|
|
16
|
-
function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
28
|
+
export function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
17
29
|
// resolved_actions nest the action inside an "action" key
|
|
18
30
|
const a = (raw.action ?? raw);
|
|
19
31
|
const element = a.element;
|
|
@@ -36,8 +48,35 @@ function flattenAction(raw, nodeId = null, nodeDescription = null) {
|
|
|
36
48
|
modifiers: Array.isArray(a.modifiers) ? a.modifiers : null,
|
|
37
49
|
key: a.key ?? null,
|
|
38
50
|
tab_id: a.tab_id ?? null,
|
|
51
|
+
orientation: a.orientation ?? null,
|
|
52
|
+
scale: a.scale ?? null,
|
|
53
|
+
// Native path: ResolvedAction.coordinates (top level of the resolved_actions
|
|
54
|
+
// entry) is the single {x,y} execution point. Fall back to the nested action
|
|
55
|
+
// for raw output — but only a point-shaped {x,y}; a drag's nested
|
|
56
|
+
// coordinates is the {startX,...,endY} path (extracted into `drag` below),
|
|
57
|
+
// not a tap point, so guard against mis-assigning it here.
|
|
58
|
+
coordinates: pickPoint(raw.coordinates) ?? pickPoint(a.coordinates) ?? null,
|
|
59
|
+
// drag: the gesture path lives on the nested action's coordinates as
|
|
60
|
+
// {startX, startY, endX, endY} (DragCoordinates serialized by_alias).
|
|
61
|
+
drag: pickDrag(a.coordinates) ?? null,
|
|
39
62
|
};
|
|
40
63
|
}
|
|
64
|
+
/** A nested action's coordinates only when it's the {x,y} tap-point shape. */
|
|
65
|
+
function pickPoint(c) {
|
|
66
|
+
if (c && typeof c === "object" && "x" in c && "y" in c) {
|
|
67
|
+
const p = c;
|
|
68
|
+
return { x: p.x, y: p.y };
|
|
69
|
+
}
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
/** A nested action's coordinates only when it's the DragCoordinates shape. */
|
|
73
|
+
function pickDrag(c) {
|
|
74
|
+
if (c && typeof c === "object" && "startX" in c && "endX" in c) {
|
|
75
|
+
const d = c;
|
|
76
|
+
return { startX: d.startX, startY: d.startY, endX: d.endX, endY: d.endY };
|
|
77
|
+
}
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
41
80
|
/**
|
|
42
81
|
* Normalize the raw backend step response into the flat structure used by the loop.
|
|
43
82
|
* Backend returns { output: { ... }, resolved_actions: [...], loop_detected }.
|
|
@@ -61,7 +100,10 @@ function normalizeStepResponse(raw) {
|
|
|
61
100
|
sentiment_intensity: out.sentiment_intensity ?? 0,
|
|
62
101
|
current_location: out.current_location,
|
|
63
102
|
effort_seconds: out.effort_seconds,
|
|
64
|
-
|
|
103
|
+
assignment_status: out.assignment_status,
|
|
104
|
+
// Terminate on completed OR abandoned — a stuck agent that gives up
|
|
105
|
+
// should stop the loop just like a finished one.
|
|
106
|
+
assignment_completed: out.assignment_status === "completed" || out.assignment_status === "abandoned",
|
|
65
107
|
actions,
|
|
66
108
|
loop_detected: raw.loop_detected,
|
|
67
109
|
};
|
|
@@ -89,7 +131,13 @@ export async function runLocalSimulations(client, opts) {
|
|
|
89
131
|
log("\nCancelling after current step...");
|
|
90
132
|
};
|
|
91
133
|
process.on("SIGINT", onSigint);
|
|
92
|
-
|
|
134
|
+
// Native runs share ONE physical device (emulator / simulator), so they
|
|
135
|
+
// can't run in parallel — force sequential regardless of --parallel.
|
|
136
|
+
const isNativeRun = isNativePlatform(opts.platform);
|
|
137
|
+
if (isNativeRun && (opts.parallel ?? 1) > 1) {
|
|
138
|
+
log("Native (android/ios) runs drive a single device — running sequentially.");
|
|
139
|
+
}
|
|
140
|
+
const concurrency = isNativeRun ? 1 : (opts.parallel ?? opts.participantIds.length);
|
|
93
141
|
try {
|
|
94
142
|
if (concurrency <= 1 || opts.participantIds.length <= 1) {
|
|
95
143
|
// Sequential execution — each participant owns its own browser
|
|
@@ -162,12 +210,20 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
162
210
|
product_id: opts.workspaceId,
|
|
163
211
|
iteration_id: opts.iterationId,
|
|
164
212
|
});
|
|
165
|
-
// Resolve
|
|
213
|
+
// Resolve target + config from iteration details (with CLI fallback).
|
|
214
|
+
// Platform precedence: --platform flag > iteration's stored platform > web.
|
|
166
215
|
const iterDetails = initResponse.iteration_details;
|
|
216
|
+
const platform = opts.platform ?? iterDetails?.platform ?? "web";
|
|
217
|
+
const isNative = isNativePlatform(platform);
|
|
218
|
+
// Browser needs a URL to navigate; native uses the app package (from --app or
|
|
219
|
+
// the iteration target) and has no URL requirement.
|
|
167
220
|
const navigationUrl = iterDetails?.url ?? opts.url;
|
|
168
|
-
if (!navigationUrl) {
|
|
221
|
+
if (!isNative && !navigationUrl) {
|
|
169
222
|
throw new Error("No URL available: backend did not return iteration_details and no --url flag was provided.");
|
|
170
223
|
}
|
|
224
|
+
// For native, launchOrReset() receives the app package (iteration target);
|
|
225
|
+
// the AndroidDevice prefers --app over this.
|
|
226
|
+
const launchTarget = isNative ? (navigationUrl ?? "") : navigationUrl;
|
|
171
227
|
const screenFormat = opts.screenFormat ?? iterDetails?.screen_format ?? "desktop";
|
|
172
228
|
const locale = opts.locale ?? iterDetails?.locale;
|
|
173
229
|
// Cache session state for per-step requests
|
|
@@ -189,7 +245,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
189
245
|
const stepContextValues = session.context_values.map(cv => cv.type === "secret" ? { ...cv, value: null } : cv);
|
|
190
246
|
const maxSteps = opts.maxInteractions ?? session.max_interactions;
|
|
191
247
|
const viewport = { width: 1440, height: 900 }; // TODO: extract from config
|
|
192
|
-
// Step 2:
|
|
248
|
+
// Step 2: Build the target device (per-platform dispatch).
|
|
249
|
+
// Browser today; AndroidDevice (adb) slots in via createDevice() later.
|
|
193
250
|
const browserOpts = {
|
|
194
251
|
headed: opts.headed,
|
|
195
252
|
slowMo: opts.slowMo,
|
|
@@ -198,15 +255,13 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
198
255
|
locale,
|
|
199
256
|
screenFormat,
|
|
200
257
|
};
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
const tabs = new TabManager(browserSession.context, browserSession.page);
|
|
209
|
-
let page = tabs.activePage();
|
|
258
|
+
const device = await createDevice(platform, {
|
|
259
|
+
browserOpts,
|
|
260
|
+
contextValues: session.context_values,
|
|
261
|
+
sharedBrowser,
|
|
262
|
+
appPath: opts.appPath,
|
|
263
|
+
log,
|
|
264
|
+
});
|
|
210
265
|
const history = [];
|
|
211
266
|
const interactions = [];
|
|
212
267
|
const debugSteps = [];
|
|
@@ -216,24 +271,51 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
216
271
|
let accumulatedEffortMs = 0;
|
|
217
272
|
let finalStatus = "completed";
|
|
218
273
|
try {
|
|
219
|
-
// Step 3:
|
|
220
|
-
await
|
|
274
|
+
// Step 3: Launch / navigate the target to its starting point.
|
|
275
|
+
await device.launchOrReset(launchTarget);
|
|
221
276
|
// Step 4: Run assignment loop
|
|
222
277
|
for (let assignmentIdx = 0; assignmentIdx < session.assignments.length; assignmentIdx++) {
|
|
223
278
|
const assignment = session.assignments[assignmentIdx];
|
|
224
279
|
log(` Assignment ${assignmentIdx + 1}/${session.assignments.length}: ${assignment.name}`);
|
|
225
280
|
let step = 0;
|
|
226
281
|
let assignmentCompleted = false;
|
|
282
|
+
// The agent's last per-turn status, used to pick the terminal run-level
|
|
283
|
+
// status when the loop ends because the agent terminated (completed vs
|
|
284
|
+
// abandoned). Stays "in_progress" if the loop hits max_steps.
|
|
285
|
+
let lastAssignmentStatus = "in_progress";
|
|
227
286
|
while (step < maxSteps && !assignmentCompleted && !isCancelled()) {
|
|
228
|
-
// OBSERVE —
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
287
|
+
// OBSERVE — the device refreshes its own active surface (popup /
|
|
288
|
+
// switch_tab for browser) before capturing. (The browser device emits
|
|
289
|
+
// its own richer debugObservation with tree/scroll detail.)
|
|
290
|
+
// TODO(perf): backend can downscale before the vision LLM; full-res sent for now.
|
|
291
|
+
const obs = await device.observe();
|
|
232
292
|
const currentScreenshot = obs.screenshot;
|
|
233
|
-
debugObservation(obs);
|
|
234
293
|
// Capture JPEG of observation for upload and recording (pre-action)
|
|
235
|
-
const obsJpeg = await
|
|
294
|
+
const obsJpeg = await device.captureScreenshotJpeg();
|
|
236
295
|
const obsBase64 = obsJpeg.toString("base64");
|
|
296
|
+
// Capture a height-capped full-page JPEG (pre-action, so it reflects
|
|
297
|
+
// the same screen the LLM reasons over). Sent to the backend as the
|
|
298
|
+
// PDQ basis + Frame representative_screenshot, matching the hosted
|
|
299
|
+
// run's full-page behavior. The per-interaction screenshot_url /
|
|
300
|
+
// recording stays the VIEWPORT (obsBase64) — unchanged.
|
|
301
|
+
// Degrade silently to omitting the field if capture fails: a frame is
|
|
302
|
+
// still created from the viewport.
|
|
303
|
+
const fullPageCap = screenFormat === "mobile_portrait"
|
|
304
|
+
? FULL_PAGE_HEIGHT_CAP_PX_MOBILE
|
|
305
|
+
: FULL_PAGE_HEIGHT_CAP_PX_DESKTOP;
|
|
306
|
+
let fullPageBase64;
|
|
307
|
+
try {
|
|
308
|
+
// Browser-only: native devices omit captureFullPageJpeg, so the
|
|
309
|
+
// field is dropped and the frame is created from the viewport.
|
|
310
|
+
fullPageBase64 = await device.captureFullPageJpeg?.({
|
|
311
|
+
documentHeight: obs.documentHeight,
|
|
312
|
+
cap: fullPageCap,
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
catch (err) {
|
|
316
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
317
|
+
log(` Warning: full-page screenshot capture failed — ${msg}`);
|
|
318
|
+
}
|
|
237
319
|
// Detect no-visible-change: compare this step's observation with the
|
|
238
320
|
// PREVIOUS step's observation (not the post-action screenshot).
|
|
239
321
|
// This tells us whether the previous step's action changed the page.
|
|
@@ -243,10 +325,9 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
243
325
|
previousObsScreenshot = currentScreenshot;
|
|
244
326
|
if (forwards.length > 0)
|
|
245
327
|
debugForwards(forwards);
|
|
246
|
-
const viewportSize = page.viewportSize() ?? viewport;
|
|
247
328
|
// Snapshot open tabs so the backend can prompt the LLM with tab ids
|
|
248
329
|
// (used by switch_tab/close_tab and to disambiguate cmd+click results).
|
|
249
|
-
const tabsSnapshot =
|
|
330
|
+
const tabsSnapshot = obs.tabs;
|
|
250
331
|
// REASON (remote)
|
|
251
332
|
let stepResponse;
|
|
252
333
|
try {
|
|
@@ -256,10 +337,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
256
337
|
assignment_name: assignment.name,
|
|
257
338
|
assignment_instructions: assignment.instructions,
|
|
258
339
|
screenshot: obs.screenshot,
|
|
259
|
-
accessibility_tree: obs.
|
|
340
|
+
accessibility_tree: obs.accessibilityTree,
|
|
260
341
|
current_url: obs.url,
|
|
261
|
-
screen_width:
|
|
262
|
-
screen_height:
|
|
342
|
+
screen_width: obs.width,
|
|
343
|
+
screen_height: obs.height,
|
|
263
344
|
interaction_count: step,
|
|
264
345
|
history,
|
|
265
346
|
forwards,
|
|
@@ -276,7 +357,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
276
357
|
catch (err) {
|
|
277
358
|
const msg = err instanceof Error ? err.message : String(err);
|
|
278
359
|
log(` Step ${step + 1}: API error — ${msg}`);
|
|
279
|
-
await
|
|
360
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
280
361
|
try {
|
|
281
362
|
const stepReqBody = {
|
|
282
363
|
participant_id: session.participant_id,
|
|
@@ -284,10 +365,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
284
365
|
assignment_name: assignment.name,
|
|
285
366
|
assignment_instructions: assignment.instructions,
|
|
286
367
|
screenshot: obs.screenshot,
|
|
287
|
-
accessibility_tree: obs.
|
|
368
|
+
accessibility_tree: obs.accessibilityTree,
|
|
288
369
|
current_url: obs.url,
|
|
289
|
-
screen_width:
|
|
290
|
-
screen_height:
|
|
370
|
+
screen_width: obs.width,
|
|
371
|
+
screen_height: obs.height,
|
|
291
372
|
interaction_count: step,
|
|
292
373
|
history,
|
|
293
374
|
forwards,
|
|
@@ -313,27 +394,21 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
313
394
|
const actionDescs = [];
|
|
314
395
|
const elementNames = [];
|
|
315
396
|
const actionDebugEntries = [];
|
|
316
|
-
const preActionScreenshot = await
|
|
397
|
+
const preActionScreenshot = await device.captureScreenshot();
|
|
317
398
|
for (let i = 0; i < stepResponse.actions.length; i++) {
|
|
318
399
|
if (isCancelled())
|
|
319
400
|
break;
|
|
320
|
-
// Pick up popup auto-switch / explicit tab switch from prior actions.
|
|
321
|
-
page = tabs.activePage();
|
|
322
401
|
const action = stepResponse.actions[i];
|
|
323
|
-
const
|
|
324
|
-
const result = await executeAction(page, action, lastTreeData, session.context_values, tabs);
|
|
402
|
+
const result = await device.executeAction(action);
|
|
325
403
|
const desc = describeAction(action);
|
|
326
404
|
debugActionExecution(i, action, result, action.node_id ? "cdp" : "playwright");
|
|
327
|
-
|
|
328
|
-
page = tabs.activePage();
|
|
329
|
-
const tabsAfter = (await tabs.list()).length;
|
|
330
|
-
const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
|
|
405
|
+
const openedNewTab = result.openedNewTab;
|
|
331
406
|
let normalizedCoords = null;
|
|
332
407
|
if (result.coordinates) {
|
|
333
|
-
const
|
|
408
|
+
const dims = device.dimensions();
|
|
334
409
|
normalizedCoords = {
|
|
335
|
-
x: Math.round((result.coordinates.x /
|
|
336
|
-
y: Math.round((result.coordinates.y /
|
|
410
|
+
x: Math.round((result.coordinates.x / dims.width) * 1000),
|
|
411
|
+
y: Math.round((result.coordinates.y / dims.height) * 1000),
|
|
337
412
|
};
|
|
338
413
|
}
|
|
339
414
|
const actionType = action.type || "unknown";
|
|
@@ -355,6 +430,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
355
430
|
...(action.modifiers?.length && { modifiers: action.modifiers }),
|
|
356
431
|
...(action.key && { key: action.key }),
|
|
357
432
|
...(action.tab_id && { tab_id: action.tab_id }),
|
|
433
|
+
...(action.orientation && { orientation: action.orientation }),
|
|
434
|
+
// The recorded `coordinates` is the drag START; persist the END
|
|
435
|
+
// (normalized 0-1000) too so the journey captures the full path.
|
|
436
|
+
...(action.drag && { drag_end: { x: action.drag.endX, y: action.drag.endY } }),
|
|
358
437
|
...(openedNewTab && { opened_new_tab: true }),
|
|
359
438
|
},
|
|
360
439
|
order: i,
|
|
@@ -376,7 +455,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
376
455
|
}
|
|
377
456
|
// Check if UI changed significantly (skip for last action in batch)
|
|
378
457
|
if (i < stepResponse.actions.length - 1) {
|
|
379
|
-
const midScreenshot = await
|
|
458
|
+
const midScreenshot = await device.captureScreenshot();
|
|
380
459
|
if (!detectNoVisibleChange(preActionScreenshot, midScreenshot)) {
|
|
381
460
|
const blockedCount = stepResponse.actions.length - 1 - i;
|
|
382
461
|
forwards.push({
|
|
@@ -387,10 +466,12 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
387
466
|
}
|
|
388
467
|
}
|
|
389
468
|
}
|
|
390
|
-
// Upload observation
|
|
469
|
+
// Upload observation screenshot (pre-action — matches coordinates and
|
|
470
|
+
// LLM context). Browser captures JPEG; native screencap is PNG.
|
|
471
|
+
const obsContentType = isNative ? "image/png" : "image/jpeg";
|
|
391
472
|
let screenshotUrl;
|
|
392
473
|
try {
|
|
393
|
-
const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg);
|
|
474
|
+
const uploadResult = await uploadScreenshot(client, session.product_id, obsJpeg, obsContentType);
|
|
394
475
|
screenshotUrl = uploadResult.screenshotUrl;
|
|
395
476
|
}
|
|
396
477
|
catch (err) {
|
|
@@ -407,6 +488,10 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
407
488
|
screenshot_url: screenshotUrl,
|
|
408
489
|
location_name: stepResponse.current_location,
|
|
409
490
|
screen_format: screenFormat,
|
|
491
|
+
...(fullPageBase64 ? { full_page_screenshot_base64: fullPageBase64 } : {}),
|
|
492
|
+
// Native: drive FrameSourceType.ANDROID/IOS directly; browser falls
|
|
493
|
+
// back to screen_format server-side.
|
|
494
|
+
platform,
|
|
410
495
|
});
|
|
411
496
|
frameVersionId = matchResult.frame_version_id;
|
|
412
497
|
}
|
|
@@ -417,7 +502,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
417
502
|
// Debug-only: capture post-action screenshot to show result
|
|
418
503
|
let postActionBase64;
|
|
419
504
|
if (isDebugEnabled()) {
|
|
420
|
-
const postJpeg = await
|
|
505
|
+
const postJpeg = await device.captureScreenshotJpeg();
|
|
421
506
|
postActionBase64 = postJpeg.toString("base64");
|
|
422
507
|
}
|
|
423
508
|
// Accumulate effort (cumulative, not wall-clock)
|
|
@@ -437,10 +522,15 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
437
522
|
step: step + 1,
|
|
438
523
|
assignment_id: assignment.id,
|
|
439
524
|
...(screenshotUrl ? { screenshot_url: screenshotUrl } : { screenshot_base64: obsBase64 }),
|
|
525
|
+
// Dimensions of THIS step's screenshot (from observe()) so the backend
|
|
526
|
+
// can populate the screenshot ref even when only screenshot_url is
|
|
527
|
+
// sent (native) and it can't read the bytes for dims.
|
|
528
|
+
screen_width: obs.width,
|
|
529
|
+
screen_height: obs.height,
|
|
440
530
|
frame_version_id: frameVersionId,
|
|
441
531
|
timestamp_ms: accumulatedEffortMs,
|
|
442
532
|
comment: stepResponse.comment,
|
|
443
|
-
url:
|
|
533
|
+
url: device.currentUrl(),
|
|
444
534
|
sentiment: {
|
|
445
535
|
label: stepResponse.sentiment,
|
|
446
536
|
valence: stepResponse.sentiment_valence,
|
|
@@ -448,7 +538,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
448
538
|
},
|
|
449
539
|
actions: actionDatas,
|
|
450
540
|
current_location: stepResponse.current_location,
|
|
451
|
-
|
|
541
|
+
assignment_status: stepResponse.assignment_status,
|
|
452
542
|
// Server reduces this to Interaction.tab when N >= 2; omit on
|
|
453
543
|
// single-tab steps to keep the payload (and DB column) null.
|
|
454
544
|
...(tabsSnapshot.length >= 2 ? { tabs: tabsSnapshot } : {}),
|
|
@@ -467,7 +557,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
467
557
|
assignmentName: assignment.name,
|
|
468
558
|
screenshotBase64: obsBase64,
|
|
469
559
|
postActionScreenshotBase64: postActionBase64,
|
|
470
|
-
url:
|
|
560
|
+
url: device.currentUrl(),
|
|
471
561
|
actions: actionDebugEntries,
|
|
472
562
|
comment: stepResponse.comment,
|
|
473
563
|
sentiment: {
|
|
@@ -480,6 +570,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
480
570
|
effortSeconds: stepResponse.effort_seconds,
|
|
481
571
|
});
|
|
482
572
|
assignmentCompleted = stepResponse.assignment_completed;
|
|
573
|
+
lastAssignmentStatus = stepResponse.assignment_status;
|
|
483
574
|
step++;
|
|
484
575
|
}
|
|
485
576
|
if (isCancelled()) {
|
|
@@ -491,16 +582,25 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
491
582
|
});
|
|
492
583
|
break;
|
|
493
584
|
}
|
|
585
|
+
// When the agent terminated, persist its ACTUAL terminal status
|
|
586
|
+
// (completed vs abandoned) rather than always "completed". When it
|
|
587
|
+
// didn't terminate, the loop hit max_steps.
|
|
588
|
+
const terminalStatus = assignmentCompleted
|
|
589
|
+
? lastAssignmentStatus
|
|
590
|
+
: "max_steps_reached";
|
|
494
591
|
assignmentStatuses.push({
|
|
495
592
|
assignment_id: assignment.id,
|
|
496
|
-
status:
|
|
593
|
+
status: terminalStatus,
|
|
497
594
|
step_count: step,
|
|
498
595
|
});
|
|
499
|
-
if (assignmentCompleted) {
|
|
500
|
-
log(` Assignment
|
|
596
|
+
if (!assignmentCompleted) {
|
|
597
|
+
log(` Assignment reached max steps (${maxSteps})`);
|
|
598
|
+
}
|
|
599
|
+
else if (lastAssignmentStatus === "abandoned") {
|
|
600
|
+
log(` Assignment abandoned by agent after ${step} steps`);
|
|
501
601
|
}
|
|
502
602
|
else {
|
|
503
|
-
log(` Assignment
|
|
603
|
+
log(` Assignment completed in ${step} steps`);
|
|
504
604
|
}
|
|
505
605
|
}
|
|
506
606
|
}
|
|
@@ -518,7 +618,7 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
518
618
|
generateDebugReport(debugSteps, {
|
|
519
619
|
participantId: session.participant_id,
|
|
520
620
|
participantName,
|
|
521
|
-
url:
|
|
621
|
+
url: launchTarget,
|
|
522
622
|
screenFormat,
|
|
523
623
|
finalStatus,
|
|
524
624
|
assignmentStatuses,
|
|
@@ -542,15 +642,8 @@ async function runSingleSimulation(client, participantId, participantName, opts,
|
|
|
542
642
|
const msg = err instanceof Error ? err.message : String(err);
|
|
543
643
|
log(` Warning: failed to record results — ${msg}`);
|
|
544
644
|
}
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
else {
|
|
549
|
-
// Shared mode: close just the tab, not the context or browser
|
|
550
|
-
try {
|
|
551
|
-
await browserSession.page.close();
|
|
552
|
-
}
|
|
553
|
-
catch { }
|
|
554
|
-
}
|
|
645
|
+
// Device owns its own teardown (full browser vs. just-the-tab for shared
|
|
646
|
+
// mode, app/emulator cleanup for native).
|
|
647
|
+
await device.close();
|
|
555
648
|
}
|
|
556
649
|
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure parser/serializer for native (Android/iOS) accessibility trees — the
|
|
3
|
+
* native counterpart of the browser's DOM-locator tree. It turns a raw device
|
|
4
|
+
* a11y dump into the SAME `[id] role "name"` string the backend's DOMLocator
|
|
5
|
+
* reasons over, plus a local `shortId → bounds` map the device taps the CENTER
|
|
6
|
+
* of. No bounds ship to the backend; like the browser path, the CLI keeps the
|
|
7
|
+
* map and resolves the LLM's returned short id locally.
|
|
8
|
+
*
|
|
9
|
+
* FCIS: this module is pure (string in, structs out) — no `adb`/`idb` I/O — so
|
|
10
|
+
* it's unit-testable without a device, exactly like `coordinates.ts`. The I/O
|
|
11
|
+
* lives in `adb.ts`/`simctl.ts`; the parse/serialize math lives here.
|
|
12
|
+
*
|
|
13
|
+
* COORDINATE SPACE — carried, not converted, by this module:
|
|
14
|
+
* - Android `uiautomator dump` bounds are screencap PIXELS (`space: "px"`).
|
|
15
|
+
* - iOS `idb ui describe-all` frames are POINTS (`space: "points"`).
|
|
16
|
+
* The device de-normalizes/taps in its own space (AndroidDevice taps pixels;
|
|
17
|
+
* IOSDevice taps points), so the `space` tag tells the caller which dimension a
|
|
18
|
+
* node's bounds-center belongs to. This module never mixes the two.
|
|
19
|
+
*
|
|
20
|
+
* ANCESTOR-VS-LEAF (the hard part): on Android the visible label
|
|
21
|
+
* ("Network & internet") sits on a `clickable=false` TextView nested inside the
|
|
22
|
+
* clickable PARENT row. Tapping the leaf's center misses the row's hit logic and
|
|
23
|
+
* lands "slightly off"; the click target is the row. So the serializer walks to
|
|
24
|
+
* the nearest clickable ANCESTOR, aggregates its descendants' text/content-desc
|
|
25
|
+
* into ONE label, and emits the CLICKABLE node WITH THE ROW'S BOUNDS — never the
|
|
26
|
+
* leaf. iOS Buttons are already labeled + actionable, so they emit directly.
|
|
27
|
+
*/
|
|
28
|
+
export type CoordinateSpace = "px" | "points";
|
|
29
|
+
export interface Bounds {
|
|
30
|
+
x: number;
|
|
31
|
+
y: number;
|
|
32
|
+
width: number;
|
|
33
|
+
height: number;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* One parsed native a11y node. `bounds` are in `space` (Android px, iOS points).
|
|
37
|
+
* `clickable` marks an actionable hit target. `resourceId` is the Android
|
|
38
|
+
* resource-id / iOS AXUniqueId when present (diagnostic; not used for tapping).
|
|
39
|
+
*/
|
|
40
|
+
export interface NativeNode {
|
|
41
|
+
role: string;
|
|
42
|
+
label: string;
|
|
43
|
+
bounds: Bounds;
|
|
44
|
+
clickable: boolean;
|
|
45
|
+
/** True for nodes whose own text/desc is a label (used to aggregate onto rows). */
|
|
46
|
+
hasOwnLabel: boolean;
|
|
47
|
+
resourceId?: string;
|
|
48
|
+
space: CoordinateSpace;
|
|
49
|
+
}
|
|
50
|
+
export interface NativeTree {
|
|
51
|
+
/** `[id] role "label"` lines, one per emitted actionable node. */
|
|
52
|
+
simplified: string;
|
|
53
|
+
/** shortId → bounds (in the platform's space). The device taps the center. */
|
|
54
|
+
nodeMap: Map<string, Bounds>;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Parse a uiautomator XML dump into a flat list of leaf-significant nodes in
|
|
58
|
+
* document order. The dump is a single line of nested `<node ...>` tags; we
|
|
59
|
+
* rebuild the parent/child nesting from the open/close-tag stream (mirroring the
|
|
60
|
+
* "break after `>`" split the oracle scripts use, but tracking depth so the
|
|
61
|
+
* ancestor-aggregation in `serializeNativeTree` has the real tree).
|
|
62
|
+
*
|
|
63
|
+
* Returns the FLATTENED set of nodes (depth-first, document order) with their
|
|
64
|
+
* raw fields; the serializer decides which to emit and how to aggregate.
|
|
65
|
+
*/
|
|
66
|
+
export declare function parseUiautomatorXml(xml: string): NativeNode[];
|
|
67
|
+
/**
|
|
68
|
+
* Parse `idb ui describe-all` JSON (a FLAT array of elements, each with a `frame`
|
|
69
|
+
* in POINTS) into NativeNodes in array order. iOS is already a flat,
|
|
70
|
+
* properly-labeled list — no ancestor walk needed — so `clickable` is derived
|
|
71
|
+
* from the element's role/type and whether it carries a usable label.
|
|
72
|
+
*/
|
|
73
|
+
export declare function parseIdbDescribeAll(json: string): NativeNode[];
|
|
74
|
+
/**
|
|
75
|
+
* Serialize a flat NativeNode list (from `parseUiautomatorXml` /
|
|
76
|
+
* `parseIdbDescribeAll`) into the `[id] role "label"` string the DOMLocator
|
|
77
|
+
* reasons over, plus a `shortId → bounds` map for local tap resolution.
|
|
78
|
+
*
|
|
79
|
+
* Emission rules (kept tight, like the DOM serializer):
|
|
80
|
+
* - ANCESTOR-VS-LEAF: a CLICKABLE node absorbs its descendants' labels and is
|
|
81
|
+
* emitted with ITS OWN bounds (the tappable row). The descendant
|
|
82
|
+
* label-bearing leaves are then NOT emitted on their own — their text lives
|
|
83
|
+
* on the row. A label-bearing leaf with NO clickable ancestor (e.g. a
|
|
84
|
+
* standalone heading) is emitted directly so on-screen text isn't lost.
|
|
85
|
+
* - Skip pure decoration: a node that is neither clickable nor label-bearing,
|
|
86
|
+
* and a generic/application container that didn't aggregate a label.
|
|
87
|
+
*
|
|
88
|
+
* The input list is depth-first / document order, which is the order the raw
|
|
89
|
+
* parsers produce; we recover ancestry from that order using bounds containment
|
|
90
|
+
* (Android leaves nest inside their clickable row's rect; iOS is already flat).
|
|
91
|
+
*/
|
|
92
|
+
export declare function serializeNativeTree(nodes: NativeNode[]): NativeTree;
|
|
93
|
+
/** Center of a node's bounds — the point the device taps. */
|
|
94
|
+
export declare function boundsCenter(b: Bounds): {
|
|
95
|
+
x: number;
|
|
96
|
+
y: number;
|
|
97
|
+
};
|