@ishlabs/cli 0.24.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/commands/ask.js +3 -3
  2. package/dist/commands/iteration.js +1 -1
  3. package/dist/commands/study-analyze.js +1 -1
  4. package/dist/commands/study-run.js +80 -12
  5. package/dist/commands/study.js +11 -7
  6. package/dist/lib/alias-store.js +1 -1
  7. package/dist/lib/api-client.d.ts +2 -0
  8. package/dist/lib/docs.js +57 -42
  9. package/dist/lib/local-sim/actions.d.ts +10 -2
  10. package/dist/lib/local-sim/actions.js +16 -11
  11. package/dist/lib/local-sim/adb.d.ts +103 -0
  12. package/dist/lib/local-sim/adb.js +352 -0
  13. package/dist/lib/local-sim/android.d.ts +111 -0
  14. package/dist/lib/local-sim/android.js +499 -0
  15. package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
  16. package/dist/lib/local-sim/apk-manifest.js +210 -0
  17. package/dist/lib/local-sim/browser.d.ts +22 -0
  18. package/dist/lib/local-sim/browser.js +65 -0
  19. package/dist/lib/local-sim/coordinates.d.ts +69 -0
  20. package/dist/lib/local-sim/coordinates.js +59 -0
  21. package/dist/lib/local-sim/device.d.ts +143 -0
  22. package/dist/lib/local-sim/device.js +152 -0
  23. package/dist/lib/local-sim/ios.d.ts +168 -0
  24. package/dist/lib/local-sim/ios.js +546 -0
  25. package/dist/lib/local-sim/loop.d.ts +14 -2
  26. package/dist/lib/local-sim/loop.js +166 -73
  27. package/dist/lib/local-sim/native-a11y.d.ts +97 -0
  28. package/dist/lib/local-sim/native-a11y.js +384 -0
  29. package/dist/lib/local-sim/simctl.d.ts +85 -0
  30. package/dist/lib/local-sim/simctl.js +273 -0
  31. package/dist/lib/local-sim/types.d.ts +37 -2
  32. package/dist/lib/local-sim/upload.d.ts +1 -1
  33. package/dist/lib/local-sim/upload.js +9 -6
  34. package/dist/lib/output.js +58 -12
  35. package/dist/lib/skill-content.js +10 -9
  36. package/package.json +2 -1
@@ -0,0 +1,210 @@
1
+ /**
2
+ * Extract an APK's package name from its binary AndroidManifest.xml — pure,
3
+ * dependency-free (no aapt, no SDK), so `--app <apk>` resolves the package even
4
+ * when the apk is ALREADY installed (the install-list diff is empty then).
5
+ *
6
+ * Two steps, both on in-memory bytes:
7
+ * 1. Pull AndroidManifest.xml out of the APK (a ZIP) — find its local file
8
+ * header and inflate (raw deflate) or copy (stored).
9
+ * 2. Parse the binary XML (AXML): read the string pool, find the <manifest>
10
+ * START_ELEMENT, and read its `package` attribute's string value.
11
+ *
12
+ * We only need the package string, so this is a deliberately minimal AXML
13
+ * reader (not a general decoder). Returns null on anything unexpected — the
14
+ * caller falls back to other resolution and never crashes on a weird apk.
15
+ */
16
+ import { inflateRawSync } from "node:zlib";
17
+ import { readFile } from "node:fs/promises";
18
+ // --- ZIP: pull AndroidManifest.xml bytes ---
19
+ const SIG_EOCD = 0x06054b50; // End Of Central Directory "PK\x05\x06"
20
+ const SIG_CENTRAL = 0x02014b50; // Central directory file header "PK\x01\x02"
21
+ const MANIFEST_NAME = "AndroidManifest.xml";
22
+ const LOCAL_HEADER_FIXED = 30; // bytes before the local header's name field
23
+ /**
24
+ * Extract AndroidManifest.xml via the CENTRAL DIRECTORY (not local headers).
25
+ * Modern APKs (v2-signed / zipaligned) flag the manifest entry with a data
26
+ * descriptor, so its local-header compressed size is 0 — only the central
27
+ * directory carries the real size. We find the EOCD, walk the central
28
+ * directory for the manifest entry, then read its data from the local header.
29
+ */
30
+ function extractManifestBytes(apk) {
31
+ // EOCD is near the end; scan backward over the (≤64KB) comment for its sig.
32
+ let eocd = -1;
33
+ const minEocd = Math.max(0, apk.length - 22 - 0xffff);
34
+ for (let i = apk.length - 22; i >= minEocd; i--) {
35
+ if (apk.readUInt32LE(i) === SIG_EOCD) {
36
+ eocd = i;
37
+ break;
38
+ }
39
+ }
40
+ if (eocd < 0)
41
+ return null;
42
+ const centralCount = apk.readUInt16LE(eocd + 10);
43
+ const centralOffset = apk.readUInt32LE(eocd + 16);
44
+ let p = centralOffset;
45
+ for (let n = 0; n < centralCount; n++) {
46
+ if (p + 46 > apk.length || apk.readUInt32LE(p) !== SIG_CENTRAL)
47
+ break;
48
+ const method = apk.readUInt16LE(p + 10);
49
+ const compSize = apk.readUInt32LE(p + 20);
50
+ const nameLen = apk.readUInt16LE(p + 28);
51
+ const extraLen = apk.readUInt16LE(p + 30);
52
+ const commentLen = apk.readUInt16LE(p + 32);
53
+ const localOffset = apk.readUInt32LE(p + 42);
54
+ const name = apk.toString("utf8", p + 46, p + 46 + nameLen);
55
+ if (name === MANIFEST_NAME) {
56
+ // Read the data from the LOCAL header (its name/extra lengths may differ
57
+ // from the central record, so re-read them at localOffset).
58
+ const lNameLen = apk.readUInt16LE(localOffset + 26);
59
+ const lExtraLen = apk.readUInt16LE(localOffset + 28);
60
+ const dataStart = localOffset + LOCAL_HEADER_FIXED + lNameLen + lExtraLen;
61
+ const data = apk.subarray(dataStart, dataStart + compSize);
62
+ try {
63
+ if (method === 0)
64
+ return Buffer.from(data); // stored
65
+ if (method === 8)
66
+ return inflateRawSync(data); // deflate
67
+ }
68
+ catch {
69
+ return null;
70
+ }
71
+ return null;
72
+ }
73
+ p += 46 + nameLen + extraLen + commentLen;
74
+ }
75
+ return null;
76
+ }
77
+ // --- AXML: read the string pool + find <manifest package="..."> ---
78
+ const RES_STRING_POOL_TYPE = 0x0001;
79
+ const RES_XML_START_ELEMENT_TYPE = 0x0102;
80
+ const UTF8_FLAG = 1 << 8;
81
+ /** Parse the AXML string pool chunk at `poolOffset`; returns a lazy accessor. */
82
+ function parseStringPool(buf, poolOffset) {
83
+ // chunk: type(2) headerSize(2) chunkSize(4) stringCount(4) styleCount(4)
84
+ // flags(4) stringsStart(4) stylesStart(4) then stringCount offsets.
85
+ const type = buf.readUInt16LE(poolOffset);
86
+ if (type !== RES_STRING_POOL_TYPE)
87
+ return null;
88
+ const stringCount = buf.readUInt32LE(poolOffset + 8);
89
+ const flags = buf.readUInt32LE(poolOffset + 16);
90
+ const stringsStart = buf.readUInt32LE(poolOffset + 20);
91
+ const isUtf8 = (flags & UTF8_FLAG) !== 0;
92
+ const offsetsStart = poolOffset + 28;
93
+ const stringDataBase = poolOffset + stringsStart;
94
+ const cache = new Map();
95
+ return {
96
+ get(index) {
97
+ if (index < 0 || index >= stringCount)
98
+ return null;
99
+ if (cache.has(index))
100
+ return cache.get(index);
101
+ const strOff = stringDataBase + buf.readUInt32LE(offsetsStart + index * 4);
102
+ let value;
103
+ if (isUtf8) {
104
+ // UTF-8: [u16-ish charLen][u16-ish byteLen] then bytes. Lengths use a
105
+ // high-bit continuation; we only need the byte length to slice.
106
+ let p = strOff;
107
+ // skip the char count (1 or 2 bytes)
108
+ if (buf[p] & 0x80)
109
+ p += 2;
110
+ else
111
+ p += 1;
112
+ let byteLen = buf[p];
113
+ if (byteLen & 0x80) {
114
+ byteLen = ((byteLen & 0x7f) << 8) | buf[p + 1];
115
+ p += 2;
116
+ }
117
+ else {
118
+ p += 1;
119
+ }
120
+ value = buf.toString("utf8", p, p + byteLen);
121
+ }
122
+ else {
123
+ // UTF-16LE: [u16 charLen (high-bit continuation)] then 2*len bytes.
124
+ let p = strOff;
125
+ let charLen = buf.readUInt16LE(p);
126
+ p += 2;
127
+ if (charLen & 0x8000) {
128
+ charLen = ((charLen & 0x7fff) << 16) | buf.readUInt16LE(p);
129
+ p += 2;
130
+ }
131
+ value = buf.toString("utf16le", p, p + charLen * 2);
132
+ }
133
+ cache.set(index, value);
134
+ return value;
135
+ },
136
+ };
137
+ }
138
+ /**
139
+ * Parse binary AXML and return the <manifest> element's `package` attribute.
140
+ * Returns null if the structure isn't what we expect.
141
+ */
142
+ export function parseAxmlPackage(axml) {
143
+ if (axml.length < 8)
144
+ return null;
145
+ // file header: type(2) headerSize(2) fileSize(4). String pool follows at 8.
146
+ const pool = parseStringPool(axml, 8);
147
+ if (!pool)
148
+ return null;
149
+ // Walk chunks from the start of the file, find the first START_ELEMENT named
150
+ // "manifest", then read its attributes for "package".
151
+ let off = 8;
152
+ // Advance past the string pool using its chunkSize.
153
+ const poolChunkSize = axml.readUInt32LE(8 + 4);
154
+ off = 8 + poolChunkSize;
155
+ while (off + 8 <= axml.length) {
156
+ const type = axml.readUInt16LE(off);
157
+ const headerSize = axml.readUInt16LE(off + 2);
158
+ const chunkSize = axml.readUInt32LE(off + 4);
159
+ if (chunkSize < 8 || off + chunkSize > axml.length)
160
+ break;
161
+ if (type === RES_XML_START_ELEMENT_TYPE) {
162
+ // START_ELEMENT body (after the standard node header lineNo/comment):
163
+ // ns(4) name(4) attrStart(2) attrSize(2) attrCount(2) ... then attrs.
164
+ // The chunk header is `headerSize` bytes; the element fields start there.
165
+ const base = off + headerSize;
166
+ const nameIdx = axml.readInt32LE(base + 4);
167
+ const elementName = nameIdx >= 0 ? pool.get(nameIdx) : null;
168
+ if (elementName === "manifest") {
169
+ const attrStart = axml.readUInt16LE(base + 8);
170
+ const attrSize = axml.readUInt16LE(base + 10);
171
+ const attrCount = axml.readUInt16LE(base + 12);
172
+ const attrsBase = base + attrStart;
173
+ for (let i = 0; i < attrCount; i++) {
174
+ const a = attrsBase + i * attrSize;
175
+ // attr: ns(4) name(4) rawValue(4) typedValue{size(2) res0(1) type(1) data(4)}
176
+ const attrNameIdx = axml.readInt32LE(a + 4);
177
+ const attrName = attrNameIdx >= 0 ? pool.get(attrNameIdx) : null;
178
+ if (attrName === "package") {
179
+ const rawValueIdx = axml.readInt32LE(a + 8);
180
+ if (rawValueIdx >= 0) {
181
+ const pkg = pool.get(rawValueIdx);
182
+ if (pkg)
183
+ return pkg;
184
+ }
185
+ // Fallback: the typed value's data is a string-pool index too.
186
+ const dataIdx = axml.readInt32LE(a + 16);
187
+ return dataIdx >= 0 ? pool.get(dataIdx) : null;
188
+ }
189
+ }
190
+ return null; // manifest found but no package attr
191
+ }
192
+ }
193
+ off += chunkSize;
194
+ }
195
+ return null;
196
+ }
197
+ /** Read an APK file and return its package name, or null if it can't be parsed. */
198
+ export async function packageNameFromApk(apkPath) {
199
+ let apk;
200
+ try {
201
+ apk = await readFile(apkPath);
202
+ }
203
+ catch {
204
+ return null;
205
+ }
206
+ const manifest = extractManifestBytes(apk);
207
+ if (!manifest)
208
+ return null;
209
+ return parseAxmlPackage(manifest);
210
+ }
@@ -58,5 +58,27 @@ export declare function resolveNodeToBoundingBox(page: Page, nodeId: string, tre
58
58
  export declare function resolveNodeToXPath(page: Page, nodeId: string, treeData: TreeData): Promise<string | null>;
59
59
  export declare function takeScreenshot(page: Page): Promise<string>;
60
60
  export declare function takeScreenshotJpeg(page: Page, quality?: number): Promise<Buffer>;
61
+ export declare const FULL_PAGE_HEIGHT_CAP_PX_MOBILE = 12000;
62
+ export declare const FULL_PAGE_HEIGHT_CAP_PX_DESKTOP = 16000;
63
+ export interface FullPageJpegResult {
64
+ base64: string;
65
+ clipped: boolean;
66
+ }
67
+ /**
68
+ * Capture a height-capped full-page JPEG, mirroring the hosted backend's
69
+ * ``take_full_page_with_navbar`` (screenshots.py). Used as the PDQ basis and
70
+ * stored as the Frame's representative_screenshot.
71
+ *
72
+ * Scroll preservation: Playwright's ``page.screenshot({ fullPage: true })``
73
+ * resets ``window.scrollY`` to 0 on long pages, and the agent is mid-task —
74
+ * we must not disturb its scroll position. We save scrollY first, capture,
75
+ * then restore it if it changed (mirrors _safe_scroll_y / _restore_scroll_y
76
+ * in screenshots.py). Both the save and restore guard against throwing.
77
+ */
78
+ export declare function takeFullPageJpeg(page: Page, opts: {
79
+ documentHeight: number;
80
+ cap: number;
81
+ viewportWidth: number;
82
+ }, quality?: number): Promise<FullPageJpegResult>;
61
83
  export declare function navigateWithRetry(page: Page, url: string, maxRetries?: number): Promise<void>;
62
84
  export declare function closeBrowser(session: BrowserSession): Promise<void>;
@@ -304,6 +304,71 @@ export async function takeScreenshot(page) {
304
304
  export async function takeScreenshotJpeg(page, quality = 85) {
305
305
  return page.screenshot({ type: "jpeg", quality });
306
306
  }
307
+ // Height caps (CSS px) for full-page capture, mirroring the hosted backend
308
+ // constants in app/interactive/computers/browser/screenshots.py
309
+ // (FULL_PAGE_HEIGHT_CAP_PX_MOBILE / _DESKTOP). Compared against
310
+ // document.documentElement.scrollHeight exactly like the hosted ``exceeds_cap``
311
+ // branch — clipping at capture time avoids OOMing the renderer on very tall
312
+ // pages.
313
+ export const FULL_PAGE_HEIGHT_CAP_PX_MOBILE = 12_000;
314
+ export const FULL_PAGE_HEIGHT_CAP_PX_DESKTOP = 16_000;
315
+ /**
316
+ * Capture a height-capped full-page JPEG, mirroring the hosted backend's
317
+ * ``take_full_page_with_navbar`` (screenshots.py). Used as the PDQ basis and
318
+ * stored as the Frame's representative_screenshot.
319
+ *
320
+ * Scroll preservation: Playwright's ``page.screenshot({ fullPage: true })``
321
+ * resets ``window.scrollY`` to 0 on long pages, and the agent is mid-task —
322
+ * we must not disturb its scroll position. We save scrollY first, capture,
323
+ * then restore it if it changed (mirrors _safe_scroll_y / _restore_scroll_y
324
+ * in screenshots.py). Both the save and restore guard against throwing.
325
+ */
326
+ export async function takeFullPageJpeg(page, opts, quality = 85) {
327
+ // Save scrollY (guard against throw — a failed read means "don't restore").
328
+ let savedY = null;
329
+ try {
330
+ savedY = Math.round(await page.evaluate(() => window.scrollY));
331
+ }
332
+ catch {
333
+ savedY = null;
334
+ }
335
+ try {
336
+ const exceedsCap = opts.documentHeight > opts.cap;
337
+ let buffer;
338
+ let clipped = false;
339
+ if (exceedsCap) {
340
+ // ``fullPage: true`` makes Playwright's CDP call set
341
+ // ``captureBeyondViewport`` so the clip can extend past the current
342
+ // viewport (same as the hosted Python cap-clip branch). Without it the
343
+ // clip would be intersected with the visible viewport.
344
+ buffer = await page.screenshot({
345
+ fullPage: true,
346
+ type: "jpeg",
347
+ quality,
348
+ clip: { x: 0, y: 0, width: opts.viewportWidth, height: opts.cap },
349
+ });
350
+ clipped = true;
351
+ }
352
+ else {
353
+ buffer = await page.screenshot({ fullPage: true, type: "jpeg", quality });
354
+ }
355
+ return { base64: buffer.toString("base64"), clipped };
356
+ }
357
+ finally {
358
+ // Restore scrollY if Playwright reset it. Never throw out of the restore.
359
+ if (savedY !== null) {
360
+ try {
361
+ const currentY = Math.round(await page.evaluate(() => window.scrollY));
362
+ if (currentY !== savedY) {
363
+ await page.evaluate((y) => window.scrollTo(0, y), savedY);
364
+ }
365
+ }
366
+ catch {
367
+ // Best-effort restore; a failure here must not unwind the capture.
368
+ }
369
+ }
370
+ }
371
+ }
307
372
  export async function navigateWithRetry(page, url, maxRetries = 3) {
308
373
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
309
374
  try {
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Pure coordinate math for the native (vision) device path.
3
+ *
4
+ * The backend's vision locator returns NORMALIZED 0-1000 coordinates. A native
5
+ * device de-normalizes them against a concrete dimension (screencap pixels for
6
+ * Android; idb POINTS for the iOS tap, screenshot PIXELS for the iOS record),
7
+ * and the loop later re-normalizes the recorded coordinate back to 0-1000.
8
+ *
9
+ * Pure and side-effect-free so the round-trip can be unit-tested without a
10
+ * device (FCIS — the I/O lives in adb.ts/simctl.ts, the math lives here).
11
+ *
12
+ * ROUND-TRIP IDENTITY: deNormalize→reNormalize is an identity only when the
13
+ * target dimension is >= the normalized scale (1000). When the dim is finer
14
+ * (pixels, e.g. 1080/1179 > 1000) the recorded coord round-trips exactly; when
15
+ * it's coarser (iOS points 393 < 1000) double-rounding can drift by 1 unit —
16
+ * which is why iOS records in PIXELS and only TAPS in points.
17
+ */
18
+ export declare const NORMALIZED_SCALE = 1000;
19
+ /** Normalized 0-1000 → a concrete dimension (pixels or points). */
20
+ export declare function deNormalize(n: number, dim: number): number;
21
+ /** A concrete coordinate (pixels or points) → normalized 0-1000 (what the loop records). */
22
+ export declare function reNormalize(coord: number, dim: number): number;
23
+ /** De-normalize a {x,y} against per-axis dimensions. */
24
+ export declare function deNormalizePoint(c: {
25
+ x: number;
26
+ y: number;
27
+ }, width: number, height: number): {
28
+ x: number;
29
+ y: number;
30
+ };
31
+ /**
32
+ * De-normalize a drag's start AND end points (each normalized 0-1000) against
33
+ * one device dimension into the {start,end} pixel/point pair the drivers feed to
34
+ * a slow swipe. The drag path is a from→to gesture, so BOTH ends de-normalize
35
+ * against the SAME basis (Android: screencap pixels; iOS: idb POINTS). Pure so
36
+ * the two-ended de-normalization is unit-testable without a device.
37
+ */
38
+ export declare function deNormalizeDrag(drag: {
39
+ startX: number;
40
+ startY: number;
41
+ endX: number;
42
+ endY: number;
43
+ }, width: number, height: number): {
44
+ start: {
45
+ x: number;
46
+ y: number;
47
+ };
48
+ end: {
49
+ x: number;
50
+ y: number;
51
+ };
52
+ };
53
+ /**
54
+ * Scale an iOS POINT coordinate into the PIXEL space, used by the element path:
55
+ * `idb` reports a11y frames (and so the tappable bounds-center) in POINTS, but
56
+ * the loop records — and re-normalizes against `dimensions()` — in PIXELS. So
57
+ * the element path taps the point-center directly (idb consumes points) yet must
58
+ * RECORD a pixel-center; this converts the one to the other per-axis by the
59
+ * point→pixel ratio (the @Nx scale). Pure so the conversion is unit-testable
60
+ * without a simulator. Android needs no analog: its screencap and tap share one
61
+ * pixel space, so the bounds-center is already a pixel center.
62
+ */
63
+ export declare function pointToPixel(c: {
64
+ x: number;
65
+ y: number;
66
+ }, pointWidth: number, pointHeight: number, pixelWidth: number, pixelHeight: number): {
67
+ x: number;
68
+ y: number;
69
+ };
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Pure coordinate math for the native (vision) device path.
3
+ *
4
+ * The backend's vision locator returns NORMALIZED 0-1000 coordinates. A native
5
+ * device de-normalizes them against a concrete dimension (screencap pixels for
6
+ * Android; idb POINTS for the iOS tap, screenshot PIXELS for the iOS record),
7
+ * and the loop later re-normalizes the recorded coordinate back to 0-1000.
8
+ *
9
+ * Pure and side-effect-free so the round-trip can be unit-tested without a
10
+ * device (FCIS — the I/O lives in adb.ts/simctl.ts, the math lives here).
11
+ *
12
+ * ROUND-TRIP IDENTITY: deNormalize→reNormalize is an identity only when the
13
+ * target dimension is >= the normalized scale (1000). When the dim is finer
14
+ * (pixels, e.g. 1080/1179 > 1000) the recorded coord round-trips exactly; when
15
+ * it's coarser (iOS points 393 < 1000) double-rounding can drift by 1 unit —
16
+ * which is why iOS records in PIXELS and only TAPS in points.
17
+ */
18
+ export const NORMALIZED_SCALE = 1000;
19
+ /** Normalized 0-1000 → a concrete dimension (pixels or points). */
20
+ export function deNormalize(n, dim) {
21
+ return Math.round((n / NORMALIZED_SCALE) * dim);
22
+ }
23
+ /** A concrete coordinate (pixels or points) → normalized 0-1000 (what the loop records). */
24
+ export function reNormalize(coord, dim) {
25
+ return Math.round((coord / dim) * NORMALIZED_SCALE);
26
+ }
27
+ /** De-normalize a {x,y} against per-axis dimensions. */
28
+ export function deNormalizePoint(c, width, height) {
29
+ return { x: deNormalize(c.x, width), y: deNormalize(c.y, height) };
30
+ }
31
+ /**
32
+ * De-normalize a drag's start AND end points (each normalized 0-1000) against
33
+ * one device dimension into the {start,end} pixel/point pair the drivers feed to
34
+ * a slow swipe. The drag path is a from→to gesture, so BOTH ends de-normalize
35
+ * against the SAME basis (Android: screencap pixels; iOS: idb POINTS). Pure so
36
+ * the two-ended de-normalization is unit-testable without a device.
37
+ */
38
+ export function deNormalizeDrag(drag, width, height) {
39
+ return {
40
+ start: deNormalizePoint({ x: drag.startX, y: drag.startY }, width, height),
41
+ end: deNormalizePoint({ x: drag.endX, y: drag.endY }, width, height),
42
+ };
43
+ }
44
+ /**
45
+ * Scale an iOS POINT coordinate into the PIXEL space, used by the element path:
46
+ * `idb` reports a11y frames (and so the tappable bounds-center) in POINTS, but
47
+ * the loop records — and re-normalizes against `dimensions()` — in PIXELS. So
48
+ * the element path taps the point-center directly (idb consumes points) yet must
49
+ * RECORD a pixel-center; this converts the one to the other per-axis by the
50
+ * point→pixel ratio (the @Nx scale). Pure so the conversion is unit-testable
51
+ * without a simulator. Android needs no analog: its screencap and tap share one
52
+ * pixel space, so the bounds-center is already a pixel center.
53
+ */
54
+ export function pointToPixel(c, pointWidth, pointHeight, pixelWidth, pixelHeight) {
55
+ return {
56
+ x: Math.round((c.x / pointWidth) * pixelWidth),
57
+ y: Math.round((c.y / pointHeight) * pixelHeight),
58
+ };
59
+ }
@@ -0,0 +1,143 @@
1
+ /**
2
+ * SimulationDevice — the target a local simulation drives.
3
+ *
4
+ * The observe → reason (remote) → act (local) loop in `loop.ts` used to be
5
+ * hardwired to a Playwright `Page`. This interface abstracts exactly what the
6
+ * loop needs from a target so a native Android device (driven by `adb`) can
7
+ * slot in next to the browser. `BrowserDevice` (below) wraps the existing
8
+ * Playwright path in `browser.ts`/`actions.ts`/`tabs.ts`; `AndroidDevice`
9
+ * (added later) implements the same surface via `adb`.
10
+ *
11
+ * Multi-tab handling is browser-specific and stays hidden behind the
12
+ * interface — the loop never touches a `Page` or `TabManager` directly.
13
+ */
14
+ import type { Browser } from "playwright-core";
15
+ import type { LocalStepAction, LocalSimBrowserOptions, LocalTabInfo, ContextValue } from "./types.js";
16
+ import type { BrowserSession } from "./browser.js";
17
+ /**
18
+ * One observation of the target's current state.
19
+ *
20
+ * `accessibilityTree` is populated by the browser (CDP) and by native targets
21
+ * (uiautomator / idb describe-all), serialized to the same `[id] role "name"`
22
+ * format the backend DOMLocator reasons over; it's "" only when a native dump
23
+ * fails or yields a sparse tree, which makes the backend take its vision branch.
24
+ * `url` is browser-only ("" for native). `tabs` is browser-only and empty for
25
+ * native. The node map needed to resolve subsequent actions is kept inside the
26
+ * device (BrowserDevice.lastTreeData / AndroidDevice.lastNodeMap /
27
+ * IOSDevice.lastNodeMap), not surfaced here.
28
+ */
29
+ export interface DeviceObservation {
30
+ /** base64 PNG of the current screen. */
31
+ screenshot: string;
32
+ /** Simplified accessibility tree (browser + native); "" when a native dump degrades to vision. */
33
+ accessibilityTree: string;
34
+ /** Current URL (browser); "" for native. */
35
+ url: string;
36
+ width: number;
37
+ height: number;
38
+ /** Full document height (browser); equals `height` for native screens. */
39
+ documentHeight: number;
40
+ /** Open-tab snapshot (browser-only; empty for native). */
41
+ tabs: LocalTabInfo[];
42
+ }
43
+ /**
44
+ * Result of executing one action against the target.
45
+ *
46
+ * `coordinates` are in the device's own pixel space (browser viewport px or
47
+ * native screencap px). `openedNewTab` is browser-only and always false for
48
+ * native.
49
+ */
50
+ export interface DeviceActionResult {
51
+ success: boolean;
52
+ elementName: string | null;
53
+ coordinates: {
54
+ x: number;
55
+ y: number;
56
+ } | null;
57
+ openedNewTab: boolean;
58
+ }
59
+ /**
60
+ * A drivable simulation target. Implementations own their own lifecycle and
61
+ * (for the browser) tab bookkeeping.
62
+ */
63
+ export interface SimulationDevice {
64
+ /**
65
+ * Launch (or reset to a clean state) and bring the target to `target`:
66
+ * a URL for the browser, an app id / apk path for native.
67
+ */
68
+ launchOrReset(target: string): Promise<void>;
69
+ /** Capture a full observation of the current screen state. */
70
+ observe(): Promise<DeviceObservation>;
71
+ /** base64 PNG — used for cheap no-visible-change detection between steps. */
72
+ captureScreenshot(): Promise<string>;
73
+ /** JPEG buffer — used for upload, frame-matching, recording, and debug. */
74
+ captureScreenshotJpeg(): Promise<Buffer>;
75
+ /**
76
+ * Height-capped full-page JPEG (base64) for the backend's PDQ basis +
77
+ * representative_screenshot. Browser-only: native targets have no scrollable
78
+ * document, so they omit this and the frame is created from the viewport.
79
+ */
80
+ captureFullPageJpeg?(opts: {
81
+ documentHeight: number;
82
+ cap: number;
83
+ }): Promise<string | undefined>;
84
+ /** Current pixel dimensions of the target (viewport / screencap). */
85
+ dimensions(): {
86
+ width: number;
87
+ height: number;
88
+ };
89
+ /** Execute one action and report what happened. */
90
+ executeAction(action: LocalStepAction): Promise<DeviceActionResult>;
91
+ /** Current location string for recording (URL for browser; "" for native). */
92
+ currentUrl(): string;
93
+ /** Tear down. For shared-browser tabs this closes just the tab. */
94
+ close(): Promise<void>;
95
+ }
96
+ /**
97
+ * Browser implementation backed by Playwright. Delegates to the existing
98
+ * `browser.ts`/`actions.ts`/`tabs.ts` helpers — no logic is rewritten here.
99
+ *
100
+ * Owns a `BrowserSession` plus a `TabManager`; the active page can swap when a
101
+ * popup auto-focuses or the LLM issues switch_tab/close_tab, so every method
102
+ * re-reads `tabs.activePage()` before acting (matching the previous loop).
103
+ */
104
+ export declare class BrowserDevice implements SimulationDevice {
105
+ private readonly session;
106
+ private readonly tabs;
107
+ private readonly opts;
108
+ private readonly contextValues;
109
+ /** When false this device shares a browser process and only closes its tab. */
110
+ private readonly ownsBrowser;
111
+ /** CDP node map from the last observe(), needed to resolve actions. */
112
+ private lastTreeData;
113
+ constructor(session: BrowserSession, opts: LocalSimBrowserOptions, contextValues: ContextValue[], ownsBrowser: boolean);
114
+ launchOrReset(target: string): Promise<void>;
115
+ observe(): Promise<DeviceObservation>;
116
+ captureScreenshot(): Promise<string>;
117
+ captureScreenshotJpeg(): Promise<Buffer>;
118
+ captureFullPageJpeg(opts: {
119
+ documentHeight: number;
120
+ cap: number;
121
+ }): Promise<string | undefined>;
122
+ dimensions(): {
123
+ width: number;
124
+ height: number;
125
+ };
126
+ executeAction(action: LocalStepAction): Promise<DeviceActionResult>;
127
+ currentUrl(): string;
128
+ close(): Promise<void>;
129
+ }
130
+ /**
131
+ * Build the device for a platform. `web`/`browser`/`""` → Playwright
132
+ * `BrowserDevice`; `android` → `AndroidDevice` (adb); `ios` → `IOSDevice`
133
+ * (simctl + idb). The native cases are dynamically imported so the browser path
134
+ * never pulls in the adb/simctl modules.
135
+ */
136
+ export declare function createDevice(platform: string, opts: {
137
+ browserOpts: LocalSimBrowserOptions;
138
+ contextValues: ContextValue[];
139
+ sharedBrowser?: Browser;
140
+ /** Native: local .apk/.app path to install or a package/bundle id to launch. */
141
+ appPath?: string;
142
+ log?: (msg: string) => void;
143
+ }): Promise<SimulationDevice>;