@ishlabs/cli 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/ask.js +3 -3
- package/dist/commands/iteration.js +1 -1
- package/dist/commands/study-analyze.js +1 -1
- package/dist/commands/study-run.js +83 -15
- package/dist/commands/study.js +11 -7
- package/dist/lib/alias-store.js +1 -1
- package/dist/lib/api-client.d.ts +2 -0
- package/dist/lib/billing.d.ts +30 -16
- package/dist/lib/billing.js +77 -27
- package/dist/lib/docs.js +57 -42
- package/dist/lib/local-sim/actions.d.ts +10 -2
- package/dist/lib/local-sim/actions.js +16 -11
- package/dist/lib/local-sim/adb.d.ts +103 -0
- package/dist/lib/local-sim/adb.js +352 -0
- package/dist/lib/local-sim/android.d.ts +111 -0
- package/dist/lib/local-sim/android.js +499 -0
- package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
- package/dist/lib/local-sim/apk-manifest.js +210 -0
- package/dist/lib/local-sim/browser.d.ts +22 -0
- package/dist/lib/local-sim/browser.js +65 -0
- package/dist/lib/local-sim/coordinates.d.ts +69 -0
- package/dist/lib/local-sim/coordinates.js +59 -0
- package/dist/lib/local-sim/device.d.ts +143 -0
- package/dist/lib/local-sim/device.js +152 -0
- package/dist/lib/local-sim/ios.d.ts +168 -0
- package/dist/lib/local-sim/ios.js +546 -0
- package/dist/lib/local-sim/loop.d.ts +14 -2
- package/dist/lib/local-sim/loop.js +166 -73
- package/dist/lib/local-sim/native-a11y.d.ts +97 -0
- package/dist/lib/local-sim/native-a11y.js +384 -0
- package/dist/lib/local-sim/simctl.d.ts +85 -0
- package/dist/lib/local-sim/simctl.js +273 -0
- package/dist/lib/local-sim/types.d.ts +37 -2
- package/dist/lib/local-sim/upload.d.ts +1 -1
- package/dist/lib/local-sim/upload.js +9 -6
- package/dist/lib/modality.d.ts +10 -1
- package/dist/lib/modality.js +21 -0
- package/dist/lib/output.js +58 -12
- package/dist/lib/skill-content.js +10 -9
- package/package.json +2 -1
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SimulationDevice — the target a local simulation drives.
|
|
3
|
+
*
|
|
4
|
+
* The observe → reason (remote) → act (local) loop in `loop.ts` used to be
|
|
5
|
+
* hardwired to a Playwright `Page`. This interface abstracts exactly what the
|
|
6
|
+
* loop needs from a target so a native Android device (driven by `adb`) can
|
|
7
|
+
* slot in next to the browser. `BrowserDevice` (below) wraps the existing
|
|
8
|
+
* Playwright path in `browser.ts`/`actions.ts`/`tabs.ts`; `AndroidDevice`
|
|
9
|
+
* (added later) implements the same surface via `adb`.
|
|
10
|
+
*
|
|
11
|
+
* Multi-tab handling is browser-specific and stays hidden behind the
|
|
12
|
+
* interface — the loop never touches a `Page` or `TabManager` directly.
|
|
13
|
+
*/
|
|
14
|
+
import { launchBrowser, createTab, captureObservation, takeScreenshot, takeScreenshotJpeg, takeFullPageJpeg, navigateWithRetry, closeBrowser, } from "./browser.js";
|
|
15
|
+
import { executeAction } from "./actions.js";
|
|
16
|
+
import { TabManager } from "./tabs.js";
|
|
17
|
+
import { debugObservation } from "./debug.js";
|
|
18
|
+
/**
|
|
19
|
+
* Browser implementation backed by Playwright. Delegates to the existing
|
|
20
|
+
* `browser.ts`/`actions.ts`/`tabs.ts` helpers — no logic is rewritten here.
|
|
21
|
+
*
|
|
22
|
+
* Owns a `BrowserSession` plus a `TabManager`; the active page can swap when a
|
|
23
|
+
* popup auto-focuses or the LLM issues switch_tab/close_tab, so every method
|
|
24
|
+
* re-reads `tabs.activePage()` before acting (matching the previous loop).
|
|
25
|
+
*/
|
|
26
|
+
export class BrowserDevice {
|
|
27
|
+
session;
|
|
28
|
+
tabs;
|
|
29
|
+
opts;
|
|
30
|
+
contextValues;
|
|
31
|
+
/** When false this device shares a browser process and only closes its tab. */
|
|
32
|
+
ownsBrowser;
|
|
33
|
+
/** CDP node map from the last observe(), needed to resolve actions. */
|
|
34
|
+
lastTreeData = null;
|
|
35
|
+
constructor(session, opts, contextValues, ownsBrowser) {
|
|
36
|
+
this.session = session;
|
|
37
|
+
this.opts = opts;
|
|
38
|
+
this.contextValues = contextValues;
|
|
39
|
+
this.ownsBrowser = ownsBrowser;
|
|
40
|
+
this.tabs = new TabManager(session.context, session.page);
|
|
41
|
+
}
|
|
42
|
+
async launchOrReset(target) {
|
|
43
|
+
await navigateWithRetry(this.tabs.activePage(), target);
|
|
44
|
+
}
|
|
45
|
+
async observe() {
|
|
46
|
+
const page = this.tabs.activePage();
|
|
47
|
+
const obs = await captureObservation(page);
|
|
48
|
+
this.lastTreeData = obs.treeData;
|
|
49
|
+
debugObservation(obs);
|
|
50
|
+
const tabsSnapshot = await this.tabs.list();
|
|
51
|
+
return {
|
|
52
|
+
screenshot: obs.screenshot,
|
|
53
|
+
accessibilityTree: obs.treeData.simplified,
|
|
54
|
+
url: obs.url,
|
|
55
|
+
width: obs.viewportWidth,
|
|
56
|
+
height: obs.viewportHeight,
|
|
57
|
+
documentHeight: obs.documentHeight,
|
|
58
|
+
tabs: tabsSnapshot,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
async captureScreenshot() {
|
|
62
|
+
return takeScreenshot(this.tabs.activePage());
|
|
63
|
+
}
|
|
64
|
+
async captureScreenshotJpeg() {
|
|
65
|
+
return takeScreenshotJpeg(this.tabs.activePage());
|
|
66
|
+
}
|
|
67
|
+
async captureFullPageJpeg(opts) {
|
|
68
|
+
const page = this.tabs.activePage();
|
|
69
|
+
const viewportWidth = page.viewportSize()?.width ?? this.opts.viewport.width;
|
|
70
|
+
const fullPage = await takeFullPageJpeg(page, {
|
|
71
|
+
documentHeight: opts.documentHeight,
|
|
72
|
+
cap: opts.cap,
|
|
73
|
+
viewportWidth,
|
|
74
|
+
});
|
|
75
|
+
return fullPage.base64;
|
|
76
|
+
}
|
|
77
|
+
dimensions() {
|
|
78
|
+
const page = this.tabs.activePage();
|
|
79
|
+
return page.viewportSize() ?? this.opts.viewport;
|
|
80
|
+
}
|
|
81
|
+
async executeAction(action) {
|
|
82
|
+
// Pick up popup auto-switch / explicit tab switch from prior actions.
|
|
83
|
+
let page = this.tabs.activePage();
|
|
84
|
+
const treeData = this.lastTreeData ?? { simplified: "", nodeMap: new Map() };
|
|
85
|
+
const tabsBefore = (await this.tabs.list()).length;
|
|
86
|
+
const result = await executeAction(page, action, treeData, this.contextValues, this.tabs);
|
|
87
|
+
// The action may have flipped the active tab — re-read.
|
|
88
|
+
page = this.tabs.activePage();
|
|
89
|
+
const tabsAfter = (await this.tabs.list()).length;
|
|
90
|
+
const openedNewTab = action.type === "tap" && tabsAfter > tabsBefore;
|
|
91
|
+
return {
|
|
92
|
+
success: result.success,
|
|
93
|
+
elementName: result.elementName,
|
|
94
|
+
coordinates: result.coordinates,
|
|
95
|
+
openedNewTab,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
currentUrl() {
|
|
99
|
+
return this.tabs.activePage().url();
|
|
100
|
+
}
|
|
101
|
+
async close() {
|
|
102
|
+
if (this.ownsBrowser) {
|
|
103
|
+
await closeBrowser(this.session);
|
|
104
|
+
}
|
|
105
|
+
else {
|
|
106
|
+
// Shared mode: close just the tab, not the context or browser.
|
|
107
|
+
try {
|
|
108
|
+
await this.session.page.close();
|
|
109
|
+
}
|
|
110
|
+
catch {
|
|
111
|
+
// already closed
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Build the device for a platform. `web`/`browser`/`""` → Playwright
|
|
118
|
+
* `BrowserDevice`; `android` → `AndroidDevice` (adb); `ios` → `IOSDevice`
|
|
119
|
+
* (simctl + idb). The native cases are dynamically imported so the browser path
|
|
120
|
+
* never pulls in the adb/simctl modules.
|
|
121
|
+
*/
|
|
122
|
+
export async function createDevice(platform, opts) {
|
|
123
|
+
switch (platform) {
|
|
124
|
+
case "web":
|
|
125
|
+
case "browser":
|
|
126
|
+
case "": {
|
|
127
|
+
const ownsBrowser = !opts.sharedBrowser;
|
|
128
|
+
const session = opts.sharedBrowser
|
|
129
|
+
? await createTab(opts.sharedBrowser, opts.browserOpts)
|
|
130
|
+
: await launchBrowser(opts.browserOpts);
|
|
131
|
+
return new BrowserDevice(session, opts.browserOpts, opts.contextValues, ownsBrowser);
|
|
132
|
+
}
|
|
133
|
+
case "android": {
|
|
134
|
+
const { AndroidDevice } = await import("./android.js");
|
|
135
|
+
return new AndroidDevice({
|
|
136
|
+
appPath: opts.appPath,
|
|
137
|
+
contextValues: opts.contextValues,
|
|
138
|
+
log: opts.log,
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
case "ios": {
|
|
142
|
+
const { IOSDevice } = await import("./ios.js");
|
|
143
|
+
return new IOSDevice({
|
|
144
|
+
appPath: opts.appPath,
|
|
145
|
+
contextValues: opts.contextValues,
|
|
146
|
+
log: opts.log,
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
default:
|
|
150
|
+
throw new Error(`Unsupported platform for local simulation: "${platform}"`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* IOSDevice — drives a local iOS simulator via `xcrun simctl` + `idb`,
|
|
3
|
+
* implementing the SimulationDevice surface the loop expects. Mirrors
|
|
4
|
+
* AndroidDevice; the one substantive difference is the coordinate space.
|
|
5
|
+
*
|
|
6
|
+
* Two resolution paths, mirroring the browser:
|
|
7
|
+
* - ELEMENT (preferred): observe() reads the `idb ui describe-all` a11y tree,
|
|
8
|
+
* serializes it to the `[id] role "label"` string the backend DOMLocator
|
|
9
|
+
* reasons over, and keeps a local `shortId → bounds` map (bounds in POINTS).
|
|
10
|
+
* The backend returns a `node_id`; executeAction() looks the bounds up and
|
|
11
|
+
* taps the element's CENTER.
|
|
12
|
+
* - VISION (fallback): when the tree is empty/sparse, observe() returns an
|
|
13
|
+
* empty tree so the backend takes its vision branch and returns NORMALIZED
|
|
14
|
+
* 0-1000 coordinates. Also taken per-action whenever node_id is absent.
|
|
15
|
+
*
|
|
16
|
+
* COORDINATE SPACE — two spaces, the key difference from Android (where
|
|
17
|
+
* screencap and tap share one pixel space):
|
|
18
|
+
* `simctl io booted screenshot` is in PIXELS (e.g. 1179x2556 @3x), but
|
|
19
|
+
* `idb ui tap/swipe` AND the `describe-all` a11y frames are POINTS (393x852).
|
|
20
|
+
* The invariant in BOTH paths: TAP in points, RECORD in pixels, because the
|
|
21
|
+
* loop re-normalizes the recorded coord against dimensions() (PIXELS).
|
|
22
|
+
* - VISION: tap pt = round(n/1000 * pointSize); record px = round(n/1000 * pixelSize).
|
|
23
|
+
* - ELEMENT: tap = bounds-center (already POINTS); record = that center
|
|
24
|
+
* scaled POINTS→PIXELS via pointToPixel() (the @Nx scale).
|
|
25
|
+
* dimensions() returns the PIXEL size, so the loop re-normalizes the recorded
|
|
26
|
+
* px back to a stable 0-1000. Recording in points would drift: the point grid
|
|
27
|
+
* (393) is coarser than the 0-1000 grid, so a points round-trip double-rounds
|
|
28
|
+
* (500→197→501). Pixels (1179 > 1000) are finer → identity. The vision model
|
|
29
|
+
* is resolution-independent (0-1000 is a fraction of the image), so the
|
|
30
|
+
* backend never converts coords with screen_width/height.
|
|
31
|
+
*/
|
|
32
|
+
import type { LocalStepAction, ContextValue } from "./types.js";
|
|
33
|
+
import type { SimulationDevice, DeviceObservation, DeviceActionResult } from "./device.js";
|
|
34
|
+
export interface IosDeviceOptions {
|
|
35
|
+
/** Bundle id to terminate/relaunch between participants. Derived from --app when a .app is given. */
|
|
36
|
+
bundleId?: string;
|
|
37
|
+
/** Local .app path to install before the run, or a bundle id to launch. */
|
|
38
|
+
appPath?: string;
|
|
39
|
+
contextValues: ContextValue[];
|
|
40
|
+
log?: (msg: string) => void;
|
|
41
|
+
}
|
|
42
|
+
export declare class IOSDevice implements SimulationDevice {
|
|
43
|
+
private readonly contextValues;
|
|
44
|
+
private readonly log;
|
|
45
|
+
private bundleId;
|
|
46
|
+
private readonly appPath;
|
|
47
|
+
/** udid of the single booted simulator we drive. */
|
|
48
|
+
private udid;
|
|
49
|
+
/** POINT size — what idb ui tap/swipe consume (de-normalization basis for TAPS). */
|
|
50
|
+
private pointWidth;
|
|
51
|
+
private pointHeight;
|
|
52
|
+
/**
|
|
53
|
+
* PIXEL size — the screenshot resolution and the RECORDED coord space.
|
|
54
|
+
* Recording in pixels (not points) keeps the loop's round-trip exact: the
|
|
55
|
+
* point grid (e.g. 393) is coarser than the 0-1000 normalized grid, so a
|
|
56
|
+
* points round-trip double-rounds and drifts; pixels (e.g. 1179 > 1000) are
|
|
57
|
+
* finer, so de-normalize-then-re-normalize is an identity.
|
|
58
|
+
*/
|
|
59
|
+
private pixelWidth;
|
|
60
|
+
private pixelHeight;
|
|
61
|
+
/**
|
|
62
|
+
* shortId → bounds (POINTS — idb describe-all frames) from the last observe(),
|
|
63
|
+
* the local counterpart of BrowserDevice.lastTreeData. executeAction()
|
|
64
|
+
* resolves a backend `node_id` against this; the bounds-center is the POINT
|
|
65
|
+
* tap target (recorded in pixels via pointToPixel).
|
|
66
|
+
*/
|
|
67
|
+
private lastNodeMap;
|
|
68
|
+
constructor(opts: IosDeviceOptions);
|
|
69
|
+
launchOrReset(target: string): Promise<void>;
|
|
70
|
+
/**
|
|
71
|
+
* Resolve the bundle id to drive, returning a non-null id or throwing.
|
|
72
|
+
* Installs a local `.app` first and reads its CFBundleIdentifier from
|
|
73
|
+
* Info.plist (no list-diff needed — a .app carries its id). A non-.app local
|
|
74
|
+
* value is treated as an already-installed bundle id.
|
|
75
|
+
*/
|
|
76
|
+
private resolveBundleId;
|
|
77
|
+
private refreshScreen;
|
|
78
|
+
observe(): Promise<DeviceObservation>;
|
|
79
|
+
/**
|
|
80
|
+
* Read + serialize the idb describe-all a11y tree (bounds in POINTS). Any
|
|
81
|
+
* failure (retries exhausted on a trivial tree, parse error) degrades to an
|
|
82
|
+
* empty tree so the backend falls back to vision — a missing tree must never
|
|
83
|
+
* abort the observation.
|
|
84
|
+
*/
|
|
85
|
+
private dumpTree;
|
|
86
|
+
captureScreenshot(): Promise<string>;
|
|
87
|
+
captureScreenshotJpeg(): Promise<Buffer>;
|
|
88
|
+
dimensions(): {
|
|
89
|
+
width: number;
|
|
90
|
+
height: number;
|
|
91
|
+
};
|
|
92
|
+
/** Normalized 0-1000 → POINT space (idb ui tap/swipe take points). */
|
|
93
|
+
private toPoints;
|
|
94
|
+
/** Normalized 0-1000 → PIXEL space (the recorded/reported coord). */
|
|
95
|
+
private toPixels;
|
|
96
|
+
/**
|
|
97
|
+
* Resolve the POINT tap target + PIXEL record coord for a positional action.
|
|
98
|
+
* ELEMENT path (node_id): the bounds-center is the POINT tap; the recorded
|
|
99
|
+
* pixel coord is that center scaled POINTS→PIXELS so it round-trips against
|
|
100
|
+
* dimensions() (pixels). VISION path: de-normalize the 0-1000 coord into both
|
|
101
|
+
* spaces. Returns {stale:true} for a node_id with no bounds (tree moved); the
|
|
102
|
+
* caller fails the action so the loop forwards DOM_ELEMENT_NOT_FOUND.
|
|
103
|
+
*/
|
|
104
|
+
private resolveTarget;
|
|
105
|
+
executeAction(action: LocalStepAction): Promise<DeviceActionResult>;
|
|
106
|
+
private failNoCoords;
|
|
107
|
+
private failStaleNode;
|
|
108
|
+
private typeText;
|
|
109
|
+
private scroll;
|
|
110
|
+
private swipe;
|
|
111
|
+
/**
|
|
112
|
+
* Perform a drag: press the GRABBED element, move to the drop point, release.
|
|
113
|
+
* A drag is "click an element and let it go", so the press lands element-
|
|
114
|
+
* center (the resolved `grab` in POINTS — node_id bounds center, or the
|
|
115
|
+
* vision coordinate when the tree is blind), NOT the backend's vision-
|
|
116
|
+
* estimated start. The release point is the drag END (drag.endX/endY). A
|
|
117
|
+
* ~0.8s idb swipe reads as a drag, not a flick. Returns the grab point scaled
|
|
118
|
+
* to PIXELS (pointToPixel) to record so it round-trips against dimensions()
|
|
119
|
+
* (pixels), or null if there's no end to drag toward.
|
|
120
|
+
*
|
|
121
|
+
* idb LIMITATION: `idb ui swipe` only exposes --duration/--delta — it has no
|
|
122
|
+
* press-and-HOLD-then-move primitive (unlike Android's `input draganddrop`).
|
|
123
|
+
* So this drives the immediate-drag surfaces (sliders, drag-to-dismiss, drag
|
|
124
|
+
* handles that pick up on touch-move) but does NOT trigger a long-press
|
|
125
|
+
* pickup (home-screen jiggle mode, in-app reorder that needs a hold first) —
|
|
126
|
+
* verified on-device: a long uiSwipe leaves home-screen icons unmoved. The
|
|
127
|
+
* grab/release SEMANTICS are still correct; the gap is purely the missing
|
|
128
|
+
* hold, which idb can't perform in one continuous gesture.
|
|
129
|
+
*/
|
|
130
|
+
private drag;
|
|
131
|
+
/**
|
|
132
|
+
* iOS has no hardware back. The system interactive-pop (left-edge swipe) is
|
|
133
|
+
* NOT reliably triggerable through idb's synthetic touch — verified on the
|
|
134
|
+
* simulator: no edge-swipe variant (start x, travel, duration, delta) pops
|
|
135
|
+
* the view. So we resolve and TAP the nav-bar back button instead: iOS HIG
|
|
136
|
+
* places "back" as the LEADING (leftmost) button in the top nav bar of any
|
|
137
|
+
* pushed view, so the leftmost button in the nav-bar band is it — verified to
|
|
138
|
+
* pop a Settings sub-screen back to root. The left-edge swipe remains a
|
|
139
|
+
* best-effort fallback for real devices (where idb sends real HID events that
|
|
140
|
+
* do drive the system gesture) when no back button is visible.
|
|
141
|
+
*/
|
|
142
|
+
private navigateBack;
|
|
143
|
+
/**
|
|
144
|
+
* The nav-bar back button: the leading (leftmost) actionable button in the
|
|
145
|
+
* top nav-bar band. iOS HIG guarantees "back" is the leading nav item in a
|
|
146
|
+
* pushed view, so the leftmost button high on the screen is it. Returns null
|
|
147
|
+
* on root screens (no leading back item) so the caller can fall back.
|
|
148
|
+
*
|
|
149
|
+
* The geometry alone (leftmost-top) would mis-fire on a modal whose LEADING
|
|
150
|
+
* item is Cancel/Close, or a root with a leading Edit/menu — and tapping
|
|
151
|
+
* Cancel/Close can DISCARD work. A stock back button is labeled with the
|
|
152
|
+
* PARENT screen's title (e.g. "Settings"), not "Back", so there's no reliable
|
|
153
|
+
* positive label signal; instead we exclude the known non-back leading
|
|
154
|
+
* labels. If every leading button is one of those, we return null and let the
|
|
155
|
+
* caller fall back rather than tap a destructive control.
|
|
156
|
+
*
|
|
157
|
+
* Known limitation: a glyph-only leading button with NO accessible label
|
|
158
|
+
* (e.g. a hamburger/avatar/logo) isn't in the deny-list, so on a screen whose
|
|
159
|
+
* leading control is an unlabeled non-back icon this can tap the wrong control
|
|
160
|
+
* (silently — it returns success). Acceptable for the common case (stock nav
|
|
161
|
+
* bars have a labeled back button), but it's why pushed views, not root/menu
|
|
162
|
+
* screens, are where navigate_back is reliable.
|
|
163
|
+
*/
|
|
164
|
+
private findBackButton;
|
|
165
|
+
private failUnsupported;
|
|
166
|
+
currentUrl(): string;
|
|
167
|
+
close(): Promise<void>;
|
|
168
|
+
}
|