@ishlabs/cli 0.24.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/ask.js +3 -3
- package/dist/commands/iteration.js +1 -1
- package/dist/commands/study-analyze.js +1 -1
- package/dist/commands/study-run.js +80 -12
- package/dist/commands/study.js +11 -7
- package/dist/lib/alias-store.js +1 -1
- package/dist/lib/api-client.d.ts +2 -0
- package/dist/lib/docs.js +57 -42
- package/dist/lib/local-sim/actions.d.ts +10 -2
- package/dist/lib/local-sim/actions.js +16 -11
- package/dist/lib/local-sim/adb.d.ts +103 -0
- package/dist/lib/local-sim/adb.js +352 -0
- package/dist/lib/local-sim/android.d.ts +111 -0
- package/dist/lib/local-sim/android.js +499 -0
- package/dist/lib/local-sim/apk-manifest.d.ts +22 -0
- package/dist/lib/local-sim/apk-manifest.js +210 -0
- package/dist/lib/local-sim/browser.d.ts +22 -0
- package/dist/lib/local-sim/browser.js +65 -0
- package/dist/lib/local-sim/coordinates.d.ts +69 -0
- package/dist/lib/local-sim/coordinates.js +59 -0
- package/dist/lib/local-sim/device.d.ts +143 -0
- package/dist/lib/local-sim/device.js +152 -0
- package/dist/lib/local-sim/ios.d.ts +168 -0
- package/dist/lib/local-sim/ios.js +546 -0
- package/dist/lib/local-sim/loop.d.ts +14 -2
- package/dist/lib/local-sim/loop.js +166 -73
- package/dist/lib/local-sim/native-a11y.d.ts +97 -0
- package/dist/lib/local-sim/native-a11y.js +384 -0
- package/dist/lib/local-sim/simctl.d.ts +85 -0
- package/dist/lib/local-sim/simctl.js +273 -0
- package/dist/lib/local-sim/types.d.ts +37 -2
- package/dist/lib/local-sim/upload.d.ts +1 -1
- package/dist/lib/local-sim/upload.js +9 -6
- package/dist/lib/output.js +58 -12
- package/dist/lib/skill-content.js +10 -9
- package/package.json +2 -1
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AndroidDevice — drives a local Android emulator/device via `adb`, implementing
|
|
3
|
+
* the SimulationDevice surface the loop expects.
|
|
4
|
+
*
|
|
5
|
+
* Two resolution paths, mirroring the browser:
|
|
6
|
+
* - ELEMENT (preferred): observe() dumps the uiautomator a11y tree, serializes
|
|
7
|
+
* it to the `[id] role "label"` string the backend DOMLocator reasons over,
|
|
8
|
+
* and keeps a local `shortId → bounds` map. The backend returns a `node_id`;
|
|
9
|
+
* executeAction() looks the bounds up locally and taps the row's CENTER.
|
|
10
|
+
* - VISION (fallback): when the dump fails or yields a sparse tree, observe()
|
|
11
|
+
* returns an empty tree so the backend takes its vision branch and returns
|
|
12
|
+
* NORMALIZED 0-1000 coordinates, which we de-normalize and tap (the original
|
|
13
|
+
* path). The vision path is also taken per-action whenever node_id is absent.
|
|
14
|
+
*
|
|
15
|
+
* Coordinate contract (see scripts/mobile-e2e + CROSS-REPO CONTRACT):
|
|
16
|
+
* adb `screencap` and `input tap` share ONE pixel space — NO DPR correction.
|
|
17
|
+
* - Element path: uiautomator bounds are screencap PIXELS, so the bounds-center
|
|
18
|
+
* is already a pixel center — tap and record it as-is.
|
|
19
|
+
* - Vision path: px = round(x / 1000 * screencapWidth); same for y.
|
|
20
|
+
*/
|
|
21
|
+
import { resolveTextValue } from "./actions.js";
|
|
22
|
+
import { requireOneDevice, screencapPng, pngDimensions, dumpUiautomatorXml, inputTap, inputSwipe, inputDrag, inputLongPress, setUserRotation, forceStop, launchApp, installApk, isPackageInstalled, listPackages, isAdbKeyboardInstalled, enableAdbKeyboard, setIme, resetIme, currentIme, adbKeyboardType, adbKeyboardClear, pressKeyEvent, ADB_KEYBOARD_PKG, } from "./adb.js";
|
|
23
|
+
import { isLocalPath } from "../upload.js";
|
|
24
|
+
import { deNormalizePoint, deNormalizeDrag } from "./coordinates.js";
|
|
25
|
+
import { parseUiautomatorXml, serializeNativeTree, boundsCenter } from "./native-a11y.js";
|
|
26
|
+
import { packageNameFromApk } from "./apk-manifest.js";
|
|
27
|
+
// Let animations/IME transitions settle before the next observation so the
|
|
28
|
+
// screenshot the LLM reasons over reflects the action's result.
|
|
29
|
+
const POST_GESTURE_SETTLE_MS = 500;
|
|
30
|
+
async function settle(ms = POST_GESTURE_SETTLE_MS) {
|
|
31
|
+
await new Promise((r) => setTimeout(r, ms));
|
|
32
|
+
}
|
|
33
|
+
export class AndroidDevice {
|
|
34
|
+
contextValues;
|
|
35
|
+
log;
|
|
36
|
+
appPackage;
|
|
37
|
+
appPath;
|
|
38
|
+
/** screencap pixel size from the most recent capture — the de-normalization basis. */
|
|
39
|
+
screenWidth = 0;
|
|
40
|
+
screenHeight = 0;
|
|
41
|
+
/**
|
|
42
|
+
* shortId → bounds (screencap PIXELS) from the last observe(), the local
|
|
43
|
+
* counterpart of BrowserDevice.lastTreeData. executeAction() resolves a
|
|
44
|
+
* backend `node_id` against this and taps the bounds CENTER (element path).
|
|
45
|
+
*/
|
|
46
|
+
lastNodeMap = new Map();
|
|
47
|
+
/** IME to restore on close (null if we never switched it). */
|
|
48
|
+
previousIme = null;
|
|
49
|
+
adbKeyboardActive = false;
|
|
50
|
+
constructor(opts) {
|
|
51
|
+
this.contextValues = opts.contextValues;
|
|
52
|
+
this.log = opts.log ?? (() => { });
|
|
53
|
+
this.appPackage = opts.appPackage ?? null;
|
|
54
|
+
this.appPath = opts.appPath;
|
|
55
|
+
}
|
|
56
|
+
async launchOrReset(target) {
|
|
57
|
+
await requireOneDevice();
|
|
58
|
+
// First call: install the apk (if --app is a local path) and resolve the
|
|
59
|
+
// package name to force-stop/relaunch on. `target` is the iteration's
|
|
60
|
+
// platform target (a package name) when no --app apk is supplied. Throws
|
|
61
|
+
// (rather than silently driving the foreground) if the package can't be
|
|
62
|
+
// resolved — a wrong-app run is worse than a clear error.
|
|
63
|
+
if (!this.appPackage) {
|
|
64
|
+
this.appPackage = await this.resolvePackage(target);
|
|
65
|
+
}
|
|
66
|
+
const pkg = this.appPackage;
|
|
67
|
+
// Set up ADBKeyboard once so text_input works (best-effort — text_input
|
|
68
|
+
// degrades to a no-op-with-warning if the IME isn't installed).
|
|
69
|
+
await this.ensureAdbKeyboard();
|
|
70
|
+
// Per-participant reset: stop then relaunch from a clean state.
|
|
71
|
+
await forceStop(pkg);
|
|
72
|
+
await launchApp(pkg);
|
|
73
|
+
await settle(1500); // cold start needs longer than a gesture settle
|
|
74
|
+
// Prime screencap dimensions for the first de-normalization.
|
|
75
|
+
await this.refreshDimensions();
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Resolve which package to drive, returning a non-null package name or
|
|
79
|
+
* throwing. For a local .apk we read the package straight from its binary
|
|
80
|
+
* AndroidManifest (no aapt) — works whether the apk is fresh or already
|
|
81
|
+
* installed. If that parse fails we fall back to diffing the installed-package
|
|
82
|
+
* list across install, then to a foreground read; an unresolvable case throws
|
|
83
|
+
* and asks for an explicit package.
|
|
84
|
+
*/
|
|
85
|
+
async resolvePackage(target) {
|
|
86
|
+
const appSpec = this.appPath ?? (target && target.trim() ? target.trim() : null);
|
|
87
|
+
if (!appSpec) {
|
|
88
|
+
throw new Error("No app to drive: pass --app <path-to.apk | installed.package.name>, or set the iteration's " +
|
|
89
|
+
"platform target to an installed package name.");
|
|
90
|
+
}
|
|
91
|
+
// `isLocalPath` returns false for http(s):// and throws on other schemes.
|
|
92
|
+
const local = isLocalPath(appSpec);
|
|
93
|
+
if (!local) {
|
|
94
|
+
// A hosted apk would need downloading to the host before `adb install`;
|
|
95
|
+
// that path isn't wired yet. Surface it loudly rather than mis-treating
|
|
96
|
+
// the URL as a package name.
|
|
97
|
+
throw new Error(`--app received a URL (${appSpec}). Installing a hosted .apk on the emulator is not supported yet — ` +
|
|
98
|
+
`pass a local .apk path or an already-installed package name.`);
|
|
99
|
+
}
|
|
100
|
+
if (appSpec.toLowerCase().endsWith(".apk")) {
|
|
101
|
+
// Read the package from the apk's manifest BEFORE install — this works
|
|
102
|
+
// even when the apk is already installed (the list-diff is empty then).
|
|
103
|
+
const manifestPkg = await packageNameFromApk(appSpec);
|
|
104
|
+
this.log(`Installing ${appSpec}...`);
|
|
105
|
+
// installApk throws on a Failure[...] result.
|
|
106
|
+
const before = manifestPkg ? null : await listPackages();
|
|
107
|
+
await installApk(appSpec);
|
|
108
|
+
if (manifestPkg) {
|
|
109
|
+
this.log(`Installed ${appSpec} → package ${manifestPkg} (from manifest)`);
|
|
110
|
+
return manifestPkg;
|
|
111
|
+
}
|
|
112
|
+
// Manifest parse failed (unusual apk layout) — fall back to the list
|
|
113
|
+
// diff, which only works on a FRESH install (the new package is the
|
|
114
|
+
// diff). We do NOT read the foreground here: launchApp hasn't run yet, so
|
|
115
|
+
// mCurrentFocus is the launcher/home — returning it would silently drive
|
|
116
|
+
// the WRONG app. When we can't tell, throw a clear error instead.
|
|
117
|
+
const after = await listPackages();
|
|
118
|
+
const added = [...after].filter((p) => !before.has(p));
|
|
119
|
+
if (added.length === 1) {
|
|
120
|
+
this.log(`Installed ${appSpec} → package ${added[0]} (from install diff)`);
|
|
121
|
+
return added[0];
|
|
122
|
+
}
|
|
123
|
+
if (added.length > 1) {
|
|
124
|
+
throw new Error(`Installing "${appSpec}" added ${added.length} packages (${added.join(", ")}); ` +
|
|
125
|
+
`can't tell which to drive. Pass --app <package.name> explicitly.`);
|
|
126
|
+
}
|
|
127
|
+
// No new package (the apk was already installed) and no manifest — there
|
|
128
|
+
// is no reliable signal for the package. Don't guess.
|
|
129
|
+
throw new Error(`Couldn't determine the package for "${appSpec}" — it's already installed (so the install added no ` +
|
|
130
|
+
`new package) and its manifest couldn't be parsed. Pass --app <package.name> explicitly.`);
|
|
131
|
+
}
|
|
132
|
+
// Local non-.apk value: treat as an installed package name.
|
|
133
|
+
if (await isPackageInstalled(appSpec)) {
|
|
134
|
+
return appSpec;
|
|
135
|
+
}
|
|
136
|
+
throw new Error(`App package "${appSpec}" is not installed on the device and is not a local .apk path. ` +
|
|
137
|
+
`Pass --app <path-to.apk> to install it, or install it first.`);
|
|
138
|
+
}
|
|
139
|
+
async ensureAdbKeyboard() {
|
|
140
|
+
try {
|
|
141
|
+
if (!(await isAdbKeyboardInstalled())) {
|
|
142
|
+
this.log(`ADBKeyboard (${ADB_KEYBOARD_PKG}) not installed — text_input actions will be skipped. ` +
|
|
143
|
+
`Install it on the emulator to enable typing.`);
|
|
144
|
+
return;
|
|
145
|
+
}
|
|
146
|
+
this.previousIme = await currentIme();
|
|
147
|
+
await enableAdbKeyboard();
|
|
148
|
+
this.adbKeyboardActive = true;
|
|
149
|
+
}
|
|
150
|
+
catch (err) {
|
|
151
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
152
|
+
this.log(`Could not activate ADBKeyboard — text_input will be skipped: ${msg}`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
async refreshDimensions() {
|
|
156
|
+
const png = await screencapPng();
|
|
157
|
+
const { width, height } = pngDimensions(png);
|
|
158
|
+
this.screenWidth = width;
|
|
159
|
+
this.screenHeight = height;
|
|
160
|
+
return png;
|
|
161
|
+
}
|
|
162
|
+
async observe() {
|
|
163
|
+
// Screencap and the a11y dump are independent reads — run them in parallel.
|
|
164
|
+
// The dump is wrapped so a failure degrades to the vision path (empty tree)
|
|
165
|
+
// rather than aborting the observation.
|
|
166
|
+
const [png, tree] = await Promise.all([
|
|
167
|
+
this.refreshDimensions(),
|
|
168
|
+
this.dumpTree(),
|
|
169
|
+
]);
|
|
170
|
+
this.lastNodeMap = tree.nodeMap;
|
|
171
|
+
return {
|
|
172
|
+
screenshot: png.toString("base64"),
|
|
173
|
+
// Element path when the dump produced a tree; "" → backend vision branch.
|
|
174
|
+
accessibilityTree: tree.simplified,
|
|
175
|
+
url: "",
|
|
176
|
+
width: this.screenWidth,
|
|
177
|
+
height: this.screenHeight,
|
|
178
|
+
// Native has no scrollable document; the screen IS the page.
|
|
179
|
+
documentHeight: this.screenHeight,
|
|
180
|
+
tabs: [],
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Dump + serialize the uiautomator a11y tree. Any failure (dump retries
|
|
185
|
+
* exhausted, parse error) degrades to an empty tree so the backend falls back
|
|
186
|
+
* to the vision path — a missing tree must never abort the observation.
|
|
187
|
+
*/
|
|
188
|
+
async dumpTree() {
|
|
189
|
+
try {
|
|
190
|
+
const xml = await dumpUiautomatorXml();
|
|
191
|
+
const nodes = parseUiautomatorXml(xml);
|
|
192
|
+
const tree = serializeNativeTree(nodes);
|
|
193
|
+
this.log(`a11y tree: ${tree.nodeMap.size} node(s)`);
|
|
194
|
+
return tree;
|
|
195
|
+
}
|
|
196
|
+
catch (err) {
|
|
197
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
198
|
+
this.log(`a11y dump failed, falling back to vision: ${msg}`);
|
|
199
|
+
return { simplified: "", nodeMap: new Map() };
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
async captureScreenshot() {
|
|
203
|
+
const png = await this.refreshDimensions();
|
|
204
|
+
return png.toString("base64");
|
|
205
|
+
}
|
|
206
|
+
async captureScreenshotJpeg() {
|
|
207
|
+
// adb screencap only emits PNG. We return the PNG bytes; the upload/record
|
|
208
|
+
// path treats them as opaque image bytes (PDQ frame-matching works on PNG).
|
|
209
|
+
return this.refreshDimensions();
|
|
210
|
+
}
|
|
211
|
+
dimensions() {
|
|
212
|
+
return { width: this.screenWidth, height: this.screenHeight };
|
|
213
|
+
}
|
|
214
|
+
/** Normalized 0-1000 → screencap pixel space. NO DPR correction. */
|
|
215
|
+
toPixels(c) {
|
|
216
|
+
return deNormalizePoint(c, this.screenWidth, this.screenHeight);
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Resolve the pixel target for a positional action. The ELEMENT path wins when
|
|
220
|
+
* the backend returned a `node_id`: look the bounds up in the last observe()'s
|
|
221
|
+
* nodeMap and tap the row's CENTER (already in screencap pixels). Otherwise the
|
|
222
|
+
* VISION path de-normalizes the backend's 0-1000 coordinates. Returns:
|
|
223
|
+
* - {target} on success,
|
|
224
|
+
* - {stale:true} when a node_id has no bounds (the tree moved under us) — the
|
|
225
|
+
* caller fails the action so the loop forwards DOM_ELEMENT_NOT_FOUND and the
|
|
226
|
+
* agent re-observes/retries,
|
|
227
|
+
* - {target:null} when neither node_id nor coordinates were supplied.
|
|
228
|
+
*/
|
|
229
|
+
resolveTarget(action) {
|
|
230
|
+
if (action.node_id) {
|
|
231
|
+
const bounds = this.lastNodeMap.get(action.node_id);
|
|
232
|
+
if (!bounds)
|
|
233
|
+
return { target: null, stale: true };
|
|
234
|
+
return { target: boundsCenter(bounds), stale: false };
|
|
235
|
+
}
|
|
236
|
+
return { target: action.coordinates ? this.toPixels(action.coordinates) : null, stale: false };
|
|
237
|
+
}
|
|
238
|
+
async executeAction(action) {
|
|
239
|
+
try {
|
|
240
|
+
// ELEMENT path (node_id → bounds center, screencap pixels) or VISION path
|
|
241
|
+
// (0-1000 → pixels). A stale node_id fails like an unresolved target.
|
|
242
|
+
const resolved = this.resolveTarget(action);
|
|
243
|
+
if (resolved.stale)
|
|
244
|
+
return this.failStaleNode(action);
|
|
245
|
+
const px = resolved.target;
|
|
246
|
+
switch (action.type) {
|
|
247
|
+
case "tap":
|
|
248
|
+
case "double_tap": {
|
|
249
|
+
if (!px)
|
|
250
|
+
return this.failNoCoords(action);
|
|
251
|
+
const count = action.type === "double_tap" ? 2 : action.count ?? 1;
|
|
252
|
+
for (let i = 0; i < count; i++)
|
|
253
|
+
await inputTap(px.x, px.y);
|
|
254
|
+
break;
|
|
255
|
+
}
|
|
256
|
+
case "long_press": {
|
|
257
|
+
if (!px)
|
|
258
|
+
return this.failNoCoords(action);
|
|
259
|
+
await inputLongPress(px.x, px.y, action.duration_ms ?? 600);
|
|
260
|
+
break;
|
|
261
|
+
}
|
|
262
|
+
case "text_input": {
|
|
263
|
+
await this.typeText(action, px);
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
case "scroll": {
|
|
267
|
+
await this.scroll(action);
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
case "swipe":
|
|
271
|
+
case "pull_to_refresh": {
|
|
272
|
+
await this.swipe(action.direction ?? (action.type === "pull_to_refresh" ? "down" : "up"));
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
case "navigate_back": {
|
|
276
|
+
await pressKeyEvent("KEYCODE_BACK");
|
|
277
|
+
break;
|
|
278
|
+
}
|
|
279
|
+
case "drag": {
|
|
280
|
+
// A drag GRABS an element and RELEASES it elsewhere ("click the
|
|
281
|
+
// element, move, let go") — distinct from a swipe (element-less
|
|
282
|
+
// directional). Press the resolved element center (px — the same
|
|
283
|
+
// element path a tap uses), move to the drop point, release. Record
|
|
284
|
+
// the grab point as the action's coordinates.
|
|
285
|
+
const start = await this.drag(action, px);
|
|
286
|
+
if (!start)
|
|
287
|
+
return this.failNoCoords(action);
|
|
288
|
+
await settle();
|
|
289
|
+
return {
|
|
290
|
+
success: true,
|
|
291
|
+
elementName: action.element_name,
|
|
292
|
+
coordinates: start,
|
|
293
|
+
openedNewTab: false,
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
case "rotate_device": {
|
|
297
|
+
await this.rotate(action.orientation === "landscape" ? "landscape" : "portrait");
|
|
298
|
+
break;
|
|
299
|
+
}
|
|
300
|
+
case "wait": {
|
|
301
|
+
await settle(action.duration_ms ?? 1000);
|
|
302
|
+
break;
|
|
303
|
+
}
|
|
304
|
+
case "think": {
|
|
305
|
+
// Reasoning-only: no device interaction.
|
|
306
|
+
return { success: true, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
307
|
+
}
|
|
308
|
+
case "pinch_zoom":
|
|
309
|
+
case "keyboard_shortcut":
|
|
310
|
+
case "switch_tab":
|
|
311
|
+
case "close_tab": {
|
|
312
|
+
// True multi-touch / browser-tab / desktop concepts with no faithful
|
|
313
|
+
// single-app native equivalent — fail LOUDLY (not silently) so the
|
|
314
|
+
// loop forwards it and the agent can adapt (e.g. double_tap to zoom).
|
|
315
|
+
return this.failUnsupported(action);
|
|
316
|
+
}
|
|
317
|
+
default: {
|
|
318
|
+
this.log(`Unknown native action: ${action.type}`);
|
|
319
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
await settle();
|
|
323
|
+
return {
|
|
324
|
+
success: true,
|
|
325
|
+
elementName: action.element_name,
|
|
326
|
+
// Report the pixel target so the loop re-normalizes it for recording
|
|
327
|
+
// (element path: the bounds-center; vision path: the de-normalized
|
|
328
|
+
// 0-1000 → round-trips exactly against dimensions(), screencap pixels).
|
|
329
|
+
coordinates: px,
|
|
330
|
+
openedNewTab: false,
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
catch (err) {
|
|
334
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
335
|
+
this.log(`Action ${action.type} failed: ${msg}`);
|
|
336
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
failNoCoords(action) {
|
|
340
|
+
// The backend couldn't vision-locate a target (coordinates=null). Skip the
|
|
341
|
+
// action (don't crash, don't silently succeed) and surface it like the
|
|
342
|
+
// browser path's unresolved-element case — success:false makes the loop
|
|
343
|
+
// push a DOM_ELEMENT_NOT_FOUND forward so the LLM learns the target missed.
|
|
344
|
+
const target = action.element_description || action.element_name || "(no description)";
|
|
345
|
+
this.log(`Skipping native action with no resolved target: ${action.type} ${target}`);
|
|
346
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
347
|
+
}
|
|
348
|
+
failStaleNode(action) {
|
|
349
|
+
// The backend resolved a node_id that's no longer in the latest tree (the
|
|
350
|
+
// screen changed between observe() and act). Fail like the browser's
|
|
351
|
+
// unresolved-element case — success:false forwards DOM_ELEMENT_NOT_FOUND so
|
|
352
|
+
// the loop re-observes and the agent retries against a fresh tree.
|
|
353
|
+
const target = action.element_description || action.element_name || action.node_id || "(unknown)";
|
|
354
|
+
this.log(`Stale node_id "${action.node_id}" not in current a11y tree: ${action.type} ${target}`);
|
|
355
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
356
|
+
}
|
|
357
|
+
async typeText(action, px) {
|
|
358
|
+
if (!this.adbKeyboardActive) {
|
|
359
|
+
this.log("text_input skipped: ADBKeyboard not active.");
|
|
360
|
+
return;
|
|
361
|
+
}
|
|
362
|
+
// Focus the field first if the model gave a target.
|
|
363
|
+
if (px) {
|
|
364
|
+
await inputTap(px.x, px.y);
|
|
365
|
+
await settle(250);
|
|
366
|
+
}
|
|
367
|
+
const text = resolveTextValue(action, this.contextValues);
|
|
368
|
+
if (action.mode === "click_type") {
|
|
369
|
+
await adbKeyboardClear();
|
|
370
|
+
}
|
|
371
|
+
await adbKeyboardType(text);
|
|
372
|
+
if (action.submit) {
|
|
373
|
+
await settle(150);
|
|
374
|
+
await pressKeyEvent("KEYCODE_ENTER");
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
async scroll(action) {
|
|
378
|
+
const w = this.screenWidth;
|
|
379
|
+
const h = this.screenHeight;
|
|
380
|
+
const cx = Math.round(w / 2);
|
|
381
|
+
const amountMap = {
|
|
382
|
+
small: 0.25, medium: 0.45, large: 0.7, extra_large: 0.9,
|
|
383
|
+
};
|
|
384
|
+
const frac = amountMap[action.amount ?? "medium"] ?? 0.45;
|
|
385
|
+
const dist = Math.round(h * frac);
|
|
386
|
+
const mid = Math.round(h / 2);
|
|
387
|
+
switch (action.direction) {
|
|
388
|
+
case "up":
|
|
389
|
+
// Reveal content above: swipe finger downward.
|
|
390
|
+
await inputSwipe(cx, mid - dist / 2, cx, mid + dist / 2);
|
|
391
|
+
break;
|
|
392
|
+
case "to_top":
|
|
393
|
+
await inputSwipe(cx, Math.round(h * 0.2), cx, Math.round(h * 0.9), 400);
|
|
394
|
+
break;
|
|
395
|
+
case "to_bottom":
|
|
396
|
+
await inputSwipe(cx, Math.round(h * 0.9), cx, Math.round(h * 0.2), 400);
|
|
397
|
+
break;
|
|
398
|
+
case "down":
|
|
399
|
+
case "to_element":
|
|
400
|
+
default:
|
|
401
|
+
// Reveal content below: swipe finger upward.
|
|
402
|
+
await inputSwipe(cx, mid + dist / 2, cx, mid - dist / 2);
|
|
403
|
+
break;
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
async swipe(direction) {
|
|
407
|
+
const w = this.screenWidth;
|
|
408
|
+
const h = this.screenHeight;
|
|
409
|
+
const cx = Math.round(w / 2);
|
|
410
|
+
const cy = Math.round(h / 2);
|
|
411
|
+
const d = Math.round(h * 0.4);
|
|
412
|
+
const dx = Math.round(w * 0.4);
|
|
413
|
+
switch (direction) {
|
|
414
|
+
case "up":
|
|
415
|
+
await inputSwipe(cx, cy + d / 2, cx, cy - d / 2);
|
|
416
|
+
break;
|
|
417
|
+
case "down":
|
|
418
|
+
await inputSwipe(cx, cy - d / 2, cx, cy + d / 2);
|
|
419
|
+
break;
|
|
420
|
+
case "left":
|
|
421
|
+
await inputSwipe(cx + dx / 2, cy, cx - dx / 2, cy);
|
|
422
|
+
break;
|
|
423
|
+
case "right":
|
|
424
|
+
await inputSwipe(cx - dx / 2, cy, cx + dx / 2, cy);
|
|
425
|
+
break;
|
|
426
|
+
default:
|
|
427
|
+
await inputSwipe(cx, cy + d / 2, cx, cy - d / 2);
|
|
428
|
+
break;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
/**
|
|
432
|
+
* Perform a drag: press the GRABBED element, move to the drop point, release.
|
|
433
|
+
* A drag is "click an element and let it go", so the press lands element-
|
|
434
|
+
* center (the resolved `grab` — node_id bounds center, or the vision
|
|
435
|
+
* coordinate when the tree is blind), NOT the backend's vision-estimated
|
|
436
|
+
* start. The release point is the drag END (drag.endX/endY). Both the grab
|
|
437
|
+
* fallback and the end de-normalize against screencap pixels. `inputDrag`
|
|
438
|
+
* (`input draganddrop`) dwells at the press point first so a long-press
|
|
439
|
+
* pickup registers — a slow swipe would read as a directional swipe instead.
|
|
440
|
+
* Returns the grab pixel point to record, or null if there's no end to drag
|
|
441
|
+
* toward.
|
|
442
|
+
*/
|
|
443
|
+
async drag(action, grab) {
|
|
444
|
+
if (!action.drag)
|
|
445
|
+
return null;
|
|
446
|
+
const { start, end } = deNormalizeDrag(action.drag, this.screenWidth, this.screenHeight);
|
|
447
|
+
// Grab the resolved element center; fall back to the backend's own start
|
|
448
|
+
// only when nothing resolved (no node_id and no vision coordinate).
|
|
449
|
+
const press = grab ?? start;
|
|
450
|
+
await inputDrag(press.x, press.y, end.x, end.y);
|
|
451
|
+
return press;
|
|
452
|
+
}
|
|
453
|
+
async rotate(orientation) {
|
|
454
|
+
await setUserRotation(orientation);
|
|
455
|
+
// Orientation changes the screencap geometry — re-read so the next
|
|
456
|
+
// de-normalization uses the rotated dimensions.
|
|
457
|
+
await settle();
|
|
458
|
+
await this.refreshDimensions();
|
|
459
|
+
}
|
|
460
|
+
failUnsupported(action) {
|
|
461
|
+
// A capability the single-app native driver genuinely can't perform (true
|
|
462
|
+
// multi-touch pinch, browser tabs, desktop keyboard shortcuts). Fail with a
|
|
463
|
+
// clear, diagnosable reason — NOT a silent false — so it's visible WHY and
|
|
464
|
+
// the agent can adapt.
|
|
465
|
+
const hint = {
|
|
466
|
+
pinch_zoom: "no multi-touch on the native driver; the agent should zoom via double_tap",
|
|
467
|
+
keyboard_shortcut: "no hardware keyboard on the native driver; use on-screen taps/text_input",
|
|
468
|
+
switch_tab: "tabs are a browser concept; the native app has a single window",
|
|
469
|
+
close_tab: "tabs are a browser concept; the native app has a single window",
|
|
470
|
+
};
|
|
471
|
+
this.log(`${action.type} not supported by the Android native driver — ${hint[action.type] ?? "no native equivalent"}`);
|
|
472
|
+
return { success: false, elementName: action.element_name, coordinates: null, openedNewTab: false };
|
|
473
|
+
}
|
|
474
|
+
currentUrl() {
|
|
475
|
+
// Native has no URL; recording stores "" (current_location comes from the
|
|
476
|
+
// backend's reasoning output, not the device).
|
|
477
|
+
return "";
|
|
478
|
+
}
|
|
479
|
+
async close() {
|
|
480
|
+
// Restore the IME if we switched it. If we had a prior IME, set it back;
|
|
481
|
+
// if there wasn't one we could read (previousIme null), at least take the
|
|
482
|
+
// shared emulator off ADBKeyboard so we don't leave it on our test IME.
|
|
483
|
+
if (this.adbKeyboardActive) {
|
|
484
|
+
try {
|
|
485
|
+
if (this.previousIme) {
|
|
486
|
+
await setIme(this.previousIme);
|
|
487
|
+
}
|
|
488
|
+
else {
|
|
489
|
+
await resetIme();
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
catch {
|
|
493
|
+
// Non-fatal — don't crash teardown over IME state on a shared device.
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
// Leave the app installed/running; the emulator is shared and the next run
|
|
497
|
+
// resets via launchOrReset. Nothing else to tear down.
|
|
498
|
+
}
|
|
499
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract an APK's package name from its binary AndroidManifest.xml — pure,
|
|
3
|
+
* dependency-free (no aapt, no SDK), so `--app <apk>` resolves the package even
|
|
4
|
+
* when the apk is ALREADY installed (the install-list diff is empty then).
|
|
5
|
+
*
|
|
6
|
+
* Two steps, both on in-memory bytes:
|
|
7
|
+
* 1. Pull AndroidManifest.xml out of the APK (a ZIP) — find its local file
|
|
8
|
+
* header and inflate (raw deflate) or copy (stored).
|
|
9
|
+
* 2. Parse the binary XML (AXML): read the string pool, find the <manifest>
|
|
10
|
+
* START_ELEMENT, and read its `package` attribute's string value.
|
|
11
|
+
*
|
|
12
|
+
* We only need the package string, so this is a deliberately minimal AXML
|
|
13
|
+
* reader (not a general decoder). Returns null on anything unexpected — the
|
|
14
|
+
* caller falls back to other resolution and never crashes on a weird apk.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Parse binary AXML and return the <manifest> element's `package` attribute.
|
|
18
|
+
* Returns null if the structure isn't what we expect.
|
|
19
|
+
*/
|
|
20
|
+
export declare function parseAxmlPackage(axml: Buffer): string | null;
|
|
21
|
+
/** Read an APK file and return its package name, or null if it can't be parsed. */
|
|
22
|
+
export declare function packageNameFromApk(apkPath: string): Promise<string | null>;
|