pi-agent-browser-native 0.2.50 → 0.2.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -1
- package/README.md +5 -5
- package/dist/extensions/agent-browser/lib/input-modes/job.js +13 -0
- package/dist/extensions/agent-browser/lib/playbook.js +3 -2
- package/dist/extensions/agent-browser/lib/results/presentation/batch.js +3 -2
- package/dist/extensions/agent-browser/lib/results/presentation/diagnostics.js +27 -9
- package/dist/extensions/agent-browser/lib/results/presentation/large-output.js +26 -1
- package/dist/extensions/agent-browser/lib/results/presentation.js +9 -7
- package/dist/extensions/agent-browser/lib/web-search.js +1 -1
- package/docs/ARCHITECTURE.md +1 -1
- package/docs/COMMAND_REFERENCE.md +11 -11
- package/docs/RELEASE.md +4 -4
- package/docs/REQUIREMENTS.md +1 -1
- package/docs/SUPPORT_MATRIX.md +12 -11
- package/docs/TOOL_CONTRACT.md +6 -5
- package/package.json +8 -7
- package/scripts/agent-browser-capability-baseline.mjs +3 -3
- package/scripts/platform-smoke.mjs +1 -1
- package/scripts/prepare.mjs +68 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,31 @@
|
|
|
2
2
|
|
|
3
3
|
## Unreleased
|
|
4
4
|
|
|
5
|
+
## 0.2.52 - 2026-06-15
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
|
|
9
|
+
- Rebaselined the upstream capability metadata, command reference, support matrix, platform-smoke image tag, and real-upstream output-shape metadata for `agent-browser` `0.27.3` / vercel-labs/agent-browser@2c7991c9eccca1c9db6eee1a26a713414778de5a. This is an install-only upstream update from the prior baseline; no wrapper feature, shim, or inventory-token change was added.
|
|
10
|
+
- Updated the local Pi development baseline to `@earendil-works/*` `0.79.4`, refreshed `.pi-fleet-tested-version`, and refreshed `package-lock.json` with npm 11 while keeping the intentional doctor floor at Pi `0.79.0`.
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- Updated the lifecycle release harness prompt-readiness check to accept Pi 0.79.4 footer units such as `1.0M`, avoiding false readiness timeouts after successful startup.
|
|
15
|
+
|
|
16
|
+
### Validation
|
|
17
|
+
|
|
18
|
+
- Ran `npm publish --dry-run` against `agent-browser` `0.27.3` and Pi `0.79.4`; the gate passed default verification, command-reference checks, build, lifecycle verification, packaged Pi smoke, and macOS/Ubuntu/Windows-native platform smoke.
|
|
19
|
+
|
|
20
|
+
## 0.2.51 - 2026-06-11
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
|
|
24
|
+
- Made the source-package `prepare` lifecycle install dev dependencies with scripts disabled when Pi's `npm install --omit=dev` package path omits the compiler and peer type packages, so GitHub/source installs can still build `dist/` from a clean clone without changing runtime dependency policy.
|
|
25
|
+
|
|
26
|
+
### Validation
|
|
27
|
+
|
|
28
|
+
- Reproduced the `pi install -l --approve https://github.com/fitchmultz/pi-agent-browser-native@v0.2.50` source-install failure, then verified production-dependency source builds, project-local GitHub install, project-local npm install, and release gates before publish.
|
|
29
|
+
|
|
5
30
|
## 0.2.50 - 2026-06-11
|
|
6
31
|
|
|
7
32
|
### Changed
|
|
@@ -318,7 +343,7 @@
|
|
|
318
343
|
### Changed
|
|
319
344
|
- `sourceLookup`, broad `get text`, fill verification, tab/session mismatch, and stale-ref guidance now include Electron-aware context and recovery actions for packaged desktop apps.
|
|
320
345
|
- Verification coverage now includes deterministic Electron lifecycle/probe benchmark scenarios, fake-upstream Electron discovery/lifecycle tests, lifecycle restore/shutdown cleanup checks, and real-app dogfood evidence recorded in the Electron plan.
|
|
321
|
-
- The configured-source lifecycle harness (`npm run verify -- lifecycle`, `scripts/verify-lifecycle.mjs`) now defaults to Pi model `zai/glm-5.
|
|
346
|
+
- The configured-source lifecycle harness (`npm run verify -- lifecycle`, `scripts/verify-lifecycle.mjs`) now defaults to Pi model `zai/glm-5.2` with `--model <id>` override; `npm run verify` lifecycle passthrough rejects `--model` without a value.
|
|
322
347
|
- Updated the local Pi development baseline to `@earendil-works/*` `0.75.4` and refreshed the npm lockfile.
|
|
323
348
|
|
|
324
349
|
### Fixed
|
package/README.md
CHANGED
|
@@ -183,7 +183,7 @@ npm exec --yes --package pi-agent-browser-native@latest -- pi-agent-browser-conf
|
|
|
183
183
|
npm exec --yes --package pi-agent-browser-native@latest -- pi-agent-browser-config show
|
|
184
184
|
```
|
|
185
185
|
|
|
186
|
-
The optional `agent_browser_web_search` companion tool is available when a usable Exa or Brave credential source is configured or resolvable from startup config or trusted session config. It is not an `agent_browser` input mode and does not launch a browser; agents may use it whenever current/live external web information helps, then use `agent_browser` when they need page interaction, screenshots, authenticated/profile content, or DOM inspection. If both keys are available, the default provider is Exa because its `/search` endpoint returns agent-friendly highlights and search modes; set `webSearch.preferredProvider` to `"brave"` when you prefer Brave Search.
|
|
186
|
+
The optional `agent_browser_web_search` companion tool is available when a usable Exa or Brave credential source is configured or resolvable from startup config or trusted session config. It is not an `agent_browser` input mode and does not launch a browser; agents may use it whenever current/live external web information helps, then use `agent_browser` when they need page interaction, screenshots, authenticated/profile content, or DOM inspection. Prefer it over automating public search-engine forms such as Google in headless browser jobs: those flows may be redirected to anti-bot or CAPTCHA pages, and this wrapper does not provide or recommend CAPTCHA bypass. If both keys are available, the default provider is Exa because its `/search` endpoint returns agent-friendly highlights and search modes; set `webSearch.preferredProvider` to `"brave"` when you prefer Brave Search.
|
|
187
187
|
|
|
188
188
|
Get an Exa API key from the [Exa dashboard](https://dashboard.exa.ai/api-keys) or a Brave Search API key from the [Brave Search API dashboard](https://api-dashboard.search.brave.com/). Most users can simply export `EXA_API_KEY` or `BRAVE_API_KEY` in the environment that launches `pi`; config is only needed when you want Pi-scoped secret references, a preferred provider, or to disable this built-in search tool.
|
|
189
189
|
|
|
@@ -412,7 +412,7 @@ After either path, use `qa: { "attached": true, ... }` for a current-session smo
|
|
|
412
412
|
|
|
413
413
|
### Lightweight QA preset
|
|
414
414
|
|
|
415
|
-
For a quick smoke/QA pass, use top-level `qa`. It compiles to the same batch path as `job` and uses `batch --bail` so failed readiness/text/selector assertions stop before slower diagnostics can burn the wrapper watchdog. The URL form clears enabled network/console/page-error buffers before opening the target URL, waits for page readiness, checks optional expected text or selector, inspects fresh network requests, console messages, and page errors when preceding assertions pass, and can capture an evidence screenshot. Expected text is checked with bounded visible-text `wait --fn … --timeout 5000` predicates after the requested load state so dense pages can pass on visible headings/copy and missing text becomes crisp QA evidence. The attached form (`qa: { "attached": true }`) runs checks against the current managed session, such as an attached Electron app, rejects `url`, and deliberately preserves existing diagnostics instead of clearing evidence; its diagnostic reads default off so stale buffers do not fail a current-page smoke unless `checkNetwork`, `checkConsole`, or `checkErrors` is explicitly `true`. `loadState` defaults to `"domcontentloaded"`; set it to `"load"` or `"networkidle"` only when the stricter state is useful and the site is not expected to keep background requests alive. For URL-opening QA, `checkNetwork`, `checkConsole`, and `checkErrors` default to true; set one to `false` to skip that diagnostic read. Network failures are classified by likely impact and failed rows are listed first in network previews: actionable document/script/API-style failures still fail QA, while some low-impact browser icon asset misses (for example certain `favicon` or `apple-touch-icon` paths when upstream marks the row failed and resource metadata looks image-like) surface only as warnings instead of failing an otherwise healthy smoke check (`details.qaPreset.warnings`, with human-readable `details.qaPreset.summary` when the preset still passes). Exact predicates live in [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#qa) and `classifyNetworkRequestFailure` in `extensions/agent-browser/lib/results/network.ts` (re-exported from the compatibility barrel).
|
|
415
|
+
For a quick smoke/QA pass, use top-level `qa`. It compiles to the same batch path as `job` and uses `batch --bail` so failed readiness/text/selector assertions stop before slower diagnostics can burn the wrapper watchdog. The URL form clears enabled network/console/page-error buffers before opening the target URL, waits for page readiness, checks optional expected text or selector, inspects fresh network requests, console messages, and page errors when preceding assertions pass, and can capture an evidence screenshot. Successful reset rows are labeled as reset-scoped output and ignored by QA failure analysis so stale pre-target errors do not fail an otherwise healthy target page; real post-open diagnostic rows still fail or warn according to the normal QA rules. Expected text is checked with bounded visible-text `wait --fn … --timeout 5000` predicates after the requested load state so dense pages can pass on visible headings/copy and missing text becomes crisp QA evidence. The attached form (`qa: { "attached": true }`) runs checks against the current managed session, such as an attached Electron app, rejects `url`, and deliberately preserves existing diagnostics instead of clearing evidence; its diagnostic reads default off so stale buffers do not fail a current-page smoke unless `checkNetwork`, `checkConsole`, or `checkErrors` is explicitly `true`. `loadState` defaults to `"domcontentloaded"`; set it to `"load"` or `"networkidle"` only when the stricter state is useful and the site is not expected to keep background requests alive. For URL-opening QA, `checkNetwork`, `checkConsole`, and `checkErrors` default to true; set one to `false` to skip that diagnostic read. Network failures are classified by likely impact and failed rows are listed first in network previews: actionable document/script/API-style failures still fail QA, while some low-impact browser icon asset misses (for example certain `favicon` or `apple-touch-icon` paths when upstream marks the row failed and resource metadata looks image-like) surface only as warnings instead of failing an otherwise healthy smoke check (`details.qaPreset.warnings`, with human-readable `details.qaPreset.summary` when the preset still passes). Exact predicates live in [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#qa) and `classifyNetworkRequestFailure` in `extensions/agent-browser/lib/results/network.ts` (re-exported from the compatibility barrel).
|
|
416
416
|
|
|
417
417
|
```json
|
|
418
418
|
{
|
|
@@ -449,7 +449,7 @@ For asynchronous exports, click first and then wait for the download:
|
|
|
449
449
|
{ "args": ["wait", "--download", "/tmp/report.csv"] }
|
|
450
450
|
```
|
|
451
451
|
|
|
452
|
-
When a user gives exact artifact paths for screenshots, recordings, downloads, PDFs, traces, or HAR files, use those paths or explicitly report why the artifact was unavailable; do not silently substitute a different path in the final report. The wrapper creates missing parent directories for direct artifact paths such as `state save`, screenshots, PDFs, downloads, and `wait --download`. For simple loopback `download <selector> <path>` anchor links with HTTP(S) `href`, it can save the in-page response directly to the requested path before falling back to upstream click/download behavior; non-loopback/profile downloads stay upstream-owned. With upstream `agent-browser 0.27.
|
|
452
|
+
When a user gives exact artifact paths for screenshots, recordings, downloads, PDFs, traces, or HAR files, use those paths or explicitly report why the artifact was unavailable; do not silently substitute a different path in the final report. The wrapper creates missing parent directories for direct artifact paths such as `state save`, screenshots, PDFs, downloads, and `wait --download`. For simple loopback `download <selector> <path>` anchor links with HTTP(S) `href`, it can save the in-page response directly to the requested path before falling back to upstream click/download behavior; non-loopback/profile downloads stay upstream-owned. With upstream `agent-browser 0.27.3`, treat `details.savedFilePath` as upstream-reported metadata and confirm `details.artifacts[].exists` / `details.artifactVerification.verified` before relying on the requested `wait --download <path>` file being present on disk; non-file download payloads such as `data:` URLs are not verified local artifacts.
|
|
453
453
|
|
|
454
454
|
For evidence-only screenshots or QA captures, branch on `details.artifactVerification` and `details.artifacts` before reporting PASS/FAIL; inline image attachments are optional when size limits allow—do not require vision review unless the user asked for visual inspection. If the latest prompt names exact required artifact paths, browser close can be blocked with `details.promptGuard` until those artifacts are saved and verified.
|
|
455
455
|
|
|
@@ -575,7 +575,7 @@ For larger local handoffs or PR-ready confidence before expensive release/lifecy
|
|
|
575
575
|
npm run verify -- pre-pr
|
|
576
576
|
```
|
|
577
577
|
|
|
578
|
-
That mode composes the full default gate with `npm run verify -- package`, so package contents and forbidden archived/repo-only files are checked without launching Pi lifecycle, Crabbox, or live dogfood flows. Package, package-pi, lifecycle, platform-target, and startup-profile modes all build `dist/` first so clean checkouts do not validate stale or missing compiled output. GitHub/source installs run the package `prepare` script
|
|
578
|
+
That mode composes the full default gate with `npm run verify -- package`, so package contents and forbidden archived/repo-only files are checked without launching Pi lifecycle, Crabbox, or live dogfood flows. Package, package-pi, lifecycle, platform-target, and startup-profile modes all build `dist/` first so clean checkouts do not validate stale or missing compiled output. GitHub/source installs run the package `prepare` script; when Pi installs with `npm install --omit=dev`, that script installs the source-build dev dependencies with lifecycle scripts disabled before building the ignored `dist/` entrypoint that Pi loads.
|
|
579
579
|
|
|
580
580
|
The deterministic agent-efficiency benchmark’s **standalone JSON/Markdown accounting run** is not part of default or pre-PR `npm run verify` (only `npm run verify -- benchmark` or `npm run benchmark:agent-browser` invokes the script). The full unit suite still exercises `test/agent-browser.efficiency-benchmark.test.ts`. Use the script before and after agent-facing abstractions to prove call-count, output-size, stale-ref, artifact, failure-category coverage, success-rate, and elapsed-time effects before changing the wrapper UX:
|
|
581
581
|
|
|
@@ -687,7 +687,7 @@ Configured-source lifecycle validation:
|
|
|
687
687
|
npm run verify -- lifecycle
|
|
688
688
|
```
|
|
689
689
|
|
|
690
|
-
The harness defaults to Pi model `zai/glm-5.
|
|
690
|
+
The harness defaults to Pi model `zai/glm-5.2` and **180000 ms** per-step tmux waits; pass `--model <id>` and/or `--timeout-ms <ms>` after `lifecycle` when you need different settings (see [Configured-source lifecycle validation](docs/RELEASE.md#configured-source-lifecycle-validation) in `docs/RELEASE.md`). It launches Pi 0.79 with `--approve` and a deterministic `--session-id`, drives `/reload`, closes Pi, relaunches the exact same session, asserts the JSONL header id, and checks managed-session continuity, compiled-entrypoint pickup after process restart, persisted spill reachability, and real Pi `tool_result` failure-patch behavior.
|
|
691
691
|
|
|
692
692
|
Use lifecycle validation when testing `/reload`, exact-session relaunch, `/resume`, managed-session continuity, or persisted artifact behavior. Branch-backed state and `session_tree` cleanup ownership are covered by focused extension harness tests. Maintainers must run the lifecycle harness before every publish; see [Pre-release checks](docs/RELEASE.md#pre-release-checks).
|
|
693
693
|
|
|
@@ -289,6 +289,9 @@ export function buildQaCompactPassText(options) {
|
|
|
289
289
|
if (pageParts.length > 0)
|
|
290
290
|
lines.push(`Page: ${pageParts.join(" — ")}`);
|
|
291
291
|
lines.push(`Checks run: ${describeQaChecksRun(options.checks)} (${options.batchStepCount} batch step${options.batchStepCount === 1 ? "" : "s"})`);
|
|
292
|
+
if (options.checks.diagnosticsResetAtStart && (options.checks.checkNetwork || options.checks.checkConsole || options.checks.checkErrors)) {
|
|
293
|
+
lines.push("Diagnostic reset: URL QA cleared enabled network/console/page-error buffers before opening the target; reset rows in details.batchSteps are not counted as current-page failures.");
|
|
294
|
+
}
|
|
292
295
|
if (options.checks.attached && !options.checks.diagnosticsResetAtStart && (options.checks.checkNetwork || options.checks.checkConsole || options.checks.checkErrors)) {
|
|
293
296
|
lines.push("Attached diagnostics: existing upstream session console/network/error buffers were preserved; rows may include events from before qa.attached started.");
|
|
294
297
|
}
|
|
@@ -369,6 +372,13 @@ function extractQaTextAssertionResultText(item) {
|
|
|
369
372
|
}
|
|
370
373
|
return undefined;
|
|
371
374
|
}
|
|
375
|
+
function isDiagnosticResetCommand(item) {
|
|
376
|
+
const command = item.command;
|
|
377
|
+
if (!Array.isArray(command) || !command.every((token) => typeof token === "string"))
|
|
378
|
+
return false;
|
|
379
|
+
const [name, subcommand] = command;
|
|
380
|
+
return command.includes("--clear") && (name === "console" || name === "errors" || (name === "network" && subcommand === "requests"));
|
|
381
|
+
}
|
|
372
382
|
export function analyzeQaPresetTimeout(compiled) {
|
|
373
383
|
if (compiled.checks.expectedText.length === 0)
|
|
374
384
|
return undefined;
|
|
@@ -392,6 +402,9 @@ export function analyzeQaPresetResults(data, compiled) {
|
|
|
392
402
|
}
|
|
393
403
|
const result = isRecord(item.result) ? item.result : undefined;
|
|
394
404
|
const commandName = getCommandNameFromBatchItem(item);
|
|
405
|
+
if (compiled?.checks.diagnosticsResetAtStart && isDiagnosticResetCommand(item)) {
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
395
408
|
if (commandName === "errors" && Array.isArray(result?.errors) && result.errors.length > 0) {
|
|
396
409
|
failedChecks.push(`${result.errors.length} page error(s)`);
|
|
397
410
|
}
|
|
@@ -24,10 +24,11 @@ export const QUICK_START_GUIDELINES = [
|
|
|
24
24
|
"For artifact-producing commands, read the visible artifact block and details.artifactVerification before using files: check requested path, absolute path, existence, size bytes, artifact kind, optional mediaType, status, optional limitation, and verified/missing/pending/unverified counts. details.artifacts contains per-file metadata; record start rows are pending/openRecording until record stop writes the target. The wrapper creates parent directories for direct artifact paths and can save simple loopback HTTP(S) anchor downloads directly to the requested path before upstream download fallback. Browser close does not delete explicit saved files; if close reports details.artifactCleanup, use host file tools to remove paths listed in explicitArtifactPaths (when non-empty) after inspection. If close fails with details.promptGuard.reason=requested-artifacts-missing-before-close, save the exact required artifact path before closing. For annotated screenshots inside batch, put --annotate in top-level args (for example { args: [\"--annotate\", \"batch\"], stdin: \"[[\\\"screenshot\\\",\\\"/tmp/page.png\\\"]]\" }) rather than inside the screenshot step; if annotation labels crowd a dense page, use a scoped or non-annotated screenshot plus snapshot refs instead.",
|
|
25
25
|
"When details.nextActions is present, prefer those exact native agent_browser follow-up payloads over prose guidance; they may include args, stdin, sessionMode, networkSourceLookup, safety notes, or artifactPath for saved files.",
|
|
26
26
|
];
|
|
27
|
-
export const WEB_SEARCH_PROMPT_GUIDELINE = "Use agent_browser_web_search for quick live search/URL discovery; it
|
|
27
|
+
export const WEB_SEARCH_PROMPT_GUIDELINE = "Use agent_browser_web_search for quick live search/URL discovery; prefer it over browser-automating public search-engine forms, which can hit anti-bot/CAPTCHA-gated pages. Use agent_browser for interaction/DOM/screenshots/auth after you have a target URL. One query, inspect, one follow-up max; on HTTP 429 stop/report limits.";
|
|
28
28
|
export const SHARED_BROWSER_PLAYBOOK_GUIDELINES = [
|
|
29
29
|
"Standard workflow: open the page, snapshot -i, interact using current @refs from that snapshot, and re-snapshot after navigation, scrolling, rerendering, or other major DOM changes because refs are page-scoped; the wrapper fails mutation-prone stale/recycled refs before upstream can silently target a different current-page element. On dense pages, use wrapper-side snapshot -i --search <text> or snapshot -i --filter role=<role> to render matching refs while preserving the full ref map in details.refSnapshot, add snapshot --viewport when scroll position or above/below-fold context matters, and add snapshot --diff when a quick before/after ref-map delta would prevent reading a full spill file.",
|
|
30
30
|
"For ordinary forms from one snapshot, batch multiple fill @refs before the submit/click step to avoid serial tool calls; if a fill may autosubmit, navigate, or rerender later fields, split the flow and refresh refs first.",
|
|
31
|
+
"Do not use browser automation to drive public search-engine forms such as Google for discovery; headless jobs that type a query and press Enter can be redirected to anti-bot or CAPTCHA pages. Use agent_browser_web_search when configured, ask for/search from a direct target URL, or navigate to known result URLs. Do not attempt CAPTCHA bypass.",
|
|
31
32
|
"Snapshot choice: prefer snapshot -i for routine clicks/fills (interactive @refs, main-content-first). Use snapshot --compact when you need a denser same-page tree without full spill; use full snapshot (no -i) only when you need the complete accessibility tree. Re-snapshot after navigation or major DOM changes. When snapshot -i compacts because the tree is oversized, scan visible output for Omitted high-value controls and optional details.data.highValueControlRefIds before opening the spill file: those list bounded searchboxes, textboxes, comboboxes, buttons, tabs, checkboxes, radios, options, and menuitems that did not fit the key/other ref previews.",
|
|
32
33
|
"When a visible text or accessible-name target should survive ref churn, prefer find locators such as role, text, label, placeholder, alt, title, or testid with the intended action instead of guessing a CSS selector.",
|
|
33
34
|
"For desktop or host-controlled rich inputs, if semanticAction fill misses, refresh refs and prefer a current editable @ref from details.richInputRecovery or the latest snapshot; focus or click that ref, then use keyboard inserttext or keyboard type with the intended text. Do not auto-submit with Enter or a submit button unless the user flow explicitly calls for it.",
|
|
@@ -44,7 +45,7 @@ export const SHARED_BROWSER_PLAYBOOK_GUIDELINES = [
|
|
|
44
45
|
"For Electron desktop apps, prefer top-level electron for wrapper-owned discovery, isolated launch, status, compact probe, and cleanup: list first, treat likely-sensitive annotations as hints rather than enforcement, launch with the default snapshot handoff unless handoff: \"tabs\" is the safer diagnostic starting point, use electron.probe or snapshot -i/qa.attached for current-session state, and always cleanup the returned launchId when done. electron.launch uses an isolated temporary profile; it does not reuse the app's normal signed-in profile or attach to an already-running authenticated app. For signed-in local app state, host-launch the normal app with --remote-debugging-port when appropriate, then use raw args connect <port|url>; after connect, inspect tab list, select the stable tab id such as tab t2, then run a condition wait or snapshot -i before using refs. close commands (`close`, `quit`, or `exit`) only close the browser/CDP session; leave manually launched app shutdown, profile cleanup, and explicit artifacts to the host owner.",
|
|
45
46
|
"For provider or specialized app workflows, load version-matched upstream guidance with skills get agentcore|electron|slack|dogfood|vercel-sandbox through the native tool; add --full when you need references/templates, and use skills get --all only for broad skill audits. Provider launches such as -p ios, --provider browserbase/kernel/browseruse/browserless/agentcore, and iOS --device are upstream-owned setup paths; use sessionMode fresh when switching providers and expect external credentials or local Appium/Xcode setup to be required.",
|
|
46
47
|
"For dialogs and frames, use dialog status/accept/dismiss and frame <selector|main> through native args; dialog commands and eval snippets that look like alert/confirm/prompt/dialog triggers are shorter-bounded than normal browser calls, and timed-out dialog-like interactions may add inspect-dialog-after-timeout, dismiss-dialog-after-timeout, or recover-fresh-session-after-dialog-timeout nextActions. When --confirm-actions produces a pending confirmation, use details.nextActions or exact confirm <id> / deny <id> calls instead of inventing ids.",
|
|
47
|
-
"If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.
|
|
48
|
+
"If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.3, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort: use explicit --timeout or top-level timeoutMs for legitimately slow waits, and treat a successful payload like \"waited\":\"timeout\" as elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.",
|
|
48
49
|
"For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.",
|
|
49
50
|
"For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.",
|
|
50
51
|
"For downloads, prefer download <selector> <path> when an element click should save a file; simple loopback anchor downloads are saved to the requested path when the wrapper can resolve an HTTP(S) href. Do not rely on click alone when you need the downloaded file on disk.",
|
|
@@ -229,13 +229,14 @@ async function buildBatchStepPresentation(options) {
|
|
|
229
229
|
};
|
|
230
230
|
}
|
|
231
231
|
const commandInfo = parseCommandInfo(command ?? []);
|
|
232
|
+
const commandInfoWithTokens = command ? { ...commandInfo, commandTokens: command } : commandInfo;
|
|
232
233
|
const networkRouteDiagnostics = commandInfo.command === "network" && commandInfo.subcommand === "requests"
|
|
233
234
|
? buildNetworkRouteDiagnostics(item.result, networkRoutes)
|
|
234
235
|
: undefined;
|
|
235
236
|
const presentation = await buildNestedToolPresentation({
|
|
236
237
|
artifactManifest,
|
|
237
238
|
artifactRequest,
|
|
238
|
-
commandInfo,
|
|
239
|
+
commandInfo: commandInfoWithTokens,
|
|
239
240
|
cwd,
|
|
240
241
|
args: command,
|
|
241
242
|
envelope: { data: item.result, success: true },
|
|
@@ -264,7 +265,7 @@ async function buildBatchStepPresentation(options) {
|
|
|
264
265
|
});
|
|
265
266
|
const pageChangeSummary = buildPageChangeSummary({
|
|
266
267
|
artifacts: presentation.artifacts,
|
|
267
|
-
commandInfo,
|
|
268
|
+
commandInfo: commandInfoWithTokens,
|
|
268
269
|
data: presentation.data,
|
|
269
270
|
nextActions,
|
|
270
271
|
savedFilePath: presentation.savedFilePath,
|
|
@@ -107,6 +107,9 @@ export function enrichStreamStatusData(commandInfo, data) {
|
|
|
107
107
|
wsUrl: getStreamWebSocketUrl(data.port),
|
|
108
108
|
};
|
|
109
109
|
}
|
|
110
|
+
function isClearDiagnosticCommand(commandInfo) {
|
|
111
|
+
return commandInfo.subcommand === "--clear" || commandInfo.commandTokens?.includes("--clear") === true;
|
|
112
|
+
}
|
|
110
113
|
export function formatDiagnosticSummary(commandInfo, data) {
|
|
111
114
|
if (commandInfo.command === "session") {
|
|
112
115
|
const sessions = getArrayField(data, "sessions");
|
|
@@ -181,7 +184,7 @@ export function formatDiagnosticSummary(commandInfo, data) {
|
|
|
181
184
|
if (commandInfo.subcommand === "requests") {
|
|
182
185
|
const requests = getArrayField(data, "requests");
|
|
183
186
|
if (requests)
|
|
184
|
-
return `Network requests: ${requests.length}`;
|
|
187
|
+
return isClearDiagnosticCommand(commandInfo) ? `Network requests reset: ${requests.length} cleared` : `Network requests: ${requests.length}`;
|
|
185
188
|
}
|
|
186
189
|
if (commandInfo.subcommand === "route") {
|
|
187
190
|
const routed = getStringField(data, "routed") ?? getStringField(data, "url") ?? getStringField(data, "pattern");
|
|
@@ -228,12 +231,12 @@ export function formatDiagnosticSummary(commandInfo, data) {
|
|
|
228
231
|
if (commandInfo.command === "console") {
|
|
229
232
|
const messages = getArrayField(data, "messages");
|
|
230
233
|
if (messages)
|
|
231
|
-
return `Console messages: ${messages.length}`;
|
|
234
|
+
return isClearDiagnosticCommand(commandInfo) ? `Console reset: ${messages.length} cleared` : `Console messages: ${messages.length}`;
|
|
232
235
|
}
|
|
233
236
|
if (commandInfo.command === "errors") {
|
|
234
237
|
const errors = getArrayField(data, "errors");
|
|
235
238
|
if (errors)
|
|
236
|
-
return `Page errors: ${errors.length}`;
|
|
239
|
+
return isClearDiagnosticCommand(commandInfo) ? `Page errors reset: ${errors.length} cleared` : `Page errors: ${errors.length}`;
|
|
237
240
|
}
|
|
238
241
|
if (commandInfo.command === "dashboard") {
|
|
239
242
|
if (typeof data.port === "number")
|
|
@@ -344,10 +347,15 @@ function formatNetworkRequestLine(item, index) {
|
|
|
344
347
|
appendNetworkPreview(lines, "Error", getPreviewCandidate(item, NETWORK_PREVIEW_FIELD_CANDIDATES.error), NETWORK_ERROR_PREVIEW_MAX_CHARS);
|
|
345
348
|
return lines;
|
|
346
349
|
}
|
|
347
|
-
function formatNetworkRequestsText(data) {
|
|
350
|
+
function formatNetworkRequestsText(data, commandInfo) {
|
|
348
351
|
const requests = getArrayField(data, "requests");
|
|
349
352
|
if (!requests)
|
|
350
353
|
return undefined;
|
|
354
|
+
if (isClearDiagnosticCommand(commandInfo)) {
|
|
355
|
+
return requests.length === 0
|
|
356
|
+
? "Network request buffer cleared; no prior request rows were returned. This reset output is not evidence of current-page network activity."
|
|
357
|
+
: `Network request buffer cleared; upstream returned ${requests.length} cleared/stale row${requests.length === 1 ? "" : "s"}. Treat these as reset output, not current-page request failures.`;
|
|
358
|
+
}
|
|
351
359
|
if (requests.length === 0)
|
|
352
360
|
return "No network requests captured. Scope: upstream session aggregate unless the upstream command output says it was cleared or filtered for this page.";
|
|
353
361
|
const shown = ["Scope: upstream session aggregate unless the upstream command output says it was cleared or filtered for this page; do not attribute old requests to the current page without URL/time evidence."];
|
|
@@ -584,10 +592,15 @@ export function buildStreamNextActions(commandInfo, data, sessionName) {
|
|
|
584
592
|
},
|
|
585
593
|
];
|
|
586
594
|
}
|
|
587
|
-
function formatConsoleText(data) {
|
|
595
|
+
function formatConsoleText(data, commandInfo) {
|
|
588
596
|
const messages = getArrayField(data, "messages");
|
|
589
597
|
if (!messages)
|
|
590
598
|
return undefined;
|
|
599
|
+
if (isClearDiagnosticCommand(commandInfo)) {
|
|
600
|
+
return messages.length === 0
|
|
601
|
+
? "Console buffer cleared; no prior message rows were returned. This reset output is not evidence of current-page console activity."
|
|
602
|
+
: `Console buffer cleared; upstream returned ${messages.length} cleared/stale message row${messages.length === 1 ? "" : "s"}. Treat these as reset output, not current-page console errors.`;
|
|
603
|
+
}
|
|
591
604
|
if (messages.length === 0)
|
|
592
605
|
return "No console messages. Scope: upstream session aggregate unless the upstream command output says it was cleared or filtered for this page.";
|
|
593
606
|
const shown = ["Scope: upstream session aggregate unless the upstream command output says it was cleared or filtered for this page; do not attribute old messages to the current page without URL/time evidence."];
|
|
@@ -604,10 +617,15 @@ function formatConsoleText(data) {
|
|
|
604
617
|
}
|
|
605
618
|
return shown.join("\n");
|
|
606
619
|
}
|
|
607
|
-
function formatErrorsText(data) {
|
|
620
|
+
function formatErrorsText(data, commandInfo) {
|
|
608
621
|
const errors = getArrayField(data, "errors");
|
|
609
622
|
if (!errors)
|
|
610
623
|
return undefined;
|
|
624
|
+
if (isClearDiagnosticCommand(commandInfo)) {
|
|
625
|
+
return errors.length === 0
|
|
626
|
+
? "Page error buffer cleared; no prior error rows were returned. This reset output is not evidence of current-page errors."
|
|
627
|
+
: `Page error buffer cleared; upstream returned ${errors.length} cleared/stale error row${errors.length === 1 ? "" : "s"}. Treat these as reset output, not current-page errors.`;
|
|
628
|
+
}
|
|
611
629
|
if (errors.length === 0)
|
|
612
630
|
return "No page errors.";
|
|
613
631
|
const shown = errors.slice(0, DIAGNOSTIC_LOG_PREVIEW_LIMIT).map((item, index) => {
|
|
@@ -927,7 +945,7 @@ export function formatDiagnosticText(commandInfo, data) {
|
|
|
927
945
|
if (commandInfo.command === "state")
|
|
928
946
|
return formatStateText(data);
|
|
929
947
|
if (commandInfo.command === "network" && commandInfo.subcommand === "requests")
|
|
930
|
-
return formatNetworkRequestsText(data);
|
|
948
|
+
return formatNetworkRequestsText(data, commandInfo);
|
|
931
949
|
if (commandInfo.command === "network" && commandInfo.subcommand === "request")
|
|
932
950
|
return formatNetworkRequestText(data);
|
|
933
951
|
if (commandInfo.command === "diff")
|
|
@@ -945,9 +963,9 @@ export function formatDiagnosticText(commandInfo, data) {
|
|
|
945
963
|
if (commandInfo.command === "chat")
|
|
946
964
|
return formatChatText(data);
|
|
947
965
|
if (commandInfo.command === "console")
|
|
948
|
-
return formatConsoleText(data);
|
|
966
|
+
return formatConsoleText(data, commandInfo);
|
|
949
967
|
if (commandInfo.command === "errors")
|
|
950
|
-
return formatErrorsText(data);
|
|
968
|
+
return formatErrorsText(data, commandInfo);
|
|
951
969
|
if (commandInfo.command === "dashboard")
|
|
952
970
|
return formatDashboardText(data);
|
|
953
971
|
if (commandInfo.command === "doctor")
|
|
@@ -13,6 +13,8 @@ const LARGE_OUTPUT_INLINE_MAX_CHARS = 8_000;
|
|
|
13
13
|
const LARGE_OUTPUT_INLINE_MAX_LINES = 120;
|
|
14
14
|
const LARGE_OUTPUT_PREVIEW_MAX_CHARS = 2_500;
|
|
15
15
|
const LARGE_OUTPUT_PREVIEW_MAX_LINES = 40;
|
|
16
|
+
const LARGE_OUTPUT_PREVIEW_MAX_LINE_CHARS = 240;
|
|
17
|
+
const LARGE_OUTPUT_FAILURE_COMMAND_MAX_CHARS = 240;
|
|
16
18
|
const LARGE_OUTPUT_FILE_PREFIX = "pi-agent-browser-output";
|
|
17
19
|
function shouldCompactLargeOutput(text) {
|
|
18
20
|
return text.length > LARGE_OUTPUT_INLINE_MAX_CHARS || countLines(text) > LARGE_OUTPUT_INLINE_MAX_LINES;
|
|
@@ -26,7 +28,7 @@ function buildLargeOutputPreview(text) {
|
|
|
26
28
|
break;
|
|
27
29
|
}
|
|
28
30
|
const remainingChars = LARGE_OUTPUT_PREVIEW_MAX_CHARS - previewChars;
|
|
29
|
-
const previewLine = truncateText(line, Math.max(40, remainingChars));
|
|
31
|
+
const previewLine = truncateText(line, Math.min(Math.max(40, remainingChars), LARGE_OUTPUT_PREVIEW_MAX_LINE_CHARS));
|
|
30
32
|
previewLines.push(previewLine);
|
|
31
33
|
previewChars += previewLine.length + 1;
|
|
32
34
|
}
|
|
@@ -35,6 +37,27 @@ function buildLargeOutputPreview(text) {
|
|
|
35
37
|
previewText: previewLines.join("\n"),
|
|
36
38
|
};
|
|
37
39
|
}
|
|
40
|
+
function buildLargeOutputFailureContext(presentation) {
|
|
41
|
+
const failure = presentation.batchFailure;
|
|
42
|
+
if (!failure)
|
|
43
|
+
return [];
|
|
44
|
+
const failedStep = failure.failedStep;
|
|
45
|
+
const commandText = truncateText(failedStep.commandText, LARGE_OUTPUT_FAILURE_COMMAND_MAX_CHARS);
|
|
46
|
+
const lines = [
|
|
47
|
+
"Failure context:",
|
|
48
|
+
`- First failing step: ${failedStep.index + 1} — ${commandText}`,
|
|
49
|
+
`- Batch result: ${failure.successCount}/${failure.totalCount} succeeded${failure.failureCount > 1 ? `; ${failure.failureCount} failed` : ""}`,
|
|
50
|
+
];
|
|
51
|
+
if (failedStep.failureCategory)
|
|
52
|
+
lines.push(`- Failure category: ${failedStep.failureCategory}`);
|
|
53
|
+
const failureText = (failedStep.text || failedStep.summary).replace(/\s+/g, " ").trim();
|
|
54
|
+
if (failureText)
|
|
55
|
+
lines.push(`- Failure detail: ${truncateText(failureText, 700)}`);
|
|
56
|
+
const stepPaths = [failedStep.fullOutputPath, ...(failedStep.fullOutputPaths ?? [])].filter((path, index, paths) => typeof path === "string" && path.length > 0 && paths.indexOf(path) === index);
|
|
57
|
+
if (stepPaths.length > 0)
|
|
58
|
+
lines.push(`- Failed-step spill path${stepPaths.length === 1 ? "" : "s"}: ${stepPaths.join(", ")}`);
|
|
59
|
+
return lines;
|
|
60
|
+
}
|
|
38
61
|
async function writeLargeOutputSpillFile(options) {
|
|
39
62
|
const payload = typeof options.data === "string"
|
|
40
63
|
? redactModelFacingText(options.data)
|
|
@@ -91,8 +114,10 @@ export async function compactLargePresentationOutput(options) {
|
|
|
91
114
|
}
|
|
92
115
|
const { omittedLineCount, previewText } = buildLargeOutputPreview(text);
|
|
93
116
|
const commandLabel = options.commandInfo.command ?? "agent-browser";
|
|
117
|
+
const failureContext = buildLargeOutputFailureContext(options.presentation);
|
|
94
118
|
const lines = [
|
|
95
119
|
`Large ${commandLabel} output compacted.`,
|
|
120
|
+
...(failureContext.length > 0 ? ["", ...failureContext] : []),
|
|
96
121
|
"",
|
|
97
122
|
"Preview:",
|
|
98
123
|
previewText,
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* Scope: Presentation shaping only; upstream stdout parsing and snapshot compaction internals live in separate modules.
|
|
5
5
|
*/
|
|
6
6
|
import { isRecord } from "../parsing.js";
|
|
7
|
+
import { extractCommandTokens } from "../runtime.js";
|
|
7
8
|
import { buildAgentBrowserNextActions } from "./action-recommendations.js";
|
|
8
9
|
import { buildAgentBrowserResultCategoryDetails } from "./categories.js";
|
|
9
10
|
import { detectConfirmationRequired } from "./confirmation.js";
|
|
@@ -37,16 +38,17 @@ function shouldAddAnnotatedScreenshotGuidance(commandInfo, args) {
|
|
|
37
38
|
}
|
|
38
39
|
export async function buildToolPresentation(options) {
|
|
39
40
|
const { args, artifactManifest, artifactRequest, commandInfo, compiledSemanticAction, cwd, envelope, errorText, networkRouteDiagnostics, networkRoutes, persistentArtifactStore, sessionName, } = options;
|
|
40
|
-
const
|
|
41
|
+
const commandInfoWithTokens = commandInfo.commandTokens || !args ? commandInfo : { ...commandInfo, commandTokens: extractCommandTokens(args) };
|
|
42
|
+
const presentationCommandInfo = resolvePresentationCommandInfo(commandInfoWithTokens, compiledSemanticAction);
|
|
41
43
|
if (errorText) {
|
|
42
44
|
return buildErrorPresentation({ args, commandInfo, errorText, sessionName });
|
|
43
45
|
}
|
|
44
|
-
const data = enrichStreamStatusData(
|
|
45
|
-
const presentationData = redactPresentationData(
|
|
46
|
+
const data = enrichStreamStatusData(commandInfoWithTokens, envelope?.data);
|
|
47
|
+
const presentationData = redactPresentationData(commandInfoWithTokens, data);
|
|
46
48
|
const artifacts = await extractFileArtifacts({ artifactManifest, artifactRequest, commandInfo: presentationCommandInfo, cwd, data, sessionName });
|
|
47
49
|
const artifactVerification = buildArtifactVerificationSummary(artifacts);
|
|
48
50
|
const artifactSummary = formatArtifactSummary(artifacts);
|
|
49
|
-
const summary = artifactSummary ?? formatPresentationSummary(
|
|
51
|
+
const summary = artifactSummary ?? formatPresentationSummary(commandInfoWithTokens, data, compiledSemanticAction);
|
|
50
52
|
const artifactText = artifacts.length > 0 ? formatArtifactMetadataLines(artifacts).join("\n") : undefined;
|
|
51
53
|
let presentation;
|
|
52
54
|
if (commandInfo.command === "batch" && isAgentBrowserBatchResultArray(data)) {
|
|
@@ -69,7 +71,7 @@ export async function buildToolPresentation(options) {
|
|
|
69
71
|
presentation = {
|
|
70
72
|
artifactVerification,
|
|
71
73
|
artifacts: artifacts.length > 0 ? artifacts : undefined,
|
|
72
|
-
content: [{ type: "text", text: artifactText ?? formatPresentationContentText(
|
|
74
|
+
content: [{ type: "text", text: artifactText ?? formatPresentationContentText(commandInfoWithTokens, data, compiledSemanticAction) }],
|
|
73
75
|
data: presentationData,
|
|
74
76
|
summary,
|
|
75
77
|
};
|
|
@@ -160,10 +162,10 @@ export async function buildToolPresentation(options) {
|
|
|
160
162
|
savedFilePath: presentationWithManifest.savedFilePath,
|
|
161
163
|
successCategory: presentationWithManifest.successCategory,
|
|
162
164
|
});
|
|
163
|
-
const networkNextActions =
|
|
165
|
+
const networkNextActions = commandInfoWithTokens.command === "network" && commandInfoWithTokens.subcommand === "requests" && presentationWithManifest.resultCategory === "success"
|
|
164
166
|
? buildNetworkRequestsNextActions(data, sessionName, presentationWithManifest.networkRouteDiagnostics)
|
|
165
167
|
: undefined;
|
|
166
|
-
const streamNextActions = presentationWithManifest.resultCategory === "success" ? buildStreamNextActions(
|
|
168
|
+
const streamNextActions = presentationWithManifest.resultCategory === "success" ? buildStreamNextActions(commandInfoWithTokens, data, sessionName) : undefined;
|
|
167
169
|
presentationWithManifest.nextActions = mergeNextActions(presentationWithManifest.nextActions, genericNextActions, networkNextActions, streamNextActions);
|
|
168
170
|
presentationWithManifest.pageChangeSummary = presentationWithManifest.pageChangeSummary ?? buildPageChangeSummary({
|
|
169
171
|
artifacts: presentationWithManifest.artifacts,
|
|
@@ -507,7 +507,7 @@ export function createAgentBrowserWebSearchTool(configState, options = {}) {
|
|
|
507
507
|
promptGuidelines: [
|
|
508
508
|
"Use agent_browser_web_search when live web search would help answer the task, find current external information, or discover candidate URLs for agent_browser.",
|
|
509
509
|
"agent_browser_web_search chooses Exa or Brave from configured keys; when both are available, Exa is preferred by default unless webSearch.preferredProvider says otherwise. Use provider only when the user/config calls for a specific provider.",
|
|
510
|
-
"Prefer agent_browser_web_search over opening
|
|
510
|
+
"Prefer agent_browser_web_search over opening or typing into public search engine result pages with agent_browser when a quick result list is enough; browser-automated search forms are often anti-bot/CAPTCHA-gated, and this tool is the fallback for discovery rather than a CAPTCHA bypass.",
|
|
511
511
|
"Do not issue parallel or repeated agent_browser_web_search calls; use one high-signal query, inspect the results, then only run a focused follow-up if needed. If the provider returns HTTP 429, stop searching and tell the user the API plan/rate limit needs time or a plan change.",
|
|
512
512
|
"After using agent_browser_web_search, cite result URLs in the final answer when web evidence informed the answer.",
|
|
513
513
|
],
|
package/docs/ARCHITECTURE.md
CHANGED
|
@@ -177,7 +177,7 @@ That failure should include a structured recovery hint pointing to `sessionMode:
|
|
|
177
177
|
Implementation detail lives in `extensions/agent-browser/lib/launch-scoped-flags.ts` (canonical flag metadata shared with playbook/docs assertions), `extensions/agent-browser/lib/argv-descriptor.ts` and `extensions/agent-browser/lib/argv-grammar.ts` (command discovery, `VALUE_FLAGS`, `parseArgvDescriptor`) plus `extensions/agent-browser/lib/runtime.ts` (`getStartupScopedFlags`, `buildExecutionPlan`):
|
|
178
178
|
|
|
179
179
|
- **Command discovery:** Leading argv is scanned with a value-taking allowlist so known global flags and documented command flags consume their values before the upstream command word is identified. Missing-value prevalidation is intentionally limited to upstream global value flags; command-scoped flags and literal text are left to upstream parsing so values like `fill #field --password` are not rejected by wrapper heuristics before the CLI sees them. When upstream adds new global flags that take values ahead of the command, extend both the command-discovery and prevalidation allowlists; when it adds command-specific flags, extend only command discovery/redaction as needed. A smaller set of global boolean flags may be followed by an optional `true`/`false` literal; when present, that literal is consumed as the flag value before command discovery continues.
|
|
180
|
-
- **`--state` disambiguation:** Persisted browser `--state` before the command participates in launch-scoped validation and tab-correction hints. The same flag spelling after a `wait` command is excluded from startup-scoped detection so upstream help examples such as `wait @ref --state hidden` do not spuriously require `sessionMode: "fresh"` while an implicit session is active. As of upstream `agent-browser 0.27.
|
|
180
|
+
- **`--state` disambiguation:** Persisted browser `--state` before the command participates in launch-scoped validation and tab-correction hints. The same flag spelling after a `wait` command is excluded from startup-scoped detection so upstream help examples such as `wait @ref --state hidden` do not spuriously require `sessionMode: "fresh"` while an implicit session is active. As of upstream `agent-browser 0.27.3`, the parser still does not implement those `wait --state` examples as distinct wait modes, so agent-facing docs recommend `wait --fn` predicates for disappearance checks instead.
|
|
181
181
|
- **`--auto-connect`:** Treated as launch-scoped only when enabled (`--auto-connect` bare or `true`). `--auto-connect false` is ignored for startup-scoped blocking so disabled attach hints do not force a fresh launch.
|
|
182
182
|
|
|
183
183
|
**Sessionless inspection and local commands:** Plain-text global help and version probes (`--help`, `-h`, `--version`, `-V`) must never allocate or bind the extension-managed session. The same session-ownership rule applies to read-only upstream `skills list`, `skills get …`, and `skills path …`, local auth profile management (`auth save/list/show/delete/remove`), plus local/setup surfaces such as `profiles`, `dashboard start/stop`, `device list`, `doctor`, `install`, `upgrade`, `session list`, and targeted/all local saved-state maintenance (`state list/show`, `state clear --all`, `state clear -a`, `state clear <session-name>`, `state clean --older-than <days>`, `state rename`). Non-plain-text sessionless commands still run with `--json` for machine-readable output, but the planner does not prepend the implicit managed `--session`, so an agent can inspect local capabilities or start/stop the standalone dashboard without consuming the implicit session slot before a real `open`. Browser-backed, context-dependent, or incomplete commands such as root `session`, untargeted `state clear`, bare `state clean`, `auth login`, `state save`, and `state load` keep normal managed-session injection. Command-shape allowlisting lives in `extensions/agent-browser/lib/command-policy.ts` (`needsManagedSession`), while `extensions/agent-browser/lib/runtime.ts` (`isPlainTextInspectionArgs`, `buildExecutionPlan`) applies that decision to execution planning.
|
|
@@ -18,16 +18,16 @@ This project intentionally blocks normal `agent-browser` bash usage in most agen
|
|
|
18
18
|
|
|
19
19
|
<!-- agent-browser-capability-baseline:start upstream-baseline -->
|
|
20
20
|
<!-- Generated from scripts/agent-browser-capability-baseline.mjs. Run `npm run docs -- command-reference write` to update. Do not edit manually. -->
|
|
21
|
-
This reference is baselined to the locally installed `agent-browser 0.27.
|
|
21
|
+
This reference is baselined to the locally installed `agent-browser 0.27.3` command/help surface, audited against vercel-labs/agent-browser@2c7991c9eccca1c9db6eee1a26a713414778de5a. Upstream `agent-browser` remains the source of truth for command semantics; this file is the local fallback for Pi agent sessions where direct binary help is blocked or discouraged.
|
|
22
22
|
|
|
23
23
|
The lightweight drift check is `npm run verify -- command-reference`. Run it whenever the installed upstream `agent-browser` version changes or this reference is edited.
|
|
24
24
|
|
|
25
25
|
Use `npm run benchmark:agent-browser` or `npm run verify -- benchmark` before and after agent-facing workflow abstractions to measure task success, tool calls, model-visible output size, stale-ref behavior, artifact success, failure-category coverage, and elapsed-time estimates.
|
|
26
26
|
<!-- agent-browser-capability-baseline:end upstream-baseline -->
|
|
27
27
|
|
|
28
|
-
### Upstream 0.27.
|
|
28
|
+
### Upstream 0.27.3 install-only rebaseline
|
|
29
29
|
|
|
30
|
-
The 0.27.
|
|
30
|
+
The 0.27.3 rebaseline is an install-only compatibility update: upstream changed Windows ARM64 installation fallback behavior and did not change the CLI/help surface or browser-command semantics. This wrapper adds no compatibility shim for older upstream releases. The wrapper must still not hide these prior upstream fixes:
|
|
31
31
|
|
|
32
32
|
- click reliability: upstream now scrolls off-viewport elements before coordinate resolution, handles JavaScript dialogs promptly, recovers mouse state after dialog-opening clicks, and reports overlay interception before dispatching input
|
|
33
33
|
- frame-scoped CSS selectors and waits, including cross-process iframe click-coordinate translation
|
|
@@ -140,7 +140,7 @@ Use `vitals [url]` for Core Web Vitals plus React hydration timing when availabl
|
|
|
140
140
|
{ "args": ["pushstate", "/dashboard?tab=settings"] }
|
|
141
141
|
```
|
|
142
142
|
|
|
143
|
-
For first-navigation setup, start on `about:blank`, then stage routes, cookies, or init scripts before navigating. The relevant v0.27.
|
|
143
|
+
For first-navigation setup, start on `about:blank`, then stage routes, cookies, or init scripts before navigating. The relevant v0.27.3 surfaces, unchanged from the prior baseline, are `network route <url> [--abort|--body <json>] [--resource-type <csv>]` and `cookies set --curl <file>`:
|
|
144
144
|
|
|
145
145
|
```json
|
|
146
146
|
{ "args": ["open"], "sessionMode": "fresh" }
|
|
@@ -267,7 +267,7 @@ On app pages that expose a native dropdown, add a `select` step such as `{ "acti
|
|
|
267
267
|
|
|
268
268
|
Use raw `args: ["batch"]` with `stdin` when you need arbitrary upstream commands, flags, or batch failure policies outside the constrained schema. Do not pass `stdin` with `job`, `qa`, `sourceLookup`, `networkSourceLookup`, or `electron`; those modes generate or manage their own input.
|
|
269
269
|
|
|
270
|
-
For quick smoke/QA checks, use top-level `qa`. It clears enabled network/console/page-error buffers before opening the target URL, waits for page readiness, checks expected text/selector, then inspects fresh network requests, console messages, and page errors only if preceding assertions pass, and can capture an evidence screenshot. The preset compiles to `batch --bail` so a missing text/selector assertion fails crisply instead of letting slower diagnostics burn the wrapper watchdog. Expected text compiles to bounded visible-text `wait --fn … --timeout 5000` predicates after load so dense pages can pass on visible headings/copy without dumping `body` text; missing text reports a crisp QA failure. The readiness wait defaults to `loadState: "domcontentloaded"`; set `loadState` to `"load"` or `"networkidle"` only when that stricter state is useful and the site is not expected to keep background requests alive. QA network diagnostics classify failed requests by likely impact and list failed rows first in the network preview: actionable document/script/API-style failures fail the preset, while common low-impact browser icon misses such as `favicon.ico` are surfaced as warnings (`qaPreset.warnings`) so they do not fail an otherwise healthy page. Successful QA with no failed checks returns compact model-visible prose (page URL/title when known, checks run, optional screenshot verification) while keeping the full step matrix in `details.qaPreset` and `details.batchSteps`. Failed QA presets report `details.resultCategory: "failure"`, `failureCategory: "qa-failure"`, keep verbose per-step batch output, and real Pi sessions treat the diagnostic as a failed tool result. Prose output also gets a model-visible result-category line including `Pi tool isError: true`; caller-requested `--json` output keeps the JSON string parseable and relies on the patched `isError` plus `details` fields.
|
|
270
|
+
For quick smoke/QA checks, use top-level `qa`. It clears enabled network/console/page-error buffers before opening the target URL, waits for page readiness, checks expected text/selector, then inspects fresh network requests, console messages, and page errors only if preceding assertions pass, and can capture an evidence screenshot. Successful reset rows are labeled as reset-scoped diagnostic output and are not counted as current-page QA failures; post-open diagnostic rows still fail or warn normally. The preset compiles to `batch --bail` so a missing text/selector assertion fails crisply instead of letting slower diagnostics burn the wrapper watchdog. Expected text compiles to bounded visible-text `wait --fn … --timeout 5000` predicates after load so dense pages can pass on visible headings/copy without dumping `body` text; missing text reports a crisp QA failure. The readiness wait defaults to `loadState: "domcontentloaded"`; set `loadState` to `"load"` or `"networkidle"` only when that stricter state is useful and the site is not expected to keep background requests alive. QA network diagnostics classify failed requests by likely impact and list failed rows first in the network preview: actionable document/script/API-style failures fail the preset, while common low-impact browser icon misses such as `favicon.ico` are surfaced as warnings (`qaPreset.warnings`) so they do not fail an otherwise healthy page. Successful QA with no failed checks returns compact model-visible prose (page URL/title when known, checks run, optional screenshot verification) while keeping the full step matrix in `details.qaPreset` and `details.batchSteps`. Failed QA presets report `details.resultCategory: "failure"`, `failureCategory: "qa-failure"`, keep verbose per-step batch output, and real Pi sessions treat the diagnostic as a failed tool result. Prose output also gets a model-visible result-category line including `Pi tool isError: true`; caller-requested `--json` output keeps the JSON string parseable and relies on the patched `isError` plus `details` fields.
|
|
271
271
|
|
|
272
272
|
The same classification drives plain `network requests` presentation: when any row counts as failed (HTTP status ≥ 400, `failed: true`, or a string `error`), model-facing text starts with a line like `Network failure summary: 0 actionable, 1 benign low-impact (1 total).`, and each preview line can end with an impact tag such as `[benign: low-impact browser icon asset]` or `[actionable: document, script, API, or non-benign request failure]`. When safe request IDs are present, `details.nextActions` adds bounded read-only follow-ups such as `network request <id>`, `networkSourceLookup` for actionable failed rows, `network requests --filter <path>`, `network requests --clear` before a repro, and `network har start`; prefer those payloads over rebuilding request-id commands from prose. For aggregate buffers, the wrapper accepts `network requests --current-page` / `--current-origin` to render only rows matching the active page origin, or `--current-url` for exact active document URL matching; it strips those wrapper-only flags before upstream spawn and reports counts in `details.networkRequestsPageFilter`. If the wrapper has seen a prior `network route` in the same session, matching failed, pending, or CORS-looking fetch/XHR rows add `details.networkRouteDiagnostics` plus executable route-mock follow-ups (`inspect-routed-network-request` and `start-network-har-capture-for-route-mock`) so agents do not mistake an unfulfilled mock for a fulfilled mock; same-origin/CORS fixture retry guidance stays in visible prose. `network requests` also hides `data:image` screenshot/artifact noise from the compact preview by default while preserving raw rows in `details.data.requests`. Rules live in `classifyNetworkRequestFailure` / `summarizeNetworkFailures` in `extensions/agent-browser/lib/results/network.ts`; QA aggregation is `analyzeQaPresetResults` in `extensions/agent-browser/index.ts`.
|
|
273
273
|
|
|
@@ -359,7 +359,7 @@ For one-call flows, put the click and wait in `batch`; the wait step keeps the s
|
|
|
359
359
|
{ "args": ["batch"], "stdin": "[[\"click\",\"@export\"],[\"wait\",\"--download\",\"/tmp/report.csv\"]]" }
|
|
360
360
|
```
|
|
361
361
|
|
|
362
|
-
A successful wait-based download renders a readable summary such as `Download completed: /tmp/report.csv` and exposes top-level `details.savedFilePath` plus `details.savedFile` for non-batch calls. With the current upstream `agent-browser 0.27.
|
|
362
|
+
A successful wait-based download renders a readable summary such as `Download completed: /tmp/report.csv` and exposes top-level `details.savedFilePath` plus `details.savedFile` for non-batch calls. With the current upstream `agent-browser 0.27.3`, `wait --download <path>` may report the requested path before this environment can verify that the file was persisted there. Treat `details.savedFilePath` as upstream-reported metadata unless `details.artifacts[].exists` is true. Upstream tracking: [vercel-labs/agent-browser#1300](https://github.com/vercel-labs/agent-browser/issues/1300).
|
|
363
363
|
|
|
364
364
|
### Download, screenshot, and PDF files
|
|
365
365
|
|
|
@@ -639,7 +639,7 @@ For dense pages, the wrapper also accepts `snapshot -i --search <text>` and `sna
|
|
|
639
639
|
| `wait --download [path]` | Wait for a download started by a previous action and optionally save it to `path`; successful wrapper results include upstream-reported `savedFilePath`/`savedFile`, while `details.artifacts[].exists` is the wrapper's on-disk verification signal. |
|
|
640
640
|
| `wait --download [path] --timeout <ms>` | Set download-start timeout in milliseconds. The native Pi wrapper forwards explicit wait timeouts and extends the subprocess watchdog unless the caller supplies top-level `timeoutMs`. |
|
|
641
641
|
|
|
642
|
-
Current v0.27.
|
|
642
|
+
Current v0.27.3 source still does not parse `wait <selector> --state hidden` / `wait <selector> --state detached` as distinct wait modes even though upstream help mentions those examples. Use `wait --fn "!document.querySelector('#spinner')"` or another explicit JavaScript predicate for disappearance/detach checks until upstream parser support exists.
|
|
643
643
|
|
|
644
644
|
### Diff, debug, and streaming
|
|
645
645
|
|
|
@@ -748,7 +748,7 @@ npm exec --yes --package pi-agent-browser-native@latest -- pi-agent-browser-conf
|
|
|
748
748
|
npm exec --yes --package pi-agent-browser-native@latest -- pi-agent-browser-config browser executable set "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"
|
|
749
749
|
```
|
|
750
750
|
|
|
751
|
-
The optional `agent_browser_web_search` tool is available when Exa or Brave credentials are visible from startup config or trusted session config and the runtime config has not set `webSearch.enabled` to `false`. It is a separate custom tool, not an `agent_browser` input mode, and does not launch a browser. Use it when current/live external web information would help; use `agent_browser` for browser interaction, screenshots, authenticated/profile pages, and DOM inspection. Disable scope is explicit: `web-search disable --global` sets the normal user default, `web-search disable --project` disables it for one repo, and a `PI_AGENT_BROWSER_CONFIG` override containing `{ "version": 1, "webSearch": { "enabled": false } }` wins over both for a hard per-run disable. Loaded config may use plaintext, custom env aliases, interpolation literals, malformed-or-late-bound `$` values, and command-backed web-search keys; the resolved secret reaches the provider request while model-facing tool output and status text stay redacted. `web-search set-key`, `set-command`, and `clear` require `--provider`; `set-env` infers Exa/Brave from `EXA_API_KEY` or `BRAVE_API_KEY` unless you pass `--provider`. For Exa, the tool defaults to `searchType: "auto"` with `contents.highlights: true`; use `fast`, `instant`, `deep-lite`, `deep`, or `deep-reasoning` only when the task needs that latency/depth tradeoff.
|
|
751
|
+
The optional `agent_browser_web_search` tool is available when Exa or Brave credentials are visible from startup config or trusted session config and the runtime config has not set `webSearch.enabled` to `false`. It is a separate custom tool, not an `agent_browser` input mode, and does not launch a browser. Use it when current/live external web information would help; use `agent_browser` for browser interaction, screenshots, authenticated/profile pages, and DOM inspection after you have a target URL. Prefer it over driving public search-engine forms such as Google with browser `job`/`type` flows, which can redirect headless automation to anti-bot or CAPTCHA pages; do not attempt CAPTCHA bypass. Disable scope is explicit: `web-search disable --global` sets the normal user default, `web-search disable --project` disables it for one repo, and a `PI_AGENT_BROWSER_CONFIG` override containing `{ "version": 1, "webSearch": { "enabled": false } }` wins over both for a hard per-run disable. Loaded config may use plaintext, custom env aliases, interpolation literals, malformed-or-late-bound `$` values, and command-backed web-search keys; the resolved secret reaches the provider request while model-facing tool output and status text stay redacted. `web-search set-key`, `set-command`, and `clear` require `--provider`; `set-env` infers Exa/Brave from `EXA_API_KEY` or `BRAVE_API_KEY` unless you pass `--provider`. For Exa, the tool defaults to `searchType: "auto"` with `contents.highlights: true`; use `fast`, `instant`, `deep-lite`, `deep`, or `deep-reasoning` only when the task needs that latency/depth tradeoff.
|
|
752
752
|
|
|
753
753
|
Example config:
|
|
754
754
|
|
|
@@ -860,14 +860,14 @@ Other useful environment variables include `AGENT_BROWSER_DEFAULT_TIMEOUT`, `AGE
|
|
|
860
860
|
<!-- agent-browser-capability-baseline:start capability-token-baseline -->
|
|
861
861
|
<!-- Generated from scripts/agent-browser-capability-baseline.mjs. Run `npm run docs -- command-reference write` to update. Do not edit manually. -->
|
|
862
862
|
<details>
|
|
863
|
-
<summary>Generated verifier capability baseline for agent-browser 0.27.
|
|
863
|
+
<summary>Generated verifier capability baseline for agent-browser 0.27.3</summary>
|
|
864
864
|
|
|
865
865
|
This generated block is review data for maintainers. The human-authored reference sections above remain the readable command guide.
|
|
866
866
|
|
|
867
867
|
#### Source evidence
|
|
868
868
|
- repository: `vercel-labs/agent-browser`
|
|
869
|
-
- upstream HEAD: `
|
|
870
|
-
- upstream package version: `0.27.
|
|
869
|
+
- upstream HEAD: `2c7991c9eccca1c9db6eee1a26a713414778de5a`
|
|
870
|
+
- upstream package version: `0.27.3`
|
|
871
871
|
- inspected: `agent-browser --version`
|
|
872
872
|
- inspected: `agent-browser --help`
|
|
873
873
|
- inspected: `selected agent-browser <command> --help output`
|
package/docs/RELEASE.md
CHANGED
|
@@ -178,7 +178,7 @@ Maintainer constraints for evolving scenarios and version bumps are summarized u
|
|
|
178
178
|
- `LICENSE` exists in the repo and the packed tarball
|
|
179
179
|
- canonical published docs are present
|
|
180
180
|
- `npm pack --json --dry-run` runs the `prepack` build and packs the compiled `dist/extensions/agent-browser/index.js` entrypoint
|
|
181
|
-
- GitHub/source installs run the package `prepare` build so Pi can load the ignored compiled `dist/extensions/agent-browser/index.js` entrypoint from a fresh clone
|
|
181
|
+
- GitHub/source installs run the package `prepare` build; when Pi installs with `npm install --omit=dev`, `scripts/prepare.mjs` installs source-build dev dependencies with lifecycle scripts disabled before building so Pi can load the ignored compiled `dist/extensions/agent-browser/index.js` entrypoint from a fresh clone
|
|
182
182
|
- the package-level doctor command and capability baseline are present
|
|
183
183
|
- compiled extension runtime files are present, including the split result-rendering modules required by the published facade
|
|
184
184
|
- source-only, agent-only, and superseded docs are absent from the tarball
|
|
@@ -232,7 +232,7 @@ Run the automated harness for deterministic configured-source lifecycle regressi
|
|
|
232
232
|
npm run verify -- lifecycle
|
|
233
233
|
```
|
|
234
234
|
|
|
235
|
-
The harness creates an isolated `PI_CODING_AGENT_DIR`, writes settings with exactly one temporary configured package source, runs `pi` in `tmux` with `--approve`, default model **`zai/glm-5.
|
|
235
|
+
The harness creates an isolated `PI_CODING_AGENT_DIR`, writes settings with exactly one temporary configured package source, runs `pi` in `tmux` with `--approve`, default model **`zai/glm-5.2`**, and a deterministic `--session-id`, puts a deterministic fake `agent-browser` first on `PATH`, drives `/reload`, closes Pi, and relaunches with the same exact session id instead of typing `/resume`. It also asserts the JSONL session header id, same-page managed-session continuity, compiled JS code pickup after full process relaunch, persisted spill reachability, and real Pi `tool_result` failure-patch semantics for a QA reclassification. Per-step tmux waits default to **180000 ms** (three minutes) in [`scripts/verify-lifecycle.mjs`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/scripts/verify-lifecycle.mjs) (`DEFAULT_TIMEOUT_MS`); override with `--timeout-ms <ms>` when slower models or cold starts need more headroom. Override the model when needed:
|
|
236
236
|
|
|
237
237
|
```bash
|
|
238
238
|
npm run verify -- lifecycle --model openai-codex/gpt-5.5:minimal
|
|
@@ -259,7 +259,7 @@ These show up often in cloud dev boxes and scripted smokes; they are maintainer
|
|
|
259
259
|
| **`pi -p` / print mode** | Non-interactive `pi -p` may hang or emit no stdout for long real-browser smokes without a TTY. | Use **tmux**-driven interactive `pi` for release evidence and checkout smokes; reserve `-p` for short, non-browser checks. |
|
|
260
260
|
| **Real-browser cleanup** | `real-upstream`, Sauce Demo, and live-site runs can leave defunct Chrome/`agent-browser` children if a session aborts mid-flow. | Close via `agent_browser` / `agent-browser` `close`, kill stray tmux sessions, and remove temp screenshots/HARs under `/tmp` or your chosen artifact dirs. |
|
|
261
261
|
| **Automated prompt driving** | Grepping tmux pane text for words that also appear in the **user** prompt (`PASS`, `FAIL`, `checkout overview`, `Smoke result:`) can false-complete before the agent finishes. | Wait for pane idle (no `Working…`), `agent_browser close` / `Artifact lifecycle`, or JSONL tool results—not instruction phrases copied from the prompt. |
|
|
262
|
-
| **Lifecycle verify flags** | `npm run verify -- lifecycle --model` or `--timeout-ms` without the next argv token fails fast with a usage error—the `project.mjs` facade validates passthrough the same way as `scripts/verify-lifecycle.mjs`. | Always pair flags with values (`--model openai-codex/gpt-5.5:minimal`, `--timeout-ms 600000`) or omit `--model` / `--timeout-ms` to keep the harness defaults (`zai/glm-5.
|
|
262
|
+
| **Lifecycle verify flags** | `npm run verify -- lifecycle --model` or `--timeout-ms` without the next argv token fails fast with a usage error—the `project.mjs` facade validates passthrough the same way as `scripts/verify-lifecycle.mjs`. | Always pair flags with values (`--model openai-codex/gpt-5.5:minimal`, `--timeout-ms 600000`) or omit `--model` / `--timeout-ms` to keep the harness defaults (`zai/glm-5.2`, **180000 ms** per-step waits). |
|
|
263
263
|
|
|
264
264
|
Manual validation remains useful for release confidence and installed-package checks:
|
|
265
265
|
|
|
@@ -300,7 +300,7 @@ The default unit suite also runs `agentBrowserExtension passes through core comm
|
|
|
300
300
|
- **Missing or extra `details` / `data` keys:** Update `test/fixtures/agent-browser-real-output-shapes.json` in the same change as the wrapper or presentation code that shifts those keys.
|
|
301
301
|
- **Timeouts:** A 120s bound covers the full matrix; repeated timeouts usually mean a hung browser, blocked loopback, or an environment preventing headful/headless launch—check upstream logs and local security tooling before loosening timeouts.
|
|
302
302
|
|
|
303
|
-
The current upstream `agent-browser 0.27.
|
|
303
|
+
The current upstream `agent-browser 0.27.3` `wait --download <path>` saveAs persistence limitation is tracked at [vercel-labs/agent-browser#1300](https://github.com/vercel-labs/agent-browser/issues/1300); until it is fixed, release validation must treat `details.savedFilePath` as upstream-reported metadata and use `details.artifacts[].exists` as the filesystem truth (the contract asserts the requested path is absent on disk while upstream still reports success). If the suite fails because JSON/detail keys drifted, update the wrapper behavior or refresh `test/fixtures/agent-browser-real-output-shapes.json` together with the presentation work that consumes those shapes.
|
|
304
304
|
|
|
305
305
|
Example smoke prompt:
|
|
306
306
|
|
package/docs/REQUIREMENTS.md
CHANGED
|
@@ -88,7 +88,7 @@ Define the product requirements and constraints for `pi-agent-browser-native`.
|
|
|
88
88
|
- The primary confidence path is a real `pi` session driven in `tmux`.
|
|
89
89
|
- For quick local checkout smoke validation, launch `pi --approve --no-extensions -e .` from the repository root so only the checkout copy loads; do not rely on Pi settings or `/reload` semantics in this isolated mode.
|
|
90
90
|
- For hot-reload validation, configure exactly one active source for this extension in Pi settings and launch plain `pi`; validate `/reload` there because it exercises auto-discovered/configured resources.
|
|
91
|
-
- Maintain a tmux-driven configured-source lifecycle harness (`npm run verify -- lifecycle`; required before release per `docs/RELEASE.md`) that isolates Pi settings, uses exactly one configured source, exercises `/reload`, full restart plus exact `--session-id` relaunch, and asserts managed-session continuity, persisted artifact survival, and real Pi `tool_result` failure-patch semantics. It remains outside the default `npm run verify` sequence, but it is embedded in `npm run verify -- release` so `prepublishOnly` enforces it before publish unless scripts are intentionally skipped. The harness defaults Pi to model `zai/glm-5.
|
|
91
|
+
- Maintain a tmux-driven configured-source lifecycle harness (`npm run verify -- lifecycle`; required before release per `docs/RELEASE.md`) that isolates Pi settings, uses exactly one configured source, exercises `/reload`, full restart plus exact `--session-id` relaunch, and asserts managed-session continuity, persisted artifact survival, and real Pi `tool_result` failure-patch semantics. It remains outside the default `npm run verify` sequence, but it is embedded in `npm run verify -- release` so `prepublishOnly` enforces it before publish unless scripts are intentionally skipped. The harness defaults Pi to model `zai/glm-5.2` (`scripts/verify-lifecycle.mjs`); pass `--model <id>` after `lifecycle` when a different model is required. Keep `docs/RELEASE.md` accurate about the harness behavior, cleanup, transcript retention, and limitations.
|
|
92
92
|
- Validate a full `pi` restart with exact `--session-id` relaunch or `/resume` when changes touch managed-session continuity, reload behavior, or persisted artifact paths. Validate branch-backed state changes with the focused `session_tree` harness tests.
|
|
93
93
|
- Prefer full `pi` restart over `/reload` when validating extension changes beyond a quick reload smoke check.
|
|
94
94
|
- Use `/resume` or an explicit session id/path when needed after restart.
|
package/docs/SUPPORT_MATRIX.md
CHANGED
|
@@ -26,10 +26,10 @@ When upstream ships a new `agent-browser` or the inventory changes:
|
|
|
26
26
|
|
|
27
27
|
## Audit result
|
|
28
28
|
|
|
29
|
-
- Target upstream: `agent-browser 0.27.
|
|
29
|
+
- Target upstream: `agent-browser 0.27.3` (must match `CAPABILITY_BASELINE.targetVersion` in [`scripts/agent-browser-capability-baseline.mjs`](../scripts/agent-browser-capability-baseline.mjs)).
|
|
30
30
|
- Source of truth: `CAPABILITY_BASELINE.inventorySections` in the same file (stable `id` keys: `skills`, `core-commands`, `state-tabs-frames-dialogs`, `network-storage-artifacts-diagnostics`, `batch-auth-setup-ai`, `options-and-env`).
|
|
31
31
|
- Status: supported for the current wrapper contract after the 2026-05-26 all-command audit.
|
|
32
|
-
- High-priority support gaps: 2026-05-26 audit found sessionless local commands and command-scoped value flags needed sharper wrapper handling; runtime/tests/docs now cover those paths. The 0.27.
|
|
32
|
+
- High-priority support gaps: 2026-05-26 audit found sessionless local commands and command-scoped value flags needed sharper wrapper handling; runtime/tests/docs now cover those paths. The 0.27.3 rebaseline is install-only: no CLI/help or browser-command semantics changed, so no new wrapper surface was added. The prior rebaseline preserves thin support for upstream click reliability, frame-scoped selectors/waits, form-command fixes, daemon retry improvements, and glibc-pinned release artifacts; wrapper wait planning forwards explicit long `wait <ms>` / `wait --timeout <ms>` calls instead of rejecting them before spawn. Remaining upstream-owned caveat: `agent-browser 0.27.3` help mentions `wait <selector> --state hidden`, but source parsing does not implement that distinct wait mode, so wrapper docs steer agents to `wait --fn` predicates.
|
|
33
33
|
- Post-`v0.2.29` review state: commits `eb55320` through `86abbfb` add browser guidance/smoke coverage plus `RQ-0086` click-probe reduction, `RQ-0087` same-snapshot form fill batching, `RQ-0088` current-ref fallback on locator misses, `RQ-0089` direct-upstream click mutation investigation, and `RQ-0090` stop-boundary/artifact-path guidance. Verification gates below were rerun on 2026-05-18 after those tasks landed. Constrained `job` (`RQ-0064`), the lightweight `qa` preset (`RQ-0065`), the experimental `sourceLookup` helper (`RQ-0066`), the experimental `networkSourceLookup` helper (`RQ-0067`), optional Exa/Brave-backed `agent_browser_web_search` with Pi-scoped package config (`RQ-0121`), and agent recovery for search/profile configuration failures (`RQ-0122`) are implemented; see [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#job), [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#qa), [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#sourcelookup), [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#networksourcelookup), and [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#optional-companion-web-search). Reusable browser recipes (`RQ-0068`) are intentionally not adopted as a runtime surface; see [`ARCHITECTURE.md`](ARCHITECTURE.md#no-reusable-recipe-layer-yet).
|
|
34
34
|
|
|
35
35
|
## Open UX/reliability follow-ups from 2026-05-29 agent feedback
|
|
@@ -43,24 +43,25 @@ Current summary:
|
|
|
43
43
|
| RQ-0110–RQ-0120 | Agent feedback triage resolved or documented; remaining unsupported areas are environment/upstream-owned. | [`docs/support-notes.md`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/docs/support-notes.md) |
|
|
44
44
|
| RQ-0123–RQ-0127 | Stress-report wrapper fixes shipped; prompt-derived business-action blocking remains intentionally out of scope. | [`docs/support-notes.md`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/docs/support-notes.md) |
|
|
45
45
|
| RQ-0101 | Upstream `agent-browser 0.27.2` rebaseline shipped. | [`docs/support-notes.md`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/docs/support-notes.md) |
|
|
46
|
+
| RQ-0128 | Upstream `agent-browser 0.27.3` install-only rebaseline shipped; no new wrapper capability adopted. | [`docs/support-notes.md`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/docs/support-notes.md) |
|
|
46
47
|
|
|
47
48
|
## Verification evidence
|
|
48
49
|
|
|
49
|
-
Re-run the gates below before each release; this table records what the closure audit exercised. Rows marked **Current for 0.27.
|
|
50
|
+
Re-run the gates below before each release; this table records what the closure audit exercised. Rows marked **Current for 0.27.3** were rerun after the `agent-browser 0.27.3` install-only rebaseline. Rows marked **Historical / pending refresh** are useful prior evidence but must not be treated as current release proof until rerun under the named condition.
|
|
50
51
|
|
|
51
52
|
| Gate | Evidence | Status |
|
|
52
53
|
| --- | --- | --- |
|
|
53
|
-
| Default local gate | `npm run verify` checks generated playbook drift, clean-builds generated `dist/`, runs `tsc --noEmit`, unit/fake tests, generated command-reference blocks, and live command-reference sampling. | **Current for 0.27.
|
|
54
|
+
| Default local gate | `npm run verify` checks generated playbook drift, clean-builds generated `dist/`, runs `tsc --noEmit`, unit/fake tests, generated command-reference blocks, and live command-reference sampling. | **Current for 0.27.3:** pass on 2026-06-13 (`npm run verify`; clean build, TypeScript, 571 passed, 1 skipped, generated docs check, and live command-reference sampling passed with `agent-browser 0.27.3` on `PATH`). |
|
|
54
55
|
| Pre-PR local gate | `npm run verify -- pre-pr` composes the default gate with package-content verification. Use before larger local handoffs or PR-ready claims when lifecycle/platform/live dogfood cost is not warranted. | Added 2026-06-10; orchestration is locked by `test/project-verify.test.ts` and does not change release mode. |
|
|
55
|
-
| Real upstream contract | `npm run verify -- real-upstream` runs the localhost fixture matrix against the real installed `agent-browser` matching the baseline. | **
|
|
56
|
-
| Packaged Pi smoke | `npm run verify -- package-pi` validates package contents, loads the packaged `agent_browser` tool without requiring optional Brave config, and executes fake-upstream `--version`. | **
|
|
56
|
+
| Real upstream contract | `npm run verify -- real-upstream` runs the localhost fixture matrix against the real installed `agent-browser` matching the baseline. | **Historical / pending refresh:** pass on 2026-06-11 (`npm run verify -- real-upstream`, `agent-browser 0.27.2` on `PATH`; includes 0.27.2 off-viewport click, frame-scoped selector/wait/click, form command, and wait-download artifact coverage). Not rerun for the 0.27.3 install-only rebaseline unless noted in release evidence. |
|
|
57
|
+
| Packaged Pi smoke | `npm run verify -- package-pi` validates package contents, loads the packaged `agent_browser` tool without requiring optional Brave config, and executes fake-upstream `--version`. | **Historical / pending refresh:** pass on 2026-06-11 as part of `npm run verify -- release` and rerun after the compiled-entrypoint change (`verify-package.mjs --smoke-pi`; packed 117 files, packaged `agent_browser --version` invocation passed). Not rerun for 0.27.3 unless noted in release evidence. |
|
|
57
58
|
| Startup profile | `npm run verify -- startup-profile --samples <n>` clean-builds generated `dist/`, records direct package entrypoint import/factory timing in fresh Node processes, and writes `.artifacts/startup-profile/latest.json`. It must not launch Pi, tmux, mise, npm, browsers, or `agent-browser`; full Pi TUI ready-prompt profiling is intentionally excluded after it proved too invasive for routine verification. Run this opt-in evidence when package layout, the compiled entrypoint, top-level imports, schema registration, or prompt/config startup logic changes. | **Current for compiled entrypoint:** pass on 2026-06-11 with direct compiled entrypoint import+factory median 47.136 ms in earlier samples, below the 250 ms direct-import guard and below the prior ~96 ms TypeScript-entrypoint baseline. Full-Pi startup numbers from the unsafe tmux profiler are not accepted as ongoing release evidence. |
|
|
58
|
-
| Deterministic dogfood smoke | `npm run verify -- dogfood` (`scripts/verify-agent-browser-dogfood.ts`) drives the native wrapper against a local file fixture through top-level `qa`, `semanticAction`, constrained `job`, screenshot artifact verification, and session close with the real `agent-browser` on `PATH`. | **Current for 0.27.
|
|
59
|
+
| Deterministic dogfood smoke | `npm run verify -- dogfood` (`scripts/verify-agent-browser-dogfood.ts`) drives the native wrapper against a local file fixture through top-level `qa`, `semanticAction`, constrained `job`, screenshot artifact verification, and session close with the real `agent-browser` on `PATH`. | **Current for 0.27.3:** pass on 2026-06-13 (`npm run verify -- dogfood`, `agent-browser 0.27.3`; `qa-url`, fresh/current opens, semantic click, job screenshot artifact verification, and close all passed). |
|
|
59
60
|
| Efficiency benchmark | `npm run verify -- benchmark` runs deterministic browser workflow accounting plus focused benchmark tests, including JSONL sampling fixtures and job/qa/sourceLookup/networkSourceLookup/Electron scenario coverage. | **Historical / pending refresh:** pass on 2026-05-29 (`npm run verify -- benchmark`). This deterministic gate is not upstream-version-specific, but rerun before claiming current benchmark evidence after benchmark or workflow-scenario edits. |
|
|
60
|
-
| Crabbox platform smoke | `npm run check:platform-smoke` syntax-checks the harness and cheap invariants. `npm run smoke:platform:ubuntu-image` builds the project-owned Linux image, `npm run smoke:platform:doctor` checks Crabbox 0.26.0+ and local target readiness, and `npm run smoke:platform:all` runs doctor first, then fast target-local `platform-build` (`npm run verify -- platform-target`, pack, clean Pi install) plus `browser-dogfood-smoke` on Crabbox `macos`, `ubuntu`, and `windows-native`; see [`platform-smoke.md`](platform-smoke.md). Target artifacts include Crabbox/provider/work-root metadata, and release review also checks provider-specific `crabbox list` commands for leftover leases/clones. | **
|
|
61
|
-
| `verify -- release` / `prepublishOnly` | `npm run verify -- release` chains the default gate with the configured-source lifecycle harness, packaged Pi smoke, and the release-blocking Crabbox platform matrix (`verifySteps` `release` in [`scripts/project.mjs`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/scripts/project.mjs)). `package.json` `prepublishOnly` runs that compose before `npm pack --dry-run` during `npm publish`. It intentionally omits standalone real-upstream, host-only dogfood, and benchmark modes—see [`RELEASE.md`](RELEASE.md#pre-release-checks). | **
|
|
62
|
-
| Configured-source lifecycle | `npm run verify -- lifecycle` (`scripts/verify-lifecycle.mjs`) drives `/reload`, closes and relaunches Pi with the same exact `--session-id`, checks the JSONL session header id, session continuity, slash-command sentinel tokens (`v1` before reload and `v2` after full relaunch because compiled JS package modules are process-cached), persisted spill reachability, and real Pi `tool_result` failure-patch semantics for a QA reclassification with a fake upstream on `PATH`. Default Pi model is `zai/glm-5.
|
|
63
|
-
| Quick isolated Pi smoke | `pi --approve --no-extensions --no-skills -e . --tools agent_browser` from trusted repo root; native `agent_browser` only. | **
|
|
61
|
+
| Crabbox platform smoke | `npm run check:platform-smoke` syntax-checks the harness and cheap invariants. `npm run smoke:platform:ubuntu-image` builds the project-owned Linux image, `npm run smoke:platform:doctor` checks Crabbox 0.26.0+ and local target readiness, and `npm run smoke:platform:all` runs doctor first, then fast target-local `platform-build` (`npm run verify -- platform-target`, pack, clean Pi install) plus `browser-dogfood-smoke` on Crabbox `macos`, `ubuntu`, and `windows-native`; see [`platform-smoke.md`](platform-smoke.md). Target artifacts include Crabbox/provider/work-root metadata, and release review also checks provider-specific `crabbox list` commands for leftover leases/clones. | **Historical / pending refresh:** pass on 2026-06-11 inside `npm run verify -- release`; rebuilt Ubuntu image `pi-agent-browser-native-platform:node24-agent-browser0.27.2`, refreshed the Windows `crabbox-ready` template snapshot to `agent-browser 0.27.2`, doctor passed, then Crabbox platform smoke passed for macOS, Ubuntu, and native Windows. Not rerun for 0.27.3 unless the release gate below records a fresh platform pass. |
|
|
62
|
+
| `verify -- release` / `prepublishOnly` | `npm run verify -- release` chains the default gate with the configured-source lifecycle harness, packaged Pi smoke, and the release-blocking Crabbox platform matrix (`verifySteps` `release` in [`scripts/project.mjs`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/scripts/project.mjs)). `package.json` `prepublishOnly` runs that compose before `npm pack --dry-run` during `npm publish`. It intentionally omits standalone real-upstream, host-only dogfood, and benchmark modes—see [`RELEASE.md`](RELEASE.md#pre-release-checks). | **Historical / pending refresh:** pass on 2026-06-11 (`npm run verify -- release`), including default unit/fake gate, generated docs checks, live command-reference sampling, lifecycle harness, packaged Pi smoke, and macOS/Ubuntu/native-Windows Crabbox platform smoke. Not rerun for 0.27.3 unless noted in release evidence. |
|
|
63
|
+
| Configured-source lifecycle | `npm run verify -- lifecycle` (`scripts/verify-lifecycle.mjs`) drives `/reload`, closes and relaunches Pi with the same exact `--session-id`, checks the JSONL session header id, session continuity, slash-command sentinel tokens (`v1` before reload and `v2` after full relaunch because compiled JS package modules are process-cached), persisted spill reachability, and real Pi `tool_result` failure-patch semantics for a QA reclassification with a fake upstream on `PATH`. Default Pi model is `zai/glm-5.2`; default per-step wait is **180000 ms** (`DEFAULT_TIMEOUT_MS`); override model with `--model <id>` and waits with `--timeout-ms <ms>`. Passthrough flags in [`scripts/project.mjs`](https://github.com/fitchmultz/pi-agent-browser-native/blob/main/scripts/project.mjs): `--keep-artifacts`, `--model`, `--verbose`, and `--timeout-ms` plus a value (for example `npm run verify -- lifecycle --model openai-codex/gpt-5.5:minimal --keep-artifacts --verbose --timeout-ms 600000`). | **Historical / pending refresh:** lifecycle-focused pass on 2026-06-11 after compiled-entrypoint update; managed browser session continuity and persisted full output verified before cleanup. Not rerun for 0.27.3 unless noted in release evidence. |
|
|
64
|
+
| Quick isolated Pi smoke | `pi --approve --no-extensions --no-skills -e . --tools agent_browser` from trusted repo root; native `agent_browser` only. | **Historical / pending refresh:** pass on 2026-06-11 via tmux with `pi --approve --no-extensions --no-skills -e .`; native `agent_browser` only. Covered `qa` with `sessionMode: "fresh"` against `https://example.com`, `open` and compact `snapshot -i` on `https://react.dev`, `semanticAction` link click to `https://react.dev/learn`, screenshot artifact verification at `/tmp/piab-release-smoke-react.png`, and `close`; explicit screenshot and temporary session artifacts were removed after evidence capture. Broader historical coverage also includes version/help/skills, eval stdin, batch stdin, explicit session, network requests, console/errors, diff snapshot, stream status/disable, dashboard start/stop, and chat credential-failure pass-through during RQ-0055. Not rerun for 0.27.3 unless noted in release evidence. |
|
|
64
65
|
|
|
65
66
|
Runtime floor note: package metadata keeps Pi core package peer ranges wildcard per installed Pi package docs, but `pi-agent-browser-doctor` / `npm run doctor` treats `pi --version` below 0.79.0 as a setup failure. This keeps package dependency shape aligned with Pi package loading while still making unsupported host Pi versions a release and first-run blocker.
|
|
66
67
|
|
package/docs/TOOL_CONTRACT.md
CHANGED
|
@@ -38,7 +38,7 @@ Agent-facing efficiency claims are measured with `npm run benchmark:agent-browse
|
|
|
38
38
|
|
|
39
39
|
`agent_browser_web_search` is a separate custom tool, not an `agent_browser` input mode. It is available when the extension can see at least one configured/resolvable Exa or Brave credential source from `~/.pi/config/pi-agent-browser-native/config.json`, `.pi/config/pi-agent-browser-native/config.json`, `PI_AGENT_BROWSER_CONFIG`, or the `EXA_API_KEY` / `BRAVE_API_KEY` environment fallbacks, and runtime execution still checks that the final available merged config has not set `webSearch.enabled` to `false`. Config layers merge global → project → `PI_AGENT_BROWSER_CONFIG` override; under Pi 0.79+, globally installed and CLI-loaded copies read `.pi/config/...` when Pi trust allows that project layer, and they skip the project layer when Pi reports the project is untrusted or when launched with `--no-approve`. Disable scope is explicit: a global disable is a normal user default, a project disable applies to one repo, and an override file with `webSearch.enabled: false` is the highest-priority hard disable for that run. Credential sources may be plaintext, `$ENV_VAR` / `${ENV_VAR}` interpolation, escaped literals, or command sources such as `"!op read 'op://Private/Exa/API Key'"` from any loaded config layer; they make the tool available without exposing the value in status text, and command values resolve when the tool executes. Browser profile/executable config uses the same paths and emits prompt guidance from the highest-priority loaded layer, including project config when that layer is loaded.
|
|
40
40
|
|
|
41
|
-
Use it when live/current external web information would help answer a task, find current docs/news, or discover candidate URLs. Use `agent_browser` when the task needs browser interaction, screenshots, authenticated/profile content, page inspection, or DOM work. The search tool is namespaced to avoid colliding with generic `web_search`, chooses Exa or Brave automatically from available credentials, defaults to Exa when both are available (unless `webSearch.preferredProvider` is set), and must not expose resolved API keys in content, details, errors, status output, docs examples, logs, or PR artifacts.
|
|
41
|
+
Use it when live/current external web information would help answer a task, find current docs/news, or discover candidate URLs. Prefer it over browser-driving public search-engine forms such as Google: headless `job`/`type` flows may be redirected to anti-bot or CAPTCHA pages, and agents should use search API results, direct target URLs, or user-provided URLs instead of attempting CAPTCHA bypass. Use `agent_browser` when the task needs browser interaction, screenshots, authenticated/profile content, page inspection, or DOM work. The search tool is namespaced to avoid colliding with generic `web_search`, chooses Exa or Brave automatically from available credentials, defaults to Exa when both are available (unless `webSearch.preferredProvider` is set), and must not expose resolved API keys in content, details, errors, status output, docs examples, logs, or PR artifacts.
|
|
42
42
|
|
|
43
43
|
Config shape:
|
|
44
44
|
|
|
@@ -142,6 +142,7 @@ The extension always plans normal browser commands with `--json` prepended in `e
|
|
|
142
142
|
<!-- Generated from extensions/agent-browser/lib/playbook.ts. Run `npm run docs -- playbook write` to update. -->
|
|
143
143
|
- Standard workflow: open the page, snapshot -i, interact using current @refs from that snapshot, and re-snapshot after navigation, scrolling, rerendering, or other major DOM changes because refs are page-scoped; the wrapper fails mutation-prone stale/recycled refs before upstream can silently target a different current-page element. On dense pages, use wrapper-side snapshot -i --search <text> or snapshot -i --filter role=<role> to render matching refs while preserving the full ref map in details.refSnapshot, add snapshot --viewport when scroll position or above/below-fold context matters, and add snapshot --diff when a quick before/after ref-map delta would prevent reading a full spill file.
|
|
144
144
|
- For ordinary forms from one snapshot, batch multiple fill @refs before the submit/click step to avoid serial tool calls; if a fill may autosubmit, navigate, or rerender later fields, split the flow and refresh refs first.
|
|
145
|
+
- Do not use browser automation to drive public search-engine forms such as Google for discovery; headless jobs that type a query and press Enter can be redirected to anti-bot or CAPTCHA pages. Use agent_browser_web_search when configured, ask for/search from a direct target URL, or navigate to known result URLs. Do not attempt CAPTCHA bypass.
|
|
145
146
|
- Snapshot choice: prefer snapshot -i for routine clicks/fills (interactive @refs, main-content-first). Use snapshot --compact when you need a denser same-page tree without full spill; use full snapshot (no -i) only when you need the complete accessibility tree. Re-snapshot after navigation or major DOM changes. When snapshot -i compacts because the tree is oversized, scan visible output for Omitted high-value controls and optional details.data.highValueControlRefIds before opening the spill file: those list bounded searchboxes, textboxes, comboboxes, buttons, tabs, checkboxes, radios, options, and menuitems that did not fit the key/other ref previews.
|
|
146
147
|
- When a visible text or accessible-name target should survive ref churn, prefer find locators such as role, text, label, placeholder, alt, title, or testid with the intended action instead of guessing a CSS selector.
|
|
147
148
|
- For desktop or host-controlled rich inputs, if semanticAction fill misses, refresh refs and prefer a current editable @ref from details.richInputRecovery or the latest snapshot; focus or click that ref, then use keyboard inserttext or keyboard type with the intended text. Do not auto-submit with Enter or a submit button unless the user flow explicitly calls for it.
|
|
@@ -158,7 +159,7 @@ The extension always plans normal browser commands with `--json` prepended in `e
|
|
|
158
159
|
- For Electron desktop apps, prefer top-level electron for wrapper-owned discovery, isolated launch, status, compact probe, and cleanup: list first, treat likely-sensitive annotations as hints rather than enforcement, launch with the default snapshot handoff unless handoff: "tabs" is the safer diagnostic starting point, use electron.probe or snapshot -i/qa.attached for current-session state, and always cleanup the returned launchId when done. electron.launch uses an isolated temporary profile; it does not reuse the app's normal signed-in profile or attach to an already-running authenticated app. For signed-in local app state, host-launch the normal app with --remote-debugging-port when appropriate, then use raw args connect <port|url>; after connect, inspect tab list, select the stable tab id such as tab t2, then run a condition wait or snapshot -i before using refs. close commands (`close`, `quit`, or `exit`) only close the browser/CDP session; leave manually launched app shutdown, profile cleanup, and explicit artifacts to the host owner.
|
|
159
160
|
- For provider or specialized app workflows, load version-matched upstream guidance with skills get agentcore|electron|slack|dogfood|vercel-sandbox through the native tool; add --full when you need references/templates, and use skills get --all only for broad skill audits. Provider launches such as -p ios, --provider browserbase/kernel/browseruse/browserless/agentcore, and iOS --device are upstream-owned setup paths; use sessionMode fresh when switching providers and expect external credentials or local Appium/Xcode setup to be required.
|
|
160
161
|
- For dialogs and frames, use dialog status/accept/dismiss and frame <selector|main> through native args; dialog commands and eval snippets that look like alert/confirm/prompt/dialog triggers are shorter-bounded than normal browser calls, and timed-out dialog-like interactions may add inspect-dialog-after-timeout, dismiss-dialog-after-timeout, or recover-fresh-session-after-dialog-timeout nextActions. When --confirm-actions produces a pending confirmation, use details.nextActions or exact confirm <id> / deny <id> calls instead of inventing ids.
|
|
161
|
-
- If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.
|
|
162
|
+
- If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.3, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort: use explicit --timeout or top-level timeoutMs for legitimately slow waits, and treat a successful payload like "waited":"timeout" as elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.
|
|
162
163
|
- For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.
|
|
163
164
|
- For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.
|
|
164
165
|
- For downloads, prefer download <selector> <path> when an element click should save a file; simple loopback anchor downloads are saved to the requested path when the wrapper can resolve an HTTP(S) href. Do not rely on click alone when you need the downloaded file on disk.
|
|
@@ -342,7 +343,7 @@ Because `job` still executes as upstream `batch` with generated stdin, the same
|
|
|
342
343
|
- type: object with either required `url` (normal URL-opening QA) or `attached: true` (current attached-session QA)
|
|
343
344
|
- optional; mutually exclusive with `args`, `semanticAction`, `job`, `sourceLookup`, `networkSourceLookup`, and `electron`
|
|
344
345
|
- lightweight preset built on the same batch compiler path as `job`, using `batch --bail` so missing readiness/text/selector assertions stop before slower diagnostics can burn the wrapper watchdog
|
|
345
|
-
- URL form: clears enabled diagnostic buffers first (`network requests --clear`, `console --clear`, `errors --clear`), then opens `url`, waits with `wait --load <state>` using the resolved `loadState`, optionally asserts `expectedText` (string or string array, compiled to bounded visible-text `wait --fn … --timeout 5000` predicates after load) and/or `expectedSelector` (each may be omitted for a load-plus-diagnostics-only smoke), then runs enabled diagnostics: `network requests`, `console`, and `errors` only if preceding batch steps pass
|
|
346
|
+
- URL form: clears enabled diagnostic buffers first (`network requests --clear`, `console --clear`, `errors --clear`), then opens `url`, waits with `wait --load <state>` using the resolved `loadState`, optionally asserts `expectedText` (string or string array, compiled to bounded visible-text `wait --fn … --timeout 5000` predicates after load) and/or `expectedSelector` (each may be omitted for a load-plus-diagnostics-only smoke), then runs enabled diagnostics: `network requests`, `console`, and `errors` only if preceding batch steps pass. Successful reset-step rows are labeled as reset output and ignored by QA failure analysis so stale pre-target rows do not fail URL QA; failed reset commands still fail the batch, and post-open diagnostic rows still count normally.
|
|
346
347
|
- attached form: `qa: { attached: true, expectedText?, expectedSelector?, screenshotPath?, checkNetwork?, checkConsole?, checkErrors?, loadState? }` runs the same waits, optional assertions, diagnostics, and screenshot against the current attached managed session without opening a URL. It rejects `url` and cannot be used with `sessionMode: "fresh"`; attach first with `electron.launch` or raw `args: ["connect", "<port-or-url>"]`, then run `qa.attached`. Before spawning the diagnostic batch, the wrapper preflights the attached session: `get url` must succeed and return an `http:` or `https:` page URL. Missing URLs, read failures, and non-http(s) surfaces fail fast with `failureCategory: "validation-error"`, `details.validationError`, and recovery `nextActions` such as `list-tabs-before-qa-attached` and `snapshot-before-qa-attached` instead of running the full QA batch. Attached QA does **not** run `network requests --clear`, `console --clear`, or `errors --clear`; `details.compiledQaPreset.checks.diagnosticsResetAtStart` is `false`. Visible text warns that existing diagnostic buffers were preserved only when `checkNetwork`, `checkConsole`, or `checkErrors` is enabled, and those diagnostics may include events from before the QA check.
|
|
347
348
|
- `loadState` is optional and must be `domcontentloaded`, `load`, or `networkidle`; it defaults to `domcontentloaded` so analytics-heavy or long-polling pages do not hang routine QA. Use `networkidle` only when the site is expected to go fully quiet.
|
|
348
349
|
- `checkNetwork`, `checkConsole`, and `checkErrors` default to `true` for URL-opening QA; for `qa.attached` they default to `false` because preserved upstream buffers may predate the current check. Set a field to `true` on `qa.attached` to opt into preserved-buffer diagnostics.
|
|
@@ -695,7 +696,7 @@ Real Pi custom tools only mark a tool result failed when the tool throws during
|
|
|
695
696
|
|
|
696
697
|
For `batch`, top-level `details` still carries `resultCategory` plus `successCategory` or `failureCategory` for the **aggregate** tool outcome: if any step fails, the overall result is a failure (`resultCategory: "failure"`) even when later steps succeed—inspect `batchSteps[]` for per-step outcomes. Each `batchSteps[]` entry includes its own `resultCategory` and either `successCategory` or `failureCategory` for that step. `batchFailure.failedStep` duplicates the first failing step’s details, including its `failureCategory` and any `nextActions`.
|
|
697
698
|
|
|
698
|
-
Top-level `details.data` on `batch` is a compact per-step roll-up (not a verbatim replay of raw upstream batch JSON): each element is `{ success, command, result? | error? }` where `command` is argv-redacted the same way as echoed invocation args (including `clipboard write` text, `cookies set` cookie values, `storage local|session set` values, and other sensitive flags/positionals), `result` is the presentation-layer data for that step after the same structured redaction as non-batch commands, and `error` is failure text with clipboard-write/cookie/storage/password literals stripped when those values appeared in argv. Prefer `batchSteps[]` for full per-step `details` (artifacts, categories, spill paths); use the roll-up when you only need a redacted matrix of what ran.
|
|
699
|
+
Top-level `details.data` on `batch` is a compact per-step roll-up (not a verbatim replay of raw upstream batch JSON): each element is `{ success, command, result? | error? }` where `command` is argv-redacted the same way as echoed invocation args (including `clipboard write` text, `cookies set` cookie values, `storage local|session set` values, and other sensitive flags/positionals), `result` is the presentation-layer data for that step after the same structured redaction as non-batch commands, and `error` is failure text with clipboard-write/cookie/storage/password literals stripped when those values appeared in argv. Prefer `batchSteps[]` for full per-step `details` (artifacts, categories, spill paths); use the roll-up when you only need a redacted matrix of what ran. If a large batch/job/qa result is compacted and spilled, the inline compacted text still includes bounded failed-step context (first failing step, failure category, failure detail, and any failed-step spill path) before the preview and top-level `Full output path:`.
|
|
699
700
|
|
|
700
701
|
`details.refSnapshot` may appear after successful `snapshot` calls and subsequent same-session calls. It records the latest page-scoped ref ids known to the wrapper, optional per-ref accessible `role`/`name` metadata from the same snapshot, and the page target they came from so mutation-prone `@e…` commands can fail fast instead of silently hitting recycled refs after navigation. For wrapper-tracked Electron sessions, `details.electronRefFreshness` may also appear after a successful `@e…` mutation as a softer same-URL rerender warning: run `snapshot -i` before reusing old refs even if the URL did not change.
|
|
701
702
|
|
|
@@ -824,7 +825,7 @@ Additional structured fields can appear when relevant:
|
|
|
824
825
|
|
|
825
826
|
When the tool echoes `args` or `effectiveArgs` back into Pi, sensitive values such as `--headers`, proxy credentials, and auth-bearing URL parameters should be redacted first.
|
|
826
827
|
|
|
827
|
-
For oversized snapshots and other oversized tool outputs, details should switch to a compact metadata object and include `fullOutputPath` pointing at a private spill file with the full redacted upstream payload. The model-facing tool text should print the actual spill-file path when one exists instead of only saying to inspect a details key. Persisted sessions should keep that spill file under a private session-scoped artifact directory so the path remains usable after reload/restart. The oldest persisted spill files are evicted as needed to stay within `PI_AGENT_BROWSER_SESSION_ARTIFACT_MAX_BYTES` (default 32 MiB), and those evictions are reported as `artifactManifest.entries[].retentionState: "evicted"` instead of silently disappearing from the session inventory. This persisted-spill byte budget is separate from the recent metadata window controlled by `PI_AGENT_BROWSER_SESSION_ARTIFACT_MANIFEST_MAX_ENTRIES`.
|
|
828
|
+
For oversized snapshots and other oversized tool outputs, details should switch to a compact metadata object and include `fullOutputPath` pointing at a private spill file with the full redacted upstream payload. The model-facing tool text should print the actual spill-file path when one exists instead of only saying to inspect a details key. Oversized batch/job/qa failures include bounded failed-step context inline before the preview so agents can see the failed assertion/error and failure category without opening the spill file. Persisted sessions should keep that spill file under a private session-scoped artifact directory so the path remains usable after reload/restart. The oldest persisted spill files are evicted as needed to stay within `PI_AGENT_BROWSER_SESSION_ARTIFACT_MAX_BYTES` (default 32 MiB), and those evictions are reported as `artifactManifest.entries[].retentionState: "evicted"` instead of silently disappearing from the session inventory. This persisted-spill byte budget is separate from the recent metadata window controlled by `PI_AGENT_BROWSER_SESSION_ARTIFACT_MANIFEST_MAX_ENTRIES`.
|
|
828
829
|
|
|
829
830
|
## High-value result rendering
|
|
830
831
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-agent-browser-native",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.52",
|
|
4
4
|
"description": "pi extension that exposes agent-browser as a native tool for browser automation",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"author": "Mitch Fultz (https://github.com/fitchmultz)",
|
|
@@ -35,6 +35,7 @@
|
|
|
35
35
|
"platform-smoke.config.mjs",
|
|
36
36
|
"scripts/config.mjs",
|
|
37
37
|
"scripts/doctor.mjs",
|
|
38
|
+
"scripts/prepare.mjs",
|
|
38
39
|
"scripts/agent-browser-capability-baseline.mjs",
|
|
39
40
|
"scripts/platform-smoke.mjs",
|
|
40
41
|
"scripts/platform-smoke",
|
|
@@ -62,10 +63,10 @@
|
|
|
62
63
|
"typebox": "*"
|
|
63
64
|
},
|
|
64
65
|
"devDependencies": {
|
|
65
|
-
"@earendil-works/pi-ai": "0.79.
|
|
66
|
-
"@earendil-works/pi-coding-agent": "0.79.
|
|
67
|
-
"@earendil-works/pi-tui": "0.79.
|
|
68
|
-
"@types/node": "^25.
|
|
66
|
+
"@earendil-works/pi-ai": "0.79.4",
|
|
67
|
+
"@earendil-works/pi-coding-agent": "0.79.4",
|
|
68
|
+
"@earendil-works/pi-tui": "0.79.4",
|
|
69
|
+
"@types/node": "^25.9.3",
|
|
69
70
|
"tsx": "^4.21.0",
|
|
70
71
|
"typebox": "^1.1.38",
|
|
71
72
|
"typescript": "^6.0.3"
|
|
@@ -86,13 +87,13 @@
|
|
|
86
87
|
"smoke:platform:windows-native": "node scripts/platform-smoke.mjs run --target windows-native",
|
|
87
88
|
"smoke:platform:all": "npm run smoke:platform:doctor && node scripts/platform-smoke.mjs run --target macos,ubuntu,windows-native",
|
|
88
89
|
"typecheck": "node ./scripts/project.mjs verify typecheck",
|
|
89
|
-
"test": "node ./scripts/build.mjs && tsx --test test/**/*.test.ts",
|
|
90
|
+
"test": "node ./scripts/build.mjs && tsx --test --test-concurrency=1 test/**/*.test.ts",
|
|
90
91
|
"verify": "node ./scripts/project.mjs verify",
|
|
91
92
|
"prepublishOnly": "npm run verify -- release && npm pack --dry-run",
|
|
92
93
|
"build": "node ./scripts/build.mjs",
|
|
93
94
|
"startup-profile": "node ./scripts/profile-startup.mjs",
|
|
94
95
|
"prepack": "npm run build",
|
|
95
|
-
"prepare": "
|
|
96
|
+
"prepare": "node ./scripts/prepare.mjs"
|
|
96
97
|
},
|
|
97
98
|
"packageManager": "npm@11.14.0"
|
|
98
99
|
}
|
|
@@ -14,8 +14,8 @@ export const COMMAND_REFERENCE_BASELINE_BLOCK_IDS = Object.freeze(["upstream-bas
|
|
|
14
14
|
|
|
15
15
|
const sourceEvidence = Object.freeze({
|
|
16
16
|
repository: "vercel-labs/agent-browser",
|
|
17
|
-
upstreamHead: "
|
|
18
|
-
upstreamPackageVersion: "0.27.
|
|
17
|
+
upstreamHead: "2c7991c9eccca1c9db6eee1a26a713414778de5a",
|
|
18
|
+
upstreamPackageVersion: "0.27.3",
|
|
19
19
|
inspectedSources: Object.freeze([
|
|
20
20
|
"agent-browser --version",
|
|
21
21
|
"agent-browser --help",
|
|
@@ -709,7 +709,7 @@ const inventorySections = Object.freeze([
|
|
|
709
709
|
]);
|
|
710
710
|
|
|
711
711
|
export const CAPABILITY_BASELINE = Object.freeze({
|
|
712
|
-
targetVersion: "0.27.
|
|
712
|
+
targetVersion: "0.27.3",
|
|
713
713
|
sourceEvidence,
|
|
714
714
|
helpCommands,
|
|
715
715
|
inventorySections,
|
|
@@ -62,7 +62,7 @@ Environment:
|
|
|
62
62
|
PLATFORM_SMOKE_MAC_USER macOS SSH user; default $USER
|
|
63
63
|
PLATFORM_SMOKE_MAC_WORK_ROOT macOS Crabbox work root
|
|
64
64
|
PLATFORM_SMOKE_MAC_PORT macOS SSH port; default 22
|
|
65
|
-
PLATFORM_SMOKE_UBUNTU_IMAGE Ubuntu local-container image; default pi-agent-browser-native-platform:node24-agent-browser0.27.
|
|
65
|
+
PLATFORM_SMOKE_UBUNTU_IMAGE Ubuntu local-container image; default pi-agent-browser-native-platform:node24-agent-browser0.27.3
|
|
66
66
|
PLATFORM_SMOKE_WINDOWS_VM Parallels Windows template VM
|
|
67
67
|
PLATFORM_SMOKE_WINDOWS_SNAPSHOT Parallels snapshot name
|
|
68
68
|
PLATFORM_SMOKE_WINDOWS_USER Windows SSH user
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Purpose: Build generated dist output for GitHub/source installs even when Pi invokes npm install --omit=dev.
|
|
4
|
+
* Responsibilities: Detect missing source-build dependencies, install dev dependencies with lifecycle scripts disabled, then run the canonical build.
|
|
5
|
+
* Scope: Package install lifecycle only; npm tarball contents and runtime behavior remain owned by scripts/build.mjs.
|
|
6
|
+
* Usage: package.json prepare script.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { execFile as execFileCallback } from "node:child_process";
|
|
10
|
+
import { createRequire } from "node:module";
|
|
11
|
+
import { join } from "node:path";
|
|
12
|
+
import process from "node:process";
|
|
13
|
+
import { promisify } from "node:util";
|
|
14
|
+
|
|
15
|
+
const execFile = promisify(execFileCallback);
|
|
16
|
+
const require = createRequire(import.meta.url);
|
|
17
|
+
const REQUIRED_SOURCE_BUILD_MODULES = [
|
|
18
|
+
"typescript",
|
|
19
|
+
"typebox",
|
|
20
|
+
"@earendil-works/pi-coding-agent",
|
|
21
|
+
"@earendil-works/pi-tui",
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
function canResolveBuildDependencies() {
|
|
25
|
+
return REQUIRED_SOURCE_BUILD_MODULES.every((moduleName) => {
|
|
26
|
+
try {
|
|
27
|
+
require.resolve(moduleName);
|
|
28
|
+
return true;
|
|
29
|
+
} catch {
|
|
30
|
+
return false;
|
|
31
|
+
}
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function runNpmInstallDevDependencies() {
|
|
36
|
+
const npmExecPath = process.env.npm_execpath;
|
|
37
|
+
const options = process.platform === "win32" ? { shell: true } : {};
|
|
38
|
+
if (npmExecPath) {
|
|
39
|
+
await execFile(process.execPath, [npmExecPath, "install", "--include=dev", "--ignore-scripts"], {
|
|
40
|
+
...options,
|
|
41
|
+
cwd: process.cwd(),
|
|
42
|
+
maxBuffer: 20 * 1024 * 1024,
|
|
43
|
+
});
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
await execFile("npm", ["install", "--include=dev", "--ignore-scripts"], {
|
|
47
|
+
...options,
|
|
48
|
+
cwd: process.cwd(),
|
|
49
|
+
maxBuffer: 20 * 1024 * 1024,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async function main() {
|
|
54
|
+
if (!canResolveBuildDependencies()) {
|
|
55
|
+
await runNpmInstallDevDependencies();
|
|
56
|
+
}
|
|
57
|
+
await execFile(process.execPath, [join(process.cwd(), "scripts", "build.mjs")], {
|
|
58
|
+
cwd: process.cwd(),
|
|
59
|
+
maxBuffer: 20 * 1024 * 1024,
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
main().catch((error) => {
|
|
64
|
+
if (error?.stdout) process.stdout.write(error.stdout);
|
|
65
|
+
if (error?.stderr) process.stderr.write(error.stderr);
|
|
66
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
67
|
+
process.exitCode = 1;
|
|
68
|
+
});
|