pi-agent-browser-native 0.2.35 → 0.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,39 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 0.2.37 - 2026-05-29
6
+
7
+ ### Added
8
+
9
+ - Added loopback navigation failure guidance for `localhost` / `127.0.0.1` errors such as `net::ERR_EMPTY_RESPONSE`, making clear that the browser host may not be able to reach the shell host's temporary server and pointing agents to host-reachable addresses or `file://` static-fixture fallbacks.
10
+ - Extended click-dispatch diagnostics to eligible `@e…` ref clicks using the latest snapshot role/name metadata, so ref and semanticAction-resolved clicks that report upstream success without a trusted DOM event now fail loudly with `details.clickDispatch` and inspect/retry next actions.
11
+
12
+ ### Changed
13
+
14
+ - Documented programmatic `eval --stdin` `.click()` as a static-fixture/debugging workaround only: it can exercise app handlers when user-like click dispatch fails, but it emits an untrusted scripted event and must not be treated as proof of real click behavior or used to bypass explicit stop boundaries.
15
+ - Updated README, command reference, tool contract, playbook guidance, and support matrix for second through fifth-round `agent_browser` UX feedback, including localhost, click dispatch, and port-lifecycle ownership boundaries.
16
+
17
+ ### Fixed
18
+
19
+ - Normalized malformed native-tool calls like `args: ["eval", "--stdin", "document.title"]` by moving trailing script tokens into stdin before spawning upstream, matching the canonical `eval --stdin` contract.
20
+
21
+ ## 0.2.36 - 2026-05-29
22
+
23
+ ### Added
24
+
25
+ - Added `details.evalResultWarning` and visible `Eval result warning` guidance for successful `eval --stdin` calls that return `null` on `file://` pages with non-trivial stdin, so agents treat those DOM checks as inconclusive without failing the tool.
26
+ - Enriched `get text <selector>` visibility diagnostics with bounded `visibleCandidates` entries (`querySelectorAll` index, tag, optional role, redacted text preview) so agents can resolve broad selector ambiguity without trusting hidden or first-match text.
27
+ - Added support-matrix tracking for the 2026-05-29 agent UX/reliability feedback batch (`RQ-0110`–`RQ-0117`) covering headed mode, local loopback, `file://` eval, click verification, selector ambiguity, host-owned fixture servers, fresh-session failures, and headed visibility limits.
28
+
29
+ ### Changed
30
+
31
+ - Made failed `sessionMode: "fresh"` managed-session recovery prose action-oriented: visible output now avoids generated session ids, distinguishes preserved/abandoned launch failures from post-launch `qa` reclassification failures, and points to safe next actions while keeping full transition details in `details.managedSessionOutcome`.
32
+ - Expanded README, command reference, tool contract, and generated playbook guidance for headed demos, browser-host localhost semantics, `file://` fixture limits, post-click verification, and host-owned temporary server cleanup.
33
+
34
+ ### Fixed
35
+
36
+ - Fresh-session failure next actions now verify or snapshot the current managed session for post-launch failures and avoid unconditional `doctor` guidance when the launch itself succeeded but a later diagnostic failed.
37
+
5
38
  ## 0.2.35 - 2026-05-28
6
39
 
7
40
  ### Changed
package/README.md CHANGED
@@ -64,7 +64,7 @@ The result is optimized for agent work:
64
64
  | Stateful cookies/storage/auth output bloats or leaks context | Presentation layer redacts `details.data` for cookies and storage (field-aware values) and recursively scrubs other structured upstream JSON (network, diff, trace/profiler, stream, dashboard, chat, auth, dialog, frame, state, and similar) using sensitive key names plus string heuristics; masks sensitive argv flags and positionals; scrubs secrets from failed batch step errors; and exposes a compact redacted `batch` matrix on top-level `details.data` | `extensions/agent-browser/lib/results/presentation.ts`, `extensions/agent-browser/lib/results/presentation/diagnostics.ts`, `extensions/agent-browser/lib/runtime.ts`, `test/agent-browser.presentation-diagnostics.test.ts` |
65
65
  | Stale `@eN` refs fail mysteriously | Records per-session `details.refSnapshot`, rejects mismatched URLs / unknown refs / unsafe `batch` stdin ordering before spawn, adds recovery guidance to rerun `snapshot -i` or use stable `find` locators | `extensions/agent-browser/index.ts`, `extensions/agent-browser/lib/session-page-state.ts`, `test/agent-browser.session-page-state.test.ts`, `test/agent-browser.results.test.ts`, `test/agent-browser.extension-ref-guards.test.ts`, `test/agent-browser.extension-semantic-recovery.test.ts` |
66
66
  | Agents need stable success/failure buckets | Exposes bounded `resultCategory`, `successCategory`, and `failureCategory` on tool `details` for branching without parsing prose; a `tool_result` hook also aligns real Pi `isError` semantics, naming `Pi tool isError: true` in prose output while preserving parseable caller-requested `--json` output | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), `extensions/agent-browser/lib/results/categories.ts`, `extensions/agent-browser/lib/results/shared.ts` (re-export barrel), `extensions/agent-browser/index.ts`, `test/agent-browser.results.test.ts`, `test/agent-browser.extension-validation.test.ts`, `test/agent-browser.pi-pipeline.test.ts` |
67
- | Clicks can report success without the page receiving the event | Top-level non-Electron `click` on exact CSS/XPath selectors installs a bounded DOM-event probe; if upstream reports success but no trusted event reaches the target, the wrapper fails the tool, exposes `details.clickDispatch`, and suggests explicit retry/inspect next actions (no in-page replay; `@e…` refs skip the probe). Other click results still expose `details.pageChangeSummary`, unchanged-URL clicks can surface evidence-backed `details.overlayBlockers` candidates, and explicit user stop boundaries can best-effort block click-like actions plus `press`/`key` Enter/Return submits via `details.promptGuard`. | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), `extensions/agent-browser/lib/orchestration/browser-run/click-dispatch.ts`, `extensions/agent-browser/lib/orchestration/browser-run/browser-action-model.ts`, `extensions/agent-browser/lib/orchestration/browser-run/prompt-guards.ts`, `extensions/agent-browser/lib/results/presentation/navigation.ts`, `test/agent-browser.presentation.test.ts`, `test/agent-browser.extension-errors-artifacts.test.ts` |
67
+ | Clicks can report success without the page receiving the event | Top-level non-Electron `click` on exact CSS/XPath selectors, and on `@e…` refs when the latest snapshot has role/name metadata the wrapper can resolve to a unique visible element, installs a bounded DOM-event probe; if upstream reports success but no trusted event reaches the target, the wrapper fails the tool, exposes `details.clickDispatch`, and suggests explicit retry/inspect next actions (no in-page replay). Other click results still expose `details.pageChangeSummary`, unchanged-URL clicks can surface evidence-backed `details.overlayBlockers` candidates, and explicit user stop boundaries can best-effort block click-like actions plus `press`/`key` Enter/Return submits via `details.promptGuard`. | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), `extensions/agent-browser/lib/orchestration/browser-run/click-dispatch.ts`, `extensions/agent-browser/lib/orchestration/browser-run/browser-action-model.ts`, `extensions/agent-browser/lib/orchestration/browser-run/prompt-guards.ts`, `extensions/agent-browser/lib/results/presentation/navigation.ts`, `test/agent-browser.presentation.test.ts`, `test/agent-browser.extension-click-dispatch.test.ts` |
68
68
  | Dashboard scroll commands can look successful while nothing moves | Samples viewport and prominent scroll-container positions around top-level `scroll` calls; unchanged positions produce `details.scrollNoop`, visible recovery guidance, and exact `nextActions` for snapshot/screenshot verification | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), [`docs/COMMAND_REFERENCE.md`](docs/COMMAND_REFERENCE.md#core-page-and-element-commands), `test/agent-browser.extension-validation.test.ts` |
69
69
  | Dropdown/combobox clicks can focus or hit native option box-model errors | Adds first-class `select <selector> <value...>` paths through raw `args`, `semanticAction`, and `job`; for custom combobox clicks, detects focused controls with explicit `aria-expanded` state but no visible options and returns `details.comboboxFocus` plus exact recovery `nextActions` | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), [`docs/COMMAND_REFERENCE.md`](docs/COMMAND_REFERENCE.md#core-page-and-element-commands), `extensions/agent-browser/lib/input-modes/semantic-action.ts`, `test/agent-browser.extension-input-modes.test.ts`, `test/agent-browser.extension-validation.test.ts` |
70
70
  | Recording workflows fail late when `ffmpeg` is missing | After successful `record start` / `record restart`, warns when `ffmpeg` is not on `PATH` so agents can install or fix PATH before `record stop` | [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details), [`docs/COMMAND_REFERENCE.md`](docs/COMMAND_REFERENCE.md#diff-debug-and-streaming), `test/agent-browser.extension-validation.test.ts` |
@@ -157,6 +157,13 @@ Open a page and inspect it (first-call recipe: open → snapshot -i → interact
157
157
  { "args": ["snapshot", "-i"] }
158
158
  ```
159
159
 
160
+ Watch a browser window during a demo or QA run by adding upstream's global `--headed` flag on the first launch. Use `sessionMode: "fresh"` if a managed session may already exist, because headed/headless state is launch-scoped. A successful tool call means upstream opened a browser context; it does **not** prove the OS window is visible on the user's display, especially under remote, container, or virtual-display setups.
161
+
162
+ ```json
163
+ { "args": ["--headed", "open", "https://example.com"], "sessionMode": "fresh" }
164
+ { "args": ["screenshot", "/tmp/agent-browser-headed-check.png"] }
165
+ ```
166
+
160
167
  On `https://example.com/`, the main link label is **Learn more**—use exact visible text from your snapshot, not guessed copy such as `More information...`.
161
168
 
162
169
  Click a visible ref, then refresh refs after navigation or a DOM update:
@@ -174,7 +181,7 @@ Run a multi-step flow in one tool call:
174
181
 
175
182
  If the same `batch` stdin later uses `@e…` on interaction commands after a step that can navigate or mutate the page (`open`, `click`, `reload`, and similar), insert a `snapshot` step whose first argv token is `snapshot` (for example `["snapshot","-i"]`) between those phases. Multiple same-snapshot `fill @e…` steps may be batched before a click/submit step; dynamic or autosubmit forms should still use stable locators or split with a fresh snapshot. The wrapper rejects unsafe ordering with `failureCategory: "stale-ref"` before upstream runs; full rules are under `refSnapshot` in [`docs/TOOL_CONTRACT.md`](docs/TOOL_CONTRACT.md#details).
176
183
 
177
- Evaluate page JavaScript through stdin. Return the value you want as an expression; `eval --stdin` may warn with `details.evalStdinHint` when a function-shaped snippet serializes to `{}` instead of being invoked:
184
+ Evaluate page JavaScript through stdin. Put the script in the top-level `stdin` field, not as an extra `args` token after `--stdin`. Return the value you want as an expression; `eval --stdin` may warn with `details.evalStdinHint` when a function-shaped snippet serializes to `{}` instead of being invoked:
178
185
 
179
186
  ```json
180
187
  { "args": ["eval", "--stdin"], "stdin": "document.title" }
@@ -222,7 +229,7 @@ Typical pitfalls:
222
229
  - Do not reuse `@e…` refs across navigation. The wrapper records the latest snapshot refs per session and fails mutation-prone stale/recycled refs before upstream can silently hit a different current-page element; use the session-aware `refresh-interactive-refs` next action.
223
230
  - If upstream classifies the failure as `stale-ref` and `details.compiledSemanticAction` is present for a compiled `find` action, `details.nextActions` may list `retry-semantic-action-after-stale-ref` after `refresh-interactive-refs`, carrying the same compiled `find` argv so you can retry the locator-stable target once it is safe to do so. `select` calls that used stale `@refs` only get refresh guidance; use a fresh snapshot or stable selector before retrying (contract in [`docs/TOOL_CONTRACT.md#semanticaction`](docs/TOOL_CONTRACT.md#semanticaction)).
224
231
  - If the failure is `selector-not-found`, the wrapper may take one fresh snapshot and add `Current snapshot ref fallback` when that snapshot has exact visible role/name matches for the failed `find` / `semanticAction` target. Non-fill targets can include direct `try-current-visible-ref*` next actions, and semantic click misses can still add bounded `Agent-browser candidate fallbacks` such as `button`/`link` role retries for `text` clicks. For semantic `fill` misses on desktop or host-controlled rich inputs, prefer `details.richInputRecovery`: refresh refs, choose the current editable `@ref`, focus or click it, then use `keyboard inserttext` or `keyboard type` with the intended text. Those recovery nextActions do not copy the fill text and do not press `Enter` or submit; only submit when the user flow explicitly calls for it (same contract link).
225
- - A successful upstream `click` is not proof that the web app handled the event or changed state. For top-level non-Electron clicks, the wrapper may fail the tool with `details.clickDispatch` and a `Click dispatch diagnostic` line when upstream reported success but no trusted DOM event reached the target; use the suggested `inspect-click-dispatch-miss` / `retry-click-after-dispatch-miss` next actions instead of assuming the click mutated the page. When the task depends on a mutation, follow `inspect-after-mutation` / `pageChangeSummary` evidence with a wait, URL/text check, or fresh snapshot before trusting the result; if the target still did not change, retry with a current visible ref or stable selector and report the workflow issue instead of silently continuing. Preserve explicit user stop boundaries: if the user says to stop before order/post/purchase/submit, gather evidence on that page and do not click the final action. The wrapper now blocks likely final order/submit clicks under such prompts and reports `details.promptGuard` rather than trusting the model to self-police.
232
+ - A successful upstream `click` is not proof that the web app handled the event or changed state. For top-level non-Electron clicks, the wrapper may fail the tool with `details.clickDispatch` and a `Click dispatch diagnostic` line when upstream reported success but no trusted DOM event reached the target; use the suggested `inspect-click-dispatch-miss` / `retry-click-after-dispatch-miss` next actions instead of assuming the click mutated the page. When the task depends on a mutation, follow `inspect-after-mutation` / `pageChangeSummary` evidence with a wait, URL/text check, or fresh snapshot before trusting the result; if the target still did not change, retry with a current visible ref or stable selector and report the workflow issue instead of silently continuing. For static local fixtures where the user only needs to exercise app code, an explicit `eval --stdin` programmatic click such as `document.querySelector("#demo").click()` can be a diagnostic workaround, but treat it as an untrusted scripted activation rather than proof a real user click works, and never use it to bypass explicit stop-before-submit/order/purchase boundaries. Preserve explicit user stop boundaries: if the user says to stop before order/post/purchase/submit, gather evidence on that page and do not click the final action. The wrapper now blocks likely final order/submit clicks under such prompts and reports `details.promptGuard` rather than trusting the model to self-police.
226
233
  - If a **top-level** `click` succeeds (unified command `click`, not a `batch` step), upstream reports `data.clicked`, and the tab URL is unchanged under the same normalization as ref preflight (fragment-insensitive), the wrapper may take one extra `snapshot -i` and add `Possible overlay blockers` with `details.overlayBlockers` (`candidates`, `summary`, optional `snapshot` refresh for refs) plus session-aware `inspect-overlay-state` / bounded `try-overlay-blocker-candidate-*` next actions when that snapshot shows strong modal context (`dialog` / `alertdialog`) and close/dismiss-like controls. Page-wide words like privacy, sign in, or banner alone do not trigger this diagnostic. The unchanged-URL check uses `details.navigationSummary`, which is populated with one read-only `eval` summary when the click JSON omits **both** string `data.url` and `data.title`; if upstream already includes either, overlay diagnostics are skipped here. Also skipped when tab correction or about-blank recovery already ran on that result.
227
234
  - If `get text <selector>` reads a non-ref CSS selector with multiple matches or a hidden first match while visible matches exist, including successful `batch` steps, the wrapper may add `Selector text visibility warning`, `details.selectorTextVisibility` (plus `selectorTextVisibilityAll` for multiple batched warnings), and `inspect-visible-text-candidates` next actions; the warning names the matching `details.nextActions` id. Prefer a visible `@ref`, a scoped selector, or a targeted `eval --stdin` over hidden tab content.
228
235
  - In attached Electron sessions, broad selectors such as `body`, `html`, `main`, or `[role=application]` may read the whole app shell. The wrapper may add `Broad Electron get text selector warning`, `details.electronGetTextScopeWarning`, and `snapshot-for-electron-text-scope`; prefer `snapshot -i`, a current `@ref`, or a narrower panel selector.
@@ -495,7 +502,10 @@ The upstream browser engine remains [`agent-browser`](https://agent-browser.dev/
495
502
  - Published pre-1.0 package.
496
503
  - Targets the current locally installed upstream `agent-browser` version only.
497
504
  - Does not bundle `agent-browser`; users install it separately.
498
- - Does not provide a human browser UI inside Pi; the primary UX is agent-invoked tool calls.
505
+ - Does not provide a human browser UI inside Pi; the primary UX is agent-invoked tool calls. `--headed` asks upstream to show a browser window, but the wrapper cannot yet prove that the window is visible on the user's desktop.
506
+ - Localhost means the browser host's loopback, not necessarily the shell/Pi host. If `http://localhost:<port>` or `http://127.0.0.1:<port>` fails with errors such as `ERR_EMPTY_RESPONSE`, use a host-reachable address when available or a `file://` URL for static fixtures, then verify with `snapshot -i` or an explicit screenshot.
507
+ - `file://` pages are useful as a static fallback, but they can behave differently from HTTP pages for MIME types, CORS, storage, and script/debugger behavior. If `eval --stdin` returns `null` or otherwise cannot verify a `file://` page, treat that as inconclusive and use screenshot/snapshot evidence or move the fixture to reachable HTTP.
508
+ - A successful upstream `click` is not proof that the app handled the event. For state-changing flows, verify with a fresh snapshot, text/URL assertion, screenshot, or `pageChangeSummary` before reporting success.
499
509
  - Real authenticated profile use is powerful but sensitive. Treat profile and cookie access as user-approved, task-specific behavior.
500
510
  - Wrapper tab/session recovery is best effort around observed upstream behavior, not a replacement for explicit profile/session design.
501
511
 
@@ -91,6 +91,21 @@ Keep routine browser work simple: open a page, inspect it with `snapshot -i`, in
91
91
  { "args": ["snapshot", "-i"] }
92
92
  ```
93
93
 
94
+ ### Headed demo and local-page checks
95
+
96
+ Use upstream's global `--headed` flag on the first launch when the user needs to watch the browser. Because headed/headless state belongs to the browser launch, use `sessionMode: "fresh"` when a managed session may already exist or when changing from a previous headless run.
97
+
98
+ ```json
99
+ { "args": ["--headed", "open", "https://example.com"], "sessionMode": "fresh" }
100
+ { "args": ["screenshot", "/tmp/agent-browser-headed-check.png"] }
101
+ ```
102
+
103
+ Treat headed success as browser-context success, not proof that a window is visible on the user's display. Remote shells, containers, virtual framebuffers, or upstream/provider-owned browser hosts can still put the visible window somewhere the user cannot see. If a user reports no window, gather evidence with `screenshot`, `tab list`, `get url`, or `snapshot -i`; then relaunch with the right display/profile/provider setup rather than assuming the user missed it.
104
+
105
+ For local fixtures, remember that `localhost` and `127.0.0.1` are resolved from the browser host, which may differ from the shell that started a temporary HTTP server. `net::ERR_EMPTY_RESPONSE` on `http://localhost:<port>` usually means the browser could not reach that server, not that the page itself rendered blank; the wrapper appends a local fixture hint for common loopback navigation failures. Prefer a host-reachable address when your environment provides one; otherwise use `file://` only for static fixtures and note its limits. `file://` does not provide HTTP headers and may change MIME/CORS/storage/debugger behavior. If `eval --stdin` on a `file://` page returns `null` for even simple DOM expressions, first make sure the JavaScript is in the native tool `stdin` field rather than trailing after `--stdin` in `args`; then treat the result as inconclusive and verify with `snapshot -i`, `get text` on current refs, or screenshots until the fixture can run over reachable HTTP.
106
+
107
+ Temporary HTTP servers and their port/process lifecycle stay outside the native tool. Extension maintainers running real-upstream contract tests can reuse `startAgentBrowserContractFixtureServer()` in [`test/helpers/agent-browser-harness.ts`](../test/helpers/agent-browser-harness.ts) instead of ad-hoc `python3 -m http.server` processes.
108
+
94
109
  ### React, SPA, and Web Vitals flows
95
110
 
96
111
  React introspection requires the React DevTools init hook to be installed before the page's first JavaScript runs. Launch or relaunch that browser session with `--enable react-devtools`; if the implicit session is already active, use `sessionMode: "fresh"`.
@@ -153,7 +168,7 @@ Do not assume Playwright selector dialects such as `text=Close` or `button:has-t
153
168
 
154
169
  Treat `@e…` refs as page-scoped. After a successful `snapshot`, the wrapper records the latest refs and page target for that session; mutation-prone ref commands such as `click @e4`, `select @e5 chocolate`, or batch steps with old refs fail with `failureCategory: "stale-ref"` when the page target changed or the ref is absent from the latest same-page snapshot. If a session `snapshot -i` fails with `No active page`, the wrapper invalidates prior refs for that session; later mutation-prone `@e…` calls fail before upstream until a successful fresh `snapshot -i` records refs again. Inside `batch` stdin JSON, the wrapper also walks steps in order before spawn: steps whose first token can navigate or mutate set a latch; a later step whose first token is `snapshot` clears that latch for following rows; guarded steps that still mention `@e…` after an uncleared latch fail with the same `stale-ref` bucket without launching upstream. Same-snapshot form fills are allowed before a click or submit step, so a login-style `fill`, `fill`, `click` batch can run from one snapshot; split dynamic or autosubmit forms with a fresh snapshot if a fill itself rerenders the targets. Follow the `refresh-interactive-refs` next action (it includes `--session <name>` when needed) and prefer stable `find` or `semanticAction` locators when navigation or rerendering is likely. Contract detail: [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#details) (`refSnapshot`, `refSnapshotInvalidation`).
155
170
 
156
- A successful `click` result means upstream reported a target, not that the app definitely handled the event. For top-level non-Electron clicks, the wrapper installs a bounded DOM-event probe; when upstream reports success but no trusted event reaches the target, it fails the tool and exposes `details.clickDispatch` plus a `Click dispatch diagnostic` line with explicit retry/inspect next actions (no in-page click replay). When the workflow depends on a mutation, use `details.pageChangeSummary`, a wait, URL/text extraction, or a fresh `snapshot -i` before trusting the state; if nothing changed, retry with a current visible ref or stable selector and report the workflow issue. Preserve explicit user stop boundaries: if the user says to stop before a final order, post, purchase, or submit action, gather evidence from that page and do not click the final action. The wrapper also blocks likely final order/submit click targets under those prompts and returns `details.promptGuard` with `failureCategory: "policy-blocked"`.
171
+ A successful `click` result means upstream reported a target, not that the app definitely handled the event. For top-level non-Electron clicks, the wrapper installs a bounded DOM-event probe; when upstream reports success but no trusted event reaches the target, it fails the tool and exposes `details.clickDispatch` plus a `Click dispatch diagnostic` line with explicit retry/inspect next actions (no in-page click replay). When the workflow depends on a mutation, use `details.pageChangeSummary`, a wait, URL/text extraction, or a fresh `snapshot -i` before trusting the state; if nothing changed, retry with a current visible ref or stable selector and report the workflow issue. For static local fixtures or debugging where the user explicitly accepts scripted activation, `eval --stdin` can call `document.querySelector(...).click()` to exercise inline handlers and app code; treat that as an untrusted programmatic event, not as evidence that CDP/user-like clicking works. Preserve explicit user stop boundaries: if the user says to stop before a final order, post, purchase, or submit action, gather evidence from that page and do not click the final action or use scripted activation to bypass the stop. The wrapper also blocks likely final order/submit click targets under those prompts and returns `details.promptGuard` with `failureCategory: "policy-blocked"`.
157
172
 
158
173
  When a **top-level** `click` succeeds (not a `click` hidden inside a `batch`/`job` tool call—the unified command must be `click`), the upstream payload includes `data.clicked`, no `details.clickDispatch` diagnostic fired for the same result, and the wrapper sees the active tab URL unchanged after the same normalization it uses for ref guards (**`#fragment` ignored**), it may run one extra `snapshot -i` and surface `Possible overlay blockers` plus `details.overlayBlockers` (`candidates`, `summary`, and a `snapshot` map that can refresh `refSnapshot`) when that snapshot shows strong modal context (`dialog` / `alertdialog`) **and** up to three close/dismiss-like controls; page-wide words such as privacy, sign in, or banner alone do not trigger it. The URL check compares the session’s prior pinned tab target to `details.navigationSummary.url` after the click; that summary is gathered with one read-only `eval` when the click JSON omits **both** string `data.url` and `data.title`—if upstream already echoes either field, overlay diagnostics are skipped on this path. The diagnostic is skipped if the wrapper already applied tab-focus correction or about-blank recovery on that result. Appended `inspect-overlay-state` / `try-overlay-blocker-candidate-*` entries in `details.nextActions` include `--session <name>` when the session is named, same as other session-scoped follow-ups. Treat `inspect-overlay-state` as the safe first follow-up; only use a `try-overlay-blocker-candidate-*` next action when the candidate is clearly the control you intend to close.
159
174
 
@@ -174,9 +189,9 @@ When you already know several visible refs or selectors, extract them in one `ba
174
189
 
175
190
  Prefer `get` and scoped `eval --stdin` for read-only extraction. Getter names are grouped under `get`: use `get title`, `get url`, or `get text <selector>`, not shortcut commands such as `title` or `url`. When upstream reports an unknown command, unknown subcommand, or unrecognized command for a single-token shortcut (`attr`, `count`, `html`, `text`, `title`, `url`, or `value`), the wrapper adds a visible grouped-`get` hint; only `title` and `url` also get exact read-only `details.nextActions` (`use-get-title` / `use-get-url`, with `--session` preserved when the failed call named a session). If another `Agent-browser hint:` (selector dialect or stale-ref recovery) was already appended to the same error text, the getter hint is omitted.
176
191
 
177
- Return the intended JavaScript value from `eval --stdin` instead of relying on `console.log`. For object-shaped extraction, pass a plain expression such as `({ title: document.title, url: location.href })`; if you send a function-shaped snippet, invoke it explicitly, for example `(() => ({ title: document.title }))()`. When upstream serializes a function result to `{}`, the wrapper can append `Eval stdin hint` and `details.evalStdinHint`.
192
+ Return the intended JavaScript value from `eval --stdin` instead of relying on `console.log`. In the native pi tool, the JavaScript belongs in the top-level `stdin` field; do **not** write it as a third `args` item such as `{ "args": ["eval", "--stdin", "document.title"] }`. The wrapper tolerates that common misplaced form by moving the trailing token to stdin before spawn, but the explicit `stdin` field is the documented form and avoids ambiguity for multiline snippets. For object-shaped extraction, pass a plain expression such as `({ title: document.title, url: location.href })`; if you send a function-shaped snippet, invoke it explicitly, for example `(() => ({ title: document.title }))()`. When upstream serializes a function result to `{}`, the wrapper can append `Eval stdin hint` and `details.evalStdinHint`.
178
193
 
179
- On tabbed or hidden-DOM pages, `get text <selector>` reads the upstream-selected match, which may be hidden even when a later match is visible. For non-`@ref` CSS selectors with multiple matches, including successful `batch` steps, the wrapper may add `Selector text visibility warning`, `details.selectorTextVisibility` (and `details.selectorTextVisibilityAll` for multiple batched warnings), and `inspect-visible-text-candidates` next actions. The warning names the matching `details.nextActions` id so agents know to use a fresher `snapshot -i`, a visible `@ref`, or a more specific selector instead of trusting hidden tab content.
194
+ On tabbed or hidden-DOM pages, `get text <selector>` reads the upstream-selected match, which may be hidden even when a later match is visible. For non-`@ref` CSS selectors with multiple matches, including successful `batch` steps, the wrapper may add `Selector text visibility warning`, `details.selectorTextVisibility` (and `details.selectorTextVisibilityAll` for multiple batched warnings), and `inspect-visible-text-candidates` next actions. The warning names the matching `details.nextActions` id so agents know to use a fresher `snapshot -i`, a visible `@ref`, or a more specific selector instead of trusting hidden tab content. If the probe still leaves multiple visible candidates, do not keep reading the broad selector; switch to a current visible `@ref`, add a narrower selector such as a known panel/container id, or use a targeted `eval --stdin` expression that filters for visible elements and returns the intended index/text.
180
195
 
181
196
  ### Run a multi-step flow in one browser invocation
182
197
 
@@ -671,7 +686,7 @@ When these commands are invoked through the native `agent_browser` tool, structu
671
686
  - `--proxy-bypass <hosts>`: proxy bypass hosts. Environments: `AGENT_BROWSER_PROXY_BYPASS`, `NO_PROXY`.
672
687
  - `--ignore-https-errors`: ignore HTTPS certificate errors. Environment: `AGENT_BROWSER_IGNORE_HTTPS_ERRORS`.
673
688
  - `--allow-file-access`: allow `file://` URLs to access local files. Environment: `AGENT_BROWSER_ALLOW_FILE_ACCESS`.
674
- - `--headed`: show the browser window. Environment: `AGENT_BROWSER_HEADED`.
689
+ - `--headed`: ask upstream to show the browser window. Environment: `AGENT_BROWSER_HEADED`. Use it on the first launch, normally with `sessionMode: "fresh"` when changing an existing managed session; verify visibility with screenshot/tab evidence because the wrapper cannot yet prove the OS window is visible to the user.
675
690
  - `--cdp <port>`: connect through Chrome DevTools Protocol.
676
691
  - `--color-scheme <scheme>`: `dark`, `light`, or `no-preference`. Environment: `AGENT_BROWSER_COLOR_SCHEME`.
677
692
  - `--download-path <path>`: default browser download directory. Environment: `AGENT_BROWSER_DOWNLOAD_PATH`.
@@ -31,20 +31,40 @@ When upstream ships a new `agent-browser` or the inventory changes:
31
31
  - High-priority support gaps: 2026-05-26 audit found sessionless local commands and command-scoped value flags needed sharper wrapper handling; runtime/tests/docs now cover those paths. Remaining upstream-owned caveat: `agent-browser 0.27.0` help mentions `wait <selector> --state hidden`, but source parsing does not implement that distinct wait mode, so wrapper docs steer agents to `wait --fn` predicates.
32
32
  - Post-`v0.2.29` review state: commits `eb55320` through `86abbfb` add browser guidance/smoke coverage plus `RQ-0086` click-probe reduction, `RQ-0087` same-snapshot form fill batching, `RQ-0088` current-ref fallback on locator misses, `RQ-0089` direct-upstream click mutation investigation, and `RQ-0090` stop-boundary/artifact-path guidance. Verification gates below were rerun on 2026-05-18 after those tasks landed. Constrained `job` (`RQ-0064`), the lightweight `qa` preset (`RQ-0065`), the experimental `sourceLookup` helper (`RQ-0066`), and the experimental `networkSourceLookup` helper (`RQ-0067`) are implemented; see [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#job), [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#qa), [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#sourcelookup), and [`TOOL_CONTRACT.md`](TOOL_CONTRACT.md#networksourcelookup). Reusable browser recipes (`RQ-0068`) are intentionally not adopted as a runtime surface; see [`ARCHITECTURE.md`](ARCHITECTURE.md#no-reusable-recipe-layer-yet).
33
33
 
34
+ ## Open UX/reliability follow-ups from 2026-05-29 agent feedback
35
+
36
+ Phase 1 triage (2026-05-29): IDs **RQ-0110–RQ-0117** track the first feedback batch. Second/third-round follow-up adds **RQ-0118–RQ-0120**. **Do not reuse RQ-0101** here—that id is already shipped for compact-snapshot high-value controls (see closure section below).
37
+
38
+ These rows track this feedback batch. Some rows are docs-only or environment-owned; rows marked shipped have code/tests in this change but still need release-gate evidence before being treated as release closure.
39
+
40
+ | ID | Feedback | Owner | Phase 1 classification | Evidence (2026-05-29, `agent-browser 0.27.0` on maintainer macOS unless noted) | Next implementation action | Likely files / tests |
41
+ | --- | --- | --- | --- | --- | --- | --- |
42
+ | RQ-0110 | Headed demos are hard to discover and hard to verify. | Wrapper + upstream (`--headed`) | **docs/playbook-mitigated** (`README`, `TOOL_CONTRACT`, `COMMAND_REFERENCE`, generated playbook guidance); visibility proof **out-of-scope/host-owned** until upstream exposes a portable signal. | `--headed open https://example.com` succeeds with JSON success; no upstream field proves an OS window is visible. Docs/playbook now document `sessionMode: "fresh"` and screenshot/tab/get-url evidence. | No further wrapper action planned for this batch without an upstream/OS portable visibility signal. | `extensions/agent-browser/lib/playbook.ts` (`npm run docs -- playbook write`), README, `docs/COMMAND_REFERENCE.md`, `docs/TOOL_CONTRACT.md`. |
43
+ | RQ-0111 | Local `localhost` / `127.0.0.1` fixture servers can fail with `ERR_EMPTY_RESPONSE` from the browser host. | Environment + upstream (navigation) | **docs-mitigated** (loopback host mismatch + `ERR_EMPTY_RESPONSE` meaning); browser-host reachability remains **environment-owned**. | Reproduced loopback navigation failures on 2026-05-29 maintainer macOS: accept-then-close without HTTP can surface as `net::ERR_EMPTY_RESPONSE` or `net::ERR_SOCKET_NOT_CONNECTED`; nothing listening yields `net::ERR_CONNECTION_REFUSED`. Same-machine `python3 -m http.server` (or harness `SimpleHTTPRequestHandler`) + `open http://127.0.0.1:<port>/fixture.html` succeeds. `npm run verify -- real-upstream` already uses localhost fixtures successfully on this host. | No wrapper server manager or classifier in this batch: failures are not specific enough to prove browser-host loopback mismatch. Keep guidance on host-reachable addresses, `file://` static fallback, and harness-owned servers. | README, `docs/COMMAND_REFERENCE.md`, `docs/TOOL_CONTRACT.md`, `test/helpers/agent-browser-harness.ts`. |
44
+ | RQ-0112 | `eval --stdin` can silently return `null` on `file://` pages, blocking DOM verification. | Upstream (eval channel) + wrapper (warning UX) | **docs-mitigated** (treat `file://` null as inconclusive); **wrapper-owned shipped** (`details.evalResultWarning` + visible `Eval result warning` on `file:` + `result === null`; upstream null channel remains environment/upstream-owned). | Reproduced on `file://` fixture: expressions `null`, `undefined`, `(() => null)()`, and missing-element queries return `"success":true` with `"result":null`; `JSON.stringify(null)` returns the string `"null"`. Simple DOM reads (`document.getElementById(...).textContent`) return real values on the same page. Focused fake coverage asserts the warning without failing the tool. | No further wrapper action planned unless real upstream exposes a richer error. Keep release validation focused on the non-failing warning and redaction-safe visible copy. | `extensions/agent-browser/lib/orchestration/browser-run/diagnostics.ts`, `process-output.ts`, `final-result.ts`, `types.ts`, `docs/TOOL_CONTRACT.md`, `test/agent-browser.extension-errors-artifacts.test.ts`. |
45
+ | RQ-0113 | A successful click may not lead to the expected DOM mutation. | Wrapper + upstream (click semantics) | **docs-mitigated / existing-runtime-mitigated** by `RQ-0089` click-dispatch, `RQ-0073` overlay diagnostics, and stronger verification guidance; arbitrary app no-op handlers remain **app/upstream semantics**, not proof of wrapper failure. | Direct upstream can correctly report `click #noop` success while app state stays unchanged; `click #mutate` updates DOM. This shows click success is target activation evidence, not expected-state proof. Wrapper already probes missing trusted DOM events and overlay blockers, but cannot infer arbitrary expected mutations without task-specific assertions. | No additional generic post-click probe in this batch to avoid false positives. Use task-specific verification (`snapshot`, `wait --text`, `assertText`, screenshot, `pageChangeSummary`) after state-changing clicks. | Existing `clickDispatch`/overlay tests plus README / `docs/COMMAND_REFERENCE.md` verification guidance. |
46
+ | RQ-0114 | `get text` selector ambiguity remains hard to resolve when several matches are visible. | Wrapper + upstream (first-match `get text`) | **wrapper-owned shipped** (`visibleCandidates` on selector probe + visible previews); first-match behavior remains upstream semantics. `RQ-0074` warning path already shipped. | Upstream CLI: `get text ".item"` with two visible matches returns only `Alpha`. Wrapper `RQ-0074` already warns when `matchCount > 1` (including all-visible cases) and now exposes bounded visible candidate previews/indexes for safer narrowing. | No further wrapper action planned for this batch. Future improvement: derive safe selector suggestions only if redaction rules can keep them non-sensitive. | `extensions/agent-browser/lib/orchestration/browser-run/diagnostics.ts`, `test/agent-browser.extension-errors-artifacts.test.ts`, `docs/TOOL_CONTRACT.md`. |
47
+ | RQ-0115 | Temporary local HTTP server port management is manual and leaked processes block later runs. | Environment (host/process lifecycle) | **out-of-scope/host-owned** (no fixture-server runtime per architecture); **docs-mitigated** (harness pointer in `COMMAND_REFERENCE`). | By design outside `agent_browser` per architecture no-recipe policy. Repo test harness already exposes `startAgentBrowserContractFixtureServer()` for deterministic localhost pages; leaked `python3 -m http.server` / Node listeners are operator or CI cleanup. Phase 1 added a maintainer pointer from [`COMMAND_REFERENCE.md`](COMMAND_REFERENCE.md#headed-demo-and-local-page-checks) to the harness. | Phase 2: **no wrapper server manager** without new design evidence. Parent decision if a separate npm script (`verify` helper) is wanted—out of scope for thin integration. | `test/helpers/agent-browser-harness.ts`, `docs/COMMAND_REFERENCE.md`, `docs/ARCHITECTURE.md` (if explicit anti-scope note is needed). |
48
+ | RQ-0116 | Fresh-session failure prose is opaque and exposes internal generated session ids without clear recovery. | Wrapper | **wrapper-owned shipped** (action-oriented visible recovery + `nextActions`; `attemptedSessionName` remains in `details`). Struct + visible line already exist (`RQ-0077`). | `buildManagedSessionOutcome` still keeps full generated-session transition details in `details.managedSessionOutcome`, while visible failure prose now summarizes preserved/abandoned/replaced outcomes without repeating generated ids. Focused fake coverage covers preserved, missing-binary, abandoned, and QA-reclassification paths. | No further wrapper action planned for this batch unless reviewer finds recovery actions unsafe or insufficient. | `extensions/agent-browser/lib/orchestration/browser-run/session-state.ts`, `final-result.ts`, `docs/TOOL_CONTRACT.md`, `test/agent-browser.extension-errors-artifacts.test.ts`, `test/agent-browser.extension-input-modes.test.ts`. |
49
+ | RQ-0117 | There is no machine-readable confirmation that headed mode is visible to the user. | Wrapper gap + environment (display) | **documented unsupported** for this batch; true OS visibility is **out-of-scope/host-owned** until upstream exposes a portable signal. Pairs with RQ-0110. | Same root cause as RQ-0110: no portable upstream/wrapper field observed. Headed launch success is not visibility proof, and adding a constant `details.headedVisibility: "unsupported"` would add noise without a decision signal. | No runtime field in this batch. Keep the explicit contract limitation and independent screenshot/tab/get-url evidence guidance. | README, `docs/TOOL_CONTRACT.md`, `docs/COMMAND_REFERENCE.md`, generated playbook guidance. |
50
+ | RQ-0118 | Second-round report says every `eval --stdin` expression on `file://` returned `null`, including `1 + 1` and `document.title`. | Wrapper UX + caller-shape recovery | **wrapper-owned shipped in follow-up branch**: direct upstream and native-tool checks on maintainer macOS show `eval --stdin` works on `file://` when the script is supplied through the top-level native tool `stdin` field. The reported all-null behavior is reproduced by the malformed native-tool shape `args: ["eval", "--stdin", "document.title"]` with no top-level `stdin`, which upstream treats as empty stdin and returns `null`. | Direct probes: `agent-browser --session ... open file:///tmp/page.html` then `printf '1+1' | agent-browser --session ... eval --stdin` returns `2`; native tool `{ args: ["eval", "--stdin"], stdin: "document.title" }` returns the fixture title; native tool `{ args: ["eval", "--stdin", "1+1"] }` reproduced `result: null` before normalization. | Normalize the common malformed native-tool call by moving trailing args after `--stdin` into process stdin before launch; keep docs/playbook explicit that top-level `stdin` is canonical. | `extensions/agent-browser/lib/orchestration/input-plan.ts`, `test/agent-browser.extension-errors-artifacts.test.ts`, README, `docs/COMMAND_REFERENCE.md`, `docs/TOOL_CONTRACT.md`, generated playbook guidance. |
51
+ | RQ-0119 | Second-round localhost failures still show `ERR_EMPTY_RESPONSE` even when shell `curl` succeeds. | Environment + wrapper diagnostics | **diagnostic-mitigated**: direct maintainer repro shows localhost HTTP succeeds with a normal same-host Python server, so the wrapper still cannot prove or bridge an environment-specific browser-host namespace/proxy mismatch. Add error presentation guidance specifically for loopback navigation failures so agents do not misread `ERR_EMPTY_RESPONSE` as blank page content. | Direct probe: `python3 -m http.server --bind 127.0.0.1 8766` + `agent-browser open http://127.0.0.1:8766/page.html` succeeds; previous first-batch evidence still shows accept-then-close servers can produce `ERR_EMPTY_RESPONSE`. | Append a local fixture hint on loopback `open`/navigation failures with `net::ERR_EMPTY_RESPONSE`, `ERR_CONNECTION_REFUSED`, `ERR_ADDRESS_UNREACHABLE`, `ERR_TIMED_OUT`, or `ERR_CONNECTION_RESET`; do not add server lifecycle management in the native browser tool. | `extensions/agent-browser/lib/results/presentation/errors.ts`, `test/agent-browser.presentation-skills-recovery.test.ts`, `docs/COMMAND_REFERENCE.md`, `docs/TOOL_CONTRACT.md`. |
52
+ | RQ-0120 | Third-round report says ref/semantic clicks can report success while inline `onclick="…"` handlers do not run, though programmatic `.click()` does. | Wrapper diagnostics + upstream/browser hit testing | **diagnostic-mitigated in follow-up branch**: simple direct upstream probes show inline `onclick` handlers fire for selector and `@ref` clicks on file pages, so the reported case is likely a hit-target/overlay/ref-resolution miss rather than inline attributes generally. Extend the click-dispatch probe to `@e…` refs using the latest snapshot role/name metadata so ref or semanticAction→ref clicks that never deliver a trusted event to the intended element fail with `details.clickDispatch` instead of silently reporting success. Fourth-round external testing confirmed this diagnostic now catches the failure. | Direct probe: minimal `<button onclick="showGraph('rps')">` fixture updates DOM via selector click, `@e1` click, and programmatic `.click()`. Existing wrapper probe covered CSS/XPath only; semantic visible-ref resolution and raw `@e…` clicks skipped dispatch diagnostics. Follow-up tester confirmed programmatic `.click()` remains a useful static-fixture workaround when CDP/user-like click dispatch fails. | Probe standalone `click @e…` when the latest snapshot maps that ref to a unique visible role/name DOM candidate; keep no in-page replay policy. Document programmatic `eval --stdin` `.click()` as an explicit debugging/static-fixture workaround only, not proof of real user click behavior and not a way around stop boundaries. | `extensions/agent-browser/lib/orchestration/browser-run/click-dispatch.ts`, `types.ts`, `prepare.ts`, `test/agent-browser.extension-click-dispatch.test.ts`, README, `docs/COMMAND_REFERENCE.md`, `docs/TOOL_CONTRACT.md`. |
53
+
34
54
  ## Verification evidence
35
55
 
36
56
  Re-run the gates below before each release; this table records what the closure audit exercised.
37
57
 
38
58
  | Gate | Evidence | Status |
39
59
  | --- | --- | --- |
40
- | Default local gate | `npm run verify` checks generated playbook drift, `tsc --noEmit`, unit/fake tests, generated command-reference blocks, and live command-reference sampling. | Pass on 2026-05-27 (`npm run verify`, `agent-browser 0.27.0` on `PATH`). |
41
- | Real upstream contract | `npm run verify -- real-upstream` runs the localhost fixture matrix against the real installed `agent-browser` matching the baseline. | Pass on 2026-05-27 (`npm run verify -- real-upstream`, `agent-browser 0.27.0` on `PATH`). |
42
- | Packaged Pi smoke | `npm run verify -- package-pi` validates package contents, loads exactly one packaged `agent_browser` tool, and executes fake-upstream `--version`. | Pass on 2026-05-27 (`npm run verify -- package-pi`). |
43
- | Deterministic dogfood smoke | `npm run verify -- dogfood` (`scripts/verify-agent-browser-dogfood.ts`) drives the native wrapper against public `example.com` through top-level `qa`, `semanticAction`, `qa.attached`, constrained `job`, screenshot artifact verification, and session close with the real `agent-browser` on `PATH`. | Pass on 2026-05-24 (`npm run verify -- dogfood --artifact-dir /tmp/pi-agent-browser-release-dogfood --json`; artifacts removed). |
44
- | Efficiency benchmark | `npm run verify -- benchmark` runs deterministic browser workflow accounting plus focused benchmark tests, including JSONL sampling fixtures and job/qa/sourceLookup/networkSourceLookup/Electron scenario coverage. | Pass on 2026-05-24 (`npm run verify -- benchmark`). |
45
- | `verify -- release` / `prepublishOnly` | `npm run verify -- release` chains the default gate with packaged Pi smoke (`verifySteps` `release` in [`scripts/project.mjs`](../scripts/project.mjs)). `package.json` `prepublishOnly` runs that compose before `npm pack --dry-run` during `npm publish`. It intentionally omits lifecycle, real-upstream, dogfood, and benchmark modes—see [`RELEASE.md`](RELEASE.md#pre-release-checks). | Pass on 2026-05-24 (`npm run verify -- release`). `prepublishOnly` still needs a fresh run during actual publish. |
46
- | Configured-source lifecycle | `npm run verify -- lifecycle` (`scripts/verify-lifecycle.mjs`) drives `/reload`, closes and relaunches Pi with the same exact `--session-id`, checks the JSONL session header id, session continuity, slash-command sentinel tokens (`v1` then `v2` after rewriting the packaged extension to simulate pickup), persisted spill reachability, and real Pi `tool_result` failure-patch semantics for a QA reclassification with a fake upstream on `PATH`. Default Pi model is `zai/glm-5.1`; default per-step wait is **180000 ms** (`DEFAULT_TIMEOUT_MS`); override model with `--model <id>` and waits with `--timeout-ms <ms>`. Passthrough flags in [`scripts/project.mjs`](../scripts/project.mjs): `--keep-artifacts`, `--model`, `--verbose`, and `--timeout-ms` plus a value (for example `npm run verify -- lifecycle --model openai-codex/gpt-5.5:minimal --keep-artifacts --verbose --timeout-ms 600000`). | Pass on 2026-05-27 (`npm run verify -- lifecycle --timeout-ms 300000`). Treat any future unexplained red lifecycle gate as a release blocker. |
47
- | Quick isolated Pi smoke | `pi --no-extensions --no-skills -e . --tools agent_browser` from repo root; native `agent_browser` only. | Pass on 2026-05-27 for an interactive tmux checkout smoke (`pi --no-extensions --no-skills -e . --session-dir <temp> --model zai/glm-5.1`): opened `https://example.com` with `sessionMode: "fresh"`, ran `snapshot -i`, verified `Example Domain`, closed the browser session, reported PASS, and removed the temp session dir/tmux session. Broader historical coverage also includes version/help/skills, open/snapshot/click, eval stdin, batch stdin, screenshot, explicit session, `sessionMode: "fresh"`, network requests, console/errors, diff snapshot, stream status/disable, dashboard start/stop, and chat credential-failure pass-through during RQ-0055. |
60
+ | Default local gate | `npm run verify` checks generated playbook drift, `tsc --noEmit`, unit/fake tests, generated command-reference blocks, and live command-reference sampling. | Pass on 2026-05-29 (`npm run verify`, `agent-browser 0.27.0` on `PATH`). |
61
+ | Real upstream contract | `npm run verify -- real-upstream` runs the localhost fixture matrix against the real installed `agent-browser` matching the baseline. | Pass on 2026-05-29 (`npm run verify -- real-upstream`, `agent-browser 0.27.0` on `PATH`). |
62
+ | Packaged Pi smoke | `npm run verify -- package-pi` validates package contents, loads exactly one packaged `agent_browser` tool, and executes fake-upstream `--version`. | Pass on 2026-05-29 (`npm run verify -- package-pi`). |
63
+ | Deterministic dogfood smoke | `npm run verify -- dogfood` (`scripts/verify-agent-browser-dogfood.ts`) drives the native wrapper against public `example.com` through top-level `qa`, `semanticAction`, `qa.attached`, constrained `job`, screenshot artifact verification, and session close with the real `agent-browser` on `PATH`. | Pass on 2026-05-29 (`npm run verify -- dogfood`; artifacts cleaned by the harness). |
64
+ | Efficiency benchmark | `npm run verify -- benchmark` runs deterministic browser workflow accounting plus focused benchmark tests, including JSONL sampling fixtures and job/qa/sourceLookup/networkSourceLookup/Electron scenario coverage. | Pass on 2026-05-29 (`npm run verify -- benchmark`). |
65
+ | `verify -- release` / `prepublishOnly` | `npm run verify -- release` chains the default gate with packaged Pi smoke (`verifySteps` `release` in [`scripts/project.mjs`](../scripts/project.mjs)). `package.json` `prepublishOnly` runs that compose before `npm pack --dry-run` during `npm publish`. It intentionally omits lifecycle, real-upstream, dogfood, and benchmark modes—see [`RELEASE.md`](RELEASE.md#pre-release-checks). | Pass on 2026-05-29 (`npm run verify -- release`). `prepublishOnly` will rerun this during `npm publish`. |
66
+ | Configured-source lifecycle | `npm run verify -- lifecycle` (`scripts/verify-lifecycle.mjs`) drives `/reload`, closes and relaunches Pi with the same exact `--session-id`, checks the JSONL session header id, session continuity, slash-command sentinel tokens (`v1` then `v2` after rewriting the packaged extension to simulate pickup), persisted spill reachability, and real Pi `tool_result` failure-patch semantics for a QA reclassification with a fake upstream on `PATH`. Default Pi model is `zai/glm-5.1`; default per-step wait is **180000 ms** (`DEFAULT_TIMEOUT_MS`); override model with `--model <id>` and waits with `--timeout-ms <ms>`. Passthrough flags in [`scripts/project.mjs`](../scripts/project.mjs): `--keep-artifacts`, `--model`, `--verbose`, and `--timeout-ms` plus a value (for example `npm run verify -- lifecycle --model openai-codex/gpt-5.5:minimal --keep-artifacts --verbose --timeout-ms 600000`). | Pass on 2026-05-29 (`npm run verify -- lifecycle`). Treat any future unexplained red lifecycle gate as a release blocker. |
67
+ | Quick isolated Pi smoke | `pi --no-extensions --no-skills -e . --tools agent_browser` from repo root; native `agent_browser` only. | Pass on 2026-05-29 for an interactive tmux checkout smoke (`pi --no-extensions --no-skills -e . --session-dir <temp> --model zai/glm-5.1`): prompted native `agent_browser --version`, verified `agent-browser 0.27.0`, reported PASS, and removed the temp session dir/tmux session. Broader historical coverage also includes version/help/skills, open/snapshot/click, eval stdin, batch stdin, screenshot, explicit session, `sessionMode: "fresh"`, network requests, console/errors, diff snapshot, stream status/disable, dashboard start/stop, and chat credential-failure pass-through during RQ-0055. |
48
68
 
49
69
  ## Baseline checklist by inventory section
50
70
 
@@ -60,6 +60,14 @@ For link and button text, use the **exact** visible label from the latest `snaps
60
60
 
61
61
  The extension always plans normal browser commands with `--json` prepended in `effectiveArgs` so upstream returns structured JSON for presentation and `details`. **Do not** include `--json` in caller `args`; it is unnecessary and can confuse planning or transcript hooks that treat caller-requested JSON differently. Plain-text inspection (`--help`, `--version`) keeps its own output shape. Read-only skills and local/setup commands such as `skills list` / `skills get` / `skills path`, local auth profile management (`auth save/list/show/delete/remove`), `profiles`, `dashboard`, `device list`, `doctor`, `install`, `upgrade`, `session list`, and targeted/all local saved-state maintenance including `state clear --all`, `state clear -a`, and named `state clear <session-name>` skip implicit session injection as documented under `sessionMode`.
62
62
 
63
+ ## Headed and local fixture limits
64
+
65
+ - `--headed` is an upstream global flag passed through `args` (for example `{ "args": ["--headed", "open", "https://example.com"], "sessionMode": "fresh" }`). Use it on the first launch for demos or human-observed QA. If a managed browser session already exists, use `sessionMode: "fresh"` so the launch-scoped headed/headless choice is not ignored.
66
+ - A successful headed call proves only that upstream accepted and ran the browser command. The wrapper currently has no portable contract field that proves the OS window is visible on the user's desktop. When visibility matters, collect independent evidence such as `screenshot`, `tab list`, `get url`, or `snapshot -i`, and treat “user cannot see the browser” as a display/provider/session setup issue until proven otherwise.
67
+ - `localhost` / `127.0.0.1` URLs are resolved by the browser host, which may differ from the shell or Pi process that started a temporary server. Errors such as `net::ERR_EMPTY_RESPONSE` on local ports are not reliable page-render evidence; they can mean the browser cannot reach the host loopback. Use an environment-specific host-reachable address when available, or fall back to `file://` only for static fixtures.
68
+ - `file://` pages do not provide HTTP headers and can differ from HTTP pages for MIME handling, CORS, storage, and debugger/script behavior. If `eval --stdin` returns `null` or otherwise fails to prove DOM state on a `file://` page, first confirm the script was passed through the native tool `stdin` field (not as a third `args` item after `--stdin`), then treat that verification as inconclusive and use `snapshot -i`, `get text` from current refs, screenshots, or a reachable HTTP fixture instead.
69
+ - Temporary HTTP servers launched outside the tool are host-owned. The native tool does not allocate ports, track background server PIDs, or clean them up; use a harness or shell cleanup for those processes.
70
+
63
71
  <!-- agent-browser-playbook:start shared-guidelines -->
64
72
  <!-- Generated from extensions/agent-browser/lib/playbook.ts. Run `npm run docs -- playbook write` to update. -->
65
73
  - Standard workflow: open the page, snapshot -i, interact using current @refs from that snapshot, and re-snapshot after navigation, scrolling, rerendering, or other major DOM changes because refs are page-scoped; the wrapper fails mutation-prone stale/recycled refs before upstream can silently target a different current-page element.
@@ -80,14 +88,14 @@ The extension always plans normal browser commands with `--json` prepended in `e
80
88
  - For Electron desktop apps, prefer top-level electron for wrapper-owned discovery, isolated launch, status, compact probe, and cleanup: list first, treat likely-sensitive annotations as hints rather than enforcement, launch with the default snapshot handoff unless handoff: "tabs" is the safer diagnostic starting point, use electron.probe or snapshot -i/qa.attached for current-session state, and always cleanup the returned launchId when done. electron.launch uses an isolated temporary profile; it does not reuse the app's normal signed-in profile or attach to an already-running authenticated app. For signed-in local app state, host-launch the normal app with --remote-debugging-port when appropriate, then use raw args connect <port|url>; after connect, inspect tab list, select the stable tab id such as tab t2, then run a condition wait or snapshot -i before using refs. close commands (`close`, `quit`, or `exit`) only close the browser/CDP session; leave manually launched app shutdown, profile cleanup, and explicit artifacts to the host owner.
81
89
  - For provider or specialized app workflows, load version-matched upstream guidance with skills get agentcore|electron|slack|dogfood|vercel-sandbox through the native tool; add --full when you need references/templates, and use skills get --all only for broad skill audits. Provider launches such as -p ios, --provider browserbase/kernel/browseruse/browserless/agentcore, and iOS --device are upstream-owned setup paths; use sessionMode fresh when switching providers and expect external credentials or local Appium/Xcode setup to be required.
82
90
  - For dialogs and frames, use dialog status/accept/dismiss and frame <selector|main> through native args; when --confirm-actions produces a pending confirmation, use details.nextActions or exact confirm <id> / deny <id> calls instead of inventing ids.
83
- - If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.0, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort, must stay below the wrapper IPC budget (wait 30000 is intentionally blocked), and a successful payload like "waited":"timeout" means elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.
91
+ - If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.0, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort, must stay below the wrapper IPC budget (wait 30000 is intentionally blocked), and a successful payload like "waited":"timeout" means elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.
84
92
  - For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.
85
93
  - For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.
86
94
  - For downloads, prefer download <selector> <path> when an element click should save a file. Do not rely on click alone when you need the downloaded file on disk.
87
95
  - On dashboards with nested scroll containers, verify scroll with a screenshot or fresh snapshot -i; if the viewport did not move, prefer scrollintoview <@ref> or target the actual scrollable region. For native selects, use select <selector> <value...> (or semanticAction/job select) instead of clicking option refs; for custom comboboxes, a click/semanticAction may only focus the field, so re-snapshot and fall back to type, press Enter/arrow keys, or visible option refs.
88
96
  - When using eval --stdin, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics.
89
- - When using eval --stdin for extraction, return the value you want instead of relying on console.log as the primary result channel. Prefer plain expressions like ({ title: document.title }) or explicitly invoked functions like (() => ({ title: document.title }))(); if a function-shaped snippet returns {}, details.evalStdinHint may warn that the function was serialized instead of called. If get text on a CSS selector surfaces details.selectorTextVisibility or selectorTextVisibilityAll, prefer a visible @ref, a more specific selector, or the inspect-visible-text-candidates nextAction over hidden tab content.
90
- - When details.pageChangeSummary is present, use changeType and summary as a compact signal for navigation, DOM mutation, confirmations, or artifacts; when nextActionIds is set, match those ids to entries in details.nextActions (or per-step nextActions inside batch) for concrete follow-up payloads instead of inferring from prose alone. If a no-navigation click surfaces details.overlayBlockers, inspect the fresh snapshot evidence before using a close/dismiss candidate nextAction; ordinary page chrome without dialog/alertdialog evidence should not trigger this diagnostic.
97
+ - When using eval --stdin for extraction, pass the JavaScript through the native tool stdin field, not as an extra args token after --stdin, and return the value you want instead of relying on console.log as the primary result channel. Prefer plain expressions like ({ title: document.title }) or explicitly invoked functions like (() => ({ title: document.title }))(); if a function-shaped snippet returns {}, details.evalStdinHint may warn that the function was serialized instead of called. On file:// pages, when upstream JSON returns result: null for non-trivial stdin, details.evalResultWarning may append Eval result warning without failing the tool—treat that as inconclusive DOM verification. If get text on a CSS selector surfaces details.selectorTextVisibility or selectorTextVisibilityAll, prefer a visible @ref, a more specific selector, or the inspect-visible-text-candidates nextAction over hidden tab content.
98
+ - When details.pageChangeSummary is present, use changeType and summary as a compact signal for navigation, DOM mutation, confirmations, or artifacts; when nextActionIds is set, match those ids to entries in details.nextActions (or per-step nextActions inside batch) for concrete follow-up payloads instead of inferring from prose alone. If details.clickDispatch reports no trusted DOM event, refresh/inspect/retry the real click first; for static local fixtures only, an explicit eval --stdin programmatic .click() can exercise app handlers, but treat it as an untrusted scripted workaround and never use it to bypass stop-before-submit/order/purchase boundaries. If a no-navigation click surfaces details.overlayBlockers, inspect the fresh snapshot evidence before using a close/dismiss candidate nextAction; ordinary page chrome without dialog/alertdialog evidence should not trigger this diagnostic.
91
99
  - When commands save or spill files (screenshots, downloads, PDFs, traces, recordings, HAR, large snapshot spills), use the user's exact requested paths when given and treat paths as provisional until details.artifactVerification shows every row verified: branch on missingCount, pendingCount, unverifiedCount, per-entry state, and optional limitation before downstream file use or PASS/FAIL reporting.
92
100
  - For evidence-only screenshots, QA captures, or other audit artifacts, save to an explicit path and branch on details.artifactVerification plus details.artifacts before reporting PASS/FAIL; do not require vision review of inline image attachments unless the user asked for visual inspection.
93
101
  - Respect explicit user stop boundaries: if the user says to stop before order/post/purchase/submit, do not click that final action. If the wrapper returns details.promptGuard.reason=explicit-user-stop-boundary, gather evidence on the current page instead of retrying the blocked final action.
@@ -469,6 +477,8 @@ Examples:
469
477
  { "args": ["eval", "--stdin"], "stdin": "document.title" }
470
478
  ```
471
479
 
480
+ For `eval --stdin`, put the script in the top-level `stdin` field. The wrapper normalizes the common mistaken shape `{ "args": ["eval", "--stdin", "document.title"] }` by moving trailing tokens after `--stdin` into stdin before launching upstream, but that recovery is only for simple one-line mistakes; use `stdin` explicitly for multiline or quote-sensitive snippets.
481
+
472
482
  ```json
473
483
  { "args": ["batch"], "stdin": "[[\"open\",\"https://example.com\"],[\"snapshot\",\"-i\"]]" }
474
484
  ```
@@ -595,7 +605,7 @@ For `batch`, each `batchSteps[]` entry can carry its own `nextActions` for that
595
605
 
596
606
  `pageChangeSummary` is an optional compact summary for mutation-prone and artifact-producing commands. It includes `changeType` (`"navigation"`, `"mutation"`, `"artifact"`, or `"confirmation"`), `command`, a readable `summary`, optional `title`/`url`, optional `artifactCount` or `savedFilePath`, and `nextActionIds` that link the observed change to `nextActions` without repeating full payloads. The wrapper maintains an explicit `eligibleForPageChangeSummary` command capability through `isPageChangeSummaryCommand` in `extensions/agent-browser/lib/command-taxonomy.ts`: those commands still emit a `mutation`-typed summary when upstream JSON lacks navigation metadata, as long as no stronger signal (artifact, saved path, navigation fields, or pending confirmation) applies. That capability is independent from `invalidatesBatchRefs` and `triggersPostMutationSnapshot`, so artifact summaries like `download` / `screenshot` and guarded-but-non-invalidating `fill` are documented directly in the capability table instead of implied by broad set spreading. Commands outside that set omit `pageChangeSummary` unless the parsed payload shows navigation, a confirmation prompt, saved files, or artifacts—including read-only inspection commands, which normally have no summary unless one of those signals appears. For `batch`, the top-level summary favors artifact rollups when any step produced artifacts; otherwise it may synthesize a `mutation` summary from steps that carried their own `pageChangeSummary`. Treat mutation summaries as "upstream attempted the action" evidence, not proof the application handled it; agents should verify URL/text/state for important mutations before continuing.
597
607
 
598
- `clickDispatch` may appear after a **top-level non-Electron** `click` when the wrapper installed a pre-click DOM-event probe for an exact CSS or XPath selector, upstream reported success, and the post-click probe found no trusted DOM event reached that target. The wrapper does **not** replay clicks in-page and does **not** install this probe for `@e…` refs because wrapper-side target resolution would diverge from upstream ref identity. On a miss it marks the tool failed, appends `Click dispatch diagnostic: …`, and sets `clickDispatch.status` to `"no-native-event-observed"` with `reason: "native-click-produced-no-target-dom-event"`, `nativeEventCount`, and a redacted `target` descriptor (`kind: "selector" | "xpath"`, plus the selector string). `details.nextActions` gains `inspect-click-dispatch-miss` (`snapshot -i`) and `retry-click-after-dispatch-miss` (same upstream `click` argv, session-prefixed when applicable). This diagnostic is only for standalone `click`; `batch`/`job`/`qa` click steps remain upstream-owned batch behavior.
608
+ `clickDispatch` may appear after a **top-level non-Electron** `click` when the wrapper installed a pre-click DOM-event probe, upstream reported success, and the post-click probe found no trusted DOM event reached that target. The wrapper probes exact CSS selectors, XPath selectors, and `@e…` refs when the latest wrapper-tracked snapshot has role/name metadata that resolves to one visible DOM candidate; it does **not** take a fresh pre-click snapshot because that could recycle upstream refs before the intended click. The wrapper does **not** replay clicks in-page. On a miss it marks the tool failed, appends `Click dispatch diagnostic: …`, and sets `clickDispatch.status` to `"no-native-event-observed"` with `reason: "native-click-produced-no-target-dom-event"`, `nativeEventCount`, and a redacted `target` descriptor (`kind: "selector" | "xpath"` plus `selector`, or `kind: "accessible"` plus `refId`, `role`, and redacted `name`). `details.nextActions` gains `inspect-click-dispatch-miss` (`snapshot -i`) and `retry-click-after-dispatch-miss` (same upstream `click` argv, session-prefixed when applicable). If a local static fixture must be exercised despite this diagnostic, a caller may explicitly run a programmatic activation via `eval --stdin` such as `document.querySelector(...).click()`, but that emits an untrusted scripted event and is only a debugging/workaround path; it must not be used as proof that real user-like clicking works or to bypass prompt stop boundaries. This diagnostic is only for standalone `click`; `batch`/`job`/`qa` click steps remain upstream-owned batch behavior.
599
609
 
600
610
  `promptGuard` may appear on wrapper-blocked calls when the latest user prompt contains machine-recognizable safety or evidence requirements. `reason: "explicit-user-stop-boundary"` is a **best-effort click-like and Enter/Return keypress guard**, not a complete stop-boundary enforcer: it blocks likely final order/payment/submit targets (for example `Finish`, `place order`, `submit payment`, or matching selectors/`@ref` metadata) on standalone `click`/`dblclick`/`tap`, `find … click`, and matching `batch` steps, and it blocks standalone or batch `press <key>` / `key <key>` when the key is Enter or Return (keypress submits do not require a final-action label match). It does **not** block `eval`, generic `fill`/`type`/`select`, `keyboard type`/`keyboard inserttext`, non-Enter keypresses, or other scripted activation. Implementation: `extensions/agent-browser/lib/orchestration/browser-run/browser-action-model.ts` plus `prompt-guards.ts`. `reason: "requested-artifacts-missing-before-close"` blocks `close` / `quit` / `exit` when the prompt named exact required screenshot paths and the session artifact manifest has not verified those paths; optional recording paths are only required when recording appears available. Both guards return `failureCategory: "policy-blocked"` and `validationError` text instead of invoking upstream.
601
611
 
@@ -677,11 +687,12 @@ Additional structured fields can appear when relevant:
677
687
  - `scrollNoop` after a successful **top-level** `scroll` when wrapper-side read-only probes before and after the command show no change in `window.scrollX` / `window.scrollY` and no change in the sampled prominent scrollable containers. To avoid pre-launching a session without caller startup state, this probe is skipped when the invocation includes startup-scoped flags such as `--profile`, `--state`, `--session-name`, `--cdp`, providers, init scripts, or similar launch settings. Shape: `{ reason: "no-observed-scroll-position-change", message, before, after, recommendations }`; `before` / `after` include viewport dimensions, document scroll dimensions, and up to ten sampled container descriptors plus scroll offsets. Container descriptors use only sample index, tag name, and ARIA role; DOM ids/classes are intentionally not stored. This diagnostic is conservative evidence that the page-level scroll likely missed a nested pane, not proof that every app-specific region is unchanged. Visible text appends `Scroll diagnostic: no observed scroll movement`, and `details.nextActions` gains `inspect-after-noop-scroll` (`snapshot -i`) plus `verify-noop-scroll-visually` (`screenshot`), session-prefixed when applicable.
678
688
  - `comboboxFocus` after a successful explicit combobox-targeted `click` / `fill` / `find … click|fill` (for example `semanticAction` with role `combobox`, including when that semantic action resolves through a current visible `@ref` before execution) when a read-only probe sees the active element is combobox-like, `aria-expanded` is explicitly present (`false` or `true`), and no visible `listbox` / `option` / menu option elements are open. Shape: `{ reason: "focused-combobox-without-visible-options", message, activeElement, visibleListboxCount, visibleOptionCount, recommendations }`; `activeElement` includes bounded role/tag/expanded/hasPopup/name metadata with normal text redaction. Visible text appends `Combobox diagnostic: focused combobox did not expose visible options`, and `details.nextActions` gains `inspect-focused-combobox` (`snapshot -i`), `try-open-combobox-with-arrow` (`press ArrowDown`), and `try-open-combobox-with-enter` (`press Enter`), session-prefixed when applicable. The diagnostic is deliberately gated to explicit combobox-targeted calls to avoid extra probes or false positives on ordinary clicks/textboxes.
679
689
  - `recordingDependencyWarning` after a successful `record start` or `record restart` when the wrapper cannot find an executable `ffmpeg` on the Pi process `PATH`. Shape: `{ reason: "ffmpeg-missing-for-recording", dependency: "ffmpeg", command, message, recommendations }`. Visible text appends `Recording dependency warning: ffmpeg not found on PATH`. This is a non-blocking preflight warning: upstream may start recording, but `record stop` needs `ffmpeg` to encode the WebM.
680
- - `selectorTextVisibility` after a **successful** upstream `get text <selector>` (standalone or inside a successful `batch`) when the wrapper’s follow-up probe finds a hazard: more than one DOM match (upstream reads the first `querySelectorAll` hit, which may be the wrong tab/panel), or the first match is hidden while at least one other match is visible (requires multiple DOM nodes so a visible peer exists; a lone hidden match is not flagged). The probe is a read-only `eval --stdin` script (`buildVisibleTextProbeScript` in `extensions/agent-browser/index.ts`) that counts matches, applies a small visibility heuristic (`display`/`visibility`/`opacity` plus non-zero client rects), and may include a redacted `firstVisibleTextPreview`. It is **not** run for page-scoped `@e…` selectors or when the selector string is withheld because `selectorMayExposeSensitiveLiteral` would risk echoing secrets in probe output. `details.selectorTextVisibility` mirrors the primary diagnostic (first sorted entry); when several selectors in one `batch` qualify, `selectorTextVisibilityAll` lists every diagnostic sorted so hidden-first cases precede generic multi-match ambiguity. Appended visible warning text names the matching `details.nextActions` id so model-facing transcripts can recover without guessing. Appended `details.nextActions` use ids `inspect-visible-text-candidates` and `inspect-visible-text-candidates-2`, … with the probe replayed via `eval --stdin` for each hazardous selector.
690
+ - `selectorTextVisibility` after a **successful** upstream `get text <selector>` (standalone or inside a successful `batch`) when the wrapper’s follow-up probe finds a hazard: more than one DOM match (upstream reads the first `querySelectorAll` hit, which may be the wrong tab/panel), or the first match is hidden while at least one other match is visible (requires multiple DOM nodes so a visible peer exists; a lone hidden match is not flagged). The probe is a read-only `eval --stdin` script (`buildVisibleTextProbeScript` in `extensions/agent-browser/lib/orchestration/browser-run/diagnostics.ts`) that counts matches, applies a small visibility heuristic (`display`/`visibility`/`opacity` plus non-zero client rects), may include a redacted `firstVisibleTextPreview`, and may include up to eight `visibleCandidates` entries (`index` in `querySelectorAll`, `tagName`, optional `role`, optional redacted `textPreview`). It is **not** run for page-scoped `@e…` selectors or when the selector string is withheld because `selectorMayExposeSensitiveLiteral` would risk echoing secrets in probe output. `details.selectorTextVisibility` mirrors the primary diagnostic (first sorted entry); when several selectors in one `batch` qualify, `selectorTextVisibilityAll` lists every diagnostic sorted so hidden-first cases precede generic multi-match ambiguity. Appended visible warning text names the matching `details.nextActions` id and may list visible candidate previews. Appended `details.nextActions` use ids `inspect-visible-text-candidates` and `inspect-visible-text-candidates-2`, … with the probe replayed via `eval --stdin` for each hazardous selector. If the probe still leaves more than one visible candidate, it is only ambiguity evidence; agents should narrow the selector, use a current visible `@ref`, or run a targeted visible-element `eval --stdin` rather than trusting the broad selector.
681
691
  - `electronGetTextScopeWarning` after a successful attached Electron `get text <selector>` (standalone or successful `batch`) when a broad non-ref CSS selector such as `body`, `html`, `main`, `div`, or `[role=application]` may read the whole app shell. Shape: `{ selector, summary, electronContext: { launchId?, sessionName?, url? } }`; multiple batched diagnostics use `electronGetTextScopeWarnings`. Visible text appends `Broad Electron get text selector warning`, and next actions use `snapshot-for-electron-text-scope` ids with session-scoped `snapshot -i` payloads.
682
- - `evalStdinHint` after a successful `eval --stdin` when caller stdin (trimmed) looks function-shaped to the wrapper’s lightweight detector (`looksLikeFunctionEvalStdin` in `extensions/agent-browser/index.ts`: leading `function` / `async function`, parenthesized arrow `(…) =>`, or a concise `name =>` / `async name =>` form) **and** upstream JSON `data` is an object whose `result` field is a plain empty object (`{}`). Arrays such as `[]` do not qualify. It includes `reason` and `suggestion`; visible output appends `Eval stdin hint` with the same guidance. This is a heuristic for the common mistake of returning a function object instead of invoking it or passing a plain expression, not a JavaScript parser or proof that the page returned no useful data.
692
+ - `evalStdinHint` after a successful `eval --stdin` when caller stdin (trimmed) looks function-shaped to the wrapper’s lightweight detector (in `extensions/agent-browser/lib/orchestration/browser-run/diagnostics.ts`: leading `function` / `async function`, parenthesized arrow `(…) =>`, or a concise `name =>` / `async name =>` form) **and** upstream JSON `data` is an object whose `result` field is a plain empty object (`{}`). Arrays such as `[]` do not qualify. It includes `reason` and `suggestion`; visible output appends `Eval stdin hint` with the same guidance. This is a heuristic for the common mistake of returning a function object instead of invoking it or passing a plain expression, not a JavaScript parser or proof that the page returned no useful data. Before this diagnostic path runs, the wrapper also recovers the common malformed native-tool call `args: ["eval", "--stdin", "..."]` with no top-level `stdin` by moving trailing `args` tokens after `--stdin` into the process stdin stream.
693
+ - `evalResultWarning` after a successful `eval --stdin` when the current or prior page URL is `file:` (from navigation summary, session tab target, or persisted session page state), upstream JSON `data.result` is strictly `null`, and stdin is non-empty and not a trivial literal `null`/`undefined`. Fields: `reason`, `suggestion`. Visible output appends `Eval result warning` without failing the tool. Use snapshot -i, ref-based getters, screenshots, or http(s) fixtures when file:// null results are inconclusive.
683
694
  - `timeoutPartialProgress` after `runAgentBrowserProcess` reports `timedOut` (wrapper child-process watchdog) when best-effort recovery finds useful context. `summary` is a short sentence counting how many declared artifact paths exist on disk versus how many were scanned, and whether page context came from live session reads or only from a planned URL (when nothing in the plan declares an artifact path, the fraction may read `0/0` while `currentPage` can still carry session or planned URL context). `steps` lists planned argv from the compiled `job` or `qa` batch plan (`compiledJob` in `extensions/agent-browser/index.ts`, which is only populated for those top-level modes) or, when that object is absent, from the same JSON-array `batch` stdin the tool sends upstream—whether caller-authored or wrapper-generated for `sourceLookup` / `networkSourceLookup` (1-based indices; only JSON-array stdin whose elements are string[] argv arrays is parsed); timeouts on other argv shapes may still emit `currentPage` / summary evidence without `steps`. `currentPage` comes from session-scoped `get url` / `get title` when the session answers, otherwise a fallback URL may be inferred from the last `open` / `navigate` / `pushstate` step in the plan. `artifacts` covers declared output paths on `screenshot`, `pdf`, `download`, and `wait --download` steps (absolute path, existence, optional `sizeBytes`, `stepIndex`). Visible text repeats the same block under `Timeout partial progress`, applying URL and path-segment redaction; the prose `Planned steps` list shows at most six steps, then an omitted-count line when the plan is longer. This is recovery evidence only; missing entries do not prove the upstream step never ran or that no other side effects occurred.
684
- - `managedSessionOutcome` after a managed-session plan reaches process execution (`buildManagedSessionOutcome` / `formatManagedSessionOutcomeText` in `extensions/agent-browser/index.ts`). Populated when `buildExecutionPlan` injects an extension-managed implicit or fresh `--session`, and also when a successful explicit `--session <current-wrapper-managed-session> close` closes the current managed session. It remains omitted for unrelated explicit user-managed sessions and for sessionless inspection/local paths that skip injection. Fields: `status` (`created`, `replaced`, `unchanged`, `closed`, `preserved`, or `abandoned`), `sessionMode`, `attemptedSessionName`, `previousSessionName`, `currentSessionName`, optional `replacedSessionName`, `activeBefore`, `activeAfter`, `succeeded`, and `summary`. Model-visible echo: only when `sessionMode` is `"fresh"` **and** `succeeded` is false, the wrapper appends a line of the form `Managed session outcome: ${summary}` after the primary presentation (including missing-binary failures on a fresh plan, where it follows the missing-binary message and no other diagnostic tail runs). When other trailing diagnostic prose is also emitted in the same result, that line is concatenated **after** semantic-action candidate lines, overlay/selector-visibility tails, and `Timeout partial progress` (see `rawAppendedDiagnosticText` in `extensions/agent-browser/index.ts`). For `"auto"` failures the same struct may appear on `details` without that extra line. When post-upstream analysis (for example **`qa`** preset failure) flips the overall tool result after a successful batch, the implementation only realigns `managedSessionOutcome.succeeded` to the final outcome; `status`/`summary` may still describe the managed-session transition (for example `replaced` while `failureCategory` is `qa-failure`), so read `failureCategory` / `qaPreset` / `batchFailure` alongside this object.
695
+ - `managedSessionOutcome` after a managed-session plan reaches process execution (`buildManagedSessionOutcome` / `formatManagedSessionOutcomeText` in `extensions/agent-browser/lib/orchestration/browser-run/session-state.ts`). Populated when `buildExecutionPlan` injects an extension-managed implicit or fresh `--session`, and also when a successful explicit `--session <current-wrapper-managed-session> close` closes the current managed session. It remains omitted for unrelated explicit user-managed sessions and for sessionless inspection/local paths that skip injection. Fields: `status` (`created`, `replaced`, `unchanged`, `closed`, `preserved`, or `abandoned`), `sessionMode`, `attemptedSessionName`, `previousSessionName`, `currentSessionName`, optional `replacedSessionName`, `activeBefore`, `activeAfter`, `succeeded`, and `summary` (machine-oriented; may include generated session names). Model-visible echo: only when `sessionMode` is `"fresh"` **and** `succeeded` is false, the wrapper appends action-oriented `Managed session outcome` and `Recovery` lines without repeating generated session ids in visible prose; session names remain in `details.managedSessionOutcome`. Failed fresh launches may also append `details.nextActions` such as `run-agent-browser-doctor`, `verify-current-managed-session`, `snapshot-current-managed-session`, or `retry-fresh-managed-session`. When other trailing diagnostic prose is also emitted in the same result, that block is concatenated **after** semantic-action candidate lines, overlay/selector-visibility tails, eval hints/warnings, and `Timeout partial progress` (see `rawAppendedDiagnosticText` in `extensions/agent-browser/lib/orchestration/browser-run/final-result.ts`). For `"auto"` failures the same struct may appear on `details` without that extra line. When post-upstream analysis (for example **`qa`** preset failure) flips the overall tool result after a successful batch, the implementation only realigns `managedSessionOutcome.succeeded` to the final outcome; `status`/`summary` may still describe the managed-session transition (for example `replaced` while `failureCategory` is `qa-failure`). In that case the visible recovery says the fresh launch became current and points to `failureCategory` / `qaPreset` / `batchFailure` for the post-launch failure.
685
696
  - `imagePath` / `imagePaths` for Pi inline image attachments from the **`screenshot`** command (including batched screenshot steps). **`diff screenshot`** still records the diff output as an `image`-kind entry in `details.artifacts`, but it does **not** populate `imagePath` / `imagePaths` or attach an inline image: only plain `screenshot` is treated as a trusted live-capture path for automatic inlining (`isTrustedScreenshotOutput` in `extensions/agent-browser/lib/results/presentation/artifacts.ts`).
686
697
  - `artifacts` for upstream saved files such as screenshots, `state save` outputs, `diff screenshot` diff images, PDFs, downloads, `wait --download` files, traces, CPU profiles, completed WebM recordings, path-bearing HAR captures, and future recording output paths reported by `record start`. Each artifact includes the original saved or requested `path`, resolved `absolutePath`, `kind`/`artifactType`, optional `mediaType`, optional `extension`, best-effort disk metadata such as `exists` and `sizeBytes`, plus `requestedPath`, `status`, `cwd`, `session`, and `tempPath` when applicable.
687
698
  - `savedFilePath` / `savedFile` for direct `download`, `pdf`, and `wait --download` saved-file workflows; batch results preserve the same fields on the relevant `batchSteps` entry.
@@ -1,11 +1,13 @@
1
1
  import { isRecord } from "../../parsing.js";
2
2
  import { redactSensitiveText } from "../../runtime.js";
3
3
  import { withOptionalSessionArgs, type AgentBrowserNextAction } from "../../results/next-actions.js";
4
+ import type { SessionRefSnapshot } from "../../session-page-state.js";
4
5
  import { runSessionCommandData } from "./session-state.js";
5
6
  import type { ClickDispatchDiagnostic, ClickDispatchProbe, ClickDispatchProbeTarget } from "./types.js";
6
7
 
7
8
  const CLICK_DISPATCH_MARKER_PREFIX = "__piAgentBrowserClickDispatchProbe_";
8
9
  const CLICK_DISPATCH_CLEANUP_TIMEOUT_MS = 2_000;
10
+ const ACCESSIBLE_REF_CLICK_DISPATCH_ROLES = new Set(["button", "checkbox", "menuitem", "radio", "switch", "tab"]);
9
11
 
10
12
  function parseClickRefId(selector: string): string | undefined {
11
13
  const trimmed = selector.trim();
@@ -13,11 +15,16 @@ function parseClickRefId(selector: string): string | undefined {
13
15
  return /^e\d+$/.test(candidate) ? candidate : undefined;
14
16
  }
15
17
 
16
- function getClickDispatchSelectorTarget(commandTokens: string[]): ClickDispatchProbeTarget | undefined {
18
+ function getClickDispatchProbeTarget(commandTokens: string[], refSnapshot?: SessionRefSnapshot): ClickDispatchProbeTarget | undefined {
17
19
  if (commandTokens[0] !== "click" || commandTokens.includes("--new-tab")) return undefined;
18
20
  const selector = commandTokens[1];
19
21
  if (!selector || selector.startsWith("-")) return undefined;
20
- if (parseClickRefId(selector)) return undefined;
22
+ const refId = parseClickRefId(selector);
23
+ if (refId) {
24
+ const ref = refSnapshot?.refs?.[refId];
25
+ if (!ref || !ACCESSIBLE_REF_CLICK_DISPATCH_ROLES.has(ref.role)) return undefined;
26
+ return { kind: "accessible", name: ref.name, refId, role: ref.role };
27
+ }
21
28
  if (selector.startsWith("xpath=")) return { kind: "xpath", selector: selector.slice("xpath=".length) };
22
29
  return { kind: "selector", selector };
23
30
  }
@@ -30,7 +37,36 @@ function buildClickDispatchProbeInstallScript(probe: ClickDispatchProbe): string
30
37
  const target = probe.target;
31
38
  const resolveTarget = target.kind === "selector"
32
39
  ? `(() => { try { return document.querySelector(${JSON.stringify(target.selector)}); } catch { return null; } })()`
33
- : `(() => { try { return document.evaluate(${JSON.stringify(target.selector)}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } catch { return null; } })()`;
40
+ : target.kind === "xpath"
41
+ ? `(() => { try { return document.evaluate(${JSON.stringify(target.selector)}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } catch { return null; } })()`
42
+ : `(() => {
43
+ const normalize = (value) => String(value ?? "").replace(/\\s+/g, " ").trim();
44
+ const expectedRole = ${JSON.stringify(target.role)};
45
+ const expectedName = normalize(${JSON.stringify(target.name)});
46
+ const inferRole = (element) => {
47
+ const explicit = element.getAttribute("role");
48
+ if (explicit) return explicit;
49
+ const tagName = element.tagName.toLowerCase();
50
+ if (tagName === "button" || tagName === "select" || tagName === "textarea") return tagName;
51
+ if (tagName === "a" && element.hasAttribute("href")) return "link";
52
+ if (tagName === "input") {
53
+ const type = (element.getAttribute("type") || "text").toLowerCase();
54
+ if (["button", "submit", "reset", "image"].includes(type)) return "button";
55
+ if (type === "checkbox") return "checkbox";
56
+ if (type === "radio") return "radio";
57
+ return "textbox";
58
+ }
59
+ return "";
60
+ };
61
+ const inferName = (element) => normalize(element.getAttribute("aria-label") || element.getAttribute("title") || element.value || element.textContent || "");
62
+ const isVisible = (element) => {
63
+ const style = window.getComputedStyle(element);
64
+ if (style.display === "none" || style.visibility === "hidden" || Number(style.opacity) === 0) return false;
65
+ return element.getClientRects().length > 0;
66
+ };
67
+ const candidates = Array.from(document.querySelectorAll("button,a[href],input,select,textarea,summary,[role],[onclick],[tabindex]")).filter((element) => inferRole(element) === expectedRole && inferName(element) === expectedName && isVisible(element));
68
+ return candidates.length === 1 ? candidates[0] : null;
69
+ })()`;
34
70
  return `(() => {
35
71
  const marker = ${JSON.stringify(probe.marker)};
36
72
  const element = ${resolveTarget};
@@ -80,9 +116,10 @@ return { status: "cleaned-up" };
80
116
  }
81
117
 
82
118
  function redactClickDispatchTarget(target: ClickDispatchProbeTarget): ClickDispatchProbeTarget {
83
- return target.kind === "selector" || target.kind === "xpath"
84
- ? { ...target, selector: redactSensitiveText(target.selector) }
85
- : target;
119
+ if (target.kind === "selector" || target.kind === "xpath") {
120
+ return { ...target, selector: redactSensitiveText(target.selector) };
121
+ }
122
+ return { ...target, name: redactSensitiveText(target.name) };
86
123
  }
87
124
 
88
125
  export function formatClickDispatchDiagnosticText(diagnostic: ClickDispatchDiagnostic): string {
@@ -109,9 +146,9 @@ export function buildClickDispatchNextActions(options: { commandTokens: string[]
109
146
  ];
110
147
  }
111
148
 
112
- export async function prepareClickDispatchProbe(options: { commandTokens: string[]; cwd: string; sessionName?: string; signal?: AbortSignal }): Promise<ClickDispatchProbe | undefined> {
149
+ export async function prepareClickDispatchProbe(options: { commandTokens: string[]; cwd: string; refSnapshot?: SessionRefSnapshot; sessionName?: string; signal?: AbortSignal }): Promise<ClickDispatchProbe | undefined> {
113
150
  if (!options.sessionName || options.commandTokens[0] !== "click" || options.commandTokens.includes("--new-tab")) return undefined;
114
- const target = getClickDispatchSelectorTarget(options.commandTokens);
151
+ const target = getClickDispatchProbeTarget(options.commandTokens, options.refSnapshot);
115
152
  if (!target) return undefined;
116
153
  const probe: ClickDispatchProbe = { marker: `${CLICK_DISPATCH_MARKER_PREFIX}${Date.now().toString(36)}_${Math.random().toString(36).slice(2)}`, target };
117
154
  const installData = await runSessionCommandData({ args: ["eval", "--stdin"], cwd: options.cwd, sessionName: options.sessionName, signal: options.signal, stdin: buildClickDispatchProbeInstallScript(probe) });
@@ -300,8 +300,21 @@ export async function collectOverlayBlockerDiagnostic(options: { command?: strin
300
300
  return { candidates, snapshot, summary: `Click completed but the page stayed on ${currentUrl}; a fresh snapshot contains likely overlay close/dismiss controls.` };
301
301
  }
302
302
 
303
+ const SELECTOR_TEXT_VISIBILITY_CANDIDATE_LIMIT = 8;
304
+
303
305
  function buildVisibleTextProbeScript(selector: string): string {
304
- return `(() => {\n const selector = ${JSON.stringify(selector)};\n const isVisible = (element) => {\n const style = window.getComputedStyle(element);\n if (!style || style.display === 'none' || style.visibility === 'hidden' || style.visibility === 'collapse' || Number(style.opacity) === 0) return false;\n return Array.from(element.getClientRects()).some((rect) => rect.width > 0 && rect.height > 0);\n };\n let matches = [];\n try {\n matches = Array.from(document.querySelectorAll(selector));\n } catch (error) {\n return JSON.stringify({ selector, error: error instanceof Error ? error.message : String(error) });\n }\n const visible = matches.filter(isVisible);\n const trim = (value) => typeof value === 'string' ? value.trim().replace(/\\s+/g, ' ').slice(0, 200) : undefined;\n return JSON.stringify({ selector, matchCount: matches.length, visibleCount: visible.length, firstMatchVisible: matches[0] ? isVisible(matches[0]) : undefined, firstTextPreview: trim(matches[0]?.textContent), firstVisibleTextPreview: trim(visible[0]?.textContent) });\n})()`;
306
+ return `(() => {\n const selector = ${JSON.stringify(selector)};\n const isVisible = (element) => {\n const style = window.getComputedStyle(element);\n if (!style || style.display === 'none' || style.visibility === 'hidden' || style.visibility === 'collapse' || Number(style.opacity) === 0) return false;\n return Array.from(element.getClientRects()).some((rect) => rect.width > 0 && rect.height > 0);\n };\n let matches = [];\n try {\n matches = Array.from(document.querySelectorAll(selector));\n } catch (error) {\n return JSON.stringify({ selector, error: error instanceof Error ? error.message : String(error) });\n }\n const visible = matches.filter(isVisible);\n const trim = (value) => typeof value === 'string' ? value.trim().replace(/\\s+/g, ' ').slice(0, 200) : undefined;\n const describeCandidate = (element) => {\n const index = matches.indexOf(element);\n const role = element.getAttribute('role');\n const candidate = { index, tagName: element.tagName.toLowerCase(), textPreview: trim(element.textContent) };\n if (role) candidate.role = role;\n return candidate;\n };\n const visibleCandidates = visible.slice(0, ${SELECTOR_TEXT_VISIBILITY_CANDIDATE_LIMIT}).map(describeCandidate);\n return JSON.stringify({ selector, matchCount: matches.length, visibleCount: visible.length, firstMatchVisible: matches[0] ? isVisible(matches[0]) : undefined, firstTextPreview: trim(matches[0]?.textContent), firstVisibleTextPreview: trim(visible[0]?.textContent), visibleCandidates });\n})()`;
307
+ }
308
+
309
+ function parseSelectorTextVisibilityCandidates(value: unknown): SelectorTextVisibilityDiagnostic["visibleCandidates"] {
310
+ if (!Array.isArray(value)) return undefined;
311
+ const candidates = value.flatMap((entry): NonNullable<SelectorTextVisibilityDiagnostic["visibleCandidates"]> => {
312
+ if (!isRecord(entry) || typeof entry.index !== "number" || typeof entry.tagName !== "string") return [];
313
+ const role = typeof entry.role === "string" && entry.role.length > 0 ? entry.role : undefined;
314
+ const textPreview = typeof entry.textPreview === "string" && entry.textPreview.length > 0 ? redactSensitiveText(entry.textPreview) : undefined;
315
+ return [{ index: entry.index, tagName: entry.tagName, ...(role ? { role } : {}), ...(textPreview ? { textPreview } : {}) }];
316
+ });
317
+ return candidates.length > 0 ? candidates : undefined;
305
318
  }
306
319
 
307
320
  function parseSelectorTextVisibilityProbe(data: unknown, selector: string): Omit<SelectorTextVisibilityDiagnostic, "summary"> | undefined {
@@ -313,7 +326,14 @@ function parseSelectorTextVisibilityProbe(data: unknown, selector: string): Omit
313
326
  const matchCount = typeof parsed.matchCount === "number" ? parsed.matchCount : undefined;
314
327
  const visibleCount = typeof parsed.visibleCount === "number" ? parsed.visibleCount : undefined;
315
328
  if (matchCount === undefined || visibleCount === undefined) return undefined;
316
- return { firstMatchVisible: typeof parsed.firstMatchVisible === "boolean" ? parsed.firstMatchVisible : undefined, firstVisibleTextPreview: typeof parsed.firstVisibleTextPreview === "string" && parsed.firstVisibleTextPreview.length > 0 ? redactSensitiveText(parsed.firstVisibleTextPreview) : undefined, matchCount, selector, visibleCount };
329
+ return {
330
+ firstMatchVisible: typeof parsed.firstMatchVisible === "boolean" ? parsed.firstMatchVisible : undefined,
331
+ firstVisibleTextPreview: typeof parsed.firstVisibleTextPreview === "string" && parsed.firstVisibleTextPreview.length > 0 ? redactSensitiveText(parsed.firstVisibleTextPreview) : undefined,
332
+ matchCount,
333
+ selector,
334
+ visibleCandidates: parseSelectorTextVisibilityCandidates(parsed.visibleCandidates),
335
+ visibleCount,
336
+ };
317
337
  }
318
338
 
319
339
  function selectorMayExposeSensitiveLiteral(selector: string): boolean {
@@ -366,6 +386,14 @@ export function formatSelectorTextVisibilityText(diagnostics: SelectorTextVisibi
366
386
  const actionId = index === 0 ? "inspect-visible-text-candidates" : `inspect-visible-text-candidates-${index + 1}`;
367
387
  const lines = [`Selector text visibility warning: ${diagnostic.summary}`];
368
388
  if (diagnostic.firstVisibleTextPreview) lines.push(`First visible text preview: ${JSON.stringify(diagnostic.firstVisibleTextPreview)}`);
389
+ if (diagnostic.visibleCandidates && diagnostic.visibleCandidates.length > 0) {
390
+ lines.push(`Visible candidates (${diagnostic.visibleCandidates.length} shown, querySelectorAll index):`);
391
+ for (const candidate of diagnostic.visibleCandidates) {
392
+ const rolePart = candidate.role ? ` role=${candidate.role}` : "";
393
+ const previewPart = candidate.textPreview ? `: ${JSON.stringify(candidate.textPreview)}` : "";
394
+ lines.push(`- [${candidate.index}] ${candidate.tagName}${rolePart}${previewPart}`);
395
+ }
396
+ }
369
397
  lines.push(`Next action: use details.nextActions ${actionId} before trusting this selector text.`);
370
398
  return lines;
371
399
  }).join("\n");
@@ -451,6 +479,22 @@ export function formatEvalStdinHintText(hint: ReturnType<typeof getEvalStdinHint
451
479
  return hint ? `Eval stdin hint: ${hint.reason} ${hint.suggestion}` : undefined;
452
480
  }
453
481
 
482
+ export function getEvalResultWarning(options: { command?: string; data: unknown; navigationSummary?: { url?: string }; pageUrl?: string; stdin?: string }) {
483
+ if (options.command !== "eval" || !options.stdin?.trim() || !isRecord(options.data) || options.data.result !== null) return undefined;
484
+ const pageUrl = options.pageUrl?.trim() ?? options.navigationSummary?.url?.trim() ?? extractNavigationSummaryFromData(options.data)?.url;
485
+ if (!pageUrl || !/^file:/i.test(pageUrl)) return undefined;
486
+ const trimmed = options.stdin.trim();
487
+ if (/^(?:null|undefined)$/i.test(trimmed)) return undefined;
488
+ return {
489
+ reason: "eval --stdin returned null on a file:// page; upstream may not expose full DOM semantics for local fixtures.",
490
+ suggestion: "Treat this as inconclusive verification. Use snapshot -i, get text on current @refs, screenshot evidence, or a reachable http(s) fixture before concluding DOM state.",
491
+ };
492
+ }
493
+
494
+ export function formatEvalResultWarningText(warning: ReturnType<typeof getEvalResultWarning>): string | undefined {
495
+ return warning ? `Eval result warning: ${warning.reason} ${warning.suggestion}` : undefined;
496
+ }
497
+
454
498
  export async function getArtifactCleanupGuidance(options: { command?: string; cwd: string; manifest?: SessionArtifactManifest; succeeded: boolean }): Promise<ArtifactCleanupGuidance | undefined> {
455
499
  if (!options.succeeded || !isCloseCommand(options.command) || !options.manifest || options.manifest.entries.length === 0) return undefined;
456
500
  const explicitEntries = options.manifest.entries.filter((entry) => entry.storageScope === "explicit-path");
@@ -56,6 +56,7 @@ import {
56
56
  formatArtifactCleanupGuidanceText,
57
57
  formatComboboxFocusDiagnosticText,
58
58
  formatElectronBroadGetTextScopeText,
59
+ formatEvalResultWarningText,
59
60
  formatEvalStdinHintText,
60
61
  formatFillVerificationText,
61
62
  formatOverlayBlockerText,
@@ -69,6 +70,7 @@ import {
69
70
  buildElectronLifecycleNextActions,
70
71
  buildElectronMismatchNextActions,
71
72
  buildElectronRefFreshnessNextActions,
73
+ buildManagedSessionFreshFailureNextActions,
72
74
  buildManagedSessionOutcome,
73
75
  buildSessionDetailFields,
74
76
  formatElectronPostCommandHealthText,
@@ -323,6 +325,7 @@ function buildResultNextActions(options: FinalResultInput): AgentBrowserNextActi
323
325
  if (options.clickDispatchDiagnostic) nextActionCollector.append(buildClickDispatchNextActions({ commandTokens: options.commandTokens, sessionName: options.executionPlan.sessionName }));
324
326
  if (options.scrollNoopDiagnostic) nextActionCollector.append(buildScrollNoopNextActions(options.executionPlan.sessionName));
325
327
  if (options.comboboxFocusDiagnostic) nextActionCollector.append(buildComboboxFocusNextActions(options.executionPlan.sessionName));
328
+ if (options.managedSessionOutcome) nextActionCollector.appendUnique(buildManagedSessionFreshFailureNextActions(options.managedSessionOutcome));
326
329
  if (options.categoryDetails.failureCategory === "stale-ref" && options.redactedCompiledSemanticAction && isCompiledSemanticActionFindCommand(options.compiledSemanticAction)) nextActionCollector.append([{ id: "retry-semantic-action-after-stale-ref", params: { args: options.redactedCompiledSemanticAction.args }, reason: "Retry the same semantic target via its compiled find command after the upstream stale-ref failure proves the prior action did not execute.", safety: "Use only for the same intended target; direct stale @refs still require a fresh snapshot or stable locator before retrying.", tool: "agent_browser" as const }]);
327
330
  if (options.electronLaunchRecord) nextActionCollector.append(buildAgentBrowserNextActions({ electron: { launchId: options.electronLaunchRecord.launchId, sessionName: options.electronLaunchRecord.sessionName, status: options.electronLaunchRecord.cleanupState }, failureCategory: options.categoryDetails.failureCategory, resultCategory: options.categoryDetails.resultCategory, successCategory: options.categoryDetails.successCategory }));
328
331
  return nextActionCollector.toArray();
@@ -386,6 +389,7 @@ function buildAgentBrowserResultDetails(options: FinalResultInput, nextActions:
386
389
  selectorTextVisibility: options.selectorTextVisibilityDiagnostics[0],
387
390
  selectorTextVisibilityAll: options.selectorTextVisibilityDiagnostics.length > 1 ? options.selectorTextVisibilityDiagnostics : undefined,
388
391
  evalStdinHint: options.evalStdinHint,
392
+ evalResultWarning: options.evalResultWarning,
389
393
  timeoutPartialProgress: options.timeoutPartialProgress,
390
394
  parseError: options.plainTextInspection ? undefined : options.parseError,
391
395
  savedFile: options.presentation.savedFile,
@@ -424,10 +428,11 @@ export function buildFinalAgentBrowserToolResult(options: FinalResultInput): Age
424
428
  const comboboxFocusDiagnosticText = formatComboboxFocusDiagnosticText(options.comboboxFocusDiagnostic);
425
429
  const recordingDependencyWarningText = formatRecordingDependencyWarningText(options.recordingDependencyWarning);
426
430
  const evalStdinHintText = formatEvalStdinHintText(options.evalStdinHint);
431
+ const evalResultWarningText = formatEvalResultWarningText(options.evalResultWarning);
427
432
  const artifactCleanupText = formatArtifactCleanupGuidanceText(options.artifactCleanup);
428
433
  const timeoutPartialProgressText = options.timeoutPartialProgress ? formatTimeoutPartialProgressText(options.timeoutPartialProgress) : undefined;
429
434
  const managedSessionOutcomeText = formatManagedSessionOutcomeText(options.managedSessionOutcome);
430
- const rawAppendedDiagnosticText = [visibleRefFallbackText, richInputRecoveryText, semanticActionCandidateText, clickDispatchText, overlayBlockerText, fillVerificationText, electronRefFreshnessText, selectorTextVisibilityText, electronBroadGetTextScopeText, scrollNoopDiagnosticText, comboboxFocusDiagnosticText, recordingDependencyWarningText, evalStdinHintText, artifactCleanupText, timeoutPartialProgressText, managedSessionOutcomeText].filter((item): item is string => item !== undefined).join("\n\n");
435
+ const rawAppendedDiagnosticText = [visibleRefFallbackText, richInputRecoveryText, semanticActionCandidateText, clickDispatchText, overlayBlockerText, fillVerificationText, electronRefFreshnessText, selectorTextVisibilityText, electronBroadGetTextScopeText, scrollNoopDiagnosticText, comboboxFocusDiagnosticText, recordingDependencyWarningText, evalStdinHintText, evalResultWarningText, artifactCleanupText, timeoutPartialProgressText, managedSessionOutcomeText].filter((item): item is string => item !== undefined).join("\n\n");
431
436
  const appendedDiagnosticText = redactSensitiveText(redactExactSensitiveText(rawAppendedDiagnosticText, options.exactSensitiveValues));
432
437
  const shouldAppendDiagnosticText = appendedDiagnosticText.length > 0 && (!options.userRequestedJson || options.plainTextInspection);
433
438
  let content = shouldAppendDiagnosticText && options.redactedContent[0]?.type === "text" ? [{ ...options.redactedContent[0], text: `${options.redactedContent[0].text}\n\n${appendedDiagnosticText}` }, ...options.redactedContent.slice(1)] : options.redactedContent;
@@ -443,6 +448,7 @@ export async function buildMissingBinaryFailureResult(options: { compatibilityWo
443
448
  const errorText = buildMissingBinaryMessage();
444
449
  const managedSessionOutcome = buildManagedSessionOutcome({ activeAfter: options.managedSessionActive, activeBefore: options.managedSessionActive, attemptedSessionName: options.executionPlan.managedSessionName, command: options.executionPlan.commandInfo.command, currentSessionName: options.managedSessionName, previousSessionName: options.managedSessionName, sessionMode: options.sessionMode, succeeded: false });
445
450
  const managedSessionOutcomeText = formatManagedSessionOutcomeText(managedSessionOutcome);
451
+ const managedSessionRecoveryNextActions = buildManagedSessionFreshFailureNextActions(managedSessionOutcome);
446
452
  let missingBinaryElectronCleanup: ElectronCleanupResult | undefined;
447
453
  let missingBinaryElectronRecord: ElectronLaunchRecord | undefined;
448
454
  if (options.electronLaunch) {
@@ -450,5 +456,5 @@ export async function buildMissingBinaryFailureResult(options: { compatibilityWo
450
456
  missingBinaryElectronRecord = missingBinaryElectronCleanup.record;
451
457
  }
452
458
  const textParts = [errorText, managedSessionOutcomeText, missingBinaryElectronCleanup ? `Electron cleanup after failed attach: ${missingBinaryElectronCleanup.summary}` : undefined].filter((part): part is string => part !== undefined && part.length > 0);
453
- return { content: [{ type: "text", text: textParts.join("\n\n") }], details: { args: options.redactedArgs, compatibilityWorkaround: options.compatibilityWorkaround, effectiveArgs: options.redactedProcessArgs, electron: missingBinaryElectronRecord ? { action: "launch" as const, cleanup: missingBinaryElectronCleanup, launch: missingBinaryElectronRecord, status: "failed" as const, targets: options.electronLaunch?.targets, version: options.electronLaunch?.version } : undefined, managedSessionOutcome, sessionMode: options.sessionMode, sessionTabCorrection: options.sessionTabCorrection, ...buildAgentBrowserResultCategoryDetails({ args: options.redactedProcessArgs, command: options.executionPlan.commandInfo.command, errorText, failureCategory: "missing-binary", spawnError: options.processResult.spawnError.message, succeeded: false }), spawnError: options.processResult.spawnError.message }, isError: true };
459
+ return { content: [{ type: "text", text: textParts.join("\n\n") }], details: { args: options.redactedArgs, compatibilityWorkaround: options.compatibilityWorkaround, effectiveArgs: options.redactedProcessArgs, electron: missingBinaryElectronRecord ? { action: "launch" as const, cleanup: missingBinaryElectronCleanup, launch: missingBinaryElectronRecord, status: "failed" as const, targets: options.electronLaunch?.targets, version: options.electronLaunch?.version } : undefined, managedSessionOutcome, nextActions: managedSessionRecoveryNextActions.length > 0 ? managedSessionRecoveryNextActions : undefined, sessionMode: options.sessionMode, sessionTabCorrection: options.sessionTabCorrection, ...buildAgentBrowserResultCategoryDetails({ args: options.redactedProcessArgs, command: options.executionPlan.commandInfo.command, errorText, failureCategory: "missing-binary", spawnError: options.processResult.spawnError.message, succeeded: false }), spawnError: options.processResult.spawnError.message }, isError: true };
454
460
  }
@@ -699,7 +699,7 @@ export async function prepareBrowserRun(options: BrowserRunOptions): Promise<Pre
699
699
  }
700
700
  }
701
701
  const clickDispatchProbe = pinnedBatchUnwrapMode === undefined && compiledElectron === undefined
702
- ? await prepareClickDispatchProbe({ commandTokens, cwd, sessionName: executionPlan.sessionName, signal })
702
+ ? await prepareClickDispatchProbe({ commandTokens, cwd, refSnapshot: promptRefSnapshot, sessionName: executionPlan.sessionName, signal })
703
703
  : undefined;
704
704
  const redactedProcessArgs = redactInvocationArgs(processArgs);
705
705
  const shouldProbeScrollNoop = executionPlan.commandInfo.command === "scroll" && executionPlan.startupScopedFlags.length === 0;
@@ -51,6 +51,7 @@ import {
51
51
  closeManagedSession,
52
52
  collectOpenResultTabCorrection,
53
53
  collectSessionTabSelection,
54
+ extractNavigationSummaryFromData,
54
55
  extractStringResultField,
55
56
  findElectronLaunchRecordForSession,
56
57
  formatElectronPostCommandHealthText,
@@ -79,6 +80,7 @@ import {
79
80
  collectTimeoutPartialProgress,
80
81
  formatQaAttachedTargetText,
81
82
  getArtifactCleanupGuidance,
83
+ getEvalResultWarning,
82
84
  getEvalStdinHint,
83
85
  getSourceLookupElectronContext,
84
86
  sleepMs,
@@ -419,7 +421,11 @@ export async function processBrowserOutput(input: ProcessBrowserOutputInput): Pr
419
421
  if (!skipAttachedTargetBanner && qaAttachedTargetText && presentation.content[0]?.type === "text") presentation.content[0] = { ...presentation.content[0], text: `${qaAttachedTargetText}\n\n${presentation.content[0].text}` };
420
422
  else if (!skipAttachedTargetBanner && qaAttachedTargetText) presentation.content.unshift({ type: "text", text: qaAttachedTargetText });
421
423
  if (managedSessionOutcome && managedSessionOutcome.succeeded !== succeeded) managedSessionOutcome = { ...managedSessionOutcome, succeeded };
424
+ const evalNavigationSummary = navigationSummary ?? extractNavigationSummaryFromData(presentationEnvelope?.data);
425
+ const evalSessionTabUrl = prepared.executionPlan.sessionName ? sessionPageState.get(prepared.executionPlan.sessionName).tabTarget?.url : undefined;
426
+ const evalPageUrl = evalNavigationSummary?.url ?? currentSessionTabTarget?.url ?? prepared.priorSessionTabTarget?.url ?? evalSessionTabUrl;
422
427
  const evalStdinHint = getEvalStdinHint({ command: prepared.executionPlan.commandInfo.command, data: presentationEnvelope?.data, stdin: prepared.runtimeToolStdin });
428
+ const evalResultWarning = getEvalResultWarning({ command: prepared.executionPlan.commandInfo.command, data: presentationEnvelope?.data, navigationSummary: evalNavigationSummary, pageUrl: evalPageUrl, stdin: prepared.runtimeToolStdin });
423
429
  const resultArtifactManifest = presentation.artifactManifest ?? artifactManifest;
424
430
  const artifactCleanup = await getArtifactCleanupGuidance({ command: prepared.executionPlan.commandInfo.command, cwd, manifest: resultArtifactManifest, succeeded });
425
431
  const warningText = electronPostCommandHealth ? formatElectronPostCommandHealthText(electronPostCommandHealth) : electronSessionMismatch ? formatElectronSessionMismatchText(electronSessionMismatch) : aboutBlankSessionMismatch ? buildAboutBlankWarning(aboutBlankSessionMismatch) : undefined;
@@ -427,7 +433,7 @@ export async function processBrowserOutput(input: ProcessBrowserOutputInput): Pr
427
433
  const finalRecoveryState = await prepareFinalResultRecoveryState({ aboutBlankSessionMismatch, batchRefSnapshotState, commandTokens: prepared.commandTokens, compiledSemanticAction: prepared.compiledSemanticAction, currentRefSnapshot, currentRefSnapshotInvalidation, currentSessionTabTarget, cwd, electronPostCommandHealth, errorText, executionPlan: prepared.executionPlan, parseError, plainTextInspection, presentation, processResult, redactedProcessArgs: prepared.redactedProcessArgs, runtimeToolArgs: prepared.runtimeToolArgs, sessionPageState, sessionPageStateUpdate, sessionTabCorrection, signal, succeeded });
428
434
  currentRefSnapshot = finalRecoveryState.currentRefSnapshot;
429
435
  currentRefSnapshotInvalidation = finalRecoveryState.currentRefSnapshotInvalidation;
430
- const result = buildFinalAgentBrowserToolResult({ aboutBlankSessionMismatch, artifactCleanup, categoryDetails: finalRecoveryState.categoryDetails, clickDispatchDiagnostic, commandTokens: prepared.commandTokens, comboboxFocusDiagnostic, compiledNetworkSourceLookup: prepared.compiledNetworkSourceLookup, compiledSemanticAction: prepared.compiledSemanticAction, compatibilityWorkaround: prepared.compatibilityWorkaround, currentRefSnapshot, currentRefSnapshotInvalidation, currentSessionTabTarget, electronBroadGetTextScopeDiagnostics, electronFailedConnectCleanup, electronHandoff, electronLaunch: prepared.electronLaunch, electronLaunchRecord, electronLaunchRecords, electronPostCommandHealth, electronProfileIsolationDetails: input.electronProfileIsolationDetails, electronRefFreshnessDiagnostic, electronSessionMismatch, errorText, evalStdinHint, exactSensitiveValues: prepared.exactSensitiveValues, executionPlan: prepared.executionPlan, fillVerificationDiagnostic, inspectionText, managedSessionOutcome, navigationSummary, networkSourceLookup, noActivePageSnapshotFailure: finalRecoveryState.noActivePageSnapshotFailure, openResultTabCorrection, overlayBlockerDiagnostic, parseError, parseFailureOutput, parseSucceeded, plainTextInspection, presentation, presentationEnvelope, priorSessionTabTarget: prepared.priorSessionTabTarget, processResult, qaAttachedTarget, qaPreset, recordingDependencyWarning, redactedArgs: prepared.redactedArgs, redactedCompiledElectron: prepared.redactedCompiledElectron, redactedCompiledJob: prepared.redactedCompiledJob, redactedCompiledNetworkSourceLookup: prepared.redactedCompiledNetworkSourceLookup, redactedCompiledQaPreset: prepared.redactedCompiledQaPreset, redactedCompiledSemanticAction: prepared.redactedCompiledSemanticAction, redactedCompiledSourceLookup: prepared.redactedCompiledSourceLookup, redactedContent, redactedProcessArgs: prepared.redactedProcessArgs, redactedRecoveryHint: prepared.redactedRecoveryHint, resultArtifactManifest, richInputRecoveryDiagnostic: finalRecoveryState.richInputRecoveryDiagnostic, scrollNoopDiagnostic, selectorTextVisibilityDiagnostics, sessionMode: prepared.sessionMode, sessionTabCorrection, sourceLookup, succeeded, timeoutPartialProgress, userRequestedJson: prepared.userRequestedJson, visibleRefFallbackDiagnostic: finalRecoveryState.visibleRefFallbackDiagnostic, visibleRefFallbackSessionName: finalRecoveryState.visibleRefFallbackSessionName });
436
+ const result = buildFinalAgentBrowserToolResult({ aboutBlankSessionMismatch, artifactCleanup, categoryDetails: finalRecoveryState.categoryDetails, clickDispatchDiagnostic, commandTokens: prepared.commandTokens, comboboxFocusDiagnostic, compiledNetworkSourceLookup: prepared.compiledNetworkSourceLookup, compiledSemanticAction: prepared.compiledSemanticAction, compatibilityWorkaround: prepared.compatibilityWorkaround, currentRefSnapshot, currentRefSnapshotInvalidation, currentSessionTabTarget, electronBroadGetTextScopeDiagnostics, electronFailedConnectCleanup, electronHandoff, electronLaunch: prepared.electronLaunch, electronLaunchRecord, electronLaunchRecords, electronPostCommandHealth, electronProfileIsolationDetails: input.electronProfileIsolationDetails, electronRefFreshnessDiagnostic, electronSessionMismatch, errorText, evalResultWarning, evalStdinHint, exactSensitiveValues: prepared.exactSensitiveValues, executionPlan: prepared.executionPlan, fillVerificationDiagnostic, inspectionText, managedSessionOutcome, navigationSummary, networkSourceLookup, noActivePageSnapshotFailure: finalRecoveryState.noActivePageSnapshotFailure, openResultTabCorrection, overlayBlockerDiagnostic, parseError, parseFailureOutput, parseSucceeded, plainTextInspection, presentation, presentationEnvelope, priorSessionTabTarget: prepared.priorSessionTabTarget, processResult, qaAttachedTarget, qaPreset, recordingDependencyWarning, redactedArgs: prepared.redactedArgs, redactedCompiledElectron: prepared.redactedCompiledElectron, redactedCompiledJob: prepared.redactedCompiledJob, redactedCompiledNetworkSourceLookup: prepared.redactedCompiledNetworkSourceLookup, redactedCompiledQaPreset: prepared.redactedCompiledQaPreset, redactedCompiledSemanticAction: prepared.redactedCompiledSemanticAction, redactedCompiledSourceLookup: prepared.redactedCompiledSourceLookup, redactedContent, redactedProcessArgs: prepared.redactedProcessArgs, redactedRecoveryHint: prepared.redactedRecoveryHint, resultArtifactManifest, richInputRecoveryDiagnostic: finalRecoveryState.richInputRecoveryDiagnostic, scrollNoopDiagnostic, selectorTextVisibilityDiagnostics, sessionMode: prepared.sessionMode, sessionTabCorrection, sourceLookup, succeeded, timeoutPartialProgress, userRequestedJson: prepared.userRequestedJson, visibleRefFallbackDiagnostic: finalRecoveryState.visibleRefFallbackDiagnostic, visibleRefFallbackSessionName: finalRecoveryState.visibleRefFallbackSessionName });
431
437
  const statePatch: BrowserRunStatePatch = { artifactManifest, freshSessionOrdinal, managedSessionActive, managedSessionCwd, managedSessionName };
432
438
  return { result, statePatch };
433
439
  } finally {
@@ -4,6 +4,7 @@ import type { ElectronLaunchStatus } from "../../electron/cleanup.js";
4
4
  import type { ElectronCdpTarget, ElectronLaunchRecord } from "../../electron/launch.js";
5
5
  import { runAgentBrowserProcess } from "../../process.js";
6
6
  import { buildAgentBrowserNextActions, getAgentBrowserErrorText, parseAgentBrowserEnvelope, type AgentBrowserBatchResult, type AgentBrowserEnvelope, type AgentBrowserNextAction } from "../../results.js";
7
+ import { buildNextToolAction, withOptionalSessionArgs } from "../../results/next-actions.js";
7
8
  import {
8
9
  extractRefSnapshotFromData,
9
10
  isAboutBlankUrl,
@@ -118,8 +119,85 @@ export function buildManagedSessionOutcome(options: {
118
119
  };
119
120
  }
120
121
 
122
+ function isFreshPostLaunchFailure(outcome: ManagedSessionOutcome): boolean {
123
+ return !outcome.succeeded && outcome.sessionMode === "fresh" && outcome.activeAfter && !!outcome.currentSessionName && (outcome.status === "created" || outcome.status === "replaced" || outcome.status === "unchanged");
124
+ }
125
+
126
+ function formatManagedSessionOutcomeHeadline(outcome: ManagedSessionOutcome): string {
127
+ if (outcome.status === "preserved") {
128
+ return "Managed session outcome: Fresh launch failed; your previous browser session is still active.";
129
+ }
130
+ if (outcome.status === "abandoned") {
131
+ return "Managed session outcome: Fresh launch failed; no managed browser session is current.";
132
+ }
133
+ if (isFreshPostLaunchFailure(outcome)) {
134
+ return "Managed session outcome: Fresh launch became current, but this tool call failed after launch.";
135
+ }
136
+ return `Managed session outcome: ${outcome.summary}`;
137
+ }
138
+
139
+ function formatManagedSessionOutcomeRecoveryGuidance(outcome: ManagedSessionOutcome): string {
140
+ const lines = ["Recovery:"];
141
+ if (outcome.status === "preserved") {
142
+ lines.push('- Continue with sessionMode "auto" on the current session, or retry the intended launch with sessionMode "fresh".');
143
+ lines.push("- Run doctor to verify agent-browser install and environment when failures persist.");
144
+ } else if (outcome.status === "abandoned") {
145
+ lines.push('- Retry with sessionMode "fresh" (for example args: ["open", "<url>"]) after verifying agent-browser is on PATH.');
146
+ lines.push("- Run doctor when install or environment issues are suspected.");
147
+ } else if (isFreshPostLaunchFailure(outcome)) {
148
+ lines.push('- Continue with sessionMode "auto" on the current session, or inspect failureCategory / qaPreset to fix the post-launch failure.');
149
+ lines.push("- Run doctor only if later browser commands also fail.");
150
+ } else {
151
+ lines.push('- Retry with sessionMode "fresh" when launch-scoped flags must apply, or run doctor to verify the environment.');
152
+ }
153
+ lines.push("- Full session names and transition details remain in details.managedSessionOutcome.");
154
+ return lines.join("\n");
155
+ }
156
+
121
157
  export function formatManagedSessionOutcomeText(outcome: ManagedSessionOutcome | undefined): string | undefined {
122
- return outcome && !outcome.succeeded && outcome.sessionMode === "fresh" ? `Managed session outcome: ${outcome.summary}` : undefined;
158
+ if (!outcome || outcome.succeeded || outcome.sessionMode !== "fresh") return undefined;
159
+ return [formatManagedSessionOutcomeHeadline(outcome), formatManagedSessionOutcomeRecoveryGuidance(outcome)].join("\n");
160
+ }
161
+
162
+ export function buildManagedSessionFreshFailureNextActions(outcome: ManagedSessionOutcome | undefined): AgentBrowserNextAction[] {
163
+ if (!outcome || outcome.succeeded || outcome.sessionMode !== "fresh") return [];
164
+ const actions: AgentBrowserNextAction[] = [];
165
+ if (!isFreshPostLaunchFailure(outcome)) {
166
+ actions.push(buildNextToolAction({
167
+ args: ["doctor"],
168
+ id: "run-agent-browser-doctor",
169
+ reason: "Verify agent-browser install, PATH, and environment after a failed fresh launch.",
170
+ safety: "Read-only local diagnostics; does not mutate browser state.",
171
+ }));
172
+ }
173
+ if ((outcome.status === "preserved" || isFreshPostLaunchFailure(outcome)) && outcome.activeAfter && outcome.currentSessionName) {
174
+ const sessionLabel = isFreshPostLaunchFailure(outcome) ? "current managed session" : "preserved managed session";
175
+ actions.push(
176
+ buildNextToolAction({
177
+ args: withOptionalSessionArgs(outcome.currentSessionName, ["get", "url"]),
178
+ id: "verify-current-managed-session",
179
+ reason: `Confirm the ${sessionLabel} before continuing with sessionMode auto.`,
180
+ safety: `Read-only URL check on the ${sessionLabel}.`,
181
+ }),
182
+ buildNextToolAction({
183
+ args: withOptionalSessionArgs(outcome.currentSessionName, ["snapshot", "-i"]),
184
+ id: "snapshot-current-managed-session",
185
+ reason: `Refresh interactive refs on the ${sessionLabel} before retrying the workflow.`,
186
+ safety: "Read-only snapshot; no navigation.",
187
+ }),
188
+ );
189
+ } else {
190
+ actions.push(
191
+ buildNextToolAction({
192
+ args: ["open", "about:blank"],
193
+ id: "retry-fresh-managed-session",
194
+ reason: "Start a new managed browser session after the failed fresh launch.",
195
+ safety: "Replace about:blank with the intended URL from your workflow.",
196
+ sessionMode: "fresh",
197
+ }),
198
+ );
199
+ }
200
+ return actions;
123
201
  }
124
202
 
125
203
  function getTraceOwner(command: string | undefined): TraceOwner | undefined {
@@ -124,10 +124,21 @@ export interface OverlayBlockerDiagnostic {
124
124
  summary: string;
125
125
  }
126
126
 
127
- export interface ClickDispatchProbeTarget {
128
- kind: "selector" | "xpath";
129
- selector: string;
130
- }
127
+ export type ClickDispatchProbeTarget =
128
+ | {
129
+ kind: "selector";
130
+ selector: string;
131
+ }
132
+ | {
133
+ kind: "xpath";
134
+ selector: string;
135
+ }
136
+ | {
137
+ kind: "accessible";
138
+ name: string;
139
+ refId: string;
140
+ role: string;
141
+ };
131
142
 
132
143
  export interface ClickDispatchProbe {
133
144
  marker: string;
@@ -142,12 +153,20 @@ export interface ClickDispatchDiagnostic {
142
153
  target: ClickDispatchProbeTarget;
143
154
  }
144
155
 
156
+ export interface SelectorTextVisibilityCandidate {
157
+ index: number;
158
+ role?: string;
159
+ tagName: string;
160
+ textPreview?: string;
161
+ }
162
+
145
163
  export interface SelectorTextVisibilityDiagnostic {
146
164
  firstMatchVisible?: boolean;
147
165
  firstVisibleTextPreview?: string;
148
166
  matchCount: number;
149
167
  selector: string;
150
168
  summary: string;
169
+ visibleCandidates?: SelectorTextVisibilityCandidate[];
151
170
  visibleCount: number;
152
171
  }
153
172
 
@@ -196,6 +215,11 @@ export interface EvalStdinHint {
196
215
  suggestion: string;
197
216
  }
198
217
 
218
+ export interface EvalResultWarning {
219
+ reason: string;
220
+ suggestion: string;
221
+ }
222
+
199
223
  export interface ArtifactCleanupGuidance {
200
224
  explicitArtifactPaths: string[];
201
225
  note: string;
@@ -461,6 +485,7 @@ export interface FinalResultInput {
461
485
  electronSessionMismatch?: ElectronSessionMismatch;
462
486
  errorText?: string;
463
487
  evalStdinHint?: EvalStdinHint;
488
+ evalResultWarning?: EvalResultWarning;
464
489
  exactSensitiveValues: string[];
465
490
  executionPlan: AgentBrowserExecutionPlan;
466
491
  fillVerificationDiagnostic?: FillVerificationDiagnostic;
@@ -1,3 +1,4 @@
1
+ import { parseArgvDescriptor } from "../argv-descriptor.js";
1
2
  import { validateToolArgs, redactInvocationArgs, redactSensitiveText } from "../runtime.js";
2
3
  import { buildAgentBrowserResultCategoryDetails } from "../results/categories.js";
3
4
  import {
@@ -150,6 +151,29 @@ function redactCompiledNetworkSourceLookup(compiled: CompiledAgentBrowserNetwork
150
151
  : undefined;
151
152
  }
152
153
 
154
+ function normalizeExplicitEvalStdinArgs(args: string[], stdin: string | undefined): { args: string[]; stdin?: string } {
155
+ if (stdin !== undefined) {
156
+ return { args, stdin };
157
+ }
158
+
159
+ const descriptor = parseArgvDescriptor(args);
160
+ if (descriptor.commandInfo.command !== "eval") {
161
+ return { args, stdin };
162
+ }
163
+
164
+ const stdinIndex = descriptor.commandTokens.indexOf("--stdin");
165
+ if (stdinIndex < 0 || stdinIndex >= descriptor.commandTokens.length - 1) {
166
+ return { args, stdin };
167
+ }
168
+
169
+ const commandStartIndex = args.length - descriptor.commandTokens.length;
170
+ const stdinValue = descriptor.commandTokens.slice(stdinIndex + 1).join(" ");
171
+ return {
172
+ args: [...args.slice(0, commandStartIndex), ...descriptor.commandTokens.slice(0, stdinIndex + 1)],
173
+ stdin: stdinValue,
174
+ };
175
+ }
176
+
153
177
  export function resolveAgentBrowserInput(options: {
154
178
  getBatchAnnotateValidationError: (args: string[], stdin: string | undefined) => string | undefined;
155
179
  managedSessionActive: boolean;
@@ -184,8 +208,9 @@ export function resolveAgentBrowserInput(options: {
184
208
  const compiledElectron = electronResult.compiled;
185
209
  const compiledJob = jobResult.compiled ?? compiledQaPreset;
186
210
  const compiledGeneratedBatch = compiledNetworkSourceLookup ?? compiledSourceLookup ?? compiledJob;
187
- const toolArgs = compiledElectron ? [] : compiledSemanticAction?.args ?? compiledGeneratedBatch?.args ?? params.args ?? [];
188
- const toolStdin = compiledGeneratedBatch?.stdin ?? params.stdin;
211
+ const normalizedExplicitArgs = normalizeExplicitEvalStdinArgs(params.args ?? [], params.stdin);
212
+ const toolArgs = compiledElectron ? [] : compiledSemanticAction?.args ?? compiledGeneratedBatch?.args ?? normalizedExplicitArgs.args;
213
+ const toolStdin = compiledGeneratedBatch?.stdin ?? normalizedExplicitArgs.stdin;
189
214
  const redactedArgs = redactInvocationArgs(toolArgs);
190
215
  const generatedStdinError = params.stdin !== undefined
191
216
  ? compiledGeneratedBatch
@@ -51,14 +51,14 @@ export const SHARED_BROWSER_PLAYBOOK_GUIDELINES = [
51
51
  "For Electron desktop apps, prefer top-level electron for wrapper-owned discovery, isolated launch, status, compact probe, and cleanup: list first, treat likely-sensitive annotations as hints rather than enforcement, launch with the default snapshot handoff unless handoff: \"tabs\" is the safer diagnostic starting point, use electron.probe or snapshot -i/qa.attached for current-session state, and always cleanup the returned launchId when done. electron.launch uses an isolated temporary profile; it does not reuse the app's normal signed-in profile or attach to an already-running authenticated app. For signed-in local app state, host-launch the normal app with --remote-debugging-port when appropriate, then use raw args connect <port|url>; after connect, inspect tab list, select the stable tab id such as tab t2, then run a condition wait or snapshot -i before using refs. close commands (`close`, `quit`, or `exit`) only close the browser/CDP session; leave manually launched app shutdown, profile cleanup, and explicit artifacts to the host owner.",
52
52
  "For provider or specialized app workflows, load version-matched upstream guidance with skills get agentcore|electron|slack|dogfood|vercel-sandbox through the native tool; add --full when you need references/templates, and use skills get --all only for broad skill audits. Provider launches such as -p ios, --provider browserbase/kernel/browseruse/browserless/agentcore, and iOS --device are upstream-owned setup paths; use sessionMode fresh when switching providers and expect external credentials or local Appium/Xcode setup to be required.",
53
53
  "For dialogs and frames, use dialog status/accept/dismiss and frame <selector|main> through native args; when --confirm-actions produces a pending confirmation, use details.nextActions or exact confirm <id> / deny <id> calls instead of inventing ids.",
54
- "If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.0, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort, must stay below the wrapper IPC budget (wait 30000 is intentionally blocked), and a successful payload like \"waited\":\"timeout\" means elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.",
54
+ "If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. For headed demos, put --headed on the first launch with sessionMode=fresh and verify with screenshot/tab/get-url evidence because tool success cannot prove the OS window is visible to the user. For desktop readiness, prefer real conditions first: wait --text, wait --url, wait --fn, wait --load <state>, wait --download, or qa.attached; for disappearance checks in agent-browser 0.27.0, use wait --fn predicates instead of stale upstream-help examples like wait <selector> --state hidden. Use electron.probe/status for wrapper-owned launch health or target mismatch. Fixed waits are a last resort, must stay below the wrapper IPC budget (wait 30000 is intentionally blocked), and a successful payload like \"waited\":\"timeout\" means elapsed time only—verify completion with an observed condition, fresh snapshot, or screenshot.",
55
55
  "For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.",
56
56
  "For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.",
57
57
  "For downloads, prefer download <selector> <path> when an element click should save a file. Do not rely on click alone when you need the downloaded file on disk.",
58
58
  "On dashboards with nested scroll containers, verify scroll with a screenshot or fresh snapshot -i; if the viewport did not move, prefer scrollintoview <@ref> or target the actual scrollable region. For native selects, use select <selector> <value...> (or semanticAction/job select) instead of clicking option refs; for custom comboboxes, a click/semanticAction may only focus the field, so re-snapshot and fall back to type, press Enter/arrow keys, or visible option refs.",
59
59
  "When using eval --stdin, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics.",
60
- "When using eval --stdin for extraction, return the value you want instead of relying on console.log as the primary result channel. Prefer plain expressions like ({ title: document.title }) or explicitly invoked functions like (() => ({ title: document.title }))(); if a function-shaped snippet returns {}, details.evalStdinHint may warn that the function was serialized instead of called. If get text on a CSS selector surfaces details.selectorTextVisibility or selectorTextVisibilityAll, prefer a visible @ref, a more specific selector, or the inspect-visible-text-candidates nextAction over hidden tab content.",
61
- "When details.pageChangeSummary is present, use changeType and summary as a compact signal for navigation, DOM mutation, confirmations, or artifacts; when nextActionIds is set, match those ids to entries in details.nextActions (or per-step nextActions inside batch) for concrete follow-up payloads instead of inferring from prose alone. If a no-navigation click surfaces details.overlayBlockers, inspect the fresh snapshot evidence before using a close/dismiss candidate nextAction; ordinary page chrome without dialog/alertdialog evidence should not trigger this diagnostic.",
60
+ "When using eval --stdin for extraction, pass the JavaScript through the native tool stdin field, not as an extra args token after --stdin, and return the value you want instead of relying on console.log as the primary result channel. Prefer plain expressions like ({ title: document.title }) or explicitly invoked functions like (() => ({ title: document.title }))(); if a function-shaped snippet returns {}, details.evalStdinHint may warn that the function was serialized instead of called. On file:// pages, when upstream JSON returns result: null for non-trivial stdin, details.evalResultWarning may append Eval result warning without failing the tool—treat that as inconclusive DOM verification. If get text on a CSS selector surfaces details.selectorTextVisibility or selectorTextVisibilityAll, prefer a visible @ref, a more specific selector, or the inspect-visible-text-candidates nextAction over hidden tab content.",
61
+ "When details.pageChangeSummary is present, use changeType and summary as a compact signal for navigation, DOM mutation, confirmations, or artifacts; when nextActionIds is set, match those ids to entries in details.nextActions (or per-step nextActions inside batch) for concrete follow-up payloads instead of inferring from prose alone. If details.clickDispatch reports no trusted DOM event, refresh/inspect/retry the real click first; for static local fixtures only, an explicit eval --stdin programmatic .click() can exercise app handlers, but treat it as an untrusted scripted workaround and never use it to bypass stop-before-submit/order/purchase boundaries. If a no-navigation click surfaces details.overlayBlockers, inspect the fresh snapshot evidence before using a close/dismiss candidate nextAction; ordinary page chrome without dialog/alertdialog evidence should not trigger this diagnostic.",
62
62
  "When commands save or spill files (screenshots, downloads, PDFs, traces, recordings, HAR, large snapshot spills), use the user's exact requested paths when given and treat paths as provisional until details.artifactVerification shows every row verified: branch on missingCount, pendingCount, unverifiedCount, per-entry state, and optional limitation before downstream file use or PASS/FAIL reporting.",
63
63
  "For evidence-only screenshots, QA captures, or other audit artifacts, save to an explicit path and branch on details.artifactVerification plus details.artifacts before reporting PASS/FAIL; do not require vision review of inline image attachments unless the user asked for visual inspection.",
64
64
  "Respect explicit user stop boundaries: if the user says to stop before order/post/purchase/submit, do not click that final action. If the wrapper returns details.promptGuard.reason=explicit-user-stop-boundary, gather evidence on the current page instead of retrying the blocked final action.",
@@ -1,3 +1,4 @@
1
+ import { isOpenNavigationCommand } from "../../command-taxonomy.js";
1
2
  import type { CommandInfo } from "../../runtime.js";
2
3
  import { redactModelFacingText } from "./common.js";
3
4
  import { buildAgentBrowserNextActions } from "../action-recommendations.js";
@@ -81,6 +82,26 @@ function buildUnknownCommandSuggestionActions(suggestions: CommandSuggestion[],
81
82
  return actions.length > 0 ? actions : undefined;
82
83
  }
83
84
 
85
+ function getLocalhostNavigationHint(commandInfo: CommandInfo, errorText: string): string | undefined {
86
+ if (!commandInfo.command || !isOpenNavigationCommand(commandInfo.command) || !commandInfo.subcommand) return undefined;
87
+ if (!/\bnet::ERR_(?:EMPTY_RESPONSE|CONNECTION_REFUSED|ADDRESS_UNREACHABLE|TIMED_OUT|CONNECTION_RESET)\b/i.test(errorText)) return undefined;
88
+
89
+ let targetUrl: URL;
90
+ try {
91
+ targetUrl = new URL(commandInfo.subcommand);
92
+ } catch {
93
+ return undefined;
94
+ }
95
+
96
+ if (!["localhost", "127.0.0.1", "::1", "[::1]"].includes(targetUrl.hostname.toLowerCase())) return undefined;
97
+
98
+ return [
99
+ "Agent-browser local fixture hint: the browser process could not read a loopback URL from its own network namespace or browser host.",
100
+ "Verify the server is still running and bound to an address the browser host can reach; if curl works from the shell but browser navigation fails, try the other loopback alias, add a proxy bypass for localhost/127.0.0.1 if a proxy is configured, or use a browser-host-reachable URL.",
101
+ "Use file:// only for static fallback fixtures and clean up any temporary server process outside agent_browser when the check is done.",
102
+ ].join(" ");
103
+ }
104
+
84
105
  export function appendSelectorRecoveryHint(errorText: string): string {
85
106
  const hint = getSelectorRecoveryHint(errorText);
86
107
  if (!hint || errorText.includes("Agent-browser hint:")) return errorText;
@@ -98,9 +119,13 @@ export function buildErrorPresentation(options: {
98
119
  const selectorHintedErrorText = appendSelectorRecoveryHint(safeErrorText);
99
120
  const unknownCommandSuggestions = getUnknownCommandSuggestions(commandInfo.command, safeErrorText);
100
121
  const unknownCommandSuggestionText = formatUnknownCommandSuggestionText(unknownCommandSuggestions);
101
- const hintedErrorText = unknownCommandSuggestionText && !selectorHintedErrorText.includes("Agent-browser hint:")
102
- ? `${selectorHintedErrorText}\n\n${unknownCommandSuggestionText}`
103
- : selectorHintedErrorText;
122
+ const localhostNavigationHint = getLocalhostNavigationHint(commandInfo, safeErrorText);
123
+ const hintedErrorParts = [
124
+ selectorHintedErrorText,
125
+ unknownCommandSuggestionText && !selectorHintedErrorText.includes("Agent-browser hint:") ? unknownCommandSuggestionText : undefined,
126
+ localhostNavigationHint,
127
+ ].filter((part): part is string => Boolean(part));
128
+ const hintedErrorText = hintedErrorParts.join("\n\n");
104
129
  const categoryDetails = buildAgentBrowserResultCategoryDetails({
105
130
  args: [commandInfo.command, commandInfo.subcommand].filter((item): item is string => item !== undefined),
106
131
  command: commandInfo.command,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-agent-browser-native",
3
- "version": "0.2.35",
3
+ "version": "0.2.37",
4
4
  "description": "pi extension that exposes agent-browser as a native tool for browser automation",
5
5
  "type": "module",
6
6
  "author": "Mitch Fultz (https://github.com/fitchmultz)",