pi-agent-browser-native 0.2.39 → 0.2.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/README.md +98 -3
- package/docs/ARCHITECTURE.md +27 -8
- package/docs/COMMAND_REFERENCE.md +84 -6
- package/docs/RELEASE.md +3 -2
- package/docs/SUPPORT_MATRIX.md +15 -11
- package/docs/TOOL_CONTRACT.md +78 -8
- package/extensions/agent-browser/index.ts +14 -3
- package/extensions/agent-browser/lib/config-policy.js +679 -0
- package/extensions/agent-browser/lib/config.ts +198 -0
- package/extensions/agent-browser/lib/input-modes/params.ts +1 -1
- package/extensions/agent-browser/lib/launch-scoped-flags.ts +67 -0
- package/extensions/agent-browser/lib/playbook.ts +42 -17
- package/extensions/agent-browser/lib/results/presentation/browser-profile-recovery.ts +67 -0
- package/extensions/agent-browser/lib/results/presentation/errors.ts +4 -0
- package/extensions/agent-browser/lib/runtime.ts +6 -73
- package/extensions/agent-browser/lib/web-search.ts +703 -0
- package/package.json +3 -1
- package/scripts/config.mjs +381 -0
package/docs/TOOL_CONTRACT.md
CHANGED
|
@@ -10,9 +10,10 @@ Related docs:
|
|
|
10
10
|
|
|
11
11
|
## V1 tool
|
|
12
12
|
|
|
13
|
-
V1
|
|
13
|
+
V1 exposes one primary native browser tool and one optional companion search tool:
|
|
14
14
|
|
|
15
15
|
- `agent_browser`
|
|
16
|
+
- `agent_browser_web_search` when an Exa or Brave Search credential source is configured or resolvable and `webSearch.enabled` is not false
|
|
16
17
|
|
|
17
18
|
## Why this tool shape
|
|
18
19
|
|
|
@@ -33,6 +34,73 @@ The native command reference in `docs/COMMAND_REFERENCE.md` is driven by the sam
|
|
|
33
34
|
|
|
34
35
|
Agent-facing efficiency claims are measured with `npm run benchmark:agent-browser` or `npm run verify -- benchmark`. The benchmark is deterministic and does not launch a browser; it tracks representative workflow success, tool calls, model-visible output size, stale-ref failures and recoveries, artifact success, failure-category coverage, and elapsed-time estimates so future abstractions can prove they reduce agent work before replacing raw tool use.
|
|
35
36
|
|
|
37
|
+
## Optional companion web search
|
|
38
|
+
|
|
39
|
+
`agent_browser_web_search` is a separate custom tool, not an `agent_browser` input mode. It is registered only when the extension can see at least one configured/resolvable Exa or Brave credential source from `~/.pi/config/pi-agent-browser-native/config.json`, `.pi/config/pi-agent-browser-native/config.json`, `PI_AGENT_BROWSER_CONFIG`, or the `EXA_API_KEY` / `BRAVE_API_KEY` environment fallbacks, and only when the final merged config does not set `webSearch.enabled` to `false`. Config layers merge global → project → `PI_AGENT_BROWSER_CONFIG` override, so disable scope is explicit: a global disable is a normal user default, a project disable applies to one repo, and an override file with `webSearch.enabled: false` is the highest-priority hard disable for that run. Command credential sources such as `"!op read 'op://Private/Exa/API Key'"` are allowed only from trusted global or explicit-override config; they make the tool available without running the command at startup, and the key is resolved when the tool executes. Project-local config may use only matching provider env refs (`$EXA_API_KEY` / `${EXA_API_KEY}` for Exa and `$BRAVE_API_KEY` / `${BRAVE_API_KEY}` for Brave); custom env aliases, interpolation literals, and malformed `$` values are rejected. Browser profile/executable config uses the same paths but only trusted global or explicit override values are emitted as host launch prompt guidance; project-local browser config is not trusted to steer local profiles or executable paths.
|
|
40
|
+
|
|
41
|
+
Use it when live/current external web information would help answer a task, find current docs/news, or discover candidate URLs. Use `agent_browser` when the task needs browser interaction, screenshots, authenticated/profile content, page inspection, or DOM work. The search tool is namespaced to avoid colliding with generic `web_search`, chooses Exa or Brave automatically from available credentials, defaults to Exa when both are available (unless `webSearch.preferredProvider` is set), and must not expose resolved API keys in content, details, errors, status output, docs examples, logs, or PR artifacts.
|
|
42
|
+
|
|
43
|
+
Config shape:
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"webSearch": {
|
|
48
|
+
"enabled": true,
|
|
49
|
+
"preferredProvider": "exa",
|
|
50
|
+
"exaApiKey": "$EXA_API_KEY",
|
|
51
|
+
"braveApiKey": "$BRAVE_API_KEY"
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Schema:
|
|
57
|
+
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"query": "search text",
|
|
61
|
+
"provider": "auto",
|
|
62
|
+
"searchType": "auto",
|
|
63
|
+
"count": 5,
|
|
64
|
+
"offset": 0,
|
|
65
|
+
"country": "US",
|
|
66
|
+
"searchLang": "en-US",
|
|
67
|
+
"safesearch": "moderate",
|
|
68
|
+
"freshness": "pw"
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Provider notes:
|
|
73
|
+
- `provider` is optional; `auto` uses available keys plus `webSearch.preferredProvider`.
|
|
74
|
+
- `searchType` applies to Exa only and supports `auto`, `fast`, `instant`, `deep-lite`, `deep`, and `deep-reasoning`. The default is `auto`; deep modes are slower and should be used only for harder research.
|
|
75
|
+
- Exa requests use `/search` with `contents.highlights: true` for compact excerpts. The wrapper intentionally does not expose Exa structured-output schemas yet, to keep the tool small.
|
|
76
|
+
- Brave-specific `searchLang` is ignored by Exa; Exa maps `country` to `userLocation`, `safesearch` moderate/strict to `moderation: true`, and `freshness` to `startPublishedDate`.
|
|
77
|
+
|
|
78
|
+
Result details:
|
|
79
|
+
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"provider": "exa",
|
|
83
|
+
"query": "search text",
|
|
84
|
+
"returnedQuery": "search text",
|
|
85
|
+
"count": 5,
|
|
86
|
+
"offset": 0,
|
|
87
|
+
"searchType": "auto",
|
|
88
|
+
"requestId": "request-id-when-provider-returns-one",
|
|
89
|
+
"fetchedAt": "2026-06-02T00:00:00.000Z",
|
|
90
|
+
"results": [
|
|
91
|
+
{
|
|
92
|
+
"title": "Result title",
|
|
93
|
+
"url": "https://example.com/",
|
|
94
|
+
"description": "Compact summary or first highlight",
|
|
95
|
+
"highlights": ["Relevant excerpt"],
|
|
96
|
+
"source": "Example",
|
|
97
|
+
"age": "2026-06-02",
|
|
98
|
+
"language": "en"
|
|
99
|
+
}
|
|
100
|
+
]
|
|
101
|
+
}
|
|
102
|
+
```
|
|
103
|
+
|
|
36
104
|
## Input mode chooser
|
|
37
105
|
|
|
38
106
|
Use exactly one top-level input per call:
|
|
@@ -63,6 +131,8 @@ The extension always plans normal browser commands with `--json` prepended in `e
|
|
|
63
131
|
## Headed and local fixture limits
|
|
64
132
|
|
|
65
133
|
- `--headed` is an upstream global flag passed through `args` (for example `{ "args": ["--headed", "open", "https://example.com"], "sessionMode": "fresh" }`). Use it on the first launch for demos or human-observed QA. If a managed browser session already exists, use `sessionMode: "fresh"` so the launch-scoped headed/headless choice is not ignored.
|
|
134
|
+
- `--profile <name|path>` is upstream Chrome profile selection. `profiles` lists Chrome profile directory names from Chrome's user data directory; `Default` is common but not guaranteed. On profile/user-data-dir failures, use `details.nextActions` or run `profiles` / `doctor`, then tell the user which profile name/path to configure before retrying.
|
|
135
|
+
- `--executable-path <path>` selects a custom Chromium-compatible browser executable when upstream can launch it. Use it with `sessionMode: "fresh"` when switching from an already-active implicit session. For non-Chrome Chromium login state, use a full profile/user-data directory path only when upstream accepts it, or attach to a debug-enabled running browser with `--auto-connect` / `connect` when appropriate.
|
|
66
136
|
- A successful headed call proves only that upstream accepted and ran the browser command. The wrapper currently has no portable contract field that proves the OS window is visible on the user's desktop. When visibility matters, collect independent evidence such as `screenshot`, `tab list`, `get url`, or `snapshot -i`, and treat “user cannot see the browser” as a display/provider/session setup issue until proven otherwise.
|
|
67
137
|
- `localhost` / `127.0.0.1` URLs are resolved by the browser host, which may differ from the shell or Pi process that started a temporary server. Errors such as `net::ERR_EMPTY_RESPONSE` on local ports are not reliable page-render evidence; they can mean the browser cannot reach the host loopback. Use an environment-specific host-reachable address when available, or fall back to `file://` only for static fixtures.
|
|
68
138
|
- `file://` pages do not provide HTTP headers and can differ from HTTP pages for MIME handling, CORS, storage, and debugger/script behavior. If `eval --stdin` returns `null` or otherwise fails to prove DOM state on a `file://` page, first confirm the script was passed through the native tool `stdin` field (not as a third `args` item after `--stdin`), then treat that verification as inconclusive and use `snapshot -i`, `get text` from current refs, screenshots, or a reachable HTTP fixture instead.
|
|
@@ -76,10 +146,10 @@ The extension always plans normal browser commands with `--json` prepended in `e
|
|
|
76
146
|
- When a visible text or accessible-name target should survive ref churn, prefer find locators such as role, text, label, placeholder, alt, title, or testid with the intended action instead of guessing a CSS selector.
|
|
77
147
|
- For desktop or host-controlled rich inputs, if semanticAction fill misses, refresh refs and prefer a current editable @ref from details.richInputRecovery or the latest snapshot; focus or click that ref, then use keyboard inserttext or keyboard type with the intended text. Do not auto-submit with Enter or a submit button unless the user flow explicitly calls for it.
|
|
78
148
|
- Do not assume Playwright selector dialects such as text=Close or button:has-text('Close') are supported wrapper syntax unless current upstream agent-browser behavior has been verified.
|
|
79
|
-
- For authenticated or user-specific content explicitly requested by the user, such as feeds, inboxes, account pages, or private dashboards,
|
|
149
|
+
- For authenticated or user-specific content explicitly requested by the user, such as feeds, inboxes, account pages, or private dashboards, use a real profile only when the user/config asks for it or profiles have been inspected; do not assume --profile Default exists on every machine. Do not use a real profile for public pages just because they are dashboards. Treat visible page content from real profiles as model-visible transcript data; use --auto-connect only if profile-based reuse is unavailable or the task is specifically about attaching to a running debug-enabled browser. If profile/user-data-dir resolution fails, stop retrying opens, run profiles and/or doctor through agent_browser, then report what the user needs to configure.
|
|
80
150
|
- Do not invent fixed explicit session names for routine tasks. Use the implicit session unless you truly need multiple isolated browser sessions in the same conversation.
|
|
81
|
-
- When using
|
|
82
|
-
- If you already used the implicit session and now need launch-scoped flags
|
|
151
|
+
- When using launch-scoped flags (--auto-connect, --cdp, --enable, --executable-path, --init-script, --device, --profile, --provider, -p, --session-name, --state), put them on the first command for that session. If you intentionally use an explicit --session, keep using that same explicit session for follow-ups.
|
|
152
|
+
- If you already used the implicit session and now need launch-scoped flags (--auto-connect, --cdp, --enable, --executable-path, --init-script, --device, --profile, --provider, -p, --session-name, --state), retry with top-level sessionMode set to fresh or pass an explicit --session for the new launch; never pass --session-mode inside args. After a successful unnamed fresh launch, later auto calls follow that new session.
|
|
83
153
|
- For React introspection, launch the page with --enable react-devtools before first navigation, then use react tree, react inspect <fiberId>, sourceLookup candidates for local UI source hints, react renders start/stop, or react suspense; sourceLookup is experimental and reports confidence/evidence instead of guaranteed DOM-to-file mappings. For failed fetches and APIs, networkSourceLookup (experimental) correlates failed network requests with initiator metadata and bounded workspace URL literals—candidates only, not definitive blame. Use vitals [url] for Core Web Vitals and hydration timing, and pushstate <url> for client-side SPA navigation.
|
|
84
154
|
- For first-navigation setup, use open without a URL plus network route --resource-type <csv>, cookies set --curl <file>, or --init-script/--enable before navigate/opening the target page.
|
|
85
155
|
- For stateful browser context work, prefer purpose-specific page actions before dumping browser data: use auth save --password-stdin with the tool stdin field for credentials, auth list/show/delete/remove for local auth-profile maintenance, auth login when you need the browser to fill a saved profile, state save/load for portable test state, state list/show/rename/clear/clear -a/clean for saved-state lifecycle cleanup, cookies get/set/clear and storage local|session only when the task needs those values, and expect cookie/storage/auth/state summaries to redact credential-like fields.
|
|
@@ -496,12 +566,12 @@ For `eval --stdin`, put the script in the top-level `stdin` field. The wrapper n
|
|
|
496
566
|
Behavior:
|
|
497
567
|
- if `args` already include `--session` (including argv compiled from optional `semanticAction.session`), upstream session choice wins
|
|
498
568
|
- `"auto"` prepends the current extension-managed active session when appropriate
|
|
499
|
-
- `"fresh"` rotates that managed session to a fresh upstream launch so startup-scoped flags like `--profile`, `--session-name`, `--cdp`, `--state`, `--auto-connect`, `--init-script`, `--enable`, `-p` / `--provider`, or iOS `--device` apply and later default calls follow the new browser
|
|
569
|
+
- `"fresh"` rotates that managed session to a fresh upstream launch so startup-scoped flags like `--profile`, `--executable-path`, `--session-name`, `--cdp`, `--state`, `--auto-connect`, `--init-script`, `--enable`, `-p` / `--provider`, or iOS `--device` apply and later default calls follow the new browser
|
|
500
570
|
- sessionless paths skip that injection even under `"auto"`: plain-text `--help` / `-h` / `--version` / `-V` (see the generated inspection playbook fragment below), read-only `skills list`, `skills get …`, and `skills path …`, local auth profile management (`auth save/list/show/delete/remove`), local/setup commands (`profiles`, `dashboard start/stop`, `device list`, `doctor`, `install`, `upgrade`, `session list`), and targeted/all local saved-state maintenance (`state list/show`, `state clear --all`, `state clear -a`, `state clear <session-name>`, `state clean --older-than <days>`, `state rename`) keep `effectiveArgs` free of the implicit managed `--session` unless the caller supplied `--session` explicitly; successful results therefore omit `usedImplicitSession` and the extension-managed `sessionName` for those calls, while root `session`, untargeted `state clear`, bare `state clean`, browser-backed `auth login`, and `state save/load` keep normal managed-session injection (`extensions/agent-browser/lib/command-policy.ts`, `needsManagedSession`; `extensions/agent-browser/lib/runtime.ts`, `buildExecutionPlan`)
|
|
501
571
|
|
|
502
572
|
Recommended use:
|
|
503
573
|
- use `"auto"` for the common browse/snapshot/click flow inside one `pi` session
|
|
504
|
-
- use `"fresh"` when switching from an already-active implicit session to a new profile/debug/auth/provider launch without inventing a fixed explicit session name
|
|
574
|
+
- use `"fresh"` when switching from an already-active implicit session to a new profile/browser executable/debug/auth/provider launch without inventing a fixed explicit session name
|
|
505
575
|
- when a fresh launch fails or times out before becoming current, check `details.managedSessionOutcome`: it states whether the prior managed session was preserved or whether the attempted fresh session was abandoned because no prior managed session existed; when `sessionMode` is `"fresh"` and the tool ultimately fails, the model-visible result also appends `Managed session outcome: …` (see `#details` below). Failures under `sessionMode: "auto"` still expose the struct on `details` when the extension injects a managed `--session`, but they do not add that extra prose line.
|
|
506
576
|
|
|
507
577
|
## Wrapper behavior
|
|
@@ -595,7 +665,7 @@ Ref preflight details (command taxonomy in `extensions/agent-browser/lib/command
|
|
|
595
665
|
|
|
596
666
|
**Presentation redaction (implementation map):** Successful non-`batch` tool calls and each successful `batchSteps[]` row run upstream `data` through `redactPresentationData` in `extensions/agent-browser/lib/results/presentation/diagnostics.ts`: `cookies` and `storage` walk objects/arrays and replace case-insensitive `value` keys with `"[REDACTED]"` (diagnostic formatters still describe rows without expanding secrets); every other command’s payload is recursively scrubbed with `redactStructuredPresentationValue`, which redacts known sensitive key names and applies string-level sensitivity heuristics so network, diff, trace/profiler, stream, dashboard, chat, and other structured results do not echo bearer tokens, proxy credentials, or similar fields verbatim into `details.data`. Echoed `command` arrays in `details` and in batch roll-ups use `redactInvocationArgs` from `extensions/agent-browser/lib/runtime.ts` to mask trailing values for sensitive global flags (including `--body`, `--headers`, `--password`, and `--proxy`), preserve the special positional rules for `cookies set`, `storage local|session set`, and `set credentials`, and scrub other argv tokens for URLs and inline secrets. Failed batch steps additionally run `redactExactValues` on structured step errors so literals taken from that step’s argv (cookie value, storage set value, `--password` / `--password=` tokens) cannot reappear inside formatted error blobs.
|
|
597
667
|
|
|
598
|
-
`nextActions` is an optional machine-readable list of exact native `agent_browser` follow-ups. Each entry includes `tool: "agent_browser"`, an `id`, a short `reason`, optional `safety`, and either `params` (`args`, optional `stdin`, optional `sessionMode`, optional `networkSourceLookup`, optional `electron`) or an `artifactPath` for saved-file workflows. Agents should prefer these payloads over prose when present. Tab/session recovery id strings are centralized in `AGENT_BROWSER_RECOVERY_NEXT_ACTION_IDS`, while rich-input focus/click recovery ids are centralized in `AGENT_BROWSER_RICH_INPUT_RECOVERY_NEXT_ACTION_IDS` plus `getAgentBrowserRichInputRecoveryNextActionId(s)` in `extensions/agent-browser/lib/results/recovery-actions.ts` (both registries are also re-exported from `shared.ts`); docs and tests mirror those registries/helpers rather than inventing recovery ids in prose. Current recommendations include: raw `connect` success → session-scoped `list-connected-session-tabs` only, then the agent should inspect/select a stable `tab t<N>` target and run `snapshot -i` explicitly; `snapshot` failures whose upstream error says `No active page` and whose wrapper result has a known session → `list-tabs-after-no-active-page` only, because this path has no wrapper-observed safe tab id to select atomically; Electron launches → wrapper-tracked `electron.status` / `electron.probe` / `electron.cleanup` actions plus session-scoped tab/snapshot inspection when attached; Electron status/probe mismatch diagnostics → `reattach-electron-launch` plus fresh tab/snapshot inspection; Electron post-command health failures → status/probe/cleanup for the same `launchId`; Electron fill verification mismatches → `inspect-after-fill-verification` and `verify-filled-value`; Electron same-URL ref freshness warnings → `refresh-electron-refs-after-rerender`; packaged-Electron `sourceLookup` no-candidate diagnostics → session snapshot, launch probe, and tab list; Electron cleanup partial failures → status plus retry-cleanup for the same wrapper-owned `launchId`; `open` success → `snapshot -i`; mutating/navigation commands (see `buildAgentBrowserNextActions` in source for the exact command set) → `snapshot -i`; stale refs and selector failures → `snapshot -i` via `refresh-interactive-refs` (prefixed with `--session <name>` when the failed call ran in a named or managed session); selector misses with exact current snapshot role/name matches → direct ref retries via `try-current-visible-ref` or bounded `try-current-visible-ref-N` for non-fill targets; semantic `fill` selector misses with exact current editable refs → `focus-current-editable-ref` / `click-current-editable-ref` or numbered variants that do not include fill text or submit; unknown getter shortcuts such as `title` / `url` → exact read-only retries like `get title` / `get url` with ids `use-get-title` / `use-get-url`; compact `network requests` results with safe request IDs → bounded read-only request detail, `networkSourceLookup`, path filter, or HAR-capture follow-ups; semantic `selector-not-found` failures that compiled from `semanticAction` may append `try-button-name-candidate` or `try-link-name-candidate` after presentation `nextActions` only for the bounded click pair enumerated under `semanticAction`; semantic `stale-ref` failures that compiled from `semanticAction` `find` argv may also include `retry-semantic-action-after-stale-ref` after that snapshot step; qualifying same-URL non-Electron top-level clicks (see `overlayBlockers` below) with fresh snapshot evidence of likely overlay/banner/dialog close controls may append `inspect-overlay-state` and bounded `try-overlay-blocker-candidate-*` entries; successful top-level `scroll` calls whose pre/post viewport and sampled scroll-container positions do not change may append `inspect-after-noop-scroll` and `verify-noop-scroll-visually`; explicit combobox-targeted actions that focus a combobox without visible options may append `inspect-focused-combobox`, `try-open-combobox-with-arrow`, and `try-open-combobox-with-enter`; `get text <selector>` calls with hidden/multiple CSS matches may append `inspect-visible-text-candidates` with a read-only `eval --stdin` probe (each prefixed with `--session <name>` when `details.sessionName` is set, same `sessionPrefixArgs` rule as other session-scoped follow-ups); confirmations → exact `confirm <id>` and `deny <id>` choices; generic tab drift → `list-tabs-for-recovery` with `tab list` first, then select or confirm the stable target before running `snapshot -i`; about:blank or tab-drift recovery with a wrapper-known target → `list-tabs-for-about-blank-recovery` or `list-tabs-for-tab-drift-recovery`, plus `select-intended-tab-after-drift` and `snapshot-after-tab-recovery` when the wrapper already observed the stable `t<N>` tab id; `wait --text` assertion failures → `inspect-after-text-assertion-failure` with a read-only snapshot; download verification failures or missing successful download artifacts → `wait --download [path]`; saved artifacts → the artifact path to inspect/consume after checking `artifactVerification`/metadata; missing non-download artifacts → `verify-artifact-path` so agents do not trust an absent file. When nothing applies, the field is omitted.
|
|
668
|
+
`nextActions` is an optional machine-readable list of exact native `agent_browser` follow-ups. Each entry includes `tool: "agent_browser"`, an `id`, a short `reason`, optional `safety`, and either `params` (`args`, optional `stdin`, optional `sessionMode`, optional `networkSourceLookup`, optional `electron`) or an `artifactPath` for saved-file workflows. Agents should prefer these payloads over prose when present. Tab/session recovery id strings are centralized in `AGENT_BROWSER_RECOVERY_NEXT_ACTION_IDS`, while rich-input focus/click recovery ids are centralized in `AGENT_BROWSER_RICH_INPUT_RECOVERY_NEXT_ACTION_IDS` plus `getAgentBrowserRichInputRecoveryNextActionId(s)` in `extensions/agent-browser/lib/results/recovery-actions.ts` (both registries are also re-exported from `shared.ts`); docs and tests mirror those registries/helpers rather than inventing recovery ids in prose. Current recommendations include: raw `connect` success → session-scoped `list-connected-session-tabs` only, then the agent should inspect/select a stable `tab t<N>` target and run `snapshot -i` explicitly; `snapshot` failures whose upstream error says `No active page` and whose wrapper result has a known session → `list-tabs-after-no-active-page` only, because this path has no wrapper-observed safe tab id to select atomically; browser profile/user-data-dir resolution failures → `inspect-browser-profiles` (`profiles`) and `run-agent-browser-doctor` (`doctor`) before retrying opens; Electron launches → wrapper-tracked `electron.status` / `electron.probe` / `electron.cleanup` actions plus session-scoped tab/snapshot inspection when attached; Electron status/probe mismatch diagnostics → `reattach-electron-launch` plus fresh tab/snapshot inspection; Electron post-command health failures → status/probe/cleanup for the same `launchId`; Electron fill verification mismatches → `inspect-after-fill-verification` and `verify-filled-value`; Electron same-URL ref freshness warnings → `refresh-electron-refs-after-rerender`; packaged-Electron `sourceLookup` no-candidate diagnostics → session snapshot, launch probe, and tab list; Electron cleanup partial failures → status plus retry-cleanup for the same wrapper-owned `launchId`; `open` success → `snapshot -i`; mutating/navigation commands (see `buildAgentBrowserNextActions` in source for the exact command set) → `snapshot -i`; stale refs and selector failures → `snapshot -i` via `refresh-interactive-refs` (prefixed with `--session <name>` when the failed call ran in a named or managed session); selector misses with exact current snapshot role/name matches → direct ref retries via `try-current-visible-ref` or bounded `try-current-visible-ref-N` for non-fill targets; semantic `fill` selector misses with exact current editable refs → `focus-current-editable-ref` / `click-current-editable-ref` or numbered variants that do not include fill text or submit; unknown getter shortcuts such as `title` / `url` → exact read-only retries like `get title` / `get url` with ids `use-get-title` / `use-get-url`; compact `network requests` results with safe request IDs → bounded read-only request detail, `networkSourceLookup`, path filter, or HAR-capture follow-ups; semantic `selector-not-found` failures that compiled from `semanticAction` may append `try-button-name-candidate` or `try-link-name-candidate` after presentation `nextActions` only for the bounded click pair enumerated under `semanticAction`; semantic `stale-ref` failures that compiled from `semanticAction` `find` argv may also include `retry-semantic-action-after-stale-ref` after that snapshot step; qualifying same-URL non-Electron top-level clicks (see `overlayBlockers` below) with fresh snapshot evidence of likely overlay/banner/dialog close controls may append `inspect-overlay-state` and bounded `try-overlay-blocker-candidate-*` entries; successful top-level `scroll` calls whose pre/post viewport and sampled scroll-container positions do not change may append `inspect-after-noop-scroll` and `verify-noop-scroll-visually`; explicit combobox-targeted actions that focus a combobox without visible options may append `inspect-focused-combobox`, `try-open-combobox-with-arrow`, and `try-open-combobox-with-enter`; `get text <selector>` calls with hidden/multiple CSS matches may append `inspect-visible-text-candidates` with a read-only `eval --stdin` probe (each prefixed with `--session <name>` when `details.sessionName` is set, same `sessionPrefixArgs` rule as other session-scoped follow-ups); confirmations → exact `confirm <id>` and `deny <id>` choices; generic tab drift → `list-tabs-for-recovery` with `tab list` first, then select or confirm the stable target before running `snapshot -i`; about:blank or tab-drift recovery with a wrapper-known target → `list-tabs-for-about-blank-recovery` or `list-tabs-for-tab-drift-recovery`, plus `select-intended-tab-after-drift` and `snapshot-after-tab-recovery` when the wrapper already observed the stable `t<N>` tab id; `wait --text` assertion failures → `inspect-after-text-assertion-failure` with a read-only snapshot; download verification failures or missing successful download artifacts → `wait --download [path]`; saved artifacts → the artifact path to inspect/consume after checking `artifactVerification`/metadata; missing non-download artifacts → `verify-artifact-path` so agents do not trust an absent file. When nothing applies, the field is omitted.
|
|
599
669
|
|
|
600
670
|
**Unknown-command getter hints (failure presentation):** `buildErrorPresentation` in `extensions/agent-browser/lib/results/presentation/errors.ts` only runs this path when upstream error text (after model-facing redaction) matches `unknown command`, `unknown subcommand`, or `unrecognized command` (case-insensitive) **and** the failed invocation’s primary command token is one of `attr`, `count`, `html`, `text`, `title`, `url`, or `value`. Visible text then includes a grouped-`get` hint line plus per-token guidance (`get text <selector>`, `get html …`, `get attr …`, `get count …`, `get value …`, `get title`, `get url`). Machine `nextActions` with ids `use-get-title` / `use-get-url` are emitted only for `title` / `url`, with `params.args` optionally prefixed by `--session <name>` when the failed call targeted a named session. If the error string already contains `Agent-browser hint:` from selector recovery (stale-ref or unsupported selector dialect appendages), the getter block is skipped so two stacked `Agent-browser hint:` headers are not emitted.
|
|
601
671
|
|
|
@@ -769,7 +839,7 @@ If `agent-browser` is not on `PATH`, fail with a message that:
|
|
|
769
839
|
- keep wrapper-spawned commands below the upstream CLI IPC read-timeout budget by clamping `AGENT_BROWSER_DEFAULT_TIMEOUT` to 25 seconds and stopping a stuck child process before the upstream 30-second retry path begins (`PI_AGENT_BROWSER_PROCESS_TIMEOUT_MS` configures the watchdog); timed-out compiled `job` / `qa` or caller `batch` calls may add `details.timeoutPartialProgress` and visible `Timeout partial progress` evidence with planned steps, current page title/URL, and declared artifact path checks
|
|
770
840
|
- interactive or long-running upstream families such as `chat` without a prompt, `dashboard start`, `stream enable`, `trace start`, `profiler start`, `record start`, `inspect`, `install`, `upgrade`, `doctor --fix`, and `confirm-interactive` are passed through thinly but remain bounded by the same wrapper timeout/session planning rules; prefer explicit arguments, single-shot `chat <message>`, non-interactive flags like `doctor --offline --quick` or `doctor --json`, and cleanup pairs such as `dashboard stop`, `stream disable`, `trace stop`, `profiler stop`, and `record stop`
|
|
771
841
|
- treat successful plain-text inspection commands like `--help` and `--version` as stateless: do not inject the implicit managed session and do not let those calls claim the managed-session slot
|
|
772
|
-
- if startup-scoped flags like `--profile`, `--session-name`, `--cdp`, `--state`, `--auto-connect`, `--init-script`, `--enable`, `-p` / `--provider`, or iOS `--device` are supplied after the implicit session is already active while `sessionMode` is `"auto"`, return a validation error with a structured recovery hint that recommends `sessionMode: "fresh"`
|
|
842
|
+
- if startup-scoped flags like `--profile`, `--executable-path`, `--session-name`, `--cdp`, `--state`, `--auto-connect`, `--init-script`, `--enable`, `-p` / `--provider`, or iOS `--device` are supplied after the implicit session is already active while `sessionMode` is `"auto"`, return a validation error with a structured recovery hint that recommends `sessionMode: "fresh"`
|
|
773
843
|
- for direct headless local Chrome launches to `chat.com` / `chatgpt.com` / `chat.openai.com`, allow a narrow compatibility fallback that injects a normal Chrome `--user-agent` only when the caller did not explicitly provide one and did not choose `--headed`, `--cdp`, `--auto-connect`, or a provider-backed launch
|
|
774
844
|
|
|
775
845
|
## Non-goals
|
|
@@ -41,7 +41,6 @@ import {
|
|
|
41
41
|
getImplicitSessionCloseTimeoutMs,
|
|
42
42
|
getImplicitSessionIdleTimeoutMs,
|
|
43
43
|
hasLaunchScopedTabCorrectionFlag,
|
|
44
|
-
hasUsableBraveApiKey,
|
|
45
44
|
extractExplicitSessionName,
|
|
46
45
|
redactInvocationArgs,
|
|
47
46
|
restoreManagedSessionStateFromBranch,
|
|
@@ -118,6 +117,8 @@ import {
|
|
|
118
117
|
type VisibleRefFallbackDiagnostic,
|
|
119
118
|
} from "./lib/results/selector-recovery.js";
|
|
120
119
|
import { withOptionalSessionArgs } from "./lib/results/next-actions.js";
|
|
120
|
+
import { canRegisterWebSearchTool, loadAgentBrowserConfigSync } from "./lib/config.js";
|
|
121
|
+
import { createAgentBrowserWebSearchTool } from "./lib/web-search.js";
|
|
121
122
|
|
|
122
123
|
const DEFAULT_SESSION_MODE = "auto" as const;
|
|
123
124
|
const DIRECT_AGENT_BROWSER_BASH_BYPASS_ENV = "PI_AGENT_BROWSER_ALLOW_DIRECT_BASH";
|
|
@@ -947,8 +948,14 @@ function getInstalledDocsPaths(): { readmePath: string; commandReferencePath: st
|
|
|
947
948
|
|
|
948
949
|
export default function agentBrowserExtension(pi: ExtensionAPI) {
|
|
949
950
|
const ephemeralSessionSeed = createEphemeralSessionSeed();
|
|
950
|
-
const
|
|
951
|
-
const
|
|
951
|
+
const agentBrowserConfig = loadAgentBrowserConfigSync({ cwd: process.cwd() });
|
|
952
|
+
const webSearchToolAvailable = canRegisterWebSearchTool(agentBrowserConfig);
|
|
953
|
+
const toolPromptGuidelines = buildToolPromptGuidelines({
|
|
954
|
+
browserDefaultProfile: agentBrowserConfig.trustedBrowserDefaultProfile,
|
|
955
|
+
browserExecutablePath: agentBrowserConfig.trustedBrowserExecutablePath,
|
|
956
|
+
includeWebSearch: webSearchToolAvailable,
|
|
957
|
+
docs: getInstalledDocsPaths(),
|
|
958
|
+
});
|
|
952
959
|
const implicitSessionIdleTimeoutMs = String(getImplicitSessionIdleTimeoutMs());
|
|
953
960
|
const implicitSessionCloseTimeoutMs = getImplicitSessionCloseTimeoutMs();
|
|
954
961
|
let managedSessionActive = false;
|
|
@@ -1267,4 +1274,8 @@ export default function agentBrowserExtension(pi: ExtensionAPI) {
|
|
|
1267
1274
|
: runBrowserCommand();
|
|
1268
1275
|
},
|
|
1269
1276
|
});
|
|
1277
|
+
|
|
1278
|
+
if (webSearchToolAvailable) {
|
|
1279
|
+
pi.registerTool(createAgentBrowserWebSearchTool(agentBrowserConfig));
|
|
1280
|
+
}
|
|
1270
1281
|
}
|