pi-agent-browser-native 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +36 -0
- package/LICENSE +21 -0
- package/README.md +139 -0
- package/docs/ARCHITECTURE.md +158 -0
- package/docs/RELEASE.md +86 -0
- package/docs/REQUIREMENTS.md +101 -0
- package/docs/TOOL_CONTRACT.md +168 -0
- package/extensions/agent-browser/index.ts +297 -0
- package/extensions/agent-browser/lib/process.ts +148 -0
- package/extensions/agent-browser/lib/results.ts +937 -0
- package/extensions/agent-browser/lib/runtime.ts +236 -0
- package/extensions/agent-browser/lib/temp.ts +93 -0
- package/package.json +61 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# Tool contract
|
|
2
|
+
|
|
3
|
+
Related docs:
|
|
4
|
+
- [`../README.md`](../README.md)
|
|
5
|
+
- [`REQUIREMENTS.md`](REQUIREMENTS.md)
|
|
6
|
+
- [`ARCHITECTURE.md`](ARCHITECTURE.md)
|
|
7
|
+
|
|
8
|
+
## V1 tool
|
|
9
|
+
|
|
10
|
+
V1 should expose one primary native tool:
|
|
11
|
+
|
|
12
|
+
- `agent_browser`
|
|
13
|
+
|
|
14
|
+
## Why this tool shape
|
|
15
|
+
|
|
16
|
+
This keeps the integration:
|
|
17
|
+
- thin
|
|
18
|
+
- powerful
|
|
19
|
+
- low-drift
|
|
20
|
+
- low-maintenance
|
|
21
|
+
- close to upstream `agent-browser`
|
|
22
|
+
|
|
23
|
+
It also keeps the main UX where it belongs: the agent invokes the tool directly instead of relying on bash or a large manual command surface.
|
|
24
|
+
|
|
25
|
+
The tool guidance should be written for task discovery first, not wrapper implementation first. That means the description should emphasize browser use cases like web research, reading live docs, clicking, filling, screenshots, extraction, and authenticated/profile-based workflows. Low-level wrapper details like `stdin` and exact CLI args belong in the schema and guidelines, not the lead description.
|
|
26
|
+
|
|
27
|
+
The tool also needs an operating playbook, not just a capability list. The model should not have to rediscover basics each session. Guidance should explicitly encode the normal browser workflow (`open` -> `snapshot -i` -> interact -> re-snapshot), the authenticated-content workflow (prefer `--profile Default` on the first browser call and let the implicit session carry continuity; use `--auto-connect` as a fallback when profile reuse is unavailable), and the preferred recovery path when a session opens on the wrong tab, an action changes origin unexpectedly, or an `open` call returns blocked/blank/unexpected results (`tab list` / `tab <n>` / `snapshot -i` before retrying different URLs or fallback strategies). It should also discourage inventing fixed explicit session names for routine tasks, because those names leak stale browser state across otherwise unrelated `pi` sessions. For read-only browsing tasks, guidance should prefer answering from the current page state first: use the current snapshot, structured ref labels, or `eval --stdin` on the current page before navigating into media viewers, detail routes, or other new pages unless the current view lacks the needed information. When using `eval --stdin`, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics. When using `eval --stdin` for extraction, return the intended value instead of relying on `console.log` as the primary result channel.
|
|
28
|
+
|
|
29
|
+
## Parameters
|
|
30
|
+
|
|
31
|
+
```json
|
|
32
|
+
{
|
|
33
|
+
"args": ["open", "https://example.com"],
|
|
34
|
+
"stdin": "optional raw stdin content",
|
|
35
|
+
"useActiveSession": true
|
|
36
|
+
}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### `args`
|
|
40
|
+
|
|
41
|
+
- type: `string[]`
|
|
42
|
+
- required
|
|
43
|
+
- exact CLI args passed after `agent-browser`
|
|
44
|
+
- no shell operators
|
|
45
|
+
- do not include the binary name
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{ "args": ["open", "https://example.com"] }
|
|
51
|
+
{ "args": ["snapshot", "-i"] }
|
|
52
|
+
{ "args": ["click", "@e2"] }
|
|
53
|
+
{ "args": ["tab", "list"] }
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### `stdin`
|
|
57
|
+
|
|
58
|
+
- type: `string`
|
|
59
|
+
- optional
|
|
60
|
+
- raw stdin for commands like `eval --stdin` and `batch`
|
|
61
|
+
|
|
62
|
+
Examples:
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
{ "args": ["eval", "--stdin"], "stdin": "document.title" }
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{ "args": ["batch"], "stdin": "[[\"open\",\"https://example.com\"],[\"snapshot\",\"-i\"]]" }
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### `useActiveSession`
|
|
73
|
+
|
|
74
|
+
- type: `boolean`
|
|
75
|
+
- optional
|
|
76
|
+
- default: `true`
|
|
77
|
+
|
|
78
|
+
Behavior:
|
|
79
|
+
- if `args` already include `--session`, upstream session choice wins
|
|
80
|
+
- otherwise the extension prepends its implicit active session when `useActiveSession` is `true`
|
|
81
|
+
|
|
82
|
+
## Wrapper behavior
|
|
83
|
+
|
|
84
|
+
The extension should:
|
|
85
|
+
- inject `--json`
|
|
86
|
+
- invoke `agent-browser` directly, not through a shell
|
|
87
|
+
- parse JSON output into tool details
|
|
88
|
+
- handle observed JSON result shapes, including the array returned by `batch --json`
|
|
89
|
+
- allow plain-text fallback for inspection commands like `--help` and `--version`
|
|
90
|
+
- discourage exploratory inspection calls unless the user explicitly asks or debugging requires them
|
|
91
|
+
- deflect normal-task `--help` inspection back into the standard browser workflow instead of letting the model relearn the tool from scratch each session
|
|
92
|
+
- surface stderr and non-zero exits clearly
|
|
93
|
+
- attach images when the result points to a screenshot-like artifact
|
|
94
|
+
|
|
95
|
+
## Result shape
|
|
96
|
+
|
|
97
|
+
### Content
|
|
98
|
+
|
|
99
|
+
Primary content should be:
|
|
100
|
+
- useful result text for the model, not just a status line
|
|
101
|
+
- an image attachment when relevant
|
|
102
|
+
- browser-aware compacting for oversized snapshots so the model gets a concise actionable view before raw page noise
|
|
103
|
+
- compact snapshots should be main-content-first: prefer the primary content block and nearby sections over top-of-page chrome, ads, or unrelated sidebars when those can be distinguished from the snapshot tree
|
|
104
|
+
|
|
105
|
+
Examples:
|
|
106
|
+
- small `snapshot` results should include the actual snapshot text
|
|
107
|
+
- oversized `snapshot` results should switch to a compact view that preserves the primary content, nearby sections, high-value refs, and a path to the spilled full raw snapshot
|
|
108
|
+
- `tab list` should include a readable tab summary
|
|
109
|
+
- `screenshot` should include the saved-path summary plus the inline image attachment when available
|
|
110
|
+
|
|
111
|
+
### Details
|
|
112
|
+
|
|
113
|
+
Recommended details:
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"args": ["snapshot", "-i"],
|
|
118
|
+
"effectiveArgs": ["--session", "pi-abc123", "--json", "snapshot", "-i"],
|
|
119
|
+
"sessionName": "pi-abc123",
|
|
120
|
+
"usedImplicitSession": true,
|
|
121
|
+
"data": {
|
|
122
|
+
"origin": "https://example.com/",
|
|
123
|
+
"refs": {
|
|
124
|
+
"e1": { "name": "Example Domain", "role": "heading" }
|
|
125
|
+
},
|
|
126
|
+
"snapshot": "- heading \"Example Domain\" [level=1, ref=e1]"
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
For oversized snapshots, details should switch to a compact metadata object and include `fullOutputPath` pointing at a private temp JSON spill file with the full upstream snapshot payload.
|
|
132
|
+
|
|
133
|
+
## High-value result rendering
|
|
134
|
+
|
|
135
|
+
"Rendering" here means how results appear inside `pi`, not embedding a browser UI.
|
|
136
|
+
|
|
137
|
+
Worth doing in v1:
|
|
138
|
+
- screenshots → inline image attachment
|
|
139
|
+
- snapshots → origin + ref count + main-content-first compact preview, with full raw snapshot spill files when the inline result would otherwise be too large
|
|
140
|
+
- tab lists → compact summary/table
|
|
141
|
+
- stream status → enabled/connected/port summary
|
|
142
|
+
|
|
143
|
+
## Missing binary behavior
|
|
144
|
+
|
|
145
|
+
If `agent-browser` is not on `PATH`, fail with a message that:
|
|
146
|
+
- says `agent-browser` is required
|
|
147
|
+
- says this project does not bundle it
|
|
148
|
+
- points to upstream install/docs
|
|
149
|
+
|
|
150
|
+
## Session behavior
|
|
151
|
+
|
|
152
|
+
- maintain one implicit active session per `pi` session for the common path
|
|
153
|
+
- derive that implicit session from the official `pi` session id
|
|
154
|
+
- respect explicit upstream `--session` with minimal interference
|
|
155
|
+
- treat the implicit session as extension-managed convenience state
|
|
156
|
+
- on normal `pi` shutdown, best-effort close the implicit session
|
|
157
|
+
- set an idle timeout on implicit sessions so abandoned daemons eventually self-clean
|
|
158
|
+
- clean up private temp spill artifacts owned by the implicit session on shutdown
|
|
159
|
+
- treat explicit upstream session choices like `--session`, `--profile`, `--session-name`, and `--cdp` as user-managed
|
|
160
|
+
- pass explicit `--profile` straight through to upstream `agent-browser`; no profile-cloning or isolation layer is added in v1
|
|
161
|
+
- if startup-scoped flags like `--profile`, `--session-name`, or `--cdp` are supplied after the implicit session is already active, return a validation error instead of silently relying on upstream to ignore them
|
|
162
|
+
|
|
163
|
+
## Non-goals
|
|
164
|
+
|
|
165
|
+
- no giant action enum mirroring the whole upstream CLI
|
|
166
|
+
- no support for older `agent-browser` versions
|
|
167
|
+
- no compatibility shims
|
|
168
|
+
- no embedded browser UI inside `pi`
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Purpose: Register the native agent_browser tool for pi so agents can invoke agent-browser without going through bash.
|
|
3
|
+
* Responsibilities: Define the tool schema, inject thin wrapper behavior around the upstream CLI, manage implicit session convenience, and return pi-friendly content/details.
|
|
4
|
+
* Scope: Native tool registration and orchestration only; the wrapper intentionally stays close to the upstream agent-browser CLI.
|
|
5
|
+
* Usage: Loaded by pi through the package manifest or the local `.pi/extensions/agent-browser.ts` development entrypoint.
|
|
6
|
+
* Invariants/Assumptions: agent-browser is installed separately on PATH, the wrapper targets the current locally installed upstream version only, and no backward-compatibility shims are provided.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { rm } from "node:fs/promises";
|
|
10
|
+
|
|
11
|
+
import { isToolCallEventType, type ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
12
|
+
import { Type } from "@sinclair/typebox";
|
|
13
|
+
|
|
14
|
+
import { runAgentBrowserProcess } from "./lib/process.js";
|
|
15
|
+
import { buildToolPresentation, getAgentBrowserErrorText, parseAgentBrowserEnvelope } from "./lib/results.js";
|
|
16
|
+
import {
|
|
17
|
+
buildExecutionPlan,
|
|
18
|
+
buildPromptPolicy,
|
|
19
|
+
createEphemeralSessionSeed,
|
|
20
|
+
createImplicitSessionName,
|
|
21
|
+
getLatestUserPrompt,
|
|
22
|
+
validateToolArgs,
|
|
23
|
+
} from "./lib/runtime.js";
|
|
24
|
+
import { cleanupSecureTempArtifacts } from "./lib/temp.js";
|
|
25
|
+
|
|
26
|
+
const IMPLICIT_SESSION_IDLE_TIMEOUT_MS = "900000";
|
|
27
|
+
const IMPLICIT_SESSION_CLOSE_TIMEOUT_MS = 5_000;
|
|
28
|
+
|
|
29
|
+
const AGENT_BROWSER_PARAMS = Type.Object({
|
|
30
|
+
args: Type.Array(Type.String({ description: "Exact agent-browser CLI arguments, excluding the binary name." }), {
|
|
31
|
+
description: "Exact agent-browser CLI arguments, excluding the binary name and any shell operators.",
|
|
32
|
+
minItems: 1,
|
|
33
|
+
}),
|
|
34
|
+
stdin: Type.Optional(Type.String({ description: "Optional raw stdin content for commands like eval --stdin or batch." })),
|
|
35
|
+
useActiveSession: Type.Optional(
|
|
36
|
+
Type.Boolean({
|
|
37
|
+
description: "When true and no explicit --session is present, inject the implicit session for this pi session.",
|
|
38
|
+
default: true,
|
|
39
|
+
}),
|
|
40
|
+
),
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
function buildMissingBinaryMessage(): string {
|
|
44
|
+
return [
|
|
45
|
+
"agent-browser is required but was not found on PATH.",
|
|
46
|
+
"This project does not bundle agent-browser.",
|
|
47
|
+
"Install it using the upstream docs:",
|
|
48
|
+
"- https://agent-browser.dev/",
|
|
49
|
+
"- https://github.com/vercel-labs/agent-browser",
|
|
50
|
+
].join("\n");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function buildInvocationPreview(effectiveArgs: string[]): string {
|
|
54
|
+
const preview = effectiveArgs.join(" ");
|
|
55
|
+
return preview.length > 120 ? `${preview.slice(0, 117)}...` : preview;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function looksLikeDirectAgentBrowserBash(command: string): boolean {
|
|
59
|
+
return /(^|[\s;&|])(npx\s+)?agent-browser(\s|$)/.test(command);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function isHarmlessAgentBrowserInspectionCommand(command: string): boolean {
|
|
63
|
+
return /(command\s+-v|which)\s+agent-browser\b/.test(command) || /(^|\s)agent-browser\s+--(help|version)\b/.test(command);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function isPlainTextInspectionArgs(args: string[]): boolean {
|
|
67
|
+
return args.includes("--help") || args.includes("-h") || args.includes("--version") || args.includes("-V");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function buildInspectionDeflectionMessage(): string {
|
|
71
|
+
return [
|
|
72
|
+
"Do not inspect agent_browser help for a normal browser task.",
|
|
73
|
+
"Use the workflow directly:",
|
|
74
|
+
"1. open the target URL",
|
|
75
|
+
"2. snapshot -i",
|
|
76
|
+
"3. interact using refs and re-snapshot after navigation or major DOM changes",
|
|
77
|
+
"For authenticated or user-specific content like feeds, inboxes, dashboards, or accounts, start with an authenticated strategy such as --profile Default on the first browser call and let the implicit session carry continuity. Use --auto-connect only if profile-based reuse is unavailable.",
|
|
78
|
+
].join("\n");
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export default function agentBrowserExtension(pi: ExtensionAPI) {
|
|
82
|
+
const ephemeralSessionSeed = createEphemeralSessionSeed();
|
|
83
|
+
let implicitSessionActive = false;
|
|
84
|
+
let implicitSessionName = createImplicitSessionName(undefined, process.cwd(), ephemeralSessionSeed);
|
|
85
|
+
let implicitSessionCwd = process.cwd();
|
|
86
|
+
|
|
87
|
+
pi.on("session_start", async (_event, ctx) => {
|
|
88
|
+
implicitSessionActive = false;
|
|
89
|
+
implicitSessionName = createImplicitSessionName(ctx.sessionManager.getSessionId(), ctx.cwd, ephemeralSessionSeed);
|
|
90
|
+
implicitSessionCwd = ctx.cwd;
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
pi.on("session_shutdown", async () => {
|
|
94
|
+
implicitSessionActive = false;
|
|
95
|
+
const controller = new AbortController();
|
|
96
|
+
const timer = setTimeout(() => controller.abort(), IMPLICIT_SESSION_CLOSE_TIMEOUT_MS);
|
|
97
|
+
try {
|
|
98
|
+
await runAgentBrowserProcess({
|
|
99
|
+
args: ["--session", implicitSessionName, "close"],
|
|
100
|
+
cwd: implicitSessionCwd,
|
|
101
|
+
signal: controller.signal,
|
|
102
|
+
});
|
|
103
|
+
} catch {
|
|
104
|
+
// Best-effort cleanup only.
|
|
105
|
+
} finally {
|
|
106
|
+
clearTimeout(timer);
|
|
107
|
+
await cleanupSecureTempArtifacts();
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
pi.on("before_agent_start", async (event) => {
|
|
112
|
+
return {
|
|
113
|
+
systemPrompt:
|
|
114
|
+
event.systemPrompt +
|
|
115
|
+
"\n\nProject rule: when browser automation is needed, prefer the native `agent_browser` tool. Do not run direct `agent-browser` bash commands unless the user explicitly asks for a bash-oriented workflow or browser-integration debugging.\n\nBrowser operating playbook:\n- Standard workflow: open the page, then snapshot -i, then interact via refs, then re-snapshot after navigation or major DOM changes.\n- For user-specific or authenticated content like feeds, inboxes, dashboards, and accounts, start with an authenticated browser strategy instead of public browsing. Prefer `--profile Default` on the first browser call and let the current implicit session carry continuity. Use `--auto-connect` only if profile-based reuse is unavailable or the task is specifically about attaching to a running debug-enabled browser.\n- Do not invent fixed explicit session names for routine tasks. Use the implicit session unless you truly need multiple isolated browser sessions in the same conversation.\n- When using startup-scoped flags like `--profile`, `--session-name`, or `--cdp`, put them on the first command for that session. If you intentionally use an explicit `--session`, keep using that same explicit session for follow-ups.\n- If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an `open` call returns blocked, blank, or otherwise unexpected results, use `tab list`, `tab <n>`, and `snapshot -i` to recover state before retrying different URLs or fallback strategies. Only use `wait` with an explicit argument like milliseconds, `--load`, `--url`, `--fn`, or `--text`.\n- For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.\n- For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or `eval --stdin` on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.\n- When using `eval --stdin`, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics.\n- When using `eval --stdin` for extraction, return the value you want instead of relying on `console.log` as the primary result channel.\n- Do not use `agent_browser --help` for normal browsing tasks.",
|
|
116
|
+
};
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
pi.on("tool_call", async (event, ctx) => {
|
|
120
|
+
const promptPolicy = buildPromptPolicy(getLatestUserPrompt(ctx.sessionManager.getBranch()));
|
|
121
|
+
if (
|
|
122
|
+
isToolCallEventType("bash", event) &&
|
|
123
|
+
!promptPolicy.allowLegacyAgentBrowserBash &&
|
|
124
|
+
looksLikeDirectAgentBrowserBash(event.input.command) &&
|
|
125
|
+
!isHarmlessAgentBrowserInspectionCommand(event.input.command)
|
|
126
|
+
) {
|
|
127
|
+
return {
|
|
128
|
+
block: true,
|
|
129
|
+
reason: "Use the native agent_browser tool instead of bash for agent-browser in this environment.",
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
pi.registerTool({
|
|
135
|
+
name: "agent_browser",
|
|
136
|
+
label: "Agent Browser",
|
|
137
|
+
description:
|
|
138
|
+
"Browse and interact with websites using agent-browser. Use this for web research, reading live docs, opening pages, taking snapshots or screenshots, clicking links, filling forms, extracting page content, and authenticated/profile-based browser work.",
|
|
139
|
+
promptSnippet:
|
|
140
|
+
"Browse websites, read live docs, click and fill pages, extract browser content, take screenshots, and automate real web workflows.",
|
|
141
|
+
promptGuidelines: [
|
|
142
|
+
"Use this tool whenever the task requires a real browser or live web content.",
|
|
143
|
+
"Standard workflow: open the page, snapshot -i, interact using refs, and re-snapshot after navigation or major DOM changes.",
|
|
144
|
+
"For authenticated or user-specific content like feeds, inboxes, dashboards, and accounts, prefer --profile Default on the first browser call and let the implicit session carry continuity. Use --auto-connect only if profile-based reuse is unavailable or the task is specifically about attaching to a running debug-enabled browser.",
|
|
145
|
+
"Do not invent fixed explicit session names for routine tasks. Use the implicit session unless you truly need multiple isolated browser sessions in the same conversation.",
|
|
146
|
+
"When using --profile, --session-name, or --cdp, put them on the first command for that session. If you intentionally use an explicit --session, keep using that same explicit session for follow-ups.",
|
|
147
|
+
"If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <n> / snapshot -i to recover state before retrying different URLs or fallback strategies. Only use wait with an explicit argument like milliseconds, --load, --url, --fn, or --text.",
|
|
148
|
+
"For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.",
|
|
149
|
+
"For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.",
|
|
150
|
+
"When using eval --stdin, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics.",
|
|
151
|
+
"When using eval --stdin for extraction, return the value you want instead of relying on console.log as the primary result channel.",
|
|
152
|
+
"Prefer this tool over bash for opening sites, reading docs on the web, clicking, filling, screenshots, eval, and batch workflows.",
|
|
153
|
+
"Do not call --help or other exploratory inspection commands unless the user explicitly asks for them or debugging the browser integration is necessary.",
|
|
154
|
+
"Do not fall back to osascript, AppleScript, or generic browser-driving bash commands when this tool can do the job.",
|
|
155
|
+
"Pass exact agent-browser CLI arguments in args, excluding the binary name.",
|
|
156
|
+
"Use stdin for commands like eval --stdin and batch instead of shell heredocs.",
|
|
157
|
+
"Let the implicit session handle the common path unless you explicitly need upstream flags like --session, --profile, or --cdp.",
|
|
158
|
+
],
|
|
159
|
+
parameters: AGENT_BROWSER_PARAMS,
|
|
160
|
+
async execute(_toolCallId, params, signal, onUpdate, ctx) {
|
|
161
|
+
const promptPolicy = buildPromptPolicy(getLatestUserPrompt(ctx.sessionManager.getBranch()));
|
|
162
|
+
if (!promptPolicy.allowAgentBrowserInspection && isPlainTextInspectionArgs(params.args)) {
|
|
163
|
+
const errorText = buildInspectionDeflectionMessage();
|
|
164
|
+
return {
|
|
165
|
+
content: [{ type: "text", text: errorText }],
|
|
166
|
+
details: { args: params.args, inspectionBlocked: true },
|
|
167
|
+
isError: true,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const validationError = validateToolArgs(params.args);
|
|
172
|
+
if (validationError) {
|
|
173
|
+
return {
|
|
174
|
+
content: [{ type: "text", text: validationError }],
|
|
175
|
+
details: { args: params.args, validationError },
|
|
176
|
+
isError: true,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const executionPlan = buildExecutionPlan(params.args, {
|
|
181
|
+
implicitSessionActive,
|
|
182
|
+
implicitSessionName,
|
|
183
|
+
useActiveSession: params.useActiveSession ?? true,
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
if (executionPlan.validationError) {
|
|
187
|
+
return {
|
|
188
|
+
content: [{ type: "text", text: executionPlan.validationError }],
|
|
189
|
+
details: {
|
|
190
|
+
args: params.args,
|
|
191
|
+
startupScopedFlags: executionPlan.startupScopedFlags,
|
|
192
|
+
validationError: executionPlan.validationError,
|
|
193
|
+
},
|
|
194
|
+
isError: true,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
onUpdate?.({
|
|
199
|
+
content: [{ type: "text", text: `Running agent-browser ${buildInvocationPreview(executionPlan.effectiveArgs)}` }],
|
|
200
|
+
details: {
|
|
201
|
+
effectiveArgs: executionPlan.effectiveArgs,
|
|
202
|
+
sessionName: executionPlan.sessionName,
|
|
203
|
+
usedImplicitSession: executionPlan.usedImplicitSession,
|
|
204
|
+
},
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
const processResult = await runAgentBrowserProcess({
|
|
208
|
+
args: executionPlan.effectiveArgs,
|
|
209
|
+
cwd: ctx.cwd,
|
|
210
|
+
env: executionPlan.usedImplicitSession
|
|
211
|
+
? { AGENT_BROWSER_IDLE_TIMEOUT_MS: IMPLICIT_SESSION_IDLE_TIMEOUT_MS }
|
|
212
|
+
: undefined,
|
|
213
|
+
signal,
|
|
214
|
+
stdin: params.stdin,
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
if (executionPlan.usedImplicitSession && !processResult.aborted && !processResult.spawnError) {
|
|
218
|
+
implicitSessionActive = executionPlan.commandInfo.command !== "close";
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
if (processResult.spawnError?.message.includes("ENOENT")) {
|
|
222
|
+
const errorText = buildMissingBinaryMessage();
|
|
223
|
+
return {
|
|
224
|
+
content: [{ type: "text", text: errorText }],
|
|
225
|
+
details: {
|
|
226
|
+
args: params.args,
|
|
227
|
+
effectiveArgs: executionPlan.effectiveArgs,
|
|
228
|
+
spawnError: processResult.spawnError.message,
|
|
229
|
+
},
|
|
230
|
+
isError: true,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
try {
|
|
235
|
+
const parsed = await parseAgentBrowserEnvelope({
|
|
236
|
+
stdout: processResult.stdout,
|
|
237
|
+
stdoutPath: processResult.stdoutSpillPath,
|
|
238
|
+
});
|
|
239
|
+
const processSucceeded = !processResult.aborted && !processResult.spawnError && processResult.exitCode === 0;
|
|
240
|
+
const plainTextInspection = isPlainTextInspectionArgs(params.args) && processSucceeded && parsed.parseError !== undefined;
|
|
241
|
+
const envelopeSuccess = plainTextInspection ? true : parsed.envelope?.success !== false;
|
|
242
|
+
const parseSucceeded = plainTextInspection || parsed.parseError === undefined;
|
|
243
|
+
const succeeded = processSucceeded && parseSucceeded && envelopeSuccess;
|
|
244
|
+
|
|
245
|
+
const errorText = getAgentBrowserErrorText({
|
|
246
|
+
aborted: processResult.aborted,
|
|
247
|
+
envelope: parsed.envelope,
|
|
248
|
+
exitCode: processResult.exitCode,
|
|
249
|
+
parseError: parsed.parseError,
|
|
250
|
+
plainTextInspection,
|
|
251
|
+
spawnError: processResult.spawnError,
|
|
252
|
+
stderr: processResult.stderr,
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
const presentation = plainTextInspection
|
|
256
|
+
? {
|
|
257
|
+
content: [{ type: "text" as const, text: processResult.stdout.trim() }],
|
|
258
|
+
imagePath: undefined,
|
|
259
|
+
summary: `${params.args.join(" ")} completed`,
|
|
260
|
+
}
|
|
261
|
+
: await buildToolPresentation({
|
|
262
|
+
commandInfo: executionPlan.commandInfo,
|
|
263
|
+
cwd: ctx.cwd,
|
|
264
|
+
envelope: parsed.envelope,
|
|
265
|
+
errorText,
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
return {
|
|
269
|
+
content: presentation.content,
|
|
270
|
+
details: {
|
|
271
|
+
args: params.args,
|
|
272
|
+
command: executionPlan.commandInfo.command,
|
|
273
|
+
subcommand: executionPlan.commandInfo.subcommand,
|
|
274
|
+
data: presentation.data,
|
|
275
|
+
error: parsed.envelope?.error,
|
|
276
|
+
effectiveArgs: executionPlan.effectiveArgs,
|
|
277
|
+
exitCode: processResult.exitCode,
|
|
278
|
+
fullOutputPath: presentation.fullOutputPath,
|
|
279
|
+
imagePath: presentation.imagePath,
|
|
280
|
+
parseError: parsed.parseError,
|
|
281
|
+
sessionName: executionPlan.sessionName,
|
|
282
|
+
startupScopedFlags: executionPlan.startupScopedFlags,
|
|
283
|
+
stderr: processResult.stderr || undefined,
|
|
284
|
+
stdout: parseSucceeded ? undefined : processResult.stdout,
|
|
285
|
+
summary: presentation.summary,
|
|
286
|
+
usedImplicitSession: executionPlan.usedImplicitSession,
|
|
287
|
+
},
|
|
288
|
+
isError: !succeeded,
|
|
289
|
+
};
|
|
290
|
+
} finally {
|
|
291
|
+
if (processResult.stdoutSpillPath) {
|
|
292
|
+
await rm(processResult.stdoutSpillPath, { force: true }).catch(() => undefined);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
},
|
|
296
|
+
});
|
|
297
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Purpose: Execute the upstream agent-browser binary for the pi-agent-browser extension.
|
|
3
|
+
* Responsibilities: Spawn the agent-browser subprocess without a shell, stream optional stdin, bound in-memory output buffering, spill oversized stdout safely to a private temp file, and honor abort signals.
|
|
4
|
+
* Scope: Process execution only; argument planning, output formatting, and pi tool registration live elsewhere.
|
|
5
|
+
* Usage: Called by the extension tool after argument validation and session planning are complete.
|
|
6
|
+
* Invariants/Assumptions: The binary name is always `agent-browser`, the wrapper never shells out, and callers handle semantic success/error interpretation.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { spawn } from "node:child_process";
|
|
10
|
+
import { env as processEnv } from "node:process";
|
|
11
|
+
|
|
12
|
+
import { openSecureTempFile } from "./temp.js";
|
|
13
|
+
|
|
14
|
+
const MAX_BUFFERED_STDOUT_BYTES = 512 * 1_024;
|
|
15
|
+
const MAX_BUFFERED_STDERR_CHARS = 32_000;
|
|
16
|
+
const MAX_BUFFERED_STDOUT_TAIL_CHARS = 32_000;
|
|
17
|
+
const PROCESS_STDOUT_SPILL_FILE_PREFIX = "process-stdout";
|
|
18
|
+
|
|
19
|
+
export interface ProcessRunResult {
|
|
20
|
+
aborted: boolean;
|
|
21
|
+
exitCode: number;
|
|
22
|
+
spawnError?: Error;
|
|
23
|
+
stderr: string;
|
|
24
|
+
stdout: string;
|
|
25
|
+
stdoutSpillPath?: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function appendTail(text: string, addition: string, maxChars: number): string {
|
|
29
|
+
const combined = text + addition;
|
|
30
|
+
return combined.length <= maxChars ? combined : combined.slice(combined.length - maxChars);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export async function runAgentBrowserProcess(options: {
|
|
34
|
+
args: string[];
|
|
35
|
+
cwd: string;
|
|
36
|
+
env?: NodeJS.ProcessEnv;
|
|
37
|
+
signal?: AbortSignal;
|
|
38
|
+
stdin?: string;
|
|
39
|
+
}): Promise<ProcessRunResult> {
|
|
40
|
+
const { args, cwd, env, signal, stdin } = options;
|
|
41
|
+
|
|
42
|
+
return await new Promise<ProcessRunResult>((resolve) => {
|
|
43
|
+
let aborted = false;
|
|
44
|
+
let settled = false;
|
|
45
|
+
let spawnError: Error | undefined;
|
|
46
|
+
let stderr = "";
|
|
47
|
+
let stdoutBuffers: Buffer[] = [];
|
|
48
|
+
let stdoutBufferedBytes = 0;
|
|
49
|
+
let stdoutTail = "";
|
|
50
|
+
let stdoutSpillHandle: Awaited<ReturnType<typeof openSecureTempFile>>["fileHandle"] | undefined;
|
|
51
|
+
let stdoutSpillPath: string | undefined;
|
|
52
|
+
let pendingStdoutWrite = Promise.resolve();
|
|
53
|
+
let stdoutSpillError: Error | undefined;
|
|
54
|
+
let killTimer: NodeJS.Timeout | undefined;
|
|
55
|
+
|
|
56
|
+
const queueStdoutChunk = (buffer: Buffer) => {
|
|
57
|
+
stdoutTail = appendTail(stdoutTail, buffer.toString("utf8"), MAX_BUFFERED_STDOUT_TAIL_CHARS);
|
|
58
|
+
if (!stdoutSpillPath && stdoutBufferedBytes + buffer.length <= MAX_BUFFERED_STDOUT_BYTES) {
|
|
59
|
+
stdoutBuffers.push(buffer);
|
|
60
|
+
stdoutBufferedBytes += buffer.length;
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
pendingStdoutWrite = pendingStdoutWrite
|
|
65
|
+
.then(async () => {
|
|
66
|
+
if (!stdoutSpillHandle) {
|
|
67
|
+
const tempFile = await openSecureTempFile(PROCESS_STDOUT_SPILL_FILE_PREFIX, ".json");
|
|
68
|
+
stdoutSpillHandle = tempFile.fileHandle;
|
|
69
|
+
stdoutSpillPath = tempFile.path;
|
|
70
|
+
if (stdoutBuffers.length > 0) {
|
|
71
|
+
await stdoutSpillHandle.writeFile(Buffer.concat(stdoutBuffers));
|
|
72
|
+
stdoutBuffers = [];
|
|
73
|
+
stdoutBufferedBytes = 0;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
await stdoutSpillHandle.writeFile(buffer);
|
|
77
|
+
})
|
|
78
|
+
.catch((error) => {
|
|
79
|
+
stdoutSpillError = error instanceof Error ? error : new Error(String(error));
|
|
80
|
+
});
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const finish = (exitCode: number) => {
|
|
84
|
+
if (settled) return;
|
|
85
|
+
settled = true;
|
|
86
|
+
void pendingStdoutWrite.finally(async () => {
|
|
87
|
+
if (killTimer) {
|
|
88
|
+
clearTimeout(killTimer);
|
|
89
|
+
}
|
|
90
|
+
if (stdoutSpillHandle) {
|
|
91
|
+
await stdoutSpillHandle.close().catch(() => undefined);
|
|
92
|
+
}
|
|
93
|
+
if (!spawnError && stdoutSpillError) {
|
|
94
|
+
spawnError = stdoutSpillError;
|
|
95
|
+
}
|
|
96
|
+
resolve({
|
|
97
|
+
aborted,
|
|
98
|
+
exitCode,
|
|
99
|
+
spawnError,
|
|
100
|
+
stderr,
|
|
101
|
+
stdout: stdoutSpillPath ? stdoutTail : Buffer.concat(stdoutBuffers).toString("utf8"),
|
|
102
|
+
stdoutSpillPath,
|
|
103
|
+
});
|
|
104
|
+
});
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
const child = spawn("agent-browser", args, {
|
|
108
|
+
cwd,
|
|
109
|
+
env: { ...processEnv, ...env },
|
|
110
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
const abortChild = () => {
|
|
114
|
+
aborted = true;
|
|
115
|
+
child.kill("SIGTERM");
|
|
116
|
+
killTimer = setTimeout(() => {
|
|
117
|
+
child.kill("SIGKILL");
|
|
118
|
+
}, 2_000);
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
child.once("error", (error) => {
|
|
122
|
+
spawnError = error instanceof Error ? error : new Error(String(error));
|
|
123
|
+
finish(127);
|
|
124
|
+
});
|
|
125
|
+
child.once("close", (code) => {
|
|
126
|
+
finish(code ?? (spawnError ? 127 : 0));
|
|
127
|
+
});
|
|
128
|
+
child.stdout.on("data", (chunk: Buffer | string) => {
|
|
129
|
+
queueStdoutChunk(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
130
|
+
});
|
|
131
|
+
child.stderr.on("data", (chunk: Buffer | string) => {
|
|
132
|
+
stderr = appendTail(stderr, chunk.toString(), MAX_BUFFERED_STDERR_CHARS);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
if (signal) {
|
|
136
|
+
if (signal.aborted) {
|
|
137
|
+
abortChild();
|
|
138
|
+
} else {
|
|
139
|
+
signal.addEventListener("abort", abortChild, { once: true });
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (stdin) {
|
|
144
|
+
child.stdin.write(stdin);
|
|
145
|
+
}
|
|
146
|
+
child.stdin.end();
|
|
147
|
+
});
|
|
148
|
+
}
|