npm - pi-agent-browser-native - Versions diffs - 0.2.11 → 0.2.13 - Mend

pi-agent-browser-native 0.2.11 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/CHANGELOG.md +14 -3
package/README.md +87 -27
package/docs/ARCHITECTURE.md +9 -3
package/docs/COMMAND_REFERENCE.md +383 -151
package/docs/RELEASE.md +81 -26
package/docs/REQUIREMENTS.md +10 -4
package/docs/TOOL_CONTRACT.md +51 -11
package/extensions/agent-browser/index.ts +845 -343
package/extensions/agent-browser/lib/parsing.ts +20 -0
package/extensions/agent-browser/lib/playbook.ts +79 -0
package/extensions/agent-browser/lib/process.ts +56 -8
package/extensions/agent-browser/lib/results/confirmation.ts +76 -0
package/extensions/agent-browser/lib/results/envelope.ts +42 -5
package/extensions/agent-browser/lib/results/presentation.ts +907 -50
package/extensions/agent-browser/lib/results/shared.ts +166 -15
package/extensions/agent-browser/lib/results/snapshot.ts +69 -7
package/extensions/agent-browser/lib/results.ts +7 -1
package/extensions/agent-browser/lib/runtime.ts +204 -15
package/extensions/agent-browser/lib/temp.ts +131 -23
package/package.json +11 -8
package/scripts/agent-browser-capability-baseline.mjs +104 -0
package/scripts/doctor.mjs +420 -0

package/extensions/agent-browser/lib/parsing.ts ADDED Viewed

@@ -0,0 +1,20 @@
+/**
+ * Purpose: Centralize low-level boundary parsing helpers shared by runtime planning, temp-artifact lifecycle, and result rendering.
+ * Responsibilities: Identify non-null object records and normalize positive-integer string configuration values.
+ * Scope: Tiny generic parsing predicates only; module-specific validation and error handling stay with their owning modules.
+ * Usage: Imported by agent-browser wrapper modules that parse untyped JSON, persisted state, or environment variables.
+ * Invariants/Assumptions: Arrays intentionally count as records to preserve existing object-boundary semantics, and positive integers must be safe base-10 integer strings greater than zero.
+ */
+export function isRecord(value: unknown): value is Record<string, unknown> {
+	return typeof value === "object" && value !== null;
+}
+export function parsePositiveInteger(rawValue: string | undefined): number | undefined {
+	if (typeof rawValue !== "string") return undefined;
+	const normalizedValue = rawValue.trim();
+	if (!/^\d+$/.test(normalizedValue)) return undefined;
+	const parsedValue = Number(normalizedValue);
+	if (!Number.isSafeInteger(parsedValue) || parsedValue <= 0) return undefined;
+	return parsedValue;
+}

package/extensions/agent-browser/lib/playbook.ts ADDED Viewed

@@ -0,0 +1,79 @@
+/**
+ * Purpose: Provide the canonical agent_browser operating playbook shared by runtime prompt metadata and generated documentation fragments.
+ * Responsibilities: Define stable guidance bullets, native tool-call examples, and wrapper-behavior notes without importing runtime/browser process code.
+ * Scope: Agent-facing documentation and prompt-guidance text only; command execution and wrapper state behavior live in runtime modules.
+ * Usage: Imported by the extension entrypoint for promptGuidelines and by the documentation drift-check script for generated Markdown blocks.
+ * Invariants/Assumptions: The native pi tool receives args after the agent-browser binary, stdin is only for batch/eval --stdin, and wrapper behavior documented here must match implemented behavior.
+ */
+export const PROJECT_RULE_PROMPT =
+	"Project rule: when browser automation is needed, prefer the native `agent_browser` tool. Do not run direct `agent-browser` bash commands unless the user explicitly asks for a bash-oriented workflow or browser-integration debugging.";
+export const TOOL_PROMPT_GUIDELINES_PREFIX = [
+	"Use agent_browser whenever the task requires a real browser or live web content.",
+] as const;
+export const QUICK_START_GUIDELINES = [
+	"Quick start mental model: args are the exact agent-browser CLI args after the binary; stdin is only for batch and eval --stdin, and other command/stdin combinations are rejected before launch; sessionMode=fresh switches the extension-managed pi-scoped session to a fresh upstream launch when you need new --profile, --session-name, --cdp, --state, or --auto-connect state.",
+	"Common first calls: { args: [\"open\", \"https://example.com\"] } then { args: [\"snapshot\", \"-i\"] }; after navigation, use { args: [\"click\", \"@e2\"] } then { args: [\"snapshot\", \"-i\"] }.",
+	"Common advanced calls: { args: [\"batch\"], stdin: \"[[\\\"open\\\",\\\"https://example.com\\\"],[\\\"snapshot\\\",\\\"-i\\\"]]\" }, { args: [\"eval\", \"--stdin\"], stdin: \"document.title\" }, and { args: [\"--profile\", \"Default\", \"open\", \"https://example.com/account\"], sessionMode: \"fresh\" }.",
+	"High-value command reference: download <selector> <path> saves a file triggered by a click; get title/url/text/html/value/attr/count reads page state; screenshot [path] captures an image; pdf <path> saves a PDF; tab list and tab <tab-id-or-label> inspect or recover the active tab.",
+] as const;
+export const BRAVE_SEARCH_PROMPT_GUIDELINE =
+	"When a non-empty BRAVE_API_KEY is available in the current environment, prefer the Brave Search API via bash/curl to discover specific destination URLs, then open the chosen URL with agent_browser instead of browsing a search engine results page just to find the target.";
+export const SHARED_BROWSER_PLAYBOOK_GUIDELINES = [
+	"Standard workflow: open the page, snapshot -i, interact using current @refs from that snapshot, and re-snapshot after navigation, scrolling, rerendering, or other major DOM changes because refs can become stale.",
+	"When a visible text or accessible-name target should survive ref churn, prefer find locators such as role, text, label, placeholder, alt, title, or testid with the intended action instead of guessing a CSS selector.",
+	"Do not assume Playwright selector dialects such as text=Close or button:has-text('Close') are supported wrapper syntax unless current upstream agent-browser behavior has been verified.",
+	"For authenticated or user-specific content like feeds, inboxes, dashboards, and accounts, prefer --profile Default on the first browser call and let the implicit session carry continuity. Use --auto-connect only if profile-based reuse is unavailable or the task is specifically about attaching to a running debug-enabled browser.",
+	"Do not invent fixed explicit session names for routine tasks. Use the implicit session unless you truly need multiple isolated browser sessions in the same conversation.",
+	"When using --profile, --session-name, --cdp, --state, or --auto-connect, put them on the first command for that session. If you intentionally use an explicit --session, keep using that same explicit session for follow-ups.",
+	"If you already used the implicit session and now need launch-scoped flags like --profile, --session-name, --cdp, --state, or --auto-connect, retry with sessionMode set to fresh or pass an explicit --session for the new launch. After a successful unnamed fresh launch, later auto calls follow that new session.",
+	"If a session lands on the wrong page or tab, an interaction changes origin unexpectedly, or an open call returns blocked, blank, or otherwise unexpected results, use tab list / tab <tab-id-or-label> / snapshot -i to recover state before retrying different URLs or fallback strategies. Only use wait with an explicit argument like milliseconds, --load <state>, --url <matcher>, --fn <js>, or --text <matcher>.",
+	"For feed, timeline, or inbox reading tasks, focus on the main timeline/list region and read the first item there rather than unrelated composer or sidebar content.",
+	"For read-only browsing tasks, prefer extracting the answer from the current snapshot, structured ref labels, or eval --stdin on the current page before navigating away. Only click into media viewers, detail routes, or new pages when the current view does not contain the needed information.",
+	"For downloads, prefer download <selector> <path> when an element click should save a file. Do not rely on click alone when you need the downloaded file on disk.",
+	"When using eval --stdin, scope checks and actions to the target element or route whenever possible instead of relying on broad page-wide text heuristics.",
+	"When using eval --stdin for extraction, return the value you want instead of relying on console.log as the primary result channel.",
+	"Do not call --help or other exploratory inspection commands unless the user explicitly asks for them or debugging the browser integration is necessary.",
+] as const;
+export const TOOL_PROMPT_GUIDELINES_SUFFIX = [
+	"Prefer agent_browser over bash for opening sites, reading docs on the web, clicking, filling, screenshots, eval, and batch workflows.",
+	"Do not fall back to osascript, AppleScript, or generic browser-driving bash commands when agent_browser can do the job.",
+	"Pass exact agent-browser CLI arguments in args, excluding the binary name.",
+	"Use stdin only for eval --stdin and batch instead of shell heredocs; other command/stdin combinations are rejected before launch.",
+	"Let the extension-managed session handle the common path unless you explicitly need a fresh launch for upstream flags like --profile, --session-name, --cdp, --state, or --auto-connect.",
+	"Use sessionMode=fresh when switching from an existing implicit session to a new profile/debug launch without inventing a fixed explicit session name; later auto calls will follow that new session.",
+] as const;
+export const INSPECTION_TOOL_CALL_EXAMPLES = [
+	'{ "args": ["--help"] }',
+	'{ "args": ["--version"] }',
+] as const;
+export const WRAPPER_TAB_RECOVERY_BEHAVIOR = [
+	"After launch-scoped open/goto/navigate calls that can restore existing tabs (for example --profile, --session-name, or --state), agent_browser best-effort re-selects the tab whose URL matches the returned page when restored tabs steal focus during launch.",
+	"After a target tab is known for a session, later active-tab commands best-effort pin that tab inside the same upstream invocation when reconnect drift would otherwise move the command to a restored/background tab.",
+	"After a successful command on a known target tab, agent_browser also best-effort restores that intended tab if a restored/background tab steals focus after the command completes.",
+	"If a known session target unexpectedly reports about:blank, agent_browser preserves the prior intended target, best-effort re-selects it when it still exists, and reports exact recovery guidance when it cannot be re-selected.",
+] as const;
+export function buildSharedBrowserPlaybookGuidelines(options: { includeBraveSearch: boolean }): string[] {
+	return [
+		SHARED_BROWSER_PLAYBOOK_GUIDELINES[0],
+		...(options.includeBraveSearch ? [BRAVE_SEARCH_PROMPT_GUIDELINE] : []),
+		...SHARED_BROWSER_PLAYBOOK_GUIDELINES.slice(1),
+	];
+}
+export function buildToolPromptGuidelines(options: { includeBraveSearch: boolean }): string[] {
+	return [
+		...TOOL_PROMPT_GUIDELINES_PREFIX,
+		...QUICK_START_GUIDELINES,
+		...buildSharedBrowserPlaybookGuidelines(options),
+		...TOOL_PROMPT_GUIDELINES_SUFFIX,
+	];
+}

package/extensions/agent-browser/lib/process.ts CHANGED Viewed

@@ -63,12 +63,27 @@ const INHERITED_ENV_NAMES = new Set([
 	"USERPROFILE",
 	"WAYLAND_DISPLAY",
 	"XAUTHORITY",
+	"AWS_ACCESS_KEY_ID",
+	"AWS_SECRET_ACCESS_KEY",
+	"AWS_SESSION_TOKEN",
+	"AWS_PROFILE",
+	"AWS_REGION",
+	"AWS_DEFAULT_REGION",
 	httpProxyEnvName,
 	httpsProxyEnvName,
 	allProxyEnvName,
 	noProxyEnvName,
 ]);
-const INHERITED_ENV_PREFIXES = ["AI_GATEWAY_", "XDG_"] as const;
+const INHERITED_ENV_PREFIXES = [
+	"AGENT_BROWSER_",
+	"AGENTCORE_",
+	"AI_GATEWAY_",
+	"BROWSERBASE_",
+	"BROWSERLESS_",
+	"BROWSER_USE_",
+	"KERNEL_",
+	"XDG_",
+] as const;
 export interface ProcessRunResult {
 	aborted: boolean;
@@ -140,8 +155,9 @@ export async function runAgentBrowserProcess(options: {
 	stdin?: string;
 }): Promise<ProcessRunResult> {
 	const { args, cwd, env, signal, stdin } = options;
-	let effectiveEnv = env;
-	const requestedSocketDir = env?.[AGENT_BROWSER_SOCKET_DIR_ENV] ?? getAgentBrowserSocketDir();
+	const explicitSocketDir = env?.[AGENT_BROWSER_SOCKET_DIR_ENV];
+	let effectiveEnv = explicitSocketDir === undefined ? { ...env, [AGENT_BROWSER_SOCKET_DIR_ENV]: undefined } : env;
+	const requestedSocketDir = explicitSocketDir ?? getAgentBrowserSocketDir();
 	if (requestedSocketDir && (await ensureAgentBrowserSocketDir(requestedSocketDir))) {
 		effectiveEnv = { ...env, [AGENT_BROWSER_SOCKET_DIR_ENV]: requestedSocketDir };
 	}
@@ -159,6 +175,7 @@ export async function runAgentBrowserProcess(options: {
 		let pendingStdoutWrite = Promise.resolve();
 		let stdoutSpillError: Error | undefined;
 		let killTimer: NodeJS.Timeout | undefined;
+		let abortListener: (() => void) | undefined;
 		const queueStdoutChunk = (buffer: Buffer) => {
 			stdoutTail = appendTail(stdoutTail, buffer.toString("utf8"), MAX_BUFFERED_STDOUT_TAIL_CHARS);
@@ -193,10 +210,17 @@ export async function runAgentBrowserProcess(options: {
 				});
 		};
+		const removeAbortListener = () => {
+			if (!signal || !abortListener) return;
+			signal.removeEventListener("abort", abortListener);
+			abortListener = undefined;
+		};
 		const finish = (exitCode: number) => {
 			if (settled) return;
 			settled = true;
 			void pendingStdoutWrite.finally(async () => {
+				removeAbortListener();
 				if (killTimer) {
 					clearTimeout(killTimer);
 				}
@@ -230,7 +254,33 @@ export async function runAgentBrowserProcess(options: {
 				child.kill("SIGKILL");
 			}, 2_000);
 		};
+		const recordStdinError = (error: unknown) => {
+			const stdinError = error instanceof Error ? error : new Error(String(error));
+			const errorCode = (stdinError as NodeJS.ErrnoException).code;
+			if (errorCode === "EPIPE" || errorCode === "ERR_STREAM_DESTROYED") {
+				return;
+			}
+			if (!spawnError) {
+				spawnError = stdinError;
+			}
+		};
+		const writeChildStdin = () => {
+			if (aborted || signal?.aborted) {
+				child.stdin.destroy();
+				return;
+			}
+			try {
+				if (stdin) {
+					child.stdin.write(stdin);
+				}
+				child.stdin.end();
+			} catch (error) {
+				recordStdinError(error);
+				child.stdin.destroy();
+			}
+		};
+		child.stdin.on("error", recordStdinError);
 		child.once("error", (error) => {
 			spawnError = error instanceof Error ? error : new Error(String(error));
 			finish(127);
@@ -249,13 +299,11 @@ export async function runAgentBrowserProcess(options: {
 			if (signal.aborted) {
 				abortChild();
 			} else {
-				signal.addEventListener("abort", abortChild, { once: true });
+				abortListener = abortChild;
+				signal.addEventListener("abort", abortListener, { once: true });
 			}
 		}
-		if (stdin) {
-			child.stdin.write(stdin);
-		}
-		child.stdin.end();
+		writeChildStdin();
 	});
 }

package/extensions/agent-browser/lib/results/confirmation.ts ADDED Viewed

@@ -0,0 +1,76 @@
+/**
+ * Purpose: Detect upstream guarded-action confirmation-needed result shapes without creating wrapper-owned confirmation state.
+ * Responsibilities: Recognize confirmation-required markers, extract the pending upstream confirmation id, and optionally surface a short upstream action label.
+ * Scope: Pure result-shape detection shared by presentation and error derivation; command execution, approval state, and redaction stay in their existing modules.
+ * Usage: Imported by result presentation to render recovery commands and by envelope error handling to avoid hiding actionable confirmation payloads behind generic failure text.
+ * Invariants/Assumptions: Detection must be conservative: a confirmation marker and a non-empty upstream id are both required before a result is treated as actionable.
+ */
+import { isRecord } from "../parsing.js";
+export interface ConfirmationRequiredPresentation {
+	id: string;
+	actionText?: string;
+}
+const CONFIRMATION_REQUIRED_FIELD_NAMES = [
+	"confirmation_required",
+	"confirmationRequired",
+	"requires_confirmation",
+	"requiresConfirmation",
+] as const;
+const CONFIRMATION_REQUIRED_RECORD_FIELD_NAMES = ["confirmation", "pendingConfirmation", "pending_confirmation"] as const;
+const CONFIRMATION_ID_FIELD_NAMES = ["confirmation_id", "confirmationId", "id"] as const;
+const CONFIRMATION_ACTION_TEXT_FIELD_NAMES = ["action", "description", "message", "summary"] as const;
+const CONFIRMATION_REQUIRED_MARKER = "confirmation_required";
+function getTrimmedStringField(data: Record<string, unknown>, fieldNames: readonly string[]): string | undefined {
+	for (const fieldName of fieldNames) {
+		const value = data[fieldName];
+		if (typeof value === "string" && value.trim().length > 0) {
+			return value.trim();
+		}
+	}
+	return undefined;
+}
+function hasConfirmationRequiredMarker(data: Record<string, unknown>): boolean {
+	return CONFIRMATION_REQUIRED_FIELD_NAMES.some((fieldName) => data[fieldName] === true)
+		|| data.type === CONFIRMATION_REQUIRED_MARKER
+		|| data.status === CONFIRMATION_REQUIRED_MARKER
+		|| data.kind === CONFIRMATION_REQUIRED_MARKER;
+}
+function getNestedConfirmationRecord(data: Record<string, unknown>): Record<string, unknown> | undefined {
+	for (const fieldName of CONFIRMATION_REQUIRED_RECORD_FIELD_NAMES) {
+		const value = data[fieldName];
+		if (isRecord(value)) {
+			return value;
+		}
+	}
+	return undefined;
+}
+export function detectConfirmationRequired(data: unknown): ConfirmationRequiredPresentation | undefined {
+	if (!isRecord(data)) {
+		return undefined;
+	}
+	const nestedRecord = getNestedConfirmationRecord(data);
+	const candidateRecords = nestedRecord ? [data, nestedRecord] : [data];
+	if (!candidateRecords.some(hasConfirmationRequiredMarker)) {
+		return undefined;
+	}
+	for (const record of candidateRecords) {
+		const id = getTrimmedStringField(record, CONFIRMATION_ID_FIELD_NAMES);
+		if (!id) {
+			continue;
+		}
+		return {
+			actionText: getTrimmedStringField(record, CONFIRMATION_ACTION_TEXT_FIELD_NAMES),
+			id,
+		};
+	}
+	return undefined;
+}

package/extensions/agent-browser/lib/results/envelope.ts CHANGED Viewed

@@ -8,7 +8,9 @@
 import { readFile } from "node:fs/promises";
-import { type AgentBrowserBatchResult, type AgentBrowserEnvelope, isRecord, stringifyUnknown } from "./shared.js";
+import { isRecord } from "../parsing.js";
+import { detectConfirmationRequired } from "./confirmation.js";
+import { type AgentBrowserBatchResult, type AgentBrowserEnvelope, stringifyUnknown } from "./shared.js";
 function hasStructuredBatchStepFailure(data: unknown): data is AgentBrowserBatchResult[] {
 	return Array.isArray(data) && data.some((item) => isRecord(item) && item.success === false);
@@ -75,21 +77,56 @@ export async function parseAgentBrowserEnvelope(options: string | { stdout: stri
 		if (!isRecord(parsed)) {
 			return { parseError: "agent-browser returned JSON, but it was not an object envelope." };
 		}
-		return { envelope: parsed };
+		if (!("success" in parsed)) {
+			return { parseError: "agent-browser returned an invalid JSON envelope: missing boolean success field." };
+		}
+		if (typeof parsed.success !== "boolean") {
+			return { parseError: "agent-browser returned an invalid JSON envelope: success field must be boolean." };
+		}
+		return { envelope: parsed as AgentBrowserEnvelope };
 	} catch (error) {
 		const message = error instanceof Error ? error.message : String(error);
 		return { parseError: `agent-browser returned invalid JSON: ${message}` };
 	}
 }
+function buildInvocationLabel(options: { command?: string; effectiveArgs?: string[] }): string {
+	if (options.effectiveArgs && options.effectiveArgs.length > 0) {
+		return `agent-browser ${options.effectiveArgs.join(" ")}`;
+	}
+	if (options.command && options.command.trim().length > 0) {
+		return `agent-browser ${options.command.trim()}`;
+	}
+	return "agent-browser";
+}
+function appendWrapperRecoveryHint(message: string, wrapperRecoveryHint?: string): string {
+	const hint = wrapperRecoveryHint?.trim();
+	return hint ? `${message}\n${hint}` : message;
+}
+function buildFailureFallback(options: { command?: string; effectiveArgs?: string[]; exitCode: number; wrapperRecoveryHint?: string }): string {
+	const invocation = buildInvocationLabel(options);
+	const exitSuffix = options.exitCode !== 0 ? ` (exit code ${options.exitCode})` : "";
+	return appendWrapperRecoveryHint(`${invocation} reported failure${exitSuffix}.`, options.wrapperRecoveryHint);
+}
+function buildExitCodeFallback(options: { command?: string; effectiveArgs?: string[]; exitCode: number; wrapperRecoveryHint?: string }): string {
+	const invocation = buildInvocationLabel(options);
+	return appendWrapperRecoveryHint(`${invocation} exited with code ${options.exitCode}.`, options.wrapperRecoveryHint);
+}
 export function getAgentBrowserErrorText(options: {
 	aborted: boolean;
+	command?: string;
+	effectiveArgs?: string[];
 	envelope?: AgentBrowserEnvelope;
 	exitCode: number;
 	parseError?: string;
 	plainTextInspection: boolean;
 	spawnError?: Error;
 	stderr: string;
+	wrapperRecoveryHint?: string;
 }): string | undefined {
 	const { aborted, envelope, exitCode, parseError, plainTextInspection, spawnError, stderr } = options;
 	if (plainTextInspection) return undefined;
@@ -97,13 +134,13 @@ export function getAgentBrowserErrorText(options: {
 	if (spawnError) return spawnError.message;
 	if (parseError) return parseError;
 	if (envelope?.success === false) {
-		if (hasStructuredBatchStepFailure(envelope.data) && envelope.error === undefined) {
+		if ((hasStructuredBatchStepFailure(envelope.data) || detectConfirmationRequired(envelope.data)) && envelope.error === undefined) {
 			return undefined;
 		}
-		return extractEnvelopeErrorText(envelope.error) ?? (stderr.trim() || `agent-browser reported failure${exitCode !== 0 ? ` (exit code ${exitCode})` : "."}`);
+		return extractEnvelopeErrorText(envelope.error) ?? (stderr.trim() || buildFailureFallback(options));
 	}
 	if (exitCode !== 0) {
-		return stderr.trim() || `agent-browser exited with code ${exitCode}.`;
+		return stderr.trim() || buildExitCodeFallback(options);
 	}
 	return undefined;
 }