pi-web-toolkit 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Firecrawl Search Extension — web search via firecrawl-cli (keyless)
3
+ *
4
+ * Provides a `firecrawl_search` tool that searches the web through the
5
+ * official Firecrawl CLI in keyless mode (no API key, no signup). It exposes
6
+ * Firecrawl-specific capabilities the local SearXNG tool does not: sources
7
+ * (web/images/news), categories (github/research/pdf), and domain filters.
8
+ *
9
+ * Requires: `npm install -g firecrawl-cli` (optional; degrades gracefully).
10
+ * Privacy: the query is sent to Firecrawl's cloud.
11
+ */
12
+
13
+ import {
14
+ defineTool,
15
+ type ExtensionAPI,
16
+ formatSize,
17
+ DEFAULT_MAX_BYTES,
18
+ DEFAULT_MAX_LINES,
19
+ } from "@earendil-works/pi-coding-agent";
20
+ import { Text } from "@earendil-works/pi-tui";
21
+ import { Type, type Static } from "typebox";
22
+ import { StringEnum } from "@earendil-works/pi-ai";
23
+ import { searchKeyless, buildSearchQuery } from "./utils/firecrawl";
24
+ import { writeWithFallback } from "./utils/output-sink";
25
+ import { abbreviateUrl, getDomain, getErrorText, normalizeWhitespace } from "./utils/render-helpers";
26
+
27
+ export const FirecrawlSearchParamsSchema = Type.Object({
28
+ query: Type.String({ description: "Search query" }),
29
+ limit: Type.Optional(Type.Integer({ description: "Max results (1-100). Default: 10", minimum: 1, maximum: 100 })),
30
+ sources: Type.Optional(Type.Array(StringEnum(["web", "images", "news"] as const), { description: "Sources to search. Default: web" })),
31
+ categories: Type.Optional(Type.Array(StringEnum(["github", "research", "pdf"] as const), { description: "Filter by GitHub / research papers / PDFs" })),
32
+ country: Type.Optional(Type.String({ description: "ISO country code for geo-targeting (e.g. US, DE)" })),
33
+ tbs: Type.Optional(Type.String({ description: "Time filter: qdr:h (hour), qdr:d (day), qdr:w (week), qdr:m (month), qdr:y (year)" })),
34
+ location: Type.Optional(Type.String({ description: "Geo-targeting location (e.g. 'Berlin,Germany')" })),
35
+ includeDomains: Type.Optional(Type.Array(Type.String(), { description: "Restrict results to these domains (hostnames)" })),
36
+ excludeDomains: Type.Optional(Type.Array(Type.String(), { description: "Exclude results from these domains (hostnames)" })),
37
+ });
38
+
39
+ export type FirecrawlSearchInput = Static<typeof FirecrawlSearchParamsSchema>;
40
+
41
+ const firecrawlSearchTool = defineTool({
42
+ name: "firecrawl_search",
43
+ label: "Firecrawl Search",
44
+ description: [
45
+ "Search the web via Firecrawl (keyless — no API key, no signup).",
46
+ "Supports sources (web/images/news) and categories (github/research/pdf) that",
47
+ "SearXNG does not. Use as an escape hatch or when web_search returns nothing.",
48
+ "Privacy: the query is sent to Firecrawl's cloud.",
49
+ `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
50
+ ].join(" "),
51
+ promptSnippet: "Search the web via Firecrawl keyless (categories, sources, domain filters)",
52
+ promptGuidelines: [
53
+ "Prefer web_search first; reach for firecrawl_search when web_search fails or returns nothing.",
54
+ "Use categories=[\"github\"], [\"research\"], or [\"pdf\"] for source-type-specific discovery.",
55
+ "Use includeDomains/excludeDomains to scope results to specific sites.",
56
+ ],
57
+ parameters: FirecrawlSearchParamsSchema,
58
+
59
+ async execute(_toolCallId, params, signal) {
60
+ const query = buildSearchQuery(params.query, params.includeDomains, params.excludeDomains);
61
+ const out = await searchKeyless(query, {
62
+ limit: params.limit,
63
+ sources: params.sources,
64
+ categories: params.categories,
65
+ country: params.country,
66
+ tbs: params.tbs,
67
+ location: params.location,
68
+ }, signal);
69
+
70
+ if (!out.ok) {
71
+ const reason = out.failure?.reason ?? "unknown error";
72
+ throw new Error(`Firecrawl search failed (${out.failure?.kind}): ${reason}`);
73
+ }
74
+
75
+ const lines: string[] = [`Results for "${params.query}" (via Firecrawl keyless${out.creditsUsed !== undefined ? `, ${out.creditsUsed} credits` : ""}):`, ""];
76
+ for (let i = 0; i < out.results.length; i++) {
77
+ const r = out.results[i];
78
+ lines.push(`${i + 1}. ${r.title ?? "(untitled)"}`);
79
+ lines.push(` URL: ${r.url}`);
80
+ if (r.description) lines.push(` ${r.description.replace(/\s+/g, " ").trim()}`);
81
+ if (r.category) lines.push(` [category: ${r.category}]`);
82
+ lines.push("");
83
+ }
84
+
85
+ const rawText = lines.join("\n");
86
+ const sink = await writeWithFallback(rawText, { tmpPrefix: "pi-firecrawl-search-", alwaysWriteFile: true });
87
+
88
+ return {
89
+ content: [{ type: "text", text: sink.text }],
90
+ details: {
91
+ query: params.query,
92
+ totalResults: out.results.length,
93
+ results: out.results,
94
+ creditsUsed: out.creditsUsed,
95
+ fullOutputPath: sink.fullOutputPath,
96
+ viaFirecrawl: true,
97
+ },
98
+ };
99
+ },
100
+
101
+ renderCall(args, theme) {
102
+ let text = theme.fg("toolTitle", theme.bold("firecrawl_search "));
103
+ text += theme.fg("muted", args.query);
104
+ if (args.categories) text += theme.fg("dim", ` [${args.categories.join(",")}]`);
105
+ return new Text(text, 0, 0);
106
+ },
107
+
108
+ renderResult(result, { expanded, isPartial }, theme, context) {
109
+ const isError = context?.isError ?? false;
110
+
111
+ if (isPartial) {
112
+ const query = (result.details as any)?.query as string | undefined;
113
+ const label = query ? `Searching "${query}" via Firecrawl...` : "Searching via Firecrawl...";
114
+ return new Text(theme.fg("warning", label), 0, 0);
115
+ }
116
+
117
+ const details = result.details as {
118
+ query?: string;
119
+ totalResults?: number;
120
+ results?: Array<{ title?: string; url?: string; description?: string; category?: string }>;
121
+ creditsUsed?: number;
122
+ fullOutputPath?: string;
123
+ } | undefined;
124
+
125
+ if (isError) {
126
+ const errText = getErrorText(result);
127
+ let text = theme.fg("error", "✗ Firecrawl search failed");
128
+ if (details?.query) text += ` ${theme.fg("dim", details.query)}`;
129
+ text += `\n\n ${theme.fg("toolOutput", errText)}`;
130
+ return new Text(text, 0, 0);
131
+ }
132
+
133
+ const showing = details?.results?.length ?? 0;
134
+ let text = theme.fg("success", `✓ ${showing} results`);
135
+ text += theme.fg("accent", " [Firecrawl keyless]");
136
+ if (details?.creditsUsed !== undefined) {
137
+ text += theme.fg("muted", ` ${details.creditsUsed} credits`);
138
+ }
139
+
140
+ const top = (details?.results ?? []).slice(0, expanded ? 10 : 3);
141
+ for (let i = 0; i < top.length; i++) {
142
+ const r = top[i];
143
+ const domain = r.url ? theme.fg("dim", ` ${getDomain(r.url)}`) : "";
144
+ text += `\n [${i + 1}] ${theme.fg("toolTitle", r.title ?? "(untitled)")}${domain}`;
145
+ if (r.description) {
146
+ const snippet = normalizeWhitespace(r.description);
147
+ const short = snippet.length > 90 ? snippet.slice(0, 90).replace(/\s+\S*$/, "") + "..." : snippet;
148
+ text += `\n ${theme.fg("muted", short)}`;
149
+ }
150
+ }
151
+ if (showing > top.length) {
152
+ text += `\n ${theme.fg("muted", `... and ${showing - top.length} more${expanded ? "" : " (Ctrl+O for full list)"}`)}`;
153
+ }
154
+
155
+ if (expanded && details?.fullOutputPath) {
156
+ text += `\n${theme.fg("accent", `Full output: ${details.fullOutputPath}`)}`;
157
+ }
158
+
159
+ return new Text(text, 0, 0);
160
+ },
161
+ });
162
+
163
+ export default function (pi: ExtensionAPI) {
164
+ pi.registerTool(firecrawlSearchTool);
165
+ }
@@ -13,10 +13,16 @@ import registerWebSearch from "./web_search";
13
13
  import registerWebFetch from "./web_fetch";
14
14
  import registerWebBrowse from "./web_browse";
15
15
  import registerWebBatchFetch from "./web_batch_fetch";
16
+ import registerFirecrawlScrape from "./firecrawl_scrape";
17
+ import registerFirecrawlSearch from "./firecrawl_search";
18
+ import registerFirecrawlInteract from "./firecrawl_interact";
16
19
 
17
20
  export default function (pi: ExtensionAPI) {
18
21
  registerWebSearch(pi);
19
22
  registerWebFetch(pi);
20
23
  registerWebBrowse(pi);
21
24
  registerWebBatchFetch(pi);
25
+ registerFirecrawlScrape(pi);
26
+ registerFirecrawlSearch(pi);
27
+ registerFirecrawlInteract(pi);
22
28
  }
@@ -17,6 +17,8 @@ export interface CLIRunOptions {
17
17
  timeout?: number;
18
18
  /** AbortSignal for cancellation. */
19
19
  signal?: AbortSignal;
20
+ /** Optional environment override for the child process. */
21
+ env?: NodeJS.ProcessEnv;
20
22
  }
21
23
 
22
24
  export interface CLIRunResult {
@@ -44,6 +46,7 @@ export function runCLI(options: CLIRunOptions): Promise<CLIRunResult> {
44
46
  const proc = spawn(options.command, options.args, {
45
47
  shell: false,
46
48
  stdio: stdio as any,
49
+ env: options.env,
47
50
  }) as ChildProcess;
48
51
 
49
52
  let stdout = "";
@@ -0,0 +1,484 @@
1
+ /**
2
+ * Firecrawl CLI wrapper
3
+ *
4
+ * Drives the `firecrawl-cli` npm package (an official Firecrawl client) in
5
+ * KEYLESS-ONLY mode: the cloud only grants the free keyless tier when a
6
+ * request comes from an official client with NO Authorization header, so we
7
+ * shell out to the CLI rather than hand-rolling REST, and we isolate the
8
+ * child process under a temporary HOME with no key env so stored credentials
9
+ * / API keys can never be picked up.
10
+ *
11
+ * This module is split so that the decision-rich, deterministic logic lives
12
+ * in pure, network-free functions (argument builders, output parsers, failure
13
+ * classification, keyless-eligibility, fallback decisions) which are unit
14
+ * tested at this boundary — mirroring the agent-browser wrapper. The
15
+ * side-effectful CLI spawning is a thin layer on top.
16
+ */
17
+
18
+ import { mkdtemp, rm } from "node:fs/promises";
19
+ import * as os from "node:os";
20
+ import * as path from "node:path";
21
+ import { runCLI } from "./cli-runner";
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Shared types
25
+ // ---------------------------------------------------------------------------
26
+
27
+ /** Why a Firecrawl attempt did not yield a result. */
28
+ export type FirecrawlFailureKind = "graceful-skip" | "hard-error";
29
+
30
+ /**
31
+ * Whether the Firecrawl keyless fallback is enabled. Defaults on; disabled
32
+ * when `PI_WEB_FIRECRAWL_FALLBACK` is a falsy value (0/false/no/off). This is
33
+ * the single opt-out for a strict local-only / no-cloud-egress policy.
34
+ */
35
+ export function isFirecrawlEnabled(): boolean {
36
+ const v = (process.env.PI_WEB_FIRECRAWL_FALLBACK ?? "").trim().toLowerCase();
37
+ return !(v === "0" || v === "false" || v === "no" || v === "off");
38
+ }
39
+
40
+ export interface FirecrawlFailure {
41
+ kind: FirecrawlFailureKind;
42
+ /** Human-readable reason. */
43
+ reason: string;
44
+ /** Raw error text, when available. */
45
+ raw?: string;
46
+ }
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // Scrape
50
+ // ---------------------------------------------------------------------------
51
+
52
+ export interface FirecrawlScrapeOptions {
53
+ /** HTML tags to include (Firecrawl tag filter, not a CSS selector). */
54
+ includeTags?: string[];
55
+ /** HTML tags to exclude. */
56
+ excludeTags?: string[];
57
+ /** Wait (ms) before scraping, for JS-rendered content. */
58
+ waitFor?: number;
59
+ /** Extract only main content (drop nav/footer/etc). Default true. */
60
+ onlyMainContent?: boolean;
61
+ }
62
+
63
+ export interface FirecrawlScrapeOutput {
64
+ ok: boolean;
65
+ /** Clean markdown content. */
66
+ content: string;
67
+ url: string;
68
+ title?: string;
69
+ scrapeId?: string;
70
+ bytes: number;
71
+ /** Reported when the source provides it; scrape responses usually do not. */
72
+ creditsUsed?: number;
73
+ failure?: FirecrawlFailure;
74
+ }
75
+
76
+ /**
77
+ * Build the `firecrawl scrape` argument list for a single URL.
78
+ * Output is always forced to JSON (`--json`) so the full data object —
79
+ * including metadata.scrapeId needed for interact — is parseable.
80
+ */
81
+ export function buildScrapeArgs(url: string, options: FirecrawlScrapeOptions): string[] {
82
+ const args = ["scrape", url, "--format", "markdown", "--json"];
83
+ if (options.onlyMainContent !== false) args.push("--only-main-content");
84
+ if (options.waitFor !== undefined) args.push("--wait-for", String(options.waitFor));
85
+ if (options.includeTags && options.includeTags.length > 0) {
86
+ args.push("--include-tags", options.includeTags.join(","));
87
+ }
88
+ if (options.excludeTags && options.excludeTags.length > 0) {
89
+ args.push("--exclude-tags", options.excludeTags.join(","));
90
+ }
91
+ return args;
92
+ }
93
+
94
+ /**
95
+ * Classify a Firecrawl failure as either a clean, non-fatal skip (fall
96
+ * through to the original local-tool error) or a hard error worth surfacing.
97
+ *
98
+ * Graceful-skip covers the known keyless reality: the CLI is absent, the IP
99
+ * is flagged as suspicious, or the keyless quota is exhausted. These must
100
+ * never make the user worse off than the local tool already did.
101
+ */
102
+ export function classifyFirecrawlFailure(errorText: string, exitCode?: number): FirecrawlFailure {
103
+ const text = (errorText ?? "").toLowerCase();
104
+ const isGraceful =
105
+ text.includes("is not installed") ||
106
+ text.includes("ip address looks suspicious") ||
107
+ text.includes("looks suspicious") ||
108
+ text.includes("rate limit") ||
109
+ text.includes("too many requests") ||
110
+ exitCode === 429;
111
+ if (isGraceful) {
112
+ return { kind: "graceful-skip", reason: errorText.trim() || "firecrawl unavailable", raw: errorText };
113
+ }
114
+ return { kind: "hard-error", reason: errorText.trim() || "firecrawl request failed", raw: errorText };
115
+ }
116
+
117
+ /**
118
+ * Parse the stdout of `firecrawl scrape --json` into a normalized result.
119
+ *
120
+ * The CLI prints the scrape `data` object as JSON (which includes `markdown`
121
+ * and `metadata.scrapeId`). If parsing fails, fall back to treating stdout as
122
+ * raw markdown so we still return something useful. Empty output is a hard
123
+ * error.
124
+ */
125
+ export function parseScrapeOutput(stdout: string, url: string): FirecrawlScrapeOutput {
126
+ const trimmed = stdout.trim();
127
+ if (!trimmed) {
128
+ return { ok: false, content: "", url, bytes: 0, failure: { kind: "hard-error", reason: "Empty output from firecrawl scrape" } };
129
+ }
130
+
131
+ try {
132
+ const data = JSON.parse(trimmed) as Record<string, unknown>;
133
+ const markdown = typeof data.markdown === "string" ? data.markdown : "";
134
+ const metadata = (data.metadata ?? {}) as Record<string, unknown>;
135
+ if (!markdown) {
136
+ return { ok: false, content: "", url, bytes: 0, failure: { kind: "hard-error", reason: "firecrawl scrape returned no markdown" } };
137
+ }
138
+ return {
139
+ ok: true,
140
+ content: markdown,
141
+ url,
142
+ title: typeof metadata.title === "string" ? metadata.title : undefined,
143
+ scrapeId: typeof metadata.scrapeId === "string" ? metadata.scrapeId : undefined,
144
+ bytes: markdown.length,
145
+ };
146
+ } catch {
147
+ // Not JSON — assume raw markdown.
148
+ return { ok: true, content: trimmed, url, bytes: trimmed.length };
149
+ }
150
+ }
151
+
152
+ // ---------------------------------------------------------------------------
153
+ // Side-effectful CLI runner (keyless-only) — not unit tested
154
+ // ---------------------------------------------------------------------------
155
+
156
+ export interface FirecrawlCliResult {
157
+ stdout: string;
158
+ stderr: string;
159
+ exitCode: number;
160
+ }
161
+
162
+ /**
163
+ * Run the firecrawl CLI under an isolated temporary HOME with no key env, so
164
+ * it can only ever operate in keyless mode (no stored credentials, no
165
+ * FIRECRAWL_API_KEY). The temp HOME is cleaned up after the process exits.
166
+ */
167
+ export async function runFirecrawlCli(
168
+ args: string[],
169
+ signal?: AbortSignal,
170
+ timeout?: number,
171
+ ): Promise<FirecrawlCliResult> {
172
+ const home = await mkdtemp(path.join(os.tmpdir(), "pi-firecrawl-"));
173
+ try {
174
+ const env: NodeJS.ProcessEnv = { ...process.env };
175
+ // Force keyless: strip any credential the CLI would otherwise honor.
176
+ delete env.FIRECRAWL_API_KEY;
177
+ delete env.FIRECRAWL_API_URL;
178
+ delete env.FIRECRAWL_OAUTH_TOKEN;
179
+ env.HOME = home;
180
+ env.XDG_CONFIG_HOME = path.join(home, ".config");
181
+ return await runCLI({ command: "firecrawl", args, env, signal, timeout });
182
+ } finally {
183
+ await rm(home, { recursive: true, force: true }).catch(() => { /* best-effort */ });
184
+ }
185
+ }
186
+
187
+ function disabledFailure(reason = "Firecrawl fallback disabled"): FirecrawlFailure {
188
+ return { kind: "graceful-skip", reason };
189
+ }
190
+
191
+ // ---------------------------------------------------------------------------
192
+ // Search
193
+ // ---------------------------------------------------------------------------
194
+
195
+ export interface FirecrawlSearchOptions {
196
+ limit?: number;
197
+ sources?: Array<"web" | "images" | "news">;
198
+ categories?: Array<"github" | "research" | "pdf">;
199
+ country?: string;
200
+ tbs?: string;
201
+ location?: string;
202
+ }
203
+
204
+ export interface FirecrawlSearchResultItem {
205
+ title?: string;
206
+ url: string;
207
+ description?: string;
208
+ category?: string;
209
+ markdown?: string;
210
+ }
211
+
212
+ export interface FirecrawlSearchOutput {
213
+ ok: boolean;
214
+ results: FirecrawlSearchResultItem[];
215
+ creditsUsed?: number;
216
+ searchId?: string;
217
+ warning?: string;
218
+ failure?: FirecrawlFailure;
219
+ }
220
+
221
+ /**
222
+ * Fold domain filters into a search query using search operators, since the
223
+ * firecrawl search CLI does not expose include/exclude domain flags directly.
224
+ * Mirrors how the official MCP server builds the query.
225
+ */
226
+ export function buildSearchQuery(
227
+ query: string,
228
+ includeDomains?: string[],
229
+ excludeDomains?: string[],
230
+ ): string {
231
+ if (includeDomains && includeDomains.length > 0) {
232
+ const clause = includeDomains.map((d) => `site:${d}`).join(" OR ");
233
+ return `${query} (${clause})`;
234
+ }
235
+ if (excludeDomains && excludeDomains.length > 0) {
236
+ return `${query} ${excludeDomains.map((d) => `-site:${d}`).join(" ")}`;
237
+ }
238
+ return query;
239
+ }
240
+
241
+ /**
242
+ * Build the `firecrawl search` argument list. Output is always JSON so the
243
+ * full response envelope (results, id, creditsUsed) is preserved.
244
+ */
245
+ export function buildSearchArgs(query: string, options: FirecrawlSearchOptions): string[] {
246
+ const args = ["search", query, "--json"];
247
+ if (options.limit !== undefined) args.push("--limit", String(options.limit));
248
+ if (options.sources && options.sources.length > 0) args.push("--sources", options.sources.join(","));
249
+ if (options.categories && options.categories.length > 0) args.push("--categories", options.categories.join(","));
250
+ if (options.country) args.push("--country", options.country);
251
+ if (options.tbs) args.push("--tbs", options.tbs);
252
+ if (options.location) args.push("--location", options.location);
253
+ return args;
254
+ }
255
+
256
+ /** Parse the stdout of `firecrawl search --json` (the response envelope). */
257
+ export function parseSearchOutput(stdout: string): FirecrawlSearchOutput {
258
+ const trimmed = stdout.trim();
259
+ if (!trimmed) {
260
+ return { ok: false, results: [], failure: { kind: "hard-error", reason: "Empty output from firecrawl search" } };
261
+ }
262
+ try {
263
+ const env = JSON.parse(trimmed) as Record<string, unknown>;
264
+ const data = (env.data ?? {}) as Record<string, unknown>;
265
+ const web = Array.isArray(data.web) ? (data.web as Array<Record<string, unknown>>) : [];
266
+ const results: FirecrawlSearchResultItem[] = web.map((r) => ({
267
+ title: typeof r.title === "string" ? r.title : undefined,
268
+ url: typeof r.url === "string" ? r.url : "",
269
+ description: typeof r.description === "string" ? r.description : undefined,
270
+ category: typeof r.category === "string" ? r.category : undefined,
271
+ markdown: typeof r.markdown === "string" ? r.markdown : undefined,
272
+ })).filter((r) => r.url);
273
+ return {
274
+ ok: true,
275
+ results,
276
+ creditsUsed: typeof env.creditsUsed === "number" ? env.creditsUsed : undefined,
277
+ searchId: typeof env.id === "string" ? env.id : undefined,
278
+ warning: typeof env.warning === "string" ? env.warning : undefined,
279
+ };
280
+ } catch {
281
+ return { ok: false, results: [], failure: { kind: "hard-error", reason: "Unparseable firecrawl search output" } };
282
+ }
283
+ }
284
+
285
+ /**
286
+ * Whether web_search should fall back to Firecrawl: when the local search
287
+ * errored OR returned zero results. A non-empty result set must not fall back.
288
+ */
289
+ export function shouldFallbackSearch(localOk: boolean, resultCount: number): boolean {
290
+ return !localOk || resultCount === 0;
291
+ }
292
+
293
+ /**
294
+ * Search via the firecrawl CLI in keyless mode. Never throws.
295
+
296
+ /**
297
+ * Scrape a single URL via the firecrawl CLI in keyless mode. Never throws —
298
+ * returns a normalized output whose `failure` field distinguishes graceful
299
+ * skips (CLI absent / IP flagged / rate-limited / disabled) from hard errors.
300
+ */
301
+ export async function scrapeKeyless(
302
+ url: string,
303
+ options: FirecrawlScrapeOptions,
304
+ signal?: AbortSignal,
305
+ ): Promise<FirecrawlScrapeOutput> {
306
+ if (!isFirecrawlEnabled()) {
307
+ return { ok: false, content: "", url, bytes: 0, failure: disabledFailure() };
308
+ }
309
+ let result: FirecrawlCliResult;
310
+ try {
311
+ result = await runFirecrawlCli(buildScrapeArgs(url, options), signal, 90_000);
312
+ } catch (err: any) {
313
+ const msg = err?.message ?? String(err);
314
+ return { ok: false, content: "", url, bytes: 0, failure: classifyFirecrawlFailure(msg) };
315
+ }
316
+ if (result.exitCode !== 0) {
317
+ const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
318
+ return { ok: false, content: "", url, bytes: 0, failure };
319
+ }
320
+ return parseScrapeOutput(result.stdout, url);
321
+ }
322
+
323
+ /**
324
+ * Search via the firecrawl CLI in keyless mode. Never throws.
325
+ */
326
+ export async function searchKeyless(
327
+ query: string,
328
+ options: FirecrawlSearchOptions,
329
+ signal?: AbortSignal,
330
+ ): Promise<FirecrawlSearchOutput> {
331
+ if (!isFirecrawlEnabled()) {
332
+ return { ok: false, results: [], failure: disabledFailure() };
333
+ }
334
+ let result: FirecrawlCliResult;
335
+ try {
336
+ result = await runFirecrawlCli(buildSearchArgs(query, options), signal, 90_000);
337
+ } catch (err: any) {
338
+ const msg = err?.message ?? String(err);
339
+ return { ok: false, results: [], failure: classifyFirecrawlFailure(msg) };
340
+ }
341
+ if (result.exitCode !== 0) {
342
+ const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
343
+ return { ok: false, results: [], failure };
344
+ }
345
+ return parseSearchOutput(result.stdout);
346
+ }
347
+
348
+ // ---------------------------------------------------------------------------
349
+ // Interact
350
+ // ---------------------------------------------------------------------------
351
+
352
+ export interface FirecrawlInteractOptions {
353
+ /** Natural-language task. Required unless `code` is set. */
354
+ prompt?: string;
355
+ /** Code to execute in the browser sandbox. Required unless `prompt` is set. */
356
+ code?: string;
357
+ /** Code language; only used with `code`. */
358
+ language?: "node" | "python" | "bash";
359
+ /** Timeout in seconds (1-300). */
360
+ timeout?: number;
361
+ }
362
+
363
+ export interface FirecrawlInteractOutput {
364
+ ok: boolean;
365
+ /** The agent's answer (prompt) or code result/stdout. */
366
+ output: string;
367
+ url: string;
368
+ scrapeId?: string;
369
+ liveViewUrl?: string;
370
+ creditsUsed?: number;
371
+ failure?: FirecrawlFailure;
372
+ }
373
+
374
+ /** Build `firecrawl interact` args bound to a scrapeId. */
375
+ export function buildInteractArgs(scrapeId: string, options: FirecrawlInteractOptions): string[] {
376
+ const args = ["interact"];
377
+ if (options.prompt !== undefined) args.push("-p", options.prompt);
378
+ if (options.code !== undefined) args.push("-c", options.code);
379
+ args.push("-s", scrapeId);
380
+ if (options.language === "python") args.push("--python");
381
+ else if (options.language === "bash") args.push("--bash");
382
+ if (options.timeout !== undefined) args.push("--timeout", String(options.timeout));
383
+ args.push("--json");
384
+ return args;
385
+ }
386
+
387
+ /** Build `firecrawl interact stop` args for a scrapeId. */
388
+ export function buildInteractStopArgs(scrapeId: string): string[] {
389
+ return ["interact", "stop", scrapeId, "--json"];
390
+ }
391
+
392
+ /** Parse the stdout of `firecrawl interact --json` (the full response). */
393
+ export function parseInteractOutput(stdout: string): FirecrawlInteractOutput {
394
+ const trimmed = stdout.trim();
395
+ if (!trimmed) {
396
+ return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason: "Empty output from firecrawl interact" } };
397
+ }
398
+ try {
399
+ const data = JSON.parse(trimmed) as Record<string, unknown>;
400
+ if (data.success === false) {
401
+ const reason = typeof data.error === "string" ? data.error : "firecrawl interact failed";
402
+ return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason } };
403
+ }
404
+ const output =
405
+ (typeof data.output === "string" && data.output) ||
406
+ (typeof data.result === "string" && data.result) ||
407
+ (typeof data.stdout === "string" && data.stdout) ||
408
+ "";
409
+ return {
410
+ ok: true,
411
+ output,
412
+ url: "",
413
+ liveViewUrl: typeof data.liveViewUrl === "string" ? data.liveViewUrl : undefined,
414
+ creditsUsed: typeof data.creditsUsed === "number" ? data.creditsUsed : undefined,
415
+ };
416
+ } catch {
417
+ return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason: "Unparseable firecrawl interact output" } };
418
+ }
419
+ }
420
+
421
+ /**
422
+ * Whether web_browse should fall back to Firecrawl interact. Runtime failures
423
+ * (CLI missing, batch execution failure) fall back; purely local validation
424
+ * errors (malformed caller-provided actions) do not — they would just fail
425
+ * again.
426
+ */
427
+ export function shouldFallbackBrowse(error: Error): boolean {
428
+ const msg = (error.message ?? "").toLowerCase();
429
+ // Purely local validation errors (malformed caller-provided actions) must not
430
+ // fall back — they would just fail again on the cloud too.
431
+ if (
432
+ msg.includes("requires non-empty") ||
433
+ msg.includes("requires non-negative") ||
434
+ msg.includes("unsupported browser action")
435
+ ) {
436
+ return false;
437
+ }
438
+ return true;
439
+ }
440
+
441
+ /**
442
+ * Drive a page via Firecrawl interact in keyless mode: scrape to start a
443
+ * session, run one interact call (prompt or code), and ALWAYS stop the session
444
+ * (even on error/abort) so no billable session is left open. Never throws.
445
+ */
446
+ export async function interactKeyless(
447
+ url: string,
448
+ options: FirecrawlInteractOptions,
449
+ signal?: AbortSignal,
450
+ ): Promise<FirecrawlInteractOutput> {
451
+ if (!isFirecrawlEnabled()) {
452
+ return { ok: false, output: "", url, failure: disabledFailure() };
453
+ }
454
+
455
+ // 1. Scrape to obtain a scrapeId that an interact session binds to.
456
+ const scrape = await scrapeKeyless(url, {}, signal);
457
+ if (!scrape.ok || !scrape.scrapeId) {
458
+ return {
459
+ ok: false,
460
+ output: "",
461
+ url,
462
+ failure: scrape.failure ?? { kind: "hard-error", reason: "firecrawl scrape returned no scrapeId" },
463
+ };
464
+ }
465
+ const scrapeId = scrape.scrapeId;
466
+
467
+ // 2. Interact, then always stop (best-effort, independent of the user signal).
468
+ try {
469
+ let result: FirecrawlCliResult;
470
+ try {
471
+ result = await runFirecrawlCli(buildInteractArgs(scrapeId, options), signal, 120_000);
472
+ } catch (err: any) {
473
+ return { ok: false, output: "", url, scrapeId, failure: classifyFirecrawlFailure(err?.message ?? String(err)) };
474
+ }
475
+ if (result.exitCode !== 0) {
476
+ const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
477
+ return { ok: false, output: "", url, scrapeId, failure };
478
+ }
479
+ const parsed = parseInteractOutput(result.stdout);
480
+ return { ...parsed, url, scrapeId };
481
+ } finally {
482
+ await runFirecrawlCli(buildInteractStopArgs(scrapeId), undefined, 30_000).catch(() => { /* best-effort */ });
483
+ }
484
+ }