pi-web-toolkit 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +143 -18
- package/docs/adr/0001-firecrawl-keyless-cloud-fallback.md +5 -0
- package/docs/agents/issue-tracker.md +2 -2
- package/docs/guide.md +32 -7
- package/docs/tools.md +79 -10
- package/extensions/firecrawl_interact.ts +147 -0
- package/extensions/firecrawl_scrape.ts +154 -0
- package/extensions/firecrawl_search.ts +165 -0
- package/extensions/index.ts +7 -1
- package/extensions/utils/cli-runner.ts +4 -1
- package/extensions/utils/firecrawl.ts +484 -0
- package/extensions/web_batch_fetch.ts +1 -2
- package/extensions/web_browse.ts +61 -4
- package/extensions/web_fetch.ts +32 -10
- package/extensions/web_search.ts +85 -35
- package/package.json +9 -7
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Firecrawl CLI wrapper
|
|
3
|
+
*
|
|
4
|
+
* Drives the `firecrawl-cli` npm package (an official Firecrawl client) in
|
|
5
|
+
* KEYLESS-ONLY mode: the cloud only grants the free keyless tier when a
|
|
6
|
+
* request comes from an official client with NO Authorization header, so we
|
|
7
|
+
* shell out to the CLI rather than hand-rolling REST, and we isolate the
|
|
8
|
+
* child process under a temporary HOME with no key env so stored credentials
|
|
9
|
+
* / API keys can never be picked up.
|
|
10
|
+
*
|
|
11
|
+
* This module is split so that the decision-rich, deterministic logic lives
|
|
12
|
+
* in pure, network-free functions (argument builders, output parsers, failure
|
|
13
|
+
* classification, keyless-eligibility, fallback decisions) which are unit
|
|
14
|
+
* tested at this boundary — mirroring the agent-browser wrapper. The
|
|
15
|
+
* side-effectful CLI spawning is a thin layer on top.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
19
|
+
import * as os from "node:os";
|
|
20
|
+
import * as path from "node:path";
|
|
21
|
+
import { runCLI } from "./cli-runner";
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Shared types
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
/** Why a Firecrawl attempt did not yield a result. */
|
|
28
|
+
export type FirecrawlFailureKind = "graceful-skip" | "hard-error";
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Whether the Firecrawl keyless fallback is enabled. Defaults on; disabled
|
|
32
|
+
* when `PI_WEB_FIRECRAWL_FALLBACK` is a falsy value (0/false/no/off). This is
|
|
33
|
+
* the single opt-out for a strict local-only / no-cloud-egress policy.
|
|
34
|
+
*/
|
|
35
|
+
export function isFirecrawlEnabled(): boolean {
|
|
36
|
+
const v = (process.env.PI_WEB_FIRECRAWL_FALLBACK ?? "").trim().toLowerCase();
|
|
37
|
+
return !(v === "0" || v === "false" || v === "no" || v === "off");
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface FirecrawlFailure {
|
|
41
|
+
kind: FirecrawlFailureKind;
|
|
42
|
+
/** Human-readable reason. */
|
|
43
|
+
reason: string;
|
|
44
|
+
/** Raw error text, when available. */
|
|
45
|
+
raw?: string;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Scrape
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
export interface FirecrawlScrapeOptions {
|
|
53
|
+
/** HTML tags to include (Firecrawl tag filter, not a CSS selector). */
|
|
54
|
+
includeTags?: string[];
|
|
55
|
+
/** HTML tags to exclude. */
|
|
56
|
+
excludeTags?: string[];
|
|
57
|
+
/** Wait (ms) before scraping, for JS-rendered content. */
|
|
58
|
+
waitFor?: number;
|
|
59
|
+
/** Extract only main content (drop nav/footer/etc). Default true. */
|
|
60
|
+
onlyMainContent?: boolean;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface FirecrawlScrapeOutput {
|
|
64
|
+
ok: boolean;
|
|
65
|
+
/** Clean markdown content. */
|
|
66
|
+
content: string;
|
|
67
|
+
url: string;
|
|
68
|
+
title?: string;
|
|
69
|
+
scrapeId?: string;
|
|
70
|
+
bytes: number;
|
|
71
|
+
/** Reported when the source provides it; scrape responses usually do not. */
|
|
72
|
+
creditsUsed?: number;
|
|
73
|
+
failure?: FirecrawlFailure;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Build the `firecrawl scrape` argument list for a single URL.
|
|
78
|
+
* Output is always forced to JSON (`--json`) so the full data object —
|
|
79
|
+
* including metadata.scrapeId needed for interact — is parseable.
|
|
80
|
+
*/
|
|
81
|
+
export function buildScrapeArgs(url: string, options: FirecrawlScrapeOptions): string[] {
|
|
82
|
+
const args = ["scrape", url, "--format", "markdown", "--json"];
|
|
83
|
+
if (options.onlyMainContent !== false) args.push("--only-main-content");
|
|
84
|
+
if (options.waitFor !== undefined) args.push("--wait-for", String(options.waitFor));
|
|
85
|
+
if (options.includeTags && options.includeTags.length > 0) {
|
|
86
|
+
args.push("--include-tags", options.includeTags.join(","));
|
|
87
|
+
}
|
|
88
|
+
if (options.excludeTags && options.excludeTags.length > 0) {
|
|
89
|
+
args.push("--exclude-tags", options.excludeTags.join(","));
|
|
90
|
+
}
|
|
91
|
+
return args;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Classify a Firecrawl failure as either a clean, non-fatal skip (fall
|
|
96
|
+
* through to the original local-tool error) or a hard error worth surfacing.
|
|
97
|
+
*
|
|
98
|
+
* Graceful-skip covers the known keyless reality: the CLI is absent, the IP
|
|
99
|
+
* is flagged as suspicious, or the keyless quota is exhausted. These must
|
|
100
|
+
* never make the user worse off than the local tool already did.
|
|
101
|
+
*/
|
|
102
|
+
export function classifyFirecrawlFailure(errorText: string, exitCode?: number): FirecrawlFailure {
|
|
103
|
+
const text = (errorText ?? "").toLowerCase();
|
|
104
|
+
const isGraceful =
|
|
105
|
+
text.includes("is not installed") ||
|
|
106
|
+
text.includes("ip address looks suspicious") ||
|
|
107
|
+
text.includes("looks suspicious") ||
|
|
108
|
+
text.includes("rate limit") ||
|
|
109
|
+
text.includes("too many requests") ||
|
|
110
|
+
exitCode === 429;
|
|
111
|
+
if (isGraceful) {
|
|
112
|
+
return { kind: "graceful-skip", reason: errorText.trim() || "firecrawl unavailable", raw: errorText };
|
|
113
|
+
}
|
|
114
|
+
return { kind: "hard-error", reason: errorText.trim() || "firecrawl request failed", raw: errorText };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Parse the stdout of `firecrawl scrape --json` into a normalized result.
|
|
119
|
+
*
|
|
120
|
+
* The CLI prints the scrape `data` object as JSON (which includes `markdown`
|
|
121
|
+
* and `metadata.scrapeId`). If parsing fails, fall back to treating stdout as
|
|
122
|
+
* raw markdown so we still return something useful. Empty output is a hard
|
|
123
|
+
* error.
|
|
124
|
+
*/
|
|
125
|
+
export function parseScrapeOutput(stdout: string, url: string): FirecrawlScrapeOutput {
|
|
126
|
+
const trimmed = stdout.trim();
|
|
127
|
+
if (!trimmed) {
|
|
128
|
+
return { ok: false, content: "", url, bytes: 0, failure: { kind: "hard-error", reason: "Empty output from firecrawl scrape" } };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
try {
|
|
132
|
+
const data = JSON.parse(trimmed) as Record<string, unknown>;
|
|
133
|
+
const markdown = typeof data.markdown === "string" ? data.markdown : "";
|
|
134
|
+
const metadata = (data.metadata ?? {}) as Record<string, unknown>;
|
|
135
|
+
if (!markdown) {
|
|
136
|
+
return { ok: false, content: "", url, bytes: 0, failure: { kind: "hard-error", reason: "firecrawl scrape returned no markdown" } };
|
|
137
|
+
}
|
|
138
|
+
return {
|
|
139
|
+
ok: true,
|
|
140
|
+
content: markdown,
|
|
141
|
+
url,
|
|
142
|
+
title: typeof metadata.title === "string" ? metadata.title : undefined,
|
|
143
|
+
scrapeId: typeof metadata.scrapeId === "string" ? metadata.scrapeId : undefined,
|
|
144
|
+
bytes: markdown.length,
|
|
145
|
+
};
|
|
146
|
+
} catch {
|
|
147
|
+
// Not JSON — assume raw markdown.
|
|
148
|
+
return { ok: true, content: trimmed, url, bytes: trimmed.length };
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ---------------------------------------------------------------------------
|
|
153
|
+
// Side-effectful CLI runner (keyless-only) — not unit tested
|
|
154
|
+
// ---------------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
export interface FirecrawlCliResult {
|
|
157
|
+
stdout: string;
|
|
158
|
+
stderr: string;
|
|
159
|
+
exitCode: number;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Run the firecrawl CLI under an isolated temporary HOME with no key env, so
|
|
164
|
+
* it can only ever operate in keyless mode (no stored credentials, no
|
|
165
|
+
* FIRECRAWL_API_KEY). The temp HOME is cleaned up after the process exits.
|
|
166
|
+
*/
|
|
167
|
+
export async function runFirecrawlCli(
|
|
168
|
+
args: string[],
|
|
169
|
+
signal?: AbortSignal,
|
|
170
|
+
timeout?: number,
|
|
171
|
+
): Promise<FirecrawlCliResult> {
|
|
172
|
+
const home = await mkdtemp(path.join(os.tmpdir(), "pi-firecrawl-"));
|
|
173
|
+
try {
|
|
174
|
+
const env: NodeJS.ProcessEnv = { ...process.env };
|
|
175
|
+
// Force keyless: strip any credential the CLI would otherwise honor.
|
|
176
|
+
delete env.FIRECRAWL_API_KEY;
|
|
177
|
+
delete env.FIRECRAWL_API_URL;
|
|
178
|
+
delete env.FIRECRAWL_OAUTH_TOKEN;
|
|
179
|
+
env.HOME = home;
|
|
180
|
+
env.XDG_CONFIG_HOME = path.join(home, ".config");
|
|
181
|
+
return await runCLI({ command: "firecrawl", args, env, signal, timeout });
|
|
182
|
+
} finally {
|
|
183
|
+
await rm(home, { recursive: true, force: true }).catch(() => { /* best-effort */ });
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function disabledFailure(reason = "Firecrawl fallback disabled"): FirecrawlFailure {
|
|
188
|
+
return { kind: "graceful-skip", reason };
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Search
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
export interface FirecrawlSearchOptions {
|
|
196
|
+
limit?: number;
|
|
197
|
+
sources?: Array<"web" | "images" | "news">;
|
|
198
|
+
categories?: Array<"github" | "research" | "pdf">;
|
|
199
|
+
country?: string;
|
|
200
|
+
tbs?: string;
|
|
201
|
+
location?: string;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
export interface FirecrawlSearchResultItem {
|
|
205
|
+
title?: string;
|
|
206
|
+
url: string;
|
|
207
|
+
description?: string;
|
|
208
|
+
category?: string;
|
|
209
|
+
markdown?: string;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
export interface FirecrawlSearchOutput {
|
|
213
|
+
ok: boolean;
|
|
214
|
+
results: FirecrawlSearchResultItem[];
|
|
215
|
+
creditsUsed?: number;
|
|
216
|
+
searchId?: string;
|
|
217
|
+
warning?: string;
|
|
218
|
+
failure?: FirecrawlFailure;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Fold domain filters into a search query using search operators, since the
|
|
223
|
+
* firecrawl search CLI does not expose include/exclude domain flags directly.
|
|
224
|
+
* Mirrors how the official MCP server builds the query.
|
|
225
|
+
*/
|
|
226
|
+
export function buildSearchQuery(
|
|
227
|
+
query: string,
|
|
228
|
+
includeDomains?: string[],
|
|
229
|
+
excludeDomains?: string[],
|
|
230
|
+
): string {
|
|
231
|
+
if (includeDomains && includeDomains.length > 0) {
|
|
232
|
+
const clause = includeDomains.map((d) => `site:${d}`).join(" OR ");
|
|
233
|
+
return `${query} (${clause})`;
|
|
234
|
+
}
|
|
235
|
+
if (excludeDomains && excludeDomains.length > 0) {
|
|
236
|
+
return `${query} ${excludeDomains.map((d) => `-site:${d}`).join(" ")}`;
|
|
237
|
+
}
|
|
238
|
+
return query;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Build the `firecrawl search` argument list. Output is always JSON so the
|
|
243
|
+
* full response envelope (results, id, creditsUsed) is preserved.
|
|
244
|
+
*/
|
|
245
|
+
export function buildSearchArgs(query: string, options: FirecrawlSearchOptions): string[] {
|
|
246
|
+
const args = ["search", query, "--json"];
|
|
247
|
+
if (options.limit !== undefined) args.push("--limit", String(options.limit));
|
|
248
|
+
if (options.sources && options.sources.length > 0) args.push("--sources", options.sources.join(","));
|
|
249
|
+
if (options.categories && options.categories.length > 0) args.push("--categories", options.categories.join(","));
|
|
250
|
+
if (options.country) args.push("--country", options.country);
|
|
251
|
+
if (options.tbs) args.push("--tbs", options.tbs);
|
|
252
|
+
if (options.location) args.push("--location", options.location);
|
|
253
|
+
return args;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Parse the stdout of `firecrawl search --json` (the response envelope). */
|
|
257
|
+
export function parseSearchOutput(stdout: string): FirecrawlSearchOutput {
|
|
258
|
+
const trimmed = stdout.trim();
|
|
259
|
+
if (!trimmed) {
|
|
260
|
+
return { ok: false, results: [], failure: { kind: "hard-error", reason: "Empty output from firecrawl search" } };
|
|
261
|
+
}
|
|
262
|
+
try {
|
|
263
|
+
const env = JSON.parse(trimmed) as Record<string, unknown>;
|
|
264
|
+
const data = (env.data ?? {}) as Record<string, unknown>;
|
|
265
|
+
const web = Array.isArray(data.web) ? (data.web as Array<Record<string, unknown>>) : [];
|
|
266
|
+
const results: FirecrawlSearchResultItem[] = web.map((r) => ({
|
|
267
|
+
title: typeof r.title === "string" ? r.title : undefined,
|
|
268
|
+
url: typeof r.url === "string" ? r.url : "",
|
|
269
|
+
description: typeof r.description === "string" ? r.description : undefined,
|
|
270
|
+
category: typeof r.category === "string" ? r.category : undefined,
|
|
271
|
+
markdown: typeof r.markdown === "string" ? r.markdown : undefined,
|
|
272
|
+
})).filter((r) => r.url);
|
|
273
|
+
return {
|
|
274
|
+
ok: true,
|
|
275
|
+
results,
|
|
276
|
+
creditsUsed: typeof env.creditsUsed === "number" ? env.creditsUsed : undefined,
|
|
277
|
+
searchId: typeof env.id === "string" ? env.id : undefined,
|
|
278
|
+
warning: typeof env.warning === "string" ? env.warning : undefined,
|
|
279
|
+
};
|
|
280
|
+
} catch {
|
|
281
|
+
return { ok: false, results: [], failure: { kind: "hard-error", reason: "Unparseable firecrawl search output" } };
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Whether web_search should fall back to Firecrawl: when the local search
|
|
287
|
+
* errored OR returned zero results. A non-empty result set must not fall back.
|
|
288
|
+
*/
|
|
289
|
+
export function shouldFallbackSearch(localOk: boolean, resultCount: number): boolean {
|
|
290
|
+
return !localOk || resultCount === 0;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Search via the firecrawl CLI in keyless mode. Never throws.
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Scrape a single URL via the firecrawl CLI in keyless mode. Never throws —
|
|
298
|
+
* returns a normalized output whose `failure` field distinguishes graceful
|
|
299
|
+
* skips (CLI absent / IP flagged / rate-limited / disabled) from hard errors.
|
|
300
|
+
*/
|
|
301
|
+
export async function scrapeKeyless(
|
|
302
|
+
url: string,
|
|
303
|
+
options: FirecrawlScrapeOptions,
|
|
304
|
+
signal?: AbortSignal,
|
|
305
|
+
): Promise<FirecrawlScrapeOutput> {
|
|
306
|
+
if (!isFirecrawlEnabled()) {
|
|
307
|
+
return { ok: false, content: "", url, bytes: 0, failure: disabledFailure() };
|
|
308
|
+
}
|
|
309
|
+
let result: FirecrawlCliResult;
|
|
310
|
+
try {
|
|
311
|
+
result = await runFirecrawlCli(buildScrapeArgs(url, options), signal, 90_000);
|
|
312
|
+
} catch (err: any) {
|
|
313
|
+
const msg = err?.message ?? String(err);
|
|
314
|
+
return { ok: false, content: "", url, bytes: 0, failure: classifyFirecrawlFailure(msg) };
|
|
315
|
+
}
|
|
316
|
+
if (result.exitCode !== 0) {
|
|
317
|
+
const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
|
|
318
|
+
return { ok: false, content: "", url, bytes: 0, failure };
|
|
319
|
+
}
|
|
320
|
+
return parseScrapeOutput(result.stdout, url);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Search via the firecrawl CLI in keyless mode. Never throws.
|
|
325
|
+
*/
|
|
326
|
+
export async function searchKeyless(
|
|
327
|
+
query: string,
|
|
328
|
+
options: FirecrawlSearchOptions,
|
|
329
|
+
signal?: AbortSignal,
|
|
330
|
+
): Promise<FirecrawlSearchOutput> {
|
|
331
|
+
if (!isFirecrawlEnabled()) {
|
|
332
|
+
return { ok: false, results: [], failure: disabledFailure() };
|
|
333
|
+
}
|
|
334
|
+
let result: FirecrawlCliResult;
|
|
335
|
+
try {
|
|
336
|
+
result = await runFirecrawlCli(buildSearchArgs(query, options), signal, 90_000);
|
|
337
|
+
} catch (err: any) {
|
|
338
|
+
const msg = err?.message ?? String(err);
|
|
339
|
+
return { ok: false, results: [], failure: classifyFirecrawlFailure(msg) };
|
|
340
|
+
}
|
|
341
|
+
if (result.exitCode !== 0) {
|
|
342
|
+
const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
|
|
343
|
+
return { ok: false, results: [], failure };
|
|
344
|
+
}
|
|
345
|
+
return parseSearchOutput(result.stdout);
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// ---------------------------------------------------------------------------
|
|
349
|
+
// Interact
|
|
350
|
+
// ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
export interface FirecrawlInteractOptions {
|
|
353
|
+
/** Natural-language task. Required unless `code` is set. */
|
|
354
|
+
prompt?: string;
|
|
355
|
+
/** Code to execute in the browser sandbox. Required unless `prompt` is set. */
|
|
356
|
+
code?: string;
|
|
357
|
+
/** Code language; only used with `code`. */
|
|
358
|
+
language?: "node" | "python" | "bash";
|
|
359
|
+
/** Timeout in seconds (1-300). */
|
|
360
|
+
timeout?: number;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
export interface FirecrawlInteractOutput {
|
|
364
|
+
ok: boolean;
|
|
365
|
+
/** The agent's answer (prompt) or code result/stdout. */
|
|
366
|
+
output: string;
|
|
367
|
+
url: string;
|
|
368
|
+
scrapeId?: string;
|
|
369
|
+
liveViewUrl?: string;
|
|
370
|
+
creditsUsed?: number;
|
|
371
|
+
failure?: FirecrawlFailure;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/** Build `firecrawl interact` args bound to a scrapeId. */
|
|
375
|
+
export function buildInteractArgs(scrapeId: string, options: FirecrawlInteractOptions): string[] {
|
|
376
|
+
const args = ["interact"];
|
|
377
|
+
if (options.prompt !== undefined) args.push("-p", options.prompt);
|
|
378
|
+
if (options.code !== undefined) args.push("-c", options.code);
|
|
379
|
+
args.push("-s", scrapeId);
|
|
380
|
+
if (options.language === "python") args.push("--python");
|
|
381
|
+
else if (options.language === "bash") args.push("--bash");
|
|
382
|
+
if (options.timeout !== undefined) args.push("--timeout", String(options.timeout));
|
|
383
|
+
args.push("--json");
|
|
384
|
+
return args;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/** Build `firecrawl interact stop` args for a scrapeId. */
|
|
388
|
+
export function buildInteractStopArgs(scrapeId: string): string[] {
|
|
389
|
+
return ["interact", "stop", scrapeId, "--json"];
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/** Parse the stdout of `firecrawl interact --json` (the full response). */
|
|
393
|
+
export function parseInteractOutput(stdout: string): FirecrawlInteractOutput {
|
|
394
|
+
const trimmed = stdout.trim();
|
|
395
|
+
if (!trimmed) {
|
|
396
|
+
return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason: "Empty output from firecrawl interact" } };
|
|
397
|
+
}
|
|
398
|
+
try {
|
|
399
|
+
const data = JSON.parse(trimmed) as Record<string, unknown>;
|
|
400
|
+
if (data.success === false) {
|
|
401
|
+
const reason = typeof data.error === "string" ? data.error : "firecrawl interact failed";
|
|
402
|
+
return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason } };
|
|
403
|
+
}
|
|
404
|
+
const output =
|
|
405
|
+
(typeof data.output === "string" && data.output) ||
|
|
406
|
+
(typeof data.result === "string" && data.result) ||
|
|
407
|
+
(typeof data.stdout === "string" && data.stdout) ||
|
|
408
|
+
"";
|
|
409
|
+
return {
|
|
410
|
+
ok: true,
|
|
411
|
+
output,
|
|
412
|
+
url: "",
|
|
413
|
+
liveViewUrl: typeof data.liveViewUrl === "string" ? data.liveViewUrl : undefined,
|
|
414
|
+
creditsUsed: typeof data.creditsUsed === "number" ? data.creditsUsed : undefined,
|
|
415
|
+
};
|
|
416
|
+
} catch {
|
|
417
|
+
return { ok: false, output: "", url: "", failure: { kind: "hard-error", reason: "Unparseable firecrawl interact output" } };
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Whether web_browse should fall back to Firecrawl interact. Runtime failures
|
|
423
|
+
* (CLI missing, batch execution failure) fall back; purely local validation
|
|
424
|
+
* errors (malformed caller-provided actions) do not — they would just fail
|
|
425
|
+
* again.
|
|
426
|
+
*/
|
|
427
|
+
export function shouldFallbackBrowse(error: Error): boolean {
|
|
428
|
+
const msg = (error.message ?? "").toLowerCase();
|
|
429
|
+
// Purely local validation errors (malformed caller-provided actions) must not
|
|
430
|
+
// fall back — they would just fail again on the cloud too.
|
|
431
|
+
if (
|
|
432
|
+
msg.includes("requires non-empty") ||
|
|
433
|
+
msg.includes("requires non-negative") ||
|
|
434
|
+
msg.includes("unsupported browser action")
|
|
435
|
+
) {
|
|
436
|
+
return false;
|
|
437
|
+
}
|
|
438
|
+
return true;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Drive a page via Firecrawl interact in keyless mode: scrape to start a
|
|
443
|
+
* session, run one interact call (prompt or code), and ALWAYS stop the session
|
|
444
|
+
* (even on error/abort) so no billable session is left open. Never throws.
|
|
445
|
+
*/
|
|
446
|
+
export async function interactKeyless(
|
|
447
|
+
url: string,
|
|
448
|
+
options: FirecrawlInteractOptions,
|
|
449
|
+
signal?: AbortSignal,
|
|
450
|
+
): Promise<FirecrawlInteractOutput> {
|
|
451
|
+
if (!isFirecrawlEnabled()) {
|
|
452
|
+
return { ok: false, output: "", url, failure: disabledFailure() };
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// 1. Scrape to obtain a scrapeId that an interact session binds to.
|
|
456
|
+
const scrape = await scrapeKeyless(url, {}, signal);
|
|
457
|
+
if (!scrape.ok || !scrape.scrapeId) {
|
|
458
|
+
return {
|
|
459
|
+
ok: false,
|
|
460
|
+
output: "",
|
|
461
|
+
url,
|
|
462
|
+
failure: scrape.failure ?? { kind: "hard-error", reason: "firecrawl scrape returned no scrapeId" },
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
const scrapeId = scrape.scrapeId;
|
|
466
|
+
|
|
467
|
+
// 2. Interact, then always stop (best-effort, independent of the user signal).
|
|
468
|
+
try {
|
|
469
|
+
let result: FirecrawlCliResult;
|
|
470
|
+
try {
|
|
471
|
+
result = await runFirecrawlCli(buildInteractArgs(scrapeId, options), signal, 120_000);
|
|
472
|
+
} catch (err: any) {
|
|
473
|
+
return { ok: false, output: "", url, scrapeId, failure: classifyFirecrawlFailure(err?.message ?? String(err)) };
|
|
474
|
+
}
|
|
475
|
+
if (result.exitCode !== 0) {
|
|
476
|
+
const failure = classifyFirecrawlFailure(result.stderr || result.stdout, result.exitCode);
|
|
477
|
+
return { ok: false, output: "", url, scrapeId, failure };
|
|
478
|
+
}
|
|
479
|
+
const parsed = parseInteractOutput(result.stdout);
|
|
480
|
+
return { ...parsed, url, scrapeId };
|
|
481
|
+
} finally {
|
|
482
|
+
await runFirecrawlCli(buildInteractStopArgs(scrapeId), undefined, 30_000).catch(() => { /* best-effort */ });
|
|
483
|
+
}
|
|
484
|
+
}
|
|
@@ -109,8 +109,7 @@ const webBatchFetchTool = defineTool({
|
|
|
109
109
|
label: "Web Batch Fetch",
|
|
110
110
|
description: [
|
|
111
111
|
"Fetch multiple web pages in parallel and return their content aggregated.",
|
|
112
|
-
"Use web_batch_fetch
|
|
113
|
-
"that the agent wants to read simultaneously for comparison or synthesis.",
|
|
112
|
+
"Use web_batch_fetch for 2–5 relevant URLs, whether discovered by search or provided by the user.",
|
|
114
113
|
"For a single page, use web_fetch instead.",
|
|
115
114
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
116
115
|
].join(" "),
|
package/extensions/web_browse.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* filling forms, waiting for dynamic content) BEFORE its target content
|
|
9
9
|
* becomes available.
|
|
10
10
|
*
|
|
11
|
-
* For
|
|
11
|
+
* For pages that need no interaction, use `web_fetch` instead.
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import {
|
|
@@ -28,6 +28,7 @@ import {
|
|
|
28
28
|
closeAgentBrowserSession,
|
|
29
29
|
} from "./utils/agent-browser";
|
|
30
30
|
import { writeWithFallback } from "./utils/output-sink";
|
|
31
|
+
import { interactKeyless, shouldFallbackBrowse, isFirecrawlEnabled } from "./utils/firecrawl";
|
|
31
32
|
import { abbreviateUrl, getErrorText, normalizeWhitespace } from "./utils/render-helpers";
|
|
32
33
|
|
|
33
34
|
export const WebBrowseActionSchema = Type.Object({
|
|
@@ -82,16 +83,35 @@ function formatBrowseStep(action: BrowseAction): string {
|
|
|
82
83
|
}
|
|
83
84
|
}
|
|
84
85
|
|
|
86
|
+
function synthesizeBrowsePrompt(params: { url: string; actions: BrowseAction[]; selector?: string }): string {
|
|
87
|
+
const parts: string[] = [];
|
|
88
|
+
for (const a of params.actions) {
|
|
89
|
+
switch (a.type) {
|
|
90
|
+
case "click": parts.push(`click the element "${a.selector ?? ""}"`); break;
|
|
91
|
+
case "fill": case "type": parts.push(`type "${a.value ?? ""}" into "${a.selector ?? ""}"`); break;
|
|
92
|
+
case "press": parts.push(`press ${a.key ?? ""}`); break;
|
|
93
|
+
case "scroll": parts.push(`scroll ${a.direction ?? "down"}`); break;
|
|
94
|
+
case "wait": parts.push("wait briefly"); break;
|
|
95
|
+
case "wait_selector": parts.push(`wait for "${a.selector ?? ""}" to appear`); break;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
const actionText = parts.length ? `Perform these actions in order: ${parts.join("; ")}. ` : "";
|
|
99
|
+
const extract = params.selector
|
|
100
|
+
? `Then return the text content of the element matching "${params.selector}".`
|
|
101
|
+
: "Then return the main textual content of the page.";
|
|
102
|
+
return `${actionText}${extract}`;
|
|
103
|
+
}
|
|
104
|
+
|
|
85
105
|
const webBrowseTool = defineTool({
|
|
86
106
|
name: "web_browse",
|
|
87
107
|
label: "Web Browse",
|
|
88
108
|
description: [
|
|
89
109
|
"Interact with a web page through a browser: navigate, click, fill forms, scroll,",
|
|
90
110
|
"wait for content, and then extract text.",
|
|
91
|
-
"Uses the agent-browser CLI
|
|
111
|
+
"Uses the agent-browser CLI with batched JSON commands.",
|
|
92
112
|
"Use web_browse when the target content requires interaction (clicking buttons,",
|
|
93
113
|
"scrolling, filling search boxes, waiting for JS to load) before it becomes available.",
|
|
94
|
-
"For
|
|
114
|
+
"For pages that need no interaction, use web_fetch instead.",
|
|
95
115
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
96
116
|
].join(" "),
|
|
97
117
|
promptSnippet: "Interact with a web page (click, scroll, fill) and extract content",
|
|
@@ -100,7 +120,7 @@ const webBrowseTool = defineTool({
|
|
|
100
120
|
"Use web_browse for SPAs, pagination (click 'Load more'), search forms, tab switching, and modal dialogs.",
|
|
101
121
|
"For static articles, docs, or blogs that load everything on first request, prefer web_fetch.",
|
|
102
122
|
"After web_search returns results, prefer web_fetch for reading individual articles.",
|
|
103
|
-
"
|
|
123
|
+
"Use web_browse directly when interaction is required; otherwise try web_fetch first.",
|
|
104
124
|
"Always provide a selector to extract only the relevant content area — avoid dumping full page text.",
|
|
105
125
|
],
|
|
106
126
|
parameters: WebBrowseParamsSchema,
|
|
@@ -195,6 +215,38 @@ const webBrowseTool = defineTool({
|
|
|
195
215
|
},
|
|
196
216
|
};
|
|
197
217
|
} catch (err: any) {
|
|
218
|
+
// Firecrawl keyless fallback: only on runtime failures (CLI missing /
|
|
219
|
+
// batch failure), never on local validation errors (bad caller actions).
|
|
220
|
+
if (isFirecrawlEnabled() && !signal?.aborted && shouldFallbackBrowse(err as Error)) {
|
|
221
|
+
const fb = await interactKeyless(
|
|
222
|
+
params.url,
|
|
223
|
+
{ prompt: synthesizeBrowsePrompt({ url: params.url, actions: params.actions as BrowseAction[], selector: params.selector }), timeout: 60 },
|
|
224
|
+
signal,
|
|
225
|
+
);
|
|
226
|
+
if (fb.ok) {
|
|
227
|
+
const preview = (fb.output || "").replace(/\s+/g, " ").trim().slice(0, 500);
|
|
228
|
+
const creditTag = fb.creditsUsed !== undefined ? `, ${fb.creditsUsed} credits` : "";
|
|
229
|
+
const rawText = `URL: ${params.url}\n(via Firecrawl keyless interact fallback${creditTag})\n\n---\n\n${fb.output || "(no content extracted)"}`;
|
|
230
|
+
const sink = await writeWithFallback(rawText, { tmpPrefix: "pi-web-browse-firecrawl-" });
|
|
231
|
+
return {
|
|
232
|
+
content: [{ type: "text", text: sink.text }],
|
|
233
|
+
details: {
|
|
234
|
+
title: "",
|
|
235
|
+
url: params.url,
|
|
236
|
+
fullOutputPath: sink.fullOutputPath,
|
|
237
|
+
preview,
|
|
238
|
+
selector: params.selector,
|
|
239
|
+
headless: params.headless ?? true,
|
|
240
|
+
actionCount,
|
|
241
|
+
steps,
|
|
242
|
+
viaFirecrawl: true,
|
|
243
|
+
creditsUsed: fb.creditsUsed,
|
|
244
|
+
},
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
// Graceful skip (CLI absent / IP flagged / rate-limited / disabled):
|
|
248
|
+
// fall through to the original local error.
|
|
249
|
+
}
|
|
198
250
|
throw new Error(`Error browsing ${params.url}: ${err.message ?? err}`);
|
|
199
251
|
} finally {
|
|
200
252
|
await closeAgentBrowserSession(session, signal);
|
|
@@ -249,6 +301,8 @@ const webBrowseTool = defineTool({
|
|
|
249
301
|
headless?: boolean;
|
|
250
302
|
actionCount?: number;
|
|
251
303
|
steps?: string[];
|
|
304
|
+
viaFirecrawl?: boolean;
|
|
305
|
+
creditsUsed?: number;
|
|
252
306
|
} | undefined;
|
|
253
307
|
|
|
254
308
|
if (isError) {
|
|
@@ -266,6 +320,9 @@ const webBrowseTool = defineTool({
|
|
|
266
320
|
}
|
|
267
321
|
|
|
268
322
|
let text = theme.fg("success", "✓ Browsed");
|
|
323
|
+
if (details?.viaFirecrawl) {
|
|
324
|
+
text += theme.fg("accent", " [Firecrawl keyless]");
|
|
325
|
+
}
|
|
269
326
|
if (details?.title) {
|
|
270
327
|
text += ` ${theme.fg("toolTitle", details.title)}`;
|
|
271
328
|
}
|