pi-web-toolkit 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +143 -18
- package/docs/adr/0001-firecrawl-keyless-cloud-fallback.md +5 -0
- package/docs/agents/issue-tracker.md +2 -2
- package/docs/guide.md +32 -7
- package/docs/tools.md +79 -10
- package/extensions/firecrawl_interact.ts +147 -0
- package/extensions/firecrawl_scrape.ts +154 -0
- package/extensions/firecrawl_search.ts +165 -0
- package/extensions/index.ts +7 -1
- package/extensions/utils/cli-runner.ts +4 -1
- package/extensions/utils/firecrawl.ts +484 -0
- package/extensions/web_batch_fetch.ts +1 -2
- package/extensions/web_browse.ts +61 -4
- package/extensions/web_fetch.ts +32 -10
- package/extensions/web_search.ts +85 -35
- package/package.json +9 -7
package/extensions/web_fetch.ts
CHANGED
|
@@ -25,6 +25,7 @@ import * as path from "node:path";
|
|
|
25
25
|
import { runScraplingWithFallback } from "./utils/scrapling";
|
|
26
26
|
import { extractPreview } from "./utils/content-preview";
|
|
27
27
|
import { writeWithFallback } from "./utils/output-sink";
|
|
28
|
+
import { scrapeKeyless } from "./utils/firecrawl";
|
|
28
29
|
import { abbreviateUrl, getDomain, getErrorText, normalizeWhitespace, formatExtraction } from "./utils/render-helpers";
|
|
29
30
|
|
|
30
31
|
export const WebFetchParamsSchema = Type.Object({
|
|
@@ -41,13 +42,13 @@ const webFetchTool = defineTool({
|
|
|
41
42
|
description: [
|
|
42
43
|
"Fetch and extract readable content from a web page URL.",
|
|
43
44
|
"Uses scrapling to download the page and convert it to clean markdown.",
|
|
44
|
-
"Use web_fetch
|
|
45
|
-
"
|
|
45
|
+
"Use web_fetch to read the full content of a specific result or user-provided URL.",
|
|
46
|
+
"Callers remain responsible for robots.txt and site terms; Scrapling extract commands do not enforce them automatically.",
|
|
46
47
|
`Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
|
|
47
48
|
].join(" "),
|
|
48
49
|
promptSnippet: "Fetch full page content from a URL as markdown",
|
|
49
50
|
promptGuidelines: [
|
|
50
|
-
"Use web_fetch to read a single
|
|
51
|
+
"Use web_fetch to read a single page (article, doc, or blog) that needs no interaction.",
|
|
51
52
|
"For a single URL, always use web_fetch instead of web_batch_fetch.",
|
|
52
53
|
"If the page is dynamic/JavaScript-heavy, the tool automatically uses browser automation.",
|
|
53
54
|
"When reading multiple (2–5) pages at once (e.g., after web_search), prefer web_batch_fetch over repeated web_fetch calls.",
|
|
@@ -68,15 +69,31 @@ const webFetchTool = defineTool({
|
|
|
68
69
|
signal,
|
|
69
70
|
);
|
|
70
71
|
|
|
71
|
-
|
|
72
|
-
|
|
72
|
+
let content: string;
|
|
73
|
+
let bytes: number;
|
|
74
|
+
let viaFirecrawl = false;
|
|
75
|
+
|
|
76
|
+
if (ok) {
|
|
77
|
+
content = await fs.promises.readFile(tmpFile, "utf-8");
|
|
78
|
+
bytes = (await fs.promises.stat(tmpFile)).size;
|
|
79
|
+
} else {
|
|
80
|
+
// Local scrapling failed — try the Firecrawl keyless fallback.
|
|
81
|
+
const localError = `Failed to fetch ${params.url}\n\nscrapling error:\n${stderr}`;
|
|
82
|
+
const fb = await scrapeKeyless(params.url, {}, signal);
|
|
83
|
+
if (fb.ok) {
|
|
84
|
+
content = fb.content;
|
|
85
|
+
bytes = fb.bytes;
|
|
86
|
+
viaFirecrawl = true;
|
|
87
|
+
} else {
|
|
88
|
+
// Graceful skip (CLI absent / IP flagged / rate-limited / disabled):
|
|
89
|
+
// never leave the user worse off — surface the original local error.
|
|
90
|
+
throw new Error(localError);
|
|
91
|
+
}
|
|
73
92
|
}
|
|
74
93
|
|
|
75
|
-
const content = await fs.promises.readFile(tmpFile, "utf-8");
|
|
76
|
-
const stats = await fs.promises.stat(tmpFile);
|
|
77
|
-
|
|
78
94
|
const preview = extractPreview(content, 500);
|
|
79
|
-
const
|
|
95
|
+
const viaTag = viaFirecrawl ? "\n(via Firecrawl keyless fallback)" : "";
|
|
96
|
+
const rawText = `Fetched: ${params.url}${viaTag}\nSize: ${bytes} bytes\n\n---\n\n${content}`;
|
|
80
97
|
const sink = await writeWithFallback(rawText, {
|
|
81
98
|
tmpPrefix: "pi-web-fetch-full-",
|
|
82
99
|
});
|
|
@@ -86,11 +103,12 @@ const webFetchTool = defineTool({
|
|
|
86
103
|
content: [{ type: "text", text: sink.text }],
|
|
87
104
|
details: {
|
|
88
105
|
url: params.url,
|
|
89
|
-
bytes
|
|
106
|
+
bytes,
|
|
90
107
|
fullOutputPath: tmpFull,
|
|
91
108
|
preview,
|
|
92
109
|
selector: params.selector,
|
|
93
110
|
stealthy: params.stealthy,
|
|
111
|
+
viaFirecrawl,
|
|
94
112
|
},
|
|
95
113
|
};
|
|
96
114
|
} catch (err: any) {
|
|
@@ -128,6 +146,7 @@ const webFetchTool = defineTool({
|
|
|
128
146
|
preview?: string;
|
|
129
147
|
selector?: string;
|
|
130
148
|
stealthy?: boolean;
|
|
149
|
+
viaFirecrawl?: boolean;
|
|
131
150
|
} | undefined;
|
|
132
151
|
|
|
133
152
|
if (isError) {
|
|
@@ -139,6 +158,9 @@ const webFetchTool = defineTool({
|
|
|
139
158
|
}
|
|
140
159
|
|
|
141
160
|
let text = theme.fg("success", "✓ Fetched");
|
|
161
|
+
if (details?.viaFirecrawl) {
|
|
162
|
+
text += theme.fg("accent", " [Firecrawl keyless]");
|
|
163
|
+
}
|
|
142
164
|
if (details?.url) {
|
|
143
165
|
text += ` ${theme.fg("dim", abbreviateUrl(details.url))}`;
|
|
144
166
|
}
|
package/extensions/web_search.ts
CHANGED
|
@@ -20,6 +20,7 @@ import {
|
|
|
20
20
|
import { Text } from "@earendil-works/pi-tui";
|
|
21
21
|
import { Type, type Static } from "typebox";
|
|
22
22
|
import { writeWithFallback } from "./utils/output-sink";
|
|
23
|
+
import { searchKeyless, shouldFallbackSearch } from "./utils/firecrawl";
|
|
23
24
|
import { abbreviateUrl, getDomain, getErrorText, normalizeWhitespace } from "./utils/render-helpers";
|
|
24
25
|
|
|
25
26
|
|
|
@@ -78,6 +79,9 @@ const webSearchTool = defineTool({
|
|
|
78
79
|
let fullOutputPath: string | undefined;
|
|
79
80
|
const MAX_PAGES = 3;
|
|
80
81
|
|
|
82
|
+
let localOk = true;
|
|
83
|
+
let localError: string | undefined;
|
|
84
|
+
|
|
81
85
|
try {
|
|
82
86
|
for (let page = 1; page <= MAX_PAGES; page++) {
|
|
83
87
|
const searchParams = new URLSearchParams({
|
|
@@ -120,51 +124,89 @@ const webSearchTool = defineTool({
|
|
|
120
124
|
break;
|
|
121
125
|
}
|
|
122
126
|
}
|
|
127
|
+
} catch (err: any) {
|
|
128
|
+
localOk = false;
|
|
129
|
+
localError = err.message ?? String(err);
|
|
130
|
+
}
|
|
123
131
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
132
|
+
// Firecrawl keyless fallback: when SearXNG errored OR returned nothing.
|
|
133
|
+
if (shouldFallbackSearch(localOk, allResults.length)) {
|
|
134
|
+
const fb = await searchKeyless(params.query, { limit: Math.min(maxResults, 10) }, signal);
|
|
135
|
+
if (fb.ok && fb.results.length > 0) {
|
|
136
|
+
const fbResults: SearxResult[] = fb.results.slice(0, maxResults).map((r) => ({
|
|
137
|
+
title: r.title ?? "(untitled)",
|
|
138
|
+
url: r.url,
|
|
139
|
+
content: r.description,
|
|
140
|
+
engine: "firecrawl",
|
|
141
|
+
}));
|
|
142
|
+
const creditTag = fb.creditsUsed !== undefined ? `, ${fb.creditsUsed} credits` : "";
|
|
143
|
+
const lines: string[] = [`Results for "${params.query}" (via Firecrawl keyless${creditTag}):`, ""];
|
|
144
|
+
for (let i = 0; i < fbResults.length; i++) {
|
|
145
|
+
const r = fbResults[i];
|
|
146
|
+
lines.push(`${i + 1}. ${r.title}`);
|
|
147
|
+
lines.push(` URL: ${r.url}`);
|
|
148
|
+
if (r.content) lines.push(` ${r.content.replace(/\s+/g, " ").trim()}`);
|
|
149
|
+
if (r.engine) lines.push(` [engine: ${r.engine}]`);
|
|
150
|
+
lines.push("");
|
|
128
151
|
}
|
|
152
|
+
const rawText = lines.join("\n");
|
|
153
|
+
const sink = await writeWithFallback(rawText, {
|
|
154
|
+
tmpPrefix: "pi-web-search-firecrawl-",
|
|
155
|
+
alwaysWriteFile: true,
|
|
156
|
+
});
|
|
129
157
|
return {
|
|
130
|
-
content: [{ type: "text", text }],
|
|
131
|
-
details: { query:
|
|
158
|
+
content: [{ type: "text", text: sink.text }],
|
|
159
|
+
details: { query: params.query, totalResults: fbResults.length, results: fbResults, fullOutputPath: sink.fullOutputPath, viaFirecrawl: true, creditsUsed: fb.creditsUsed },
|
|
132
160
|
};
|
|
133
161
|
}
|
|
162
|
+
// Graceful skip or empty Firecrawl: fall through to local handling.
|
|
163
|
+
}
|
|
134
164
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
];
|
|
165
|
+
if (!localOk) {
|
|
166
|
+
throw new Error(`Failed to query SearXNG at ${searxngUrl}: ${localError}`);
|
|
167
|
+
}
|
|
139
168
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
if (r.content) {
|
|
145
|
-
const snippet = r.content.replace(/\s+/g, " ").trim();
|
|
146
|
-
lines.push(` ${snippet}`);
|
|
147
|
-
}
|
|
148
|
-
if (r.engine) {
|
|
149
|
-
lines.push(` [engine: ${r.engine}]`);
|
|
150
|
-
}
|
|
151
|
-
lines.push("");
|
|
169
|
+
if (allResults.length === 0) {
|
|
170
|
+
let text = `No results found for "${finalQuery}".`;
|
|
171
|
+
if (suggestions && suggestions.length > 0) {
|
|
172
|
+
text += `\n\nSuggestions:\n${suggestions.map((s) => `- ${s}`).join("\n")}`;
|
|
152
173
|
}
|
|
153
|
-
|
|
154
|
-
const rawText = lines.join("\n");
|
|
155
|
-
const sink = await writeWithFallback(rawText, {
|
|
156
|
-
tmpPrefix: "pi-web-search-",
|
|
157
|
-
alwaysWriteFile: true,
|
|
158
|
-
});
|
|
159
|
-
fullOutputPath = sink.fullOutputPath;
|
|
160
|
-
|
|
161
174
|
return {
|
|
162
|
-
content: [{ type: "text", text
|
|
163
|
-
details: { query: finalQuery, totalResults:
|
|
175
|
+
content: [{ type: "text", text }],
|
|
176
|
+
details: { query: finalQuery, totalResults: 0, results: [] as SearxResult[], fullOutputPath: undefined as string | undefined, viaFirecrawl: false, creditsUsed: undefined },
|
|
164
177
|
};
|
|
165
|
-
} catch (err: any) {
|
|
166
|
-
throw new Error(`Failed to query SearXNG at ${searxngUrl}: ${err.message ?? err}`);
|
|
167
178
|
}
|
|
179
|
+
|
|
180
|
+
const lines: string[] = [
|
|
181
|
+
`Results for "${finalQuery}":`,
|
|
182
|
+
"",
|
|
183
|
+
];
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < Math.min(maxResults, allResults.length); i++) {
|
|
186
|
+
const r = allResults[i];
|
|
187
|
+
lines.push(`${i + 1}. ${r.title}`);
|
|
188
|
+
lines.push(` URL: ${r.url}`);
|
|
189
|
+
if (r.content) {
|
|
190
|
+
const snippet = r.content.replace(/\s+/g, " ").trim();
|
|
191
|
+
lines.push(` ${snippet}`);
|
|
192
|
+
}
|
|
193
|
+
if (r.engine) {
|
|
194
|
+
lines.push(` [engine: ${r.engine}]`);
|
|
195
|
+
}
|
|
196
|
+
lines.push("");
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const rawText = lines.join("\n");
|
|
200
|
+
const sink = await writeWithFallback(rawText, {
|
|
201
|
+
tmpPrefix: "pi-web-search-",
|
|
202
|
+
alwaysWriteFile: true,
|
|
203
|
+
});
|
|
204
|
+
fullOutputPath = sink.fullOutputPath;
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
content: [{ type: "text", text: sink.text }],
|
|
208
|
+
details: { query: finalQuery, totalResults: allResults.length, results: allResults.slice(0, maxResults), fullOutputPath, viaFirecrawl: false, creditsUsed: undefined },
|
|
209
|
+
};
|
|
168
210
|
},
|
|
169
211
|
|
|
170
212
|
renderCall(args, theme) {
|
|
@@ -190,6 +232,8 @@ const webSearchTool = defineTool({
|
|
|
190
232
|
totalResults?: number;
|
|
191
233
|
results?: Array<{ title?: string; url?: string; score?: number; engine?: string; content?: string }>;
|
|
192
234
|
fullOutputPath?: string;
|
|
235
|
+
viaFirecrawl?: boolean;
|
|
236
|
+
creditsUsed?: number;
|
|
193
237
|
} | undefined;
|
|
194
238
|
|
|
195
239
|
if (isError) {
|
|
@@ -207,7 +251,13 @@ const webSearchTool = defineTool({
|
|
|
207
251
|
const showing = details.results?.length ?? 0;
|
|
208
252
|
const total = details?.totalResults ?? 0;
|
|
209
253
|
let text = theme.fg("success", `✓ ${showing} unique results`);
|
|
210
|
-
if (
|
|
254
|
+
if (details?.viaFirecrawl) {
|
|
255
|
+
text += theme.fg("accent", " [Firecrawl keyless]");
|
|
256
|
+
}
|
|
257
|
+
if (details?.creditsUsed !== undefined) {
|
|
258
|
+
text += theme.fg("muted", ` ${details.creditsUsed} credits`);
|
|
259
|
+
}
|
|
260
|
+
if (!details?.viaFirecrawl && total > showing) {
|
|
211
261
|
text += theme.fg("dim", ` (${total} total)`);
|
|
212
262
|
}
|
|
213
263
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pi-web-toolkit",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "Web research toolkit for the pi coding agent. Search via SearXNG, fetch
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Web research toolkit for the pi coding agent. Search via SearXNG, fetch pages with scrapling, browse interactively via agent-browser, batch-read sources in parallel, and optionally fall back to Firecrawl Keyless (no API key) when a local backend fails.",
|
|
5
5
|
"author": "Wade Huang <fastwade11@gmail.com>",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"repository": {
|
|
@@ -12,18 +12,20 @@
|
|
|
12
12
|
"url": "https://github.com/Wade11s/pi-web-toolkit/issues"
|
|
13
13
|
},
|
|
14
14
|
"homepage": "https://github.com/Wade11s/pi-web-toolkit#readme",
|
|
15
|
-
"keywords": ["pi-package", "pi-extension", "web-search", "scrapling", "agent-browser"],
|
|
16
|
-
"files": ["extensions", "docs", "README.md", "package.json", "LICENSE"],
|
|
15
|
+
"keywords": ["pi-package", "pi-extension", "web-search", "scrapling", "agent-browser", "firecrawl"],
|
|
16
|
+
"files": ["extensions", "docs", "README.md", "CHANGELOG.md", "package.json", "LICENSE"],
|
|
17
17
|
"engines": {
|
|
18
18
|
"node": ">=22.0.0"
|
|
19
19
|
},
|
|
20
20
|
"scripts": {
|
|
21
21
|
"typecheck": "tsc --noEmit",
|
|
22
|
-
"test": "
|
|
23
|
-
"test:agent-browser": "
|
|
24
|
-
"test:
|
|
22
|
+
"test": "tsx test/content-preview/test.ts && tsx test/agent-browser/test.ts && tsx test/firecrawl/test.ts",
|
|
23
|
+
"test:agent-browser": "tsx test/agent-browser/test.ts",
|
|
24
|
+
"test:firecrawl": "tsx test/firecrawl/test.ts",
|
|
25
|
+
"test:approve": "tsx test/content-preview/test.ts --approve"
|
|
25
26
|
},
|
|
26
27
|
"devDependencies": {
|
|
28
|
+
"tsx": "^4.22.4",
|
|
27
29
|
"typescript": "^5.7.0"
|
|
28
30
|
},
|
|
29
31
|
"peerDependencies": {
|