pi-web-toolkit 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,7 @@ import * as path from "node:path";
25
25
  import { runScraplingWithFallback } from "./utils/scrapling";
26
26
  import { extractPreview } from "./utils/content-preview";
27
27
  import { writeWithFallback } from "./utils/output-sink";
28
+ import { scrapeKeyless } from "./utils/firecrawl";
28
29
  import { abbreviateUrl, getDomain, getErrorText, normalizeWhitespace, formatExtraction } from "./utils/render-helpers";
29
30
 
30
31
  export const WebFetchParamsSchema = Type.Object({
@@ -41,13 +42,13 @@ const webFetchTool = defineTool({
41
42
  description: [
42
43
  "Fetch and extract readable content from a web page URL.",
43
44
  "Uses scrapling to download the page and convert it to clean markdown.",
44
- "Use web_fetch AFTER web_search to read the full content of a result page.",
45
- "Respects robots.txt and site ToS.",
45
+ "Use web_fetch to read the full content of a specific result or user-provided URL.",
46
+ "Callers remain responsible for robots.txt and site terms; Scrapling extract commands do not enforce them automatically.",
46
47
  `Output is truncated to ${DEFAULT_MAX_LINES} lines or ${formatSize(DEFAULT_MAX_BYTES)}; if truncated, full output is saved to a temp file.`,
47
48
  ].join(" "),
48
49
  promptSnippet: "Fetch full page content from a URL as markdown",
49
50
  promptGuidelines: [
50
- "Use web_fetch to read a single static page (article, doc, or blog) when given a specific URL.",
51
+ "Use web_fetch to read a single page (article, doc, or blog) that needs no interaction.",
51
52
  "For a single URL, always use web_fetch instead of web_batch_fetch.",
52
53
  "If the page is dynamic/JavaScript-heavy, the tool automatically uses browser automation.",
53
54
  "When reading multiple (2–5) pages at once (e.g., after web_search), prefer web_batch_fetch over repeated web_fetch calls.",
@@ -68,15 +69,31 @@ const webFetchTool = defineTool({
68
69
  signal,
69
70
  );
70
71
 
71
- if (!ok) {
72
- throw new Error(`Failed to fetch ${params.url}\n\nscrapling error:\n${stderr}`);
72
+ let content: string;
73
+ let bytes: number;
74
+ let viaFirecrawl = false;
75
+
76
+ if (ok) {
77
+ content = await fs.promises.readFile(tmpFile, "utf-8");
78
+ bytes = (await fs.promises.stat(tmpFile)).size;
79
+ } else {
80
+ // Local scrapling failed — try the Firecrawl keyless fallback.
81
+ const localError = `Failed to fetch ${params.url}\n\nscrapling error:\n${stderr}`;
82
+ const fb = await scrapeKeyless(params.url, {}, signal);
83
+ if (fb.ok) {
84
+ content = fb.content;
85
+ bytes = fb.bytes;
86
+ viaFirecrawl = true;
87
+ } else {
88
+ // Graceful skip (CLI absent / IP flagged / rate-limited / disabled):
89
+ // never leave the user worse off — surface the original local error.
90
+ throw new Error(localError);
91
+ }
73
92
  }
74
93
 
75
- const content = await fs.promises.readFile(tmpFile, "utf-8");
76
- const stats = await fs.promises.stat(tmpFile);
77
-
78
94
  const preview = extractPreview(content, 500);
79
- const rawText = `Fetched: ${params.url}\nSize: ${stats.size} bytes\n\n---\n\n${content}`;
95
+ const viaTag = viaFirecrawl ? "\n(via Firecrawl keyless fallback)" : "";
96
+ const rawText = `Fetched: ${params.url}${viaTag}\nSize: ${bytes} bytes\n\n---\n\n${content}`;
80
97
  const sink = await writeWithFallback(rawText, {
81
98
  tmpPrefix: "pi-web-fetch-full-",
82
99
  });
@@ -86,11 +103,12 @@ const webFetchTool = defineTool({
86
103
  content: [{ type: "text", text: sink.text }],
87
104
  details: {
88
105
  url: params.url,
89
- bytes: stats.size,
106
+ bytes,
90
107
  fullOutputPath: tmpFull,
91
108
  preview,
92
109
  selector: params.selector,
93
110
  stealthy: params.stealthy,
111
+ viaFirecrawl,
94
112
  },
95
113
  };
96
114
  } catch (err: any) {
@@ -128,6 +146,7 @@ const webFetchTool = defineTool({
128
146
  preview?: string;
129
147
  selector?: string;
130
148
  stealthy?: boolean;
149
+ viaFirecrawl?: boolean;
131
150
  } | undefined;
132
151
 
133
152
  if (isError) {
@@ -139,6 +158,9 @@ const webFetchTool = defineTool({
139
158
  }
140
159
 
141
160
  let text = theme.fg("success", "✓ Fetched");
161
+ if (details?.viaFirecrawl) {
162
+ text += theme.fg("accent", " [Firecrawl keyless]");
163
+ }
142
164
  if (details?.url) {
143
165
  text += ` ${theme.fg("dim", abbreviateUrl(details.url))}`;
144
166
  }
@@ -20,6 +20,7 @@ import {
20
20
  import { Text } from "@earendil-works/pi-tui";
21
21
  import { Type, type Static } from "typebox";
22
22
  import { writeWithFallback } from "./utils/output-sink";
23
+ import { searchKeyless, shouldFallbackSearch } from "./utils/firecrawl";
23
24
  import { abbreviateUrl, getDomain, getErrorText, normalizeWhitespace } from "./utils/render-helpers";
24
25
 
25
26
 
@@ -78,6 +79,9 @@ const webSearchTool = defineTool({
78
79
  let fullOutputPath: string | undefined;
79
80
  const MAX_PAGES = 3;
80
81
 
82
+ let localOk = true;
83
+ let localError: string | undefined;
84
+
81
85
  try {
82
86
  for (let page = 1; page <= MAX_PAGES; page++) {
83
87
  const searchParams = new URLSearchParams({
@@ -120,51 +124,89 @@ const webSearchTool = defineTool({
120
124
  break;
121
125
  }
122
126
  }
127
+ } catch (err: any) {
128
+ localOk = false;
129
+ localError = err.message ?? String(err);
130
+ }
123
131
 
124
- if (allResults.length === 0) {
125
- let text = `No results found for "${finalQuery}".`;
126
- if (suggestions && suggestions.length > 0) {
127
- text += `\n\nSuggestions:\n${suggestions.map((s) => `- ${s}`).join("\n")}`;
132
+ // Firecrawl keyless fallback: when SearXNG errored OR returned nothing.
133
+ if (shouldFallbackSearch(localOk, allResults.length)) {
134
+ const fb = await searchKeyless(params.query, { limit: Math.min(maxResults, 10) }, signal);
135
+ if (fb.ok && fb.results.length > 0) {
136
+ const fbResults: SearxResult[] = fb.results.slice(0, maxResults).map((r) => ({
137
+ title: r.title ?? "(untitled)",
138
+ url: r.url,
139
+ content: r.description,
140
+ engine: "firecrawl",
141
+ }));
142
+ const creditTag = fb.creditsUsed !== undefined ? `, ${fb.creditsUsed} credits` : "";
143
+ const lines: string[] = [`Results for "${params.query}" (via Firecrawl keyless${creditTag}):`, ""];
144
+ for (let i = 0; i < fbResults.length; i++) {
145
+ const r = fbResults[i];
146
+ lines.push(`${i + 1}. ${r.title}`);
147
+ lines.push(` URL: ${r.url}`);
148
+ if (r.content) lines.push(` ${r.content.replace(/\s+/g, " ").trim()}`);
149
+ if (r.engine) lines.push(` [engine: ${r.engine}]`);
150
+ lines.push("");
128
151
  }
152
+ const rawText = lines.join("\n");
153
+ const sink = await writeWithFallback(rawText, {
154
+ tmpPrefix: "pi-web-search-firecrawl-",
155
+ alwaysWriteFile: true,
156
+ });
129
157
  return {
130
- content: [{ type: "text", text }],
131
- details: { query: finalQuery, totalResults: 0, results: [] as SearxResult[], fullOutputPath: undefined as string | undefined },
158
+ content: [{ type: "text", text: sink.text }],
159
+ details: { query: params.query, totalResults: fbResults.length, results: fbResults, fullOutputPath: sink.fullOutputPath, viaFirecrawl: true, creditsUsed: fb.creditsUsed },
132
160
  };
133
161
  }
162
+ // Graceful skip or empty Firecrawl: fall through to local handling.
163
+ }
134
164
 
135
- const lines: string[] = [
136
- `Results for "${finalQuery}":`,
137
- "",
138
- ];
165
+ if (!localOk) {
166
+ throw new Error(`Failed to query SearXNG at ${searxngUrl}: ${localError}`);
167
+ }
139
168
 
140
- for (let i = 0; i < Math.min(maxResults, allResults.length); i++) {
141
- const r = allResults[i];
142
- lines.push(`${i + 1}. ${r.title}`);
143
- lines.push(` URL: ${r.url}`);
144
- if (r.content) {
145
- const snippet = r.content.replace(/\s+/g, " ").trim();
146
- lines.push(` ${snippet}`);
147
- }
148
- if (r.engine) {
149
- lines.push(` [engine: ${r.engine}]`);
150
- }
151
- lines.push("");
169
+ if (allResults.length === 0) {
170
+ let text = `No results found for "${finalQuery}".`;
171
+ if (suggestions && suggestions.length > 0) {
172
+ text += `\n\nSuggestions:\n${suggestions.map((s) => `- ${s}`).join("\n")}`;
152
173
  }
153
-
154
- const rawText = lines.join("\n");
155
- const sink = await writeWithFallback(rawText, {
156
- tmpPrefix: "pi-web-search-",
157
- alwaysWriteFile: true,
158
- });
159
- fullOutputPath = sink.fullOutputPath;
160
-
161
174
  return {
162
- content: [{ type: "text", text: sink.text }],
163
- details: { query: finalQuery, totalResults: allResults.length, results: allResults.slice(0, maxResults), fullOutputPath },
175
+ content: [{ type: "text", text }],
176
+ details: { query: finalQuery, totalResults: 0, results: [] as SearxResult[], fullOutputPath: undefined as string | undefined, viaFirecrawl: false, creditsUsed: undefined },
164
177
  };
165
- } catch (err: any) {
166
- throw new Error(`Failed to query SearXNG at ${searxngUrl}: ${err.message ?? err}`);
167
178
  }
179
+
180
+ const lines: string[] = [
181
+ `Results for "${finalQuery}":`,
182
+ "",
183
+ ];
184
+
185
+ for (let i = 0; i < Math.min(maxResults, allResults.length); i++) {
186
+ const r = allResults[i];
187
+ lines.push(`${i + 1}. ${r.title}`);
188
+ lines.push(` URL: ${r.url}`);
189
+ if (r.content) {
190
+ const snippet = r.content.replace(/\s+/g, " ").trim();
191
+ lines.push(` ${snippet}`);
192
+ }
193
+ if (r.engine) {
194
+ lines.push(` [engine: ${r.engine}]`);
195
+ }
196
+ lines.push("");
197
+ }
198
+
199
+ const rawText = lines.join("\n");
200
+ const sink = await writeWithFallback(rawText, {
201
+ tmpPrefix: "pi-web-search-",
202
+ alwaysWriteFile: true,
203
+ });
204
+ fullOutputPath = sink.fullOutputPath;
205
+
206
+ return {
207
+ content: [{ type: "text", text: sink.text }],
208
+ details: { query: finalQuery, totalResults: allResults.length, results: allResults.slice(0, maxResults), fullOutputPath, viaFirecrawl: false, creditsUsed: undefined },
209
+ };
168
210
  },
169
211
 
170
212
  renderCall(args, theme) {
@@ -190,6 +232,8 @@ const webSearchTool = defineTool({
190
232
  totalResults?: number;
191
233
  results?: Array<{ title?: string; url?: string; score?: number; engine?: string; content?: string }>;
192
234
  fullOutputPath?: string;
235
+ viaFirecrawl?: boolean;
236
+ creditsUsed?: number;
193
237
  } | undefined;
194
238
 
195
239
  if (isError) {
@@ -207,7 +251,13 @@ const webSearchTool = defineTool({
207
251
  const showing = details.results?.length ?? 0;
208
252
  const total = details?.totalResults ?? 0;
209
253
  let text = theme.fg("success", `✓ ${showing} unique results`);
210
- if (total > showing) {
254
+ if (details?.viaFirecrawl) {
255
+ text += theme.fg("accent", " [Firecrawl keyless]");
256
+ }
257
+ if (details?.creditsUsed !== undefined) {
258
+ text += theme.fg("muted", ` ${details.creditsUsed} credits`);
259
+ }
260
+ if (!details?.viaFirecrawl && total > showing) {
211
261
  text += theme.fg("dim", ` (${total} total)`);
212
262
  }
213
263
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "pi-web-toolkit",
3
- "version": "0.2.1",
4
- "description": "Web research toolkit for the pi coding agent. Search via SearXNG, fetch static pages with scrapling, browse interactively via agent-browser, and batch-read sources in parallel.",
3
+ "version": "0.3.0",
4
+ "description": "Web research toolkit for the pi coding agent. Search via SearXNG, fetch pages with scrapling, browse interactively via agent-browser, batch-read sources in parallel, and optionally fall back to Firecrawl Keyless (no API key) when a local backend fails.",
5
5
  "author": "Wade Huang <fastwade11@gmail.com>",
6
6
  "license": "MIT",
7
7
  "repository": {
@@ -12,18 +12,20 @@
12
12
  "url": "https://github.com/Wade11s/pi-web-toolkit/issues"
13
13
  },
14
14
  "homepage": "https://github.com/Wade11s/pi-web-toolkit#readme",
15
- "keywords": ["pi-package", "pi-extension", "web-search", "scrapling", "agent-browser"],
16
- "files": ["extensions", "docs", "README.md", "package.json", "LICENSE"],
15
+ "keywords": ["pi-package", "pi-extension", "web-search", "scrapling", "agent-browser", "firecrawl"],
16
+ "files": ["extensions", "docs", "README.md", "CHANGELOG.md", "package.json", "LICENSE"],
17
17
  "engines": {
18
18
  "node": ">=22.0.0"
19
19
  },
20
20
  "scripts": {
21
21
  "typecheck": "tsc --noEmit",
22
- "test": "npx tsx test/content-preview/test.ts && npx tsx test/agent-browser/test.ts",
23
- "test:agent-browser": "npx tsx test/agent-browser/test.ts",
24
- "test:approve": "npx tsx test/content-preview/test.ts --approve"
22
+ "test": "tsx test/content-preview/test.ts && tsx test/agent-browser/test.ts && tsx test/firecrawl/test.ts",
23
+ "test:agent-browser": "tsx test/agent-browser/test.ts",
24
+ "test:firecrawl": "tsx test/firecrawl/test.ts",
25
+ "test:approve": "tsx test/content-preview/test.ts --approve"
25
26
  },
26
27
  "devDependencies": {
28
+ "tsx": "^4.22.4",
27
29
  "typescript": "^5.7.0"
28
30
  },
29
31
  "peerDependencies": {