@pi-lab/webfetch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +18 -0
  2. package/dist/index.mjs +475 -0
  3. package/package.json +43 -0
package/README.md ADDED
@@ -0,0 +1,18 @@
1
+ # @pi-lab/webfetch
2
+
3
+ A web fetching extension for [pi coding agent](https://github.com/badlogic/pi-mono). Adds a `webfetch` tool — fetch any URL and get back clean Markdown, ready for the model to read.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pi install npm:@pi-lab/webfetch
9
+ ```
10
+
11
+ ## Features
12
+
13
+ - **HTML → Markdown** via [Mozilla Readability](https://github.com/mozilla/readability) (same engine as Firefox Reader Mode) + [Turndown](https://github.com/mixmark-io/turndown). Falls back to full-page conversion if Readability can't extract a main article.
14
+ - **Pagination** — large pages are sliced into chunks; the model reads page by page using `offset`.
15
+ - **Inline script index** — `<script>` tags are stripped from the Markdown body but listed as a numbered index at the end. The model can read any of them with `script=N`.
16
+ - **Redirect handling** — same-domain redirects are followed automatically (up to 10 hops); cross-domain redirects are surfaced to the model so it can decide whether to follow.
17
+ - **Binary downloads** — non-text responses (PDFs, images, etc.) are saved to `.pi/pi-lab/webfetch/tmp/` and the file path is returned.
18
+ - **LRU cache** — processed Markdown is cached in memory (default: 50 MB, 15 min TTL) so paginating the same URL doesn't re-fetch.
package/dist/index.mjs ADDED
@@ -0,0 +1,475 @@
1
+ import { Type } from "@sinclair/typebox";
2
+ import { join } from "node:path";
3
+ import { keyHint } from "@mariozechner/pi-coding-agent";
4
+ import { Text } from "@mariozechner/pi-tui";
5
+ import { LRUCache } from "lru-cache";
6
+ import { mkdir, writeFile } from "node:fs/promises";
7
+ import { Readability } from "@mozilla/readability";
8
+ import { parseHTML } from "linkedom";
9
+ import TurndownService from "turndown";
10
+ //#region src/config.ts
11
+ const DEFAULT_CONFIG = {
12
+ maxPageLength: 2e4,
13
+ cache: {
14
+ maxSizeBytes: 50 * 1024 * 1024,
15
+ ttlMs: 900 * 1e3
16
+ }
17
+ };
18
+ function mergeConfig(partial) {
19
+ if (!partial) return DEFAULT_CONFIG;
20
+ return {
21
+ maxPageLength: partial.maxPageLength ?? DEFAULT_CONFIG.maxPageLength,
22
+ cache: {
23
+ ...DEFAULT_CONFIG.cache,
24
+ ...partial.cache
25
+ }
26
+ };
27
+ }
28
+ //#endregion
29
+ //#region src/cache.ts
30
+ var WebFetchCache = class {
31
+ cache;
32
+ constructor(config) {
33
+ this.cache = new LRUCache({
34
+ maxSize: config.maxSizeBytes,
35
+ sizeCalculation: (value) => {
36
+ return Buffer.byteLength(value.markdown, "utf8") + value.scripts.reduce((sum, s) => sum + Buffer.byteLength(s.content, "utf8"), 0);
37
+ },
38
+ ttl: config.ttlMs,
39
+ allowStale: false
40
+ });
41
+ }
42
+ get(key) {
43
+ return this.cache.get(key);
44
+ }
45
+ set(key, value) {
46
+ this.cache.set(key, value);
47
+ }
48
+ delete(key) {
49
+ this.cache.delete(key);
50
+ }
51
+ clear() {
52
+ this.cache.clear();
53
+ }
54
+ };
55
+ //#endregion
56
+ //#region src/normalize.ts
57
+ /**
58
+ * Normalize a URL in a lossless way:
59
+ * 1. Lowercase protocol
60
+ * 2. Lowercase hostname
61
+ * 3. Upgrade http → https
62
+ * 4. Remove default ports (:80 for http, :443 for https)
63
+ *
64
+ * Does NOT reorder query params or normalize trailing slashes.
65
+ */
66
+ function normalizeUrl(rawUrl) {
67
+ const url = new URL(rawUrl);
68
+ url.protocol = url.protocol.toLowerCase();
69
+ url.hostname = url.hostname.toLowerCase();
70
+ if (url.protocol === "http:") url.protocol = "https:";
71
+ if (url.port === "443" && url.protocol === "https:") url.port = "";
72
+ if (url.port === "80" && url.protocol === "http:") url.port = "";
73
+ return url.toString();
74
+ }
75
+ //#endregion
76
+ //#region src/fetch.ts
77
+ /**
78
+ * Determine if two URLs are on the same domain.
79
+ * Same domain = same protocol + same port + same hostname (ignoring www prefix).
80
+ */
81
+ function isSameDomain(a, b) {
82
+ try {
83
+ const u1 = new URL(a);
84
+ const u2 = new URL(b);
85
+ if (u1.protocol !== u2.protocol) return false;
86
+ if (u1.port !== u2.port) return false;
87
+ return u1.hostname.replace(/^www\./, "") === u2.hostname.replace(/^www\./, "");
88
+ } catch {
89
+ return false;
90
+ }
91
+ }
92
+ const CONTENT_TYPE_EXTENSIONS = {
93
+ "image/jpeg": ".jpg",
94
+ "image/png": ".png",
95
+ "image/gif": ".gif",
96
+ "image/webp": ".webp",
97
+ "image/svg+xml": ".svg",
98
+ "application/pdf": ".pdf",
99
+ "application/zip": ".zip",
100
+ "application/json": ".json",
101
+ "video/mp4": ".mp4",
102
+ "audio/mpeg": ".mp3"
103
+ };
104
+ function extForContentType(contentType) {
105
+ return CONTENT_TYPE_EXTENSIONS[contentType] ?? ".bin";
106
+ }
107
+ /**
108
+ * Fetch a URL, following same-domain redirects automatically.
109
+ * Cross-domain redirects are returned as RedirectResult for the LLM to handle.
110
+ * Binary content is saved to tempDir and returned as BinaryResult.
111
+ */
112
+ async function fetchUrl(normalizedUrl, tempDir, signal, maxRedirects = 10) {
113
+ let currentUrl = normalizedUrl;
114
+ for (let hop = 0; hop <= maxRedirects; hop++) {
115
+ const response = await fetch(currentUrl, {
116
+ signal,
117
+ redirect: "manual",
118
+ headers: {
119
+ Accept: "text/markdown, text/plain, text/html, */*",
120
+ "User-Agent": "pi/webfetch"
121
+ }
122
+ });
123
+ if (response.status >= 300 && response.status < 400) {
124
+ const location = response.headers.get("location");
125
+ if (!location) throw new Error(`Redirect ${response.status} with no Location header`);
126
+ let redirectUrl;
127
+ try {
128
+ redirectUrl = new URL(location, currentUrl).toString();
129
+ } catch {
130
+ throw new Error(`Invalid redirect location: ${location}`);
131
+ }
132
+ if (isSameDomain(currentUrl, redirectUrl)) {
133
+ currentUrl = normalizeUrl(redirectUrl);
134
+ continue;
135
+ } else return {
136
+ type: "redirect",
137
+ originalUrl: normalizedUrl,
138
+ redirectUrl,
139
+ statusCode: response.status
140
+ };
141
+ }
142
+ if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
143
+ const baseContentType = (response.headers.get("content-type") ?? "").split(";")[0].trim().toLowerCase();
144
+ if (baseContentType.startsWith("text/")) return {
145
+ type: "text",
146
+ content: await response.text(),
147
+ contentType: baseContentType,
148
+ url: currentUrl
149
+ };
150
+ await mkdir(tempDir, { recursive: true });
151
+ const ext = extForContentType(baseContentType);
152
+ const filePath = join(tempDir, `webfetch-${Date.now()}${ext}`);
153
+ const buffer = await response.arrayBuffer();
154
+ await writeFile(filePath, Buffer.from(buffer));
155
+ return {
156
+ type: "binary",
157
+ filePath,
158
+ contentType: baseContentType,
159
+ url: currentUrl
160
+ };
161
+ }
162
+ throw new Error(`Too many redirects (max ${maxRedirects})`);
163
+ }
164
+ //#endregion
165
+ //#region src/content.ts
166
+ let turndownInstance = null;
167
+ function getTurndown() {
168
+ if (!turndownInstance) {
169
+ turndownInstance = new TurndownService({
170
+ headingStyle: "atx",
171
+ codeBlockStyle: "fenced",
172
+ bulletListMarker: "-"
173
+ });
174
+ turndownInstance.remove([
175
+ "style",
176
+ "script",
177
+ "noscript"
178
+ ]);
179
+ turndownInstance.keep(["pre", "code"]);
180
+ }
181
+ return turndownInstance;
182
+ }
183
+ /**
184
+ * Extract inline <script> elements (no src attribute) from a parsed document.
185
+ * External scripts are skipped — they have no inline content.
186
+ */
187
+ function extractInlineScripts(document) {
188
+ const results = [];
189
+ const els = document.querySelectorAll("script:not([src])");
190
+ let index = 0;
191
+ for (const el of els) {
192
+ const content = el.textContent?.trim() ?? "";
193
+ if (content.length === 0) continue;
194
+ results.push({
195
+ index: index++,
196
+ length: content.length,
197
+ preview: content.slice(0, 80).replace(/\s+/g, " "),
198
+ content
199
+ });
200
+ }
201
+ return results;
202
+ }
203
+ /**
204
+ * Process HTML content:
205
+ * 1. Try Mozilla Readability to extract main content
206
+ * 2. If extraction ratio < 10%, fall back to full HTML → Markdown
207
+ * Also extracts inline scripts as a separate list.
208
+ */
209
+ async function processHtml(html, _url) {
210
+ const td = getTurndown();
211
+ const { document } = parseHTML(html);
212
+ const scripts = extractInlineScripts(document);
213
+ try {
214
+ const article = new Readability(document).parse();
215
+ if (article?.content) {
216
+ if (article.content.length / html.length >= .1) return {
217
+ markdown: td.turndown(article.content),
218
+ scripts,
219
+ method: "readability"
220
+ };
221
+ }
222
+ } catch {}
223
+ return {
224
+ markdown: td.turndown(html),
225
+ scripts,
226
+ method: "full-html"
227
+ };
228
+ }
229
+ /**
230
+ * Process plain text / markdown content.
231
+ */
232
+ function processPlainText(text) {
233
+ return text;
234
+ }
235
+ //#endregion
236
+ //#region src/tool.ts
237
+ function formatScriptIndex(scripts) {
238
+ if (scripts.length === 0) return "";
239
+ const width = String(scripts.reduce((m, s) => Math.max(m, s.length), 0)).length;
240
+ const lines = scripts.map((s) => ` [${s.index}] ${String(s.length).padStart(width)} chars ${s.preview}`);
241
+ return [
242
+ "",
243
+ `Inline scripts (${scripts.length}, call webfetch with script=N to read full content):`,
244
+ ...lines
245
+ ].join("\n");
246
+ }
247
+ function formatTextResult(output, scripts) {
248
+ const lines = [];
249
+ lines.push(`URL: ${output.url}`);
250
+ if (output.truncated) {
251
+ const next = output.offset + output.returned_length;
252
+ lines.push(`Offset: ${output.offset} / ${output.total_length} chars — truncated, call again with offset=${next}`);
253
+ } else lines.push(`Length: ${output.total_length} chars`);
254
+ lines.push("", "---", "", output.content);
255
+ const scriptIndex = formatScriptIndex(scripts);
256
+ if (scriptIndex) lines.push(scriptIndex);
257
+ return lines.join("\n");
258
+ }
259
+ function formatScriptResult(url, scriptIndex, output) {
260
+ const lines = [];
261
+ lines.push(`URL: ${url} — script ${scriptIndex}`);
262
+ if (output.truncated) {
263
+ const next = output.offset + output.returned_length;
264
+ lines.push(`Offset: ${output.offset} / ${output.total_length} chars — truncated, call again with offset=${next}`);
265
+ } else lines.push(`Length: ${output.total_length} chars`);
266
+ lines.push("", "---", "", output.content);
267
+ return lines.join("\n");
268
+ }
269
+ function formatBinaryResult(output) {
270
+ return [
271
+ `BINARY FILE: ${output.file_path}`,
272
+ `Content-Type: ${output.content_type}`,
273
+ `URL: ${output.url}`
274
+ ].join("\n");
275
+ }
276
+ function formatRedirectResult(output) {
277
+ return [`REDIRECT ${output.status_code}: ${output.original_url} → ${output.redirect_url}`, output.message].join("\n");
278
+ }
279
+ function registerWebFetchTool(pi, config) {
280
+ const cache = new WebFetchCache(config.cache);
281
+ pi.on("session_shutdown", async () => {
282
+ cache.clear();
283
+ });
284
+ pi.registerTool({
285
+ name: "webfetch",
286
+ label: "Web Fetch",
287
+ description: [
288
+ "Fetch content from a URL and return it as Markdown text.",
289
+ "Handles HTML extraction via Mozilla Readability and pagination for large pages.",
290
+ "Inline scripts are listed in an index at the end — use the `script` parameter to read a specific one.",
291
+ "Non-text content (images, PDFs, etc.) is saved to a local file and the path is returned.",
292
+ "Cross-domain redirects are reported back so you can decide whether to follow them."
293
+ ].join(" "),
294
+ promptSnippet: "Fetch and read web page content from a URL",
295
+ promptGuidelines: [
296
+ "Use webfetch to retrieve content from URLs instead of suggesting the user open a browser.",
297
+ "For paginated results, increment `offset` by `returned_length` and call webfetch again until `truncated` is false.",
298
+ "If the page has inline scripts listed at the end, use `script=N` to read one if it might contain relevant data.",
299
+ "If webfetch returns a redirect result, call it again with the `redirect_url`."
300
+ ],
301
+ parameters: Type.Object({
302
+ url: Type.String({ description: "The URL to fetch." }),
303
+ script: Type.Optional(Type.Number({ description: "Index of an inline script to read (from the script index at the end of a previous response). Supports the same `offset` and `max_length` pagination as normal page content." })),
304
+ offset: Type.Optional(Type.Number({ description: "Starting character position for pagination. Defaults to 0." })),
305
+ max_length: Type.Optional(Type.Number({ description: `Maximum characters to return in this call. Defaults to ${config.maxPageLength}.` }))
306
+ }),
307
+ async execute(_toolCallId, params, signal, onUpdate, ctx) {
308
+ const { url, script: scriptIndex, offset = 0, max_length } = params;
309
+ const maxLength = max_length ?? config.maxPageLength;
310
+ let normalizedUrl;
311
+ try {
312
+ normalizedUrl = normalizeUrl(url);
313
+ } catch {
314
+ throw new Error(`Invalid URL: ${url}`);
315
+ }
316
+ const tempDir = join(ctx.cwd, ".pi", "pi-lab", "webfetch", "tmp");
317
+ let entry = cache.get(normalizedUrl);
318
+ if (!entry) {
319
+ onUpdate?.({
320
+ content: [{
321
+ type: "text",
322
+ text: `Fetching ${normalizedUrl}…`
323
+ }],
324
+ details: {}
325
+ });
326
+ const result = await fetchUrl(normalizedUrl, tempDir, signal);
327
+ if (result.type === "redirect") {
328
+ const output = {
329
+ redirect: true,
330
+ original_url: result.originalUrl,
331
+ redirect_url: result.redirectUrl,
332
+ status_code: result.statusCode,
333
+ message: "This URL redirects to a different domain. Call webfetch again with `redirect_url` to fetch the content."
334
+ };
335
+ return {
336
+ content: [{
337
+ type: "text",
338
+ text: formatRedirectResult(output)
339
+ }],
340
+ details: output
341
+ };
342
+ }
343
+ if (result.type === "binary") {
344
+ const output = {
345
+ file_path: result.filePath,
346
+ content_type: result.contentType,
347
+ url: result.url
348
+ };
349
+ return {
350
+ content: [{
351
+ type: "text",
352
+ text: formatBinaryResult(output)
353
+ }],
354
+ details: output
355
+ };
356
+ }
357
+ onUpdate?.({
358
+ content: [{
359
+ type: "text",
360
+ text: "Processing content…"
361
+ }],
362
+ details: {}
363
+ });
364
+ if (result.contentType === "text/html") {
365
+ const processed = await processHtml(result.content, normalizedUrl);
366
+ entry = {
367
+ markdown: processed.markdown,
368
+ scripts: processed.scripts
369
+ };
370
+ } else entry = {
371
+ markdown: processPlainText(result.content),
372
+ scripts: []
373
+ };
374
+ cache.set(normalizedUrl, entry);
375
+ }
376
+ if (scriptIndex !== void 0) {
377
+ const script = entry.scripts.find((s) => s.index === scriptIndex);
378
+ if (!script) throw new Error(`Script ${scriptIndex} not found. Available indices: ${entry.scripts.map((s) => s.index).join(", ") || "none"}`);
379
+ const total = script.content.length;
380
+ const slice = script.content.slice(offset, offset + maxLength);
381
+ const output = {
382
+ content: slice,
383
+ truncated: offset + maxLength < total,
384
+ total_length: total,
385
+ offset,
386
+ returned_length: slice.length,
387
+ url: normalizedUrl
388
+ };
389
+ return {
390
+ content: [{
391
+ type: "text",
392
+ text: formatScriptResult(normalizedUrl, scriptIndex, output)
393
+ }],
394
+ details: output
395
+ };
396
+ }
397
+ const totalLength = entry.markdown.length;
398
+ const slice = entry.markdown.slice(offset, offset + maxLength);
399
+ const output = {
400
+ content: slice,
401
+ truncated: offset + maxLength < totalLength,
402
+ total_length: totalLength,
403
+ offset,
404
+ returned_length: slice.length,
405
+ url: normalizedUrl
406
+ };
407
+ return {
408
+ content: [{
409
+ type: "text",
410
+ text: formatTextResult(output, entry.scripts)
411
+ }],
412
+ details: output
413
+ };
414
+ },
415
+ renderCall(args, theme, context) {
416
+ const text = context.lastComponent ?? new Text("", 0, 0);
417
+ let line = theme.fg("toolTitle", theme.bold("webfetch "));
418
+ line += theme.fg("accent", args.url ?? "");
419
+ if (args.script !== void 0) line += theme.fg("muted", ` · script=${args.script}`);
420
+ if (args.offset) line += theme.fg("dim", ` · offset=${args.offset}`);
421
+ text.setText(line);
422
+ return text;
423
+ },
424
+ renderResult(result, options, theme, context) {
425
+ const text = context.lastComponent ?? new Text("", 0, 0);
426
+ if (options.isPartial) {
427
+ text.setText(theme.fg("muted", "Fetching…"));
428
+ return text;
429
+ }
430
+ if (context.isError || !result.details) {
431
+ const raw = result.content.find((c) => c.type === "text")?.text ?? "";
432
+ text.setText(theme.fg("error", raw));
433
+ return text;
434
+ }
435
+ const details = result.details;
436
+ if ("redirect" in details) {
437
+ text.setText(theme.fg("warning", `↪ REDIRECT ${details.status_code}: `) + theme.fg("accent", details.redirect_url));
438
+ return text;
439
+ }
440
+ if ("file_path" in details) {
441
+ text.setText(theme.fg("success", "✓ ") + theme.fg("muted", details.content_type) + theme.fg("dim", ` → ${details.file_path}`));
442
+ return text;
443
+ }
444
+ const allLines = details.content.split("\n");
445
+ const maxLines = options.expanded ? allLines.length : 10;
446
+ const displayLines = allLines.slice(0, maxLines);
447
+ const remaining = allLines.length - maxLines;
448
+ const header = theme.fg("dim", details.url) + (details.truncated ? theme.fg("muted", ` · ${details.returned_length.toLocaleString()} / ${details.total_length.toLocaleString()} chars`) : theme.fg("muted", ` · ${details.total_length.toLocaleString()} chars`));
449
+ let body = "\n" + displayLines.map((l) => theme.fg("toolOutput", l)).join("\n");
450
+ if (remaining > 0) body += theme.fg("muted", `\n… (${remaining} more lines, `) + keyHint("app.tools.expand", "to expand") + theme.fg("muted", ")");
451
+ text.setText(header + body);
452
+ return text;
453
+ }
454
+ });
455
+ }
456
+ //#endregion
457
+ //#region src/index.ts
458
+ /**
459
+ * WebFetch extension for pi coding agent.
460
+ *
461
+ * Registers the `webfetch` tool which fetches URLs and returns Markdown content.
462
+ *
463
+ * Features:
464
+ * - URL normalization (lowercase, http→https, strip default ports)
465
+ * - Same-domain redirect following; cross-domain redirects returned to LLM
466
+ * - Mozilla Readability for HTML → Markdown extraction
467
+ * - Inline script index — use `script=N` to read a specific inline script
468
+ * - LRU cache (50 MB, 15 min TTL) keyed on normalized URL
469
+ * - Pagination via offset/max_length parameters
470
+ */
471
+ function src_default(pi) {
472
+ registerWebFetchTool(pi, mergeConfig());
473
+ }
474
+ //#endregion
475
+ export { DEFAULT_CONFIG, src_default as default, mergeConfig, registerWebFetchTool };
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "@pi-lab/webfetch",
3
+ "version": "0.1.0",
4
+ "description": "WebFetch tool extension for pi coding agent",
5
+ "keywords": [
6
+ "pi-package"
7
+ ],
8
+ "license": "MIT",
9
+ "type": "module",
10
+ "files": [
11
+ "dist",
12
+ "README.md"
13
+ ],
14
+ "pi": {
15
+ "extensions": [
16
+ "./dist/index.mjs"
17
+ ]
18
+ },
19
+ "devDependencies": {
20
+ "@mariozechner/pi-coding-agent": "^0.67.68",
21
+ "@mariozechner/pi-tui": "^0.67.68",
22
+ "@sinclair/typebox": "^0.34.49",
23
+ "@types/mozilla__readability": "^0.4.0",
24
+ "@types/node": "^25.6.0",
25
+ "@types/turndown": "^5.0.6",
26
+ "tsdown": "^0.21.9"
27
+ },
28
+ "peerDependencies": {
29
+ "@mariozechner/pi-coding-agent": "^0.67.68",
30
+ "@mariozechner/pi-tui": "^0.67.68",
31
+ "@sinclair/typebox": "^0.34.49"
32
+ },
33
+ "dependencies": {
34
+ "@mozilla/readability": "^0.5.0",
35
+ "fflate": "^0.8.2",
36
+ "linkedom": "^0.18.10",
37
+ "lru-cache": "^11.0.0",
38
+ "turndown": "^7.2.0"
39
+ },
40
+ "scripts": {
41
+ "build": "tsdown"
42
+ }
43
+ }