npm - @pi-lab/webfetch - Versions diffs - 0.1.0 - Mend

@pi-lab/webfetch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# @pi-lab/webfetch
+A web fetching extension for [pi coding agent](https://github.com/badlogic/pi-mono). Adds a `webfetch` tool — fetch any URL and get back clean Markdown, ready for the model to read.
+## Install
+```bash
+pi install npm:@pi-lab/webfetch
+```
+## Features
+- **HTML → Markdown** via [Mozilla Readability](https://github.com/mozilla/readability) (same engine as Firefox Reader Mode) + [Turndown](https://github.com/mixmark-io/turndown). Falls back to full-page conversion if Readability can't extract a main article.
+- **Pagination** — large pages are sliced into chunks; the model reads page by page using `offset`.
+- **Inline script index** — `<script>` tags are stripped from the Markdown body but listed as a numbered index at the end. The model can read any of them with `script=N`.
+- **Redirect handling** — same-domain redirects are followed automatically (up to 10 hops); cross-domain redirects are surfaced to the model so it can decide whether to follow.
+- **Binary downloads** — non-text responses (PDFs, images, etc.) are saved to `.pi/pi-lab/webfetch/tmp/` and the file path is returned.
+- **LRU cache** — processed Markdown is cached in memory (default: 50 MB, 15 min TTL) so paginating the same URL doesn't re-fetch.

package/dist/index.mjs ADDED Viewed

@@ -0,0 +1,475 @@
+import { Type } from "@sinclair/typebox";
+import { join } from "node:path";
+import { keyHint } from "@mariozechner/pi-coding-agent";
+import { Text } from "@mariozechner/pi-tui";
+import { LRUCache } from "lru-cache";
+import { mkdir, writeFile } from "node:fs/promises";
+import { Readability } from "@mozilla/readability";
+import { parseHTML } from "linkedom";
+import TurndownService from "turndown";
+//#region src/config.ts
+const DEFAULT_CONFIG = {
+	maxPageLength: 2e4,
+	cache: {
+		maxSizeBytes: 50 * 1024 * 1024,
+		ttlMs: 900 * 1e3
+	}
+};
+function mergeConfig(partial) {
+	if (!partial) return DEFAULT_CONFIG;
+	return {
+		maxPageLength: partial.maxPageLength ?? DEFAULT_CONFIG.maxPageLength,
+		cache: {
+			...DEFAULT_CONFIG.cache,
+			...partial.cache
+		}
+	};
+}
+//#endregion
+//#region src/cache.ts
+var WebFetchCache = class {
+	cache;
+	constructor(config) {
+		this.cache = new LRUCache({
+			maxSize: config.maxSizeBytes,
+			sizeCalculation: (value) => {
+				return Buffer.byteLength(value.markdown, "utf8") + value.scripts.reduce((sum, s) => sum + Buffer.byteLength(s.content, "utf8"), 0);
+			},
+			ttl: config.ttlMs,
+			allowStale: false
+		});
+	}
+	get(key) {
+		return this.cache.get(key);
+	}
+	set(key, value) {
+		this.cache.set(key, value);
+	}
+	delete(key) {
+		this.cache.delete(key);
+	}
+	clear() {
+		this.cache.clear();
+	}
+};
+//#endregion
+//#region src/normalize.ts
+/**
+* Normalize a URL in a lossless way:
+* 1. Lowercase protocol
+* 2. Lowercase hostname
+* 3. Upgrade http → https
+* 4. Remove default ports (:80 for http, :443 for https)
+*
+* Does NOT reorder query params or normalize trailing slashes.
+*/
+function normalizeUrl(rawUrl) {
+	const url = new URL(rawUrl);
+	url.protocol = url.protocol.toLowerCase();
+	url.hostname = url.hostname.toLowerCase();
+	if (url.protocol === "http:") url.protocol = "https:";
+	if (url.port === "443" && url.protocol === "https:") url.port = "";
+	if (url.port === "80" && url.protocol === "http:") url.port = "";
+	return url.toString();
+}
+//#endregion
+//#region src/fetch.ts
+/**
+* Determine if two URLs are on the same domain.
+* Same domain = same protocol + same port + same hostname (ignoring www prefix).
+*/
+function isSameDomain(a, b) {
+	try {
+		const u1 = new URL(a);
+		const u2 = new URL(b);
+		if (u1.protocol !== u2.protocol) return false;
+		if (u1.port !== u2.port) return false;
+		return u1.hostname.replace(/^www\./, "") === u2.hostname.replace(/^www\./, "");
+	} catch {
+		return false;
+	}
+}
+const CONTENT_TYPE_EXTENSIONS = {
+	"image/jpeg": ".jpg",
+	"image/png": ".png",
+	"image/gif": ".gif",
+	"image/webp": ".webp",
+	"image/svg+xml": ".svg",
+	"application/pdf": ".pdf",
+	"application/zip": ".zip",
+	"application/json": ".json",
+	"video/mp4": ".mp4",
+	"audio/mpeg": ".mp3"
+};
+function extForContentType(contentType) {
+	return CONTENT_TYPE_EXTENSIONS[contentType] ?? ".bin";
+}
+/**
+* Fetch a URL, following same-domain redirects automatically.
+* Cross-domain redirects are returned as RedirectResult for the LLM to handle.
+* Binary content is saved to tempDir and returned as BinaryResult.
+*/
+async function fetchUrl(normalizedUrl, tempDir, signal, maxRedirects = 10) {
+	let currentUrl = normalizedUrl;
+	for (let hop = 0; hop <= maxRedirects; hop++) {
+		const response = await fetch(currentUrl, {
+			signal,
+			redirect: "manual",
+			headers: {
+				Accept: "text/markdown, text/plain, text/html, */*",
+				"User-Agent": "pi/webfetch"
+			}
+		});
+		if (response.status >= 300 && response.status < 400) {
+			const location = response.headers.get("location");
+			if (!location) throw new Error(`Redirect ${response.status} with no Location header`);
+			let redirectUrl;
+			try {
+				redirectUrl = new URL(location, currentUrl).toString();
+			} catch {
+				throw new Error(`Invalid redirect location: ${location}`);
+			}
+			if (isSameDomain(currentUrl, redirectUrl)) {
+				currentUrl = normalizeUrl(redirectUrl);
+				continue;
+			} else return {
+				type: "redirect",
+				originalUrl: normalizedUrl,
+				redirectUrl,
+				statusCode: response.status
+			};
+		}
+		if (!response.ok) throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+		const baseContentType = (response.headers.get("content-type") ?? "").split(";")[0].trim().toLowerCase();
+		if (baseContentType.startsWith("text/")) return {
+			type: "text",
+			content: await response.text(),
+			contentType: baseContentType,
+			url: currentUrl
+		};
+		await mkdir(tempDir, { recursive: true });
+		const ext = extForContentType(baseContentType);
+		const filePath = join(tempDir, `webfetch-${Date.now()}${ext}`);
+		const buffer = await response.arrayBuffer();
+		await writeFile(filePath, Buffer.from(buffer));
+		return {
+			type: "binary",
+			filePath,
+			contentType: baseContentType,
+			url: currentUrl
+		};
+	}
+	throw new Error(`Too many redirects (max ${maxRedirects})`);
+}
+//#endregion
+//#region src/content.ts
+let turndownInstance = null;
+function getTurndown() {
+	if (!turndownInstance) {
+		turndownInstance = new TurndownService({
+			headingStyle: "atx",
+			codeBlockStyle: "fenced",
+			bulletListMarker: "-"
+		});
+		turndownInstance.remove([
+			"style",
+			"script",
+			"noscript"
+		]);
+		turndownInstance.keep(["pre", "code"]);
+	}
+	return turndownInstance;
+}
+/**
+* Extract inline <script> elements (no src attribute) from a parsed document.
+* External scripts are skipped — they have no inline content.
+*/
+function extractInlineScripts(document) {
+	const results = [];
+	const els = document.querySelectorAll("script:not([src])");
+	let index = 0;
+	for (const el of els) {
+		const content = el.textContent?.trim() ?? "";
+		if (content.length === 0) continue;
+		results.push({
+			index: index++,
+			length: content.length,
+			preview: content.slice(0, 80).replace(/\s+/g, " "),
+			content
+		});
+	}
+	return results;
+}
+/**
+* Process HTML content:
+* 1. Try Mozilla Readability to extract main content
+* 2. If extraction ratio < 10%, fall back to full HTML → Markdown
+* Also extracts inline scripts as a separate list.
+*/
+async function processHtml(html, _url) {
+	const td = getTurndown();
+	const { document } = parseHTML(html);
+	const scripts = extractInlineScripts(document);
+	try {
+		const article = new Readability(document).parse();
+		if (article?.content) {
+			if (article.content.length / html.length >= .1) return {
+				markdown: td.turndown(article.content),
+				scripts,
+				method: "readability"
+			};
+		}
+	} catch {}
+	return {
+		markdown: td.turndown(html),
+		scripts,
+		method: "full-html"
+	};
+}
+/**
+* Process plain text / markdown content.
+*/
+function processPlainText(text) {
+	return text;
+}
+//#endregion
+//#region src/tool.ts
+function formatScriptIndex(scripts) {
+	if (scripts.length === 0) return "";
+	const width = String(scripts.reduce((m, s) => Math.max(m, s.length), 0)).length;
+	const lines = scripts.map((s) => `  [${s.index}] ${String(s.length).padStart(width)} chars  ${s.preview}`);
+	return [
+		"",
+		`Inline scripts (${scripts.length}, call webfetch with script=N to read full content):`,
+		...lines
+	].join("\n");
+}
+function formatTextResult(output, scripts) {
+	const lines = [];
+	lines.push(`URL: ${output.url}`);
+	if (output.truncated) {
+		const next = output.offset + output.returned_length;
+		lines.push(`Offset: ${output.offset} / ${output.total_length} chars — truncated, call again with offset=${next}`);
+	} else lines.push(`Length: ${output.total_length} chars`);
+	lines.push("", "---", "", output.content);
+	const scriptIndex = formatScriptIndex(scripts);
+	if (scriptIndex) lines.push(scriptIndex);
+	return lines.join("\n");
+}
+function formatScriptResult(url, scriptIndex, output) {
+	const lines = [];
+	lines.push(`URL: ${url} — script ${scriptIndex}`);
+	if (output.truncated) {
+		const next = output.offset + output.returned_length;
+		lines.push(`Offset: ${output.offset} / ${output.total_length} chars — truncated, call again with offset=${next}`);
+	} else lines.push(`Length: ${output.total_length} chars`);
+	lines.push("", "---", "", output.content);
+	return lines.join("\n");
+}
+function formatBinaryResult(output) {
+	return [
+		`BINARY FILE: ${output.file_path}`,
+		`Content-Type: ${output.content_type}`,
+		`URL: ${output.url}`
+	].join("\n");
+}
+function formatRedirectResult(output) {
+	return [`REDIRECT ${output.status_code}: ${output.original_url} → ${output.redirect_url}`, output.message].join("\n");
+}
+function registerWebFetchTool(pi, config) {
+	const cache = new WebFetchCache(config.cache);
+	pi.on("session_shutdown", async () => {
+		cache.clear();
+	});
+	pi.registerTool({
+		name: "webfetch",
+		label: "Web Fetch",
+		description: [
+			"Fetch content from a URL and return it as Markdown text.",
+			"Handles HTML extraction via Mozilla Readability and pagination for large pages.",
+			"Inline scripts are listed in an index at the end — use the `script` parameter to read a specific one.",
+			"Non-text content (images, PDFs, etc.) is saved to a local file and the path is returned.",
+			"Cross-domain redirects are reported back so you can decide whether to follow them."
+		].join(" "),
+		promptSnippet: "Fetch and read web page content from a URL",
+		promptGuidelines: [
+			"Use webfetch to retrieve content from URLs instead of suggesting the user open a browser.",
+			"For paginated results, increment `offset` by `returned_length` and call webfetch again until `truncated` is false.",
+			"If the page has inline scripts listed at the end, use `script=N` to read one if it might contain relevant data.",
+			"If webfetch returns a redirect result, call it again with the `redirect_url`."
+		],
+		parameters: Type.Object({
+			url: Type.String({ description: "The URL to fetch." }),
+			script: Type.Optional(Type.Number({ description: "Index of an inline script to read (from the script index at the end of a previous response). Supports the same `offset` and `max_length` pagination as normal page content." })),
+			offset: Type.Optional(Type.Number({ description: "Starting character position for pagination. Defaults to 0." })),
+			max_length: Type.Optional(Type.Number({ description: `Maximum characters to return in this call. Defaults to ${config.maxPageLength}.` }))
+		}),
+		async execute(_toolCallId, params, signal, onUpdate, ctx) {
+			const { url, script: scriptIndex, offset = 0, max_length } = params;
+			const maxLength = max_length ?? config.maxPageLength;
+			let normalizedUrl;
+			try {
+				normalizedUrl = normalizeUrl(url);
+			} catch {
+				throw new Error(`Invalid URL: ${url}`);
+			}
+			const tempDir = join(ctx.cwd, ".pi", "pi-lab", "webfetch", "tmp");
+			let entry = cache.get(normalizedUrl);
+			if (!entry) {
+				onUpdate?.({
+					content: [{
+						type: "text",
+						text: `Fetching ${normalizedUrl}…`
+					}],
+					details: {}
+				});
+				const result = await fetchUrl(normalizedUrl, tempDir, signal);
+				if (result.type === "redirect") {
+					const output = {
+						redirect: true,
+						original_url: result.originalUrl,
+						redirect_url: result.redirectUrl,
+						status_code: result.statusCode,
+						message: "This URL redirects to a different domain. Call webfetch again with `redirect_url` to fetch the content."
+					};
+					return {
+						content: [{
+							type: "text",
+							text: formatRedirectResult(output)
+						}],
+						details: output
+					};
+				}
+				if (result.type === "binary") {
+					const output = {
+						file_path: result.filePath,
+						content_type: result.contentType,
+						url: result.url
+					};
+					return {
+						content: [{
+							type: "text",
+							text: formatBinaryResult(output)
+						}],
+						details: output
+					};
+				}
+				onUpdate?.({
+					content: [{
+						type: "text",
+						text: "Processing content…"
+					}],
+					details: {}
+				});
+				if (result.contentType === "text/html") {
+					const processed = await processHtml(result.content, normalizedUrl);
+					entry = {
+						markdown: processed.markdown,
+						scripts: processed.scripts
+					};
+				} else entry = {
+					markdown: processPlainText(result.content),
+					scripts: []
+				};
+				cache.set(normalizedUrl, entry);
+			}
+			if (scriptIndex !== void 0) {
+				const script = entry.scripts.find((s) => s.index === scriptIndex);
+				if (!script) throw new Error(`Script ${scriptIndex} not found. Available indices: ${entry.scripts.map((s) => s.index).join(", ") || "none"}`);
+				const total = script.content.length;
+				const slice = script.content.slice(offset, offset + maxLength);
+				const output = {
+					content: slice,
+					truncated: offset + maxLength < total,
+					total_length: total,
+					offset,
+					returned_length: slice.length,
+					url: normalizedUrl
+				};
+				return {
+					content: [{
+						type: "text",
+						text: formatScriptResult(normalizedUrl, scriptIndex, output)
+					}],
+					details: output
+				};
+			}
+			const totalLength = entry.markdown.length;
+			const slice = entry.markdown.slice(offset, offset + maxLength);
+			const output = {
+				content: slice,
+				truncated: offset + maxLength < totalLength,
+				total_length: totalLength,
+				offset,
+				returned_length: slice.length,
+				url: normalizedUrl
+			};
+			return {
+				content: [{
+					type: "text",
+					text: formatTextResult(output, entry.scripts)
+				}],
+				details: output
+			};
+		},
+		renderCall(args, theme, context) {
+			const text = context.lastComponent ?? new Text("", 0, 0);
+			let line = theme.fg("toolTitle", theme.bold("webfetch "));
+			line += theme.fg("accent", args.url ?? "");
+			if (args.script !== void 0) line += theme.fg("muted", ` · script=${args.script}`);
+			if (args.offset) line += theme.fg("dim", ` · offset=${args.offset}`);
+			text.setText(line);
+			return text;
+		},
+		renderResult(result, options, theme, context) {
+			const text = context.lastComponent ?? new Text("", 0, 0);
+			if (options.isPartial) {
+				text.setText(theme.fg("muted", "Fetching…"));
+				return text;
+			}
+			if (context.isError || !result.details) {
+				const raw = result.content.find((c) => c.type === "text")?.text ?? "";
+				text.setText(theme.fg("error", raw));
+				return text;
+			}
+			const details = result.details;
+			if ("redirect" in details) {
+				text.setText(theme.fg("warning", `↪ REDIRECT ${details.status_code}: `) + theme.fg("accent", details.redirect_url));
+				return text;
+			}
+			if ("file_path" in details) {
+				text.setText(theme.fg("success", "✓ ") + theme.fg("muted", details.content_type) + theme.fg("dim", ` → ${details.file_path}`));
+				return text;
+			}
+			const allLines = details.content.split("\n");
+			const maxLines = options.expanded ? allLines.length : 10;
+			const displayLines = allLines.slice(0, maxLines);
+			const remaining = allLines.length - maxLines;
+			const header = theme.fg("dim", details.url) + (details.truncated ? theme.fg("muted", ` · ${details.returned_length.toLocaleString()} / ${details.total_length.toLocaleString()} chars`) : theme.fg("muted", ` · ${details.total_length.toLocaleString()} chars`));
+			let body = "\n" + displayLines.map((l) => theme.fg("toolOutput", l)).join("\n");
+			if (remaining > 0) body += theme.fg("muted", `\n… (${remaining} more lines, `) + keyHint("app.tools.expand", "to expand") + theme.fg("muted", ")");
+			text.setText(header + body);
+			return text;
+		}
+	});
+}
+//#endregion
+//#region src/index.ts
+/**
+* WebFetch extension for pi coding agent.
+*
+* Registers the `webfetch` tool which fetches URLs and returns Markdown content.
+*
+* Features:
+* - URL normalization (lowercase, http→https, strip default ports)
+* - Same-domain redirect following; cross-domain redirects returned to LLM
+* - Mozilla Readability for HTML → Markdown extraction
+* - Inline script index — use `script=N` to read a specific inline script
+* - LRU cache (50 MB, 15 min TTL) keyed on normalized URL
+* - Pagination via offset/max_length parameters
+*/
+function src_default(pi) {
+	registerWebFetchTool(pi, mergeConfig());
+}
+//#endregion
+export { DEFAULT_CONFIG, src_default as default, mergeConfig, registerWebFetchTool };

package/package.json ADDED Viewed

@@ -0,0 +1,43 @@
+{
+  "name": "@pi-lab/webfetch",
+  "version": "0.1.0",
+  "description": "WebFetch tool extension for pi coding agent",
+  "keywords": [
+    "pi-package"
+  ],
+  "license": "MIT",
+  "type": "module",
+  "files": [
+    "dist",
+    "README.md"
+  ],
+  "pi": {
+    "extensions": [
+      "./dist/index.mjs"
+    ]
+  },
+  "devDependencies": {
+    "@mariozechner/pi-coding-agent": "^0.67.68",
+    "@mariozechner/pi-tui": "^0.67.68",
+    "@sinclair/typebox": "^0.34.49",
+    "@types/mozilla__readability": "^0.4.0",
+    "@types/node": "^25.6.0",
+    "@types/turndown": "^5.0.6",
+    "tsdown": "^0.21.9"
+  },
+  "peerDependencies": {
+    "@mariozechner/pi-coding-agent": "^0.67.68",
+    "@mariozechner/pi-tui": "^0.67.68",
+    "@sinclair/typebox": "^0.34.49"
+  },
+  "dependencies": {
+    "@mozilla/readability": "^0.5.0",
+    "fflate": "^0.8.2",
+    "linkedom": "^0.18.10",
+    "lru-cache": "^11.0.0",
+    "turndown": "^7.2.0"
+  },
+  "scripts": {
+    "build": "tsdown"
+  }
+}