@rjshrjndrn/pi-fetch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Makefile ADDED
@@ -0,0 +1,63 @@
1
+ .PHONY: install test typecheck build publish patch minor major clean help
2
+
3
+ # ── Config ───────────────────────────────────────────────────────────────────
4
+
5
+ PACKAGE_NAME := $(shell node -p "require('./package.json').name" 2>/dev/null)
6
+ VERSION := $(shell node -p "require('./package.json').version" 2>/dev/null)
7
+
8
+ # ── Default ──────────────────────────────────────────────────────────────────
9
+
10
+ help:
11
+ @echo "$(PACKAGE_NAME) v$(VERSION)"
12
+ @echo ""
13
+ @echo " make install Install dependencies"
14
+ @echo " make test Run tests"
15
+ @echo " make typecheck Type-check TypeScript (no emit)"
16
+ @echo " make build install + typecheck + test"
17
+ @echo " make patch Bump patch version and publish"
18
+ @echo " make minor Bump minor version and publish"
19
+ @echo " make major Bump major version and publish"
20
+ @echo " make publish Publish current version to npm"
21
+ @echo " make clean Remove node_modules"
22
+
23
+ # ── Core tasks ───────────────────────────────────────────────────────────────
24
+
25
+ install:
26
+ npm install
27
+
28
+ test:
29
+ npm test
30
+
31
+ typecheck:
32
+ npx tsc --noEmit --strict --skipLibCheck --moduleResolution bundler --module esnext \
33
+ --target esnext --allowImportingTsExtensions \
34
+ extensions/index.ts $(wildcard src/*.ts)
35
+
36
+ build: install typecheck test
37
+ @echo "✓ Build complete — $(PACKAGE_NAME) v$(VERSION)"
38
+
39
+ # ── Publishing ───────────────────────────────────────────────────────────────
40
+
41
+ publish: clean build
42
+ npm publish --access public
43
+ @echo "✓ Published $(PACKAGE_NAME) v$(VERSION)"
44
+
45
+ patch: clean build
46
+ npm version patch
47
+ npm publish --access public
48
+ @echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
49
+
50
+ minor: clean build
51
+ npm version minor
52
+ npm publish --access public
53
+ @echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
54
+
55
+ major: clean build
56
+ npm version major
57
+ npm publish --access public
58
+ @echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
59
+
60
+ # ── Cleanup ──────────────────────────────────────────────────────────────────
61
+
62
+ clean:
63
+ rm -rf node_modules
package/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # pi-fetch
2
+
3
+ Web content extraction for [pi](https://github.com/badlogic/pi-mono). Fetch any URL as clean Markdown — no headless browser required.
4
+
5
+ Powered by [Defuddle](https://github.com/kepano/defuddle) by [Steph Ango](https://github.com/kepano) (creator of [Obsidian Web Clipper](https://obsidian.md/clipper)).
6
+
7
+ ## How it works
8
+
9
+ Registers a `web_fetch` tool that the LLM can call with any URL. Under the hood:
10
+
11
+ 1. **[JSDOM](https://github.com/jsdom/jsdom)** fetches and parses the HTML (lightweight, no browser engine)
12
+ 2. **[Defuddle](https://github.com/kepano/defuddle)** extracts the main content, stripping navigation, ads, sidebars, cookie banners, and clutter
13
+ 3. Returns clean **Markdown** with metadata (title, author, description, word count)
14
+
15
+ ```
16
+ web_fetch https://www.npmjs.com/package/defuddle
17
+
18
+ # defuddle
19
+
20
+ **Author:** kepano · **Domain:** npmjs.com · **Words:** 342
21
+
22
+ > Get the main content of any page as Markdown.
23
+
24
+ ---
25
+
26
+ Defuddle extracts the main content from web pages...
27
+ ```
28
+
29
+ Output is automatically truncated to stay within pi's context limits.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ # As a pi package
35
+ pi install npm:pi-fetch
36
+
37
+ # Or test locally
38
+ pi -e ./extensions/index.ts
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ Once installed, just ask pi to fetch a URL:
44
+
45
+ ```
46
+ fetch https://docs.example.com/getting-started
47
+ ```
48
+
49
+ Or the LLM will use `web_fetch` automatically when it needs to read a webpage.
50
+
51
+ ## Limitations
52
+
53
+ - **No JavaScript rendering** — JSDOM does not execute JavaScript. SPAs that require JS to render content will return empty or minimal results. For those, you'll still need a headless browser.
54
+ - **Some sites block non-browser requests** — sites with aggressive bot detection may reject the request.
55
+ - **Output truncation** — very large pages are truncated to 50KB / 2000 lines to protect context window.
56
+
57
+ ## Development
58
+
59
+ ```bash
60
+ cd pi-fetch
61
+ npm install
62
+ npm test
63
+ ```
64
+
65
+ ## Credits
66
+
67
+ - **[Defuddle](https://github.com/kepano/defuddle)** by [Steph Ango (kepano)](https://github.com/kepano) — content extraction engine
68
+ - **[JSDOM](https://github.com/jsdom/jsdom)** — HTML parsing without a browser
69
+ - **[pi](https://github.com/badlogic/pi-mono)** by [Mario Zechner (badlogic)](https://github.com/badlogic) — agent framework
70
+
71
+ ## License
72
+
73
+ MIT
@@ -0,0 +1,55 @@
1
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"
2
+ import {
3
+ truncateHead,
4
+ DEFAULT_MAX_BYTES,
5
+ DEFAULT_MAX_LINES,
6
+ formatSize,
7
+ } from "@mariozechner/pi-coding-agent"
8
+ import { Type } from "@sinclair/typebox"
9
+ import { fetchPage, formatResult } from "../src/fetch.js"
10
+
11
+ export default function (pi: ExtensionAPI) {
12
+ pi.registerTool({
13
+ name: "web_fetch",
14
+ label: "Web Fetch",
15
+ description:
16
+ "Fetch a webpage and extract its main content as clean Markdown. " +
17
+ "Strips navigation, ads, sidebars, and clutter. " +
18
+ "Returns title, author, description, word count, and the article content.",
19
+ promptSnippet:
20
+ "Fetch a URL and return its readable content as clean Markdown",
21
+ promptGuidelines: [
22
+ "Use web_fetch when the user provides a URL to read, analyze, or summarize.",
23
+ "Prefer web_fetch over bash curl for reading webpage content — it extracts clean text and saves tokens.",
24
+ ],
25
+ parameters: Type.Object({
26
+ url: Type.String({ description: "URL of the webpage to fetch" }),
27
+ }),
28
+
29
+ async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
30
+ const result = await fetchPage(params.url, signal ?? undefined)
31
+ const output = formatResult(result)
32
+
33
+ // Truncate if output exceeds context limits
34
+ const truncation = truncateHead(output, {
35
+ maxLines: DEFAULT_MAX_LINES,
36
+ maxBytes: DEFAULT_MAX_BYTES,
37
+ })
38
+
39
+ let text = truncation.content
40
+ if (truncation.truncated) {
41
+ text += `\n\n[Truncated: showing ${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}]`
42
+ }
43
+
44
+ return {
45
+ content: [{ type: "text", text }],
46
+ details: {
47
+ url: result.url,
48
+ title: result.title,
49
+ wordCount: result.wordCount,
50
+ truncated: truncation.truncated,
51
+ },
52
+ }
53
+ },
54
+ })
55
+ }
package/package.json ADDED
@@ -0,0 +1,36 @@
1
+ {
2
+ "name": "@rjshrjndrn/pi-fetch",
3
+ "version": "0.1.0",
4
+ "description": "Web content extraction for pi — fetch any URL as clean Markdown using Defuddle",
5
+ "keywords": [
6
+ "pi-package",
7
+ "pi-extension",
8
+ "web-fetch",
9
+ "markdown",
10
+ "defuddle",
11
+ "readability"
12
+ ],
13
+ "license": "MIT",
14
+ "type": "module",
15
+ "pi": {
16
+ "extensions": [
17
+ "./extensions"
18
+ ]
19
+ },
20
+ "dependencies": {
21
+ "defuddle": "^0.14.0"
22
+ },
23
+ "peerDependencies": {
24
+ "@mariozechner/pi-coding-agent": "*",
25
+ "@sinclair/typebox": "*"
26
+ },
27
+ "devDependencies": {
28
+ "@types/node": "^20.0.0",
29
+ "typescript": "^5.4.0",
30
+ "vitest": "^1.6.0"
31
+ },
32
+ "scripts": {
33
+ "test": "vitest run",
34
+ "test:watch": "vitest"
35
+ }
36
+ }
package/src/fetch.ts ADDED
@@ -0,0 +1,87 @@
1
+ export interface FetchResult {
2
+ url: string
3
+ title: string | null
4
+ author: string | null
5
+ description: string | null
6
+ domain: string | null
7
+ published: string | null
8
+ wordCount: number | null
9
+ content: string
10
+ }
11
+
12
+ const USER_AGENT =
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
14
+
15
+ /**
16
+ * Fetch a URL and extract its main content as Markdown via Defuddle.
17
+ *
18
+ * Uses native fetch + Defuddle's Node API (no headless browser) to
19
+ * strip navigation, ads, sidebars, and clutter.
20
+ */
21
+ export async function fetchPage(
22
+ url: string,
23
+ signal?: AbortSignal
24
+ ): Promise<FetchResult> {
25
+ const { Defuddle } = await import("defuddle/node")
26
+
27
+ let html: string
28
+ try {
29
+ const response = await fetch(url, {
30
+ signal,
31
+ headers: { "User-Agent": USER_AGENT },
32
+ })
33
+ if (!response.ok) {
34
+ throw new Error(`HTTP ${response.status} ${response.statusText}`)
35
+ }
36
+ html = await response.text()
37
+ } catch (err: any) {
38
+ if (err?.name === "AbortError") throw err
39
+ throw new Error(`Failed to fetch ${url}: ${err?.message ?? err}`)
40
+ }
41
+
42
+ const result = await Defuddle(html, url, { markdown: true })
43
+
44
+ return {
45
+ url,
46
+ title: result.title ?? null,
47
+ author: result.author ?? null,
48
+ description: result.description ?? null,
49
+ domain: result.domain ?? null,
50
+ published: result.published ?? null,
51
+ wordCount: result.wordCount ?? null,
52
+ content: result.content ?? "",
53
+ }
54
+ }
55
+
56
+ /**
57
+ * Format a FetchResult into a human/LLM-readable Markdown string.
58
+ */
59
+ export function formatResult(result: FetchResult): string {
60
+ const lines: string[] = []
61
+
62
+ if (result.title) lines.push(`# ${result.title}`)
63
+ const meta: string[] = []
64
+ if (result.author) meta.push(`**Author:** ${result.author}`)
65
+ if (result.domain) meta.push(`**Domain:** ${result.domain}`)
66
+ if (result.published) meta.push(`**Published:** ${result.published}`)
67
+ if (result.wordCount) meta.push(`**Words:** ${result.wordCount}`)
68
+ if (meta.length > 0) {
69
+ lines.push("")
70
+ lines.push(meta.join(" · "))
71
+ }
72
+
73
+ if (result.description) {
74
+ lines.push("")
75
+ lines.push(`> ${result.description}`)
76
+ }
77
+
78
+ if (lines.length > 0) {
79
+ lines.push("")
80
+ lines.push("---")
81
+ }
82
+
83
+ lines.push("")
84
+ lines.push(result.content || "(no content extracted)")
85
+
86
+ return lines.join("\n")
87
+ }
@@ -0,0 +1,82 @@
1
+ import { describe, it, expect } from "vitest"
2
+ import { formatResult, type FetchResult } from "../src/fetch.js"
3
+
4
+ describe("formatResult", () => {
5
+ it("formats a full result with all metadata", () => {
6
+ const result: FetchResult = {
7
+ url: "https://example.com/article",
8
+ title: "Test Article",
9
+ author: "Jane Doe",
10
+ description: "A test article about testing",
11
+ domain: "example.com",
12
+ published: "2026-01-15",
13
+ wordCount: 500,
14
+ content: "This is the article body.",
15
+ }
16
+
17
+ const output = formatResult(result)
18
+ expect(output).toContain("# Test Article")
19
+ expect(output).toContain("**Author:** Jane Doe")
20
+ expect(output).toContain("**Domain:** example.com")
21
+ expect(output).toContain("**Published:** 2026-01-15")
22
+ expect(output).toContain("**Words:** 500")
23
+ expect(output).toContain("> A test article about testing")
24
+ expect(output).toContain("---")
25
+ expect(output).toContain("This is the article body.")
26
+ })
27
+
28
+ it("handles minimal result with no metadata", () => {
29
+ const result: FetchResult = {
30
+ url: "https://example.com",
31
+ title: null,
32
+ author: null,
33
+ description: null,
34
+ domain: null,
35
+ published: null,
36
+ wordCount: null,
37
+ content: "Just content.",
38
+ }
39
+
40
+ const output = formatResult(result)
41
+ expect(output).not.toContain("#")
42
+ expect(output).not.toContain("**Author:**")
43
+ expect(output).not.toContain("---")
44
+ expect(output).toContain("Just content.")
45
+ })
46
+
47
+ it("handles empty content", () => {
48
+ const result: FetchResult = {
49
+ url: "https://example.com",
50
+ title: "Empty Page",
51
+ author: null,
52
+ description: null,
53
+ domain: null,
54
+ published: null,
55
+ wordCount: null,
56
+ content: "",
57
+ }
58
+
59
+ const output = formatResult(result)
60
+ expect(output).toContain("# Empty Page")
61
+ expect(output).toContain("(no content extracted)")
62
+ })
63
+
64
+ it("includes only available metadata fields", () => {
65
+ const result: FetchResult = {
66
+ url: "https://example.com",
67
+ title: "Partial",
68
+ author: "Author Only",
69
+ description: null,
70
+ domain: null,
71
+ published: null,
72
+ wordCount: 100,
73
+ content: "Body text.",
74
+ }
75
+
76
+ const output = formatResult(result)
77
+ expect(output).toContain("**Author:** Author Only")
78
+ expect(output).toContain("**Words:** 100")
79
+ expect(output).not.toContain("**Domain:**")
80
+ expect(output).not.toContain("**Published:**")
81
+ })
82
+ })