@rjshrjndrn/pi-fetch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Makefile +63 -0
- package/README.md +73 -0
- package/extensions/index.ts +55 -0
- package/package.json +36 -0
- package/src/fetch.ts +87 -0
- package/tests/fetch.test.ts +82 -0
package/Makefile
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
.PHONY: install test typecheck build publish patch minor major clean help
|
|
2
|
+
|
|
3
|
+
# ── Config ───────────────────────────────────────────────────────────────────
|
|
4
|
+
|
|
5
|
+
PACKAGE_NAME := $(shell node -p "require('./package.json').name" 2>/dev/null)
|
|
6
|
+
VERSION := $(shell node -p "require('./package.json').version" 2>/dev/null)
|
|
7
|
+
|
|
8
|
+
# ── Default ──────────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
help:
|
|
11
|
+
@echo "$(PACKAGE_NAME) v$(VERSION)"
|
|
12
|
+
@echo ""
|
|
13
|
+
@echo " make install Install dependencies"
|
|
14
|
+
@echo " make test Run tests"
|
|
15
|
+
@echo " make typecheck Type-check TypeScript (no emit)"
|
|
16
|
+
@echo " make build install + typecheck + test"
|
|
17
|
+
@echo " make patch Bump patch version and publish"
|
|
18
|
+
@echo " make minor Bump minor version and publish"
|
|
19
|
+
@echo " make major Bump major version and publish"
|
|
20
|
+
@echo " make publish Publish current version to npm"
|
|
21
|
+
@echo " make clean Remove node_modules"
|
|
22
|
+
|
|
23
|
+
# ── Core tasks ───────────────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
install:
|
|
26
|
+
npm install
|
|
27
|
+
|
|
28
|
+
test:
|
|
29
|
+
npm test
|
|
30
|
+
|
|
31
|
+
typecheck:
|
|
32
|
+
npx tsc --noEmit --strict --skipLibCheck --moduleResolution bundler --module esnext \
|
|
33
|
+
--target esnext --allowImportingTsExtensions \
|
|
34
|
+
extensions/index.ts $(wildcard src/*.ts)
|
|
35
|
+
|
|
36
|
+
build: install typecheck test
|
|
37
|
+
@echo "✓ Build complete — $(PACKAGE_NAME) v$(VERSION)"
|
|
38
|
+
|
|
39
|
+
# ── Publishing ───────────────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
publish: clean build
|
|
42
|
+
npm publish --access public
|
|
43
|
+
@echo "✓ Published $(PACKAGE_NAME) v$(VERSION)"
|
|
44
|
+
|
|
45
|
+
patch: clean build
|
|
46
|
+
npm version patch
|
|
47
|
+
npm publish --access public
|
|
48
|
+
@echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
|
|
49
|
+
|
|
50
|
+
minor: clean build
|
|
51
|
+
npm version minor
|
|
52
|
+
npm publish --access public
|
|
53
|
+
@echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
|
|
54
|
+
|
|
55
|
+
major: clean build
|
|
56
|
+
npm version major
|
|
57
|
+
npm publish --access public
|
|
58
|
+
@echo "✓ Published $(PACKAGE_NAME) v$(shell node -p "require('./package.json').version")"
|
|
59
|
+
|
|
60
|
+
# ── Cleanup ──────────────────────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
clean:
|
|
63
|
+
rm -rf node_modules
|
package/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# pi-fetch
|
|
2
|
+
|
|
3
|
+
Web content extraction for [pi](https://github.com/badlogic/pi-mono). Fetch any URL as clean Markdown — no headless browser required.
|
|
4
|
+
|
|
5
|
+
Powered by [Defuddle](https://github.com/kepano/defuddle) by [Steph Ango](https://github.com/kepano) (creator of [Obsidian Web Clipper](https://obsidian.md/clipper)).
|
|
6
|
+
|
|
7
|
+
## How it works
|
|
8
|
+
|
|
9
|
+
Registers a `web_fetch` tool that the LLM can call with any URL. Under the hood:
|
|
10
|
+
|
|
11
|
+
1. **[JSDOM](https://github.com/jsdom/jsdom)** fetches and parses the HTML (lightweight, no browser engine)
|
|
12
|
+
2. **[Defuddle](https://github.com/kepano/defuddle)** extracts the main content, stripping navigation, ads, sidebars, cookie banners, and clutter
|
|
13
|
+
3. Returns clean **Markdown** with metadata (title, author, description, word count)
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
web_fetch https://www.npmjs.com/package/defuddle
|
|
17
|
+
|
|
18
|
+
# defuddle
|
|
19
|
+
|
|
20
|
+
**Author:** kepano · **Domain:** npmjs.com · **Words:** 342
|
|
21
|
+
|
|
22
|
+
> Get the main content of any page as Markdown.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
Defuddle extracts the main content from web pages...
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Output is automatically truncated to stay within pi's context limits.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# As a pi package
|
|
35
|
+
pi install npm:pi-fetch
|
|
36
|
+
|
|
37
|
+
# Or test locally
|
|
38
|
+
pi -e ./extensions/index.ts
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
Once installed, just ask pi to fetch a URL:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
fetch https://docs.example.com/getting-started
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Or the LLM will use `web_fetch` automatically when it needs to read a webpage.
|
|
50
|
+
|
|
51
|
+
## Limitations
|
|
52
|
+
|
|
53
|
+
- **No JavaScript rendering** — JSDOM does not execute JavaScript. SPAs that require JS to render content will return empty or minimal results. For those, you'll still need a headless browser.
|
|
54
|
+
- **Some sites block non-browser requests** — sites with aggressive bot detection may reject the request.
|
|
55
|
+
- **Output truncation** — very large pages are truncated to 50KB / 2000 lines to protect context window.
|
|
56
|
+
|
|
57
|
+
## Development
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
cd pi-fetch
|
|
61
|
+
npm install
|
|
62
|
+
npm test
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Credits
|
|
66
|
+
|
|
67
|
+
- **[Defuddle](https://github.com/kepano/defuddle)** by [Steph Ango (kepano)](https://github.com/kepano) — content extraction engine
|
|
68
|
+
- **[JSDOM](https://github.com/jsdom/jsdom)** — HTML parsing without a browser
|
|
69
|
+
- **[pi](https://github.com/badlogic/pi-mono)** by [Mario Zechner (badlogic)](https://github.com/badlogic) — agent framework
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"
|
|
2
|
+
import {
|
|
3
|
+
truncateHead,
|
|
4
|
+
DEFAULT_MAX_BYTES,
|
|
5
|
+
DEFAULT_MAX_LINES,
|
|
6
|
+
formatSize,
|
|
7
|
+
} from "@mariozechner/pi-coding-agent"
|
|
8
|
+
import { Type } from "@sinclair/typebox"
|
|
9
|
+
import { fetchPage, formatResult } from "../src/fetch.js"
|
|
10
|
+
|
|
11
|
+
export default function (pi: ExtensionAPI) {
|
|
12
|
+
pi.registerTool({
|
|
13
|
+
name: "web_fetch",
|
|
14
|
+
label: "Web Fetch",
|
|
15
|
+
description:
|
|
16
|
+
"Fetch a webpage and extract its main content as clean Markdown. " +
|
|
17
|
+
"Strips navigation, ads, sidebars, and clutter. " +
|
|
18
|
+
"Returns title, author, description, word count, and the article content.",
|
|
19
|
+
promptSnippet:
|
|
20
|
+
"Fetch a URL and return its readable content as clean Markdown",
|
|
21
|
+
promptGuidelines: [
|
|
22
|
+
"Use web_fetch when the user provides a URL to read, analyze, or summarize.",
|
|
23
|
+
"Prefer web_fetch over bash curl for reading webpage content — it extracts clean text and saves tokens.",
|
|
24
|
+
],
|
|
25
|
+
parameters: Type.Object({
|
|
26
|
+
url: Type.String({ description: "URL of the webpage to fetch" }),
|
|
27
|
+
}),
|
|
28
|
+
|
|
29
|
+
async execute(_toolCallId, params, signal, _onUpdate, _ctx) {
|
|
30
|
+
const result = await fetchPage(params.url, signal ?? undefined)
|
|
31
|
+
const output = formatResult(result)
|
|
32
|
+
|
|
33
|
+
// Truncate if output exceeds context limits
|
|
34
|
+
const truncation = truncateHead(output, {
|
|
35
|
+
maxLines: DEFAULT_MAX_LINES,
|
|
36
|
+
maxBytes: DEFAULT_MAX_BYTES,
|
|
37
|
+
})
|
|
38
|
+
|
|
39
|
+
let text = truncation.content
|
|
40
|
+
if (truncation.truncated) {
|
|
41
|
+
text += `\n\n[Truncated: showing ${formatSize(truncation.outputBytes)} of ${formatSize(truncation.totalBytes)}]`
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
content: [{ type: "text", text }],
|
|
46
|
+
details: {
|
|
47
|
+
url: result.url,
|
|
48
|
+
title: result.title,
|
|
49
|
+
wordCount: result.wordCount,
|
|
50
|
+
truncated: truncation.truncated,
|
|
51
|
+
},
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
})
|
|
55
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@rjshrjndrn/pi-fetch",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Web content extraction for pi — fetch any URL as clean Markdown using Defuddle",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"pi-package",
|
|
7
|
+
"pi-extension",
|
|
8
|
+
"web-fetch",
|
|
9
|
+
"markdown",
|
|
10
|
+
"defuddle",
|
|
11
|
+
"readability"
|
|
12
|
+
],
|
|
13
|
+
"license": "MIT",
|
|
14
|
+
"type": "module",
|
|
15
|
+
"pi": {
|
|
16
|
+
"extensions": [
|
|
17
|
+
"./extensions"
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"defuddle": "^0.14.0"
|
|
22
|
+
},
|
|
23
|
+
"peerDependencies": {
|
|
24
|
+
"@mariozechner/pi-coding-agent": "*",
|
|
25
|
+
"@sinclair/typebox": "*"
|
|
26
|
+
},
|
|
27
|
+
"devDependencies": {
|
|
28
|
+
"@types/node": "^20.0.0",
|
|
29
|
+
"typescript": "^5.4.0",
|
|
30
|
+
"vitest": "^1.6.0"
|
|
31
|
+
},
|
|
32
|
+
"scripts": {
|
|
33
|
+
"test": "vitest run",
|
|
34
|
+
"test:watch": "vitest"
|
|
35
|
+
}
|
|
36
|
+
}
|
package/src/fetch.ts
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
export interface FetchResult {
|
|
2
|
+
url: string
|
|
3
|
+
title: string | null
|
|
4
|
+
author: string | null
|
|
5
|
+
description: string | null
|
|
6
|
+
domain: string | null
|
|
7
|
+
published: string | null
|
|
8
|
+
wordCount: number | null
|
|
9
|
+
content: string
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const USER_AGENT =
|
|
13
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Fetch a URL and extract its main content as Markdown via Defuddle.
|
|
17
|
+
*
|
|
18
|
+
* Uses native fetch + Defuddle's Node API (no headless browser) to
|
|
19
|
+
* strip navigation, ads, sidebars, and clutter.
|
|
20
|
+
*/
|
|
21
|
+
export async function fetchPage(
|
|
22
|
+
url: string,
|
|
23
|
+
signal?: AbortSignal
|
|
24
|
+
): Promise<FetchResult> {
|
|
25
|
+
const { Defuddle } = await import("defuddle/node")
|
|
26
|
+
|
|
27
|
+
let html: string
|
|
28
|
+
try {
|
|
29
|
+
const response = await fetch(url, {
|
|
30
|
+
signal,
|
|
31
|
+
headers: { "User-Agent": USER_AGENT },
|
|
32
|
+
})
|
|
33
|
+
if (!response.ok) {
|
|
34
|
+
throw new Error(`HTTP ${response.status} ${response.statusText}`)
|
|
35
|
+
}
|
|
36
|
+
html = await response.text()
|
|
37
|
+
} catch (err: any) {
|
|
38
|
+
if (err?.name === "AbortError") throw err
|
|
39
|
+
throw new Error(`Failed to fetch ${url}: ${err?.message ?? err}`)
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const result = await Defuddle(html, url, { markdown: true })
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
url,
|
|
46
|
+
title: result.title ?? null,
|
|
47
|
+
author: result.author ?? null,
|
|
48
|
+
description: result.description ?? null,
|
|
49
|
+
domain: result.domain ?? null,
|
|
50
|
+
published: result.published ?? null,
|
|
51
|
+
wordCount: result.wordCount ?? null,
|
|
52
|
+
content: result.content ?? "",
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Format a FetchResult into a human/LLM-readable Markdown string.
|
|
58
|
+
*/
|
|
59
|
+
export function formatResult(result: FetchResult): string {
|
|
60
|
+
const lines: string[] = []
|
|
61
|
+
|
|
62
|
+
if (result.title) lines.push(`# ${result.title}`)
|
|
63
|
+
const meta: string[] = []
|
|
64
|
+
if (result.author) meta.push(`**Author:** ${result.author}`)
|
|
65
|
+
if (result.domain) meta.push(`**Domain:** ${result.domain}`)
|
|
66
|
+
if (result.published) meta.push(`**Published:** ${result.published}`)
|
|
67
|
+
if (result.wordCount) meta.push(`**Words:** ${result.wordCount}`)
|
|
68
|
+
if (meta.length > 0) {
|
|
69
|
+
lines.push("")
|
|
70
|
+
lines.push(meta.join(" · "))
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (result.description) {
|
|
74
|
+
lines.push("")
|
|
75
|
+
lines.push(`> ${result.description}`)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (lines.length > 0) {
|
|
79
|
+
lines.push("")
|
|
80
|
+
lines.push("---")
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
lines.push("")
|
|
84
|
+
lines.push(result.content || "(no content extracted)")
|
|
85
|
+
|
|
86
|
+
return lines.join("\n")
|
|
87
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest"
|
|
2
|
+
import { formatResult, type FetchResult } from "../src/fetch.js"
|
|
3
|
+
|
|
4
|
+
describe("formatResult", () => {
|
|
5
|
+
it("formats a full result with all metadata", () => {
|
|
6
|
+
const result: FetchResult = {
|
|
7
|
+
url: "https://example.com/article",
|
|
8
|
+
title: "Test Article",
|
|
9
|
+
author: "Jane Doe",
|
|
10
|
+
description: "A test article about testing",
|
|
11
|
+
domain: "example.com",
|
|
12
|
+
published: "2026-01-15",
|
|
13
|
+
wordCount: 500,
|
|
14
|
+
content: "This is the article body.",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
const output = formatResult(result)
|
|
18
|
+
expect(output).toContain("# Test Article")
|
|
19
|
+
expect(output).toContain("**Author:** Jane Doe")
|
|
20
|
+
expect(output).toContain("**Domain:** example.com")
|
|
21
|
+
expect(output).toContain("**Published:** 2026-01-15")
|
|
22
|
+
expect(output).toContain("**Words:** 500")
|
|
23
|
+
expect(output).toContain("> A test article about testing")
|
|
24
|
+
expect(output).toContain("---")
|
|
25
|
+
expect(output).toContain("This is the article body.")
|
|
26
|
+
})
|
|
27
|
+
|
|
28
|
+
it("handles minimal result with no metadata", () => {
|
|
29
|
+
const result: FetchResult = {
|
|
30
|
+
url: "https://example.com",
|
|
31
|
+
title: null,
|
|
32
|
+
author: null,
|
|
33
|
+
description: null,
|
|
34
|
+
domain: null,
|
|
35
|
+
published: null,
|
|
36
|
+
wordCount: null,
|
|
37
|
+
content: "Just content.",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const output = formatResult(result)
|
|
41
|
+
expect(output).not.toContain("#")
|
|
42
|
+
expect(output).not.toContain("**Author:**")
|
|
43
|
+
expect(output).not.toContain("---")
|
|
44
|
+
expect(output).toContain("Just content.")
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
it("handles empty content", () => {
|
|
48
|
+
const result: FetchResult = {
|
|
49
|
+
url: "https://example.com",
|
|
50
|
+
title: "Empty Page",
|
|
51
|
+
author: null,
|
|
52
|
+
description: null,
|
|
53
|
+
domain: null,
|
|
54
|
+
published: null,
|
|
55
|
+
wordCount: null,
|
|
56
|
+
content: "",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const output = formatResult(result)
|
|
60
|
+
expect(output).toContain("# Empty Page")
|
|
61
|
+
expect(output).toContain("(no content extracted)")
|
|
62
|
+
})
|
|
63
|
+
|
|
64
|
+
it("includes only available metadata fields", () => {
|
|
65
|
+
const result: FetchResult = {
|
|
66
|
+
url: "https://example.com",
|
|
67
|
+
title: "Partial",
|
|
68
|
+
author: "Author Only",
|
|
69
|
+
description: null,
|
|
70
|
+
domain: null,
|
|
71
|
+
published: null,
|
|
72
|
+
wordCount: 100,
|
|
73
|
+
content: "Body text.",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const output = formatResult(result)
|
|
77
|
+
expect(output).toContain("**Author:** Author Only")
|
|
78
|
+
expect(output).toContain("**Words:** 100")
|
|
79
|
+
expect(output).not.toContain("**Domain:**")
|
|
80
|
+
expect(output).not.toContain("**Published:**")
|
|
81
|
+
})
|
|
82
|
+
})
|