@rjshrjndrn/pi-fetch 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/package.json +4 -2
- package/src/fetch.ts +42 -5
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@ Powered by [Defuddle](https://github.com/kepano/defuddle) by [Steph Ango](https:
|
|
|
8
8
|
|
|
9
9
|
Registers a `web_fetch` tool that the LLM can call with any URL. Under the hood:
|
|
10
10
|
|
|
11
|
-
1. **
|
|
11
|
+
1. **Native `fetch`** retrieves the HTML (lightweight, no browser engine)
|
|
12
12
|
2. **[Defuddle](https://github.com/kepano/defuddle)** extracts the main content, stripping navigation, ads, sidebars, cookie banners, and clutter
|
|
13
13
|
3. Returns clean **Markdown** with metadata (title, author, description, word count)
|
|
14
14
|
|
|
@@ -32,7 +32,10 @@ Output is automatically truncated to stay within pi's context limits.
|
|
|
32
32
|
|
|
33
33
|
```bash
|
|
34
34
|
# As a pi package
|
|
35
|
-
pi install npm
|
|
35
|
+
pi install npm:@rjshrjndrn/pi-fetch
|
|
36
|
+
|
|
37
|
+
# Or test without installing
|
|
38
|
+
pi -e npm:@rjshrjndrn/pi-fetch
|
|
36
39
|
|
|
37
40
|
# Or test locally
|
|
38
41
|
pi -e ./extensions/index.ts
|
|
@@ -50,7 +53,7 @@ Or the LLM will use `web_fetch` automatically when it needs to read a webpage.
|
|
|
50
53
|
|
|
51
54
|
## Limitations
|
|
52
55
|
|
|
53
|
-
- **No JavaScript rendering** —
|
|
56
|
+
- **No JavaScript rendering** — uses native `fetch`, not a browser. SPAs that require JS to render content will return empty or minimal results. For those, you'll still need a headless browser.
|
|
54
57
|
- **Some sites block non-browser requests** — sites with aggressive bot detection may reject the request.
|
|
55
58
|
- **Output truncation** — very large pages are truncated to 50KB / 2000 lines to protect context window.
|
|
56
59
|
|
|
@@ -65,7 +68,6 @@ npm test
|
|
|
65
68
|
## Credits
|
|
66
69
|
|
|
67
70
|
- **[Defuddle](https://github.com/kepano/defuddle)** by [Steph Ango (kepano)](https://github.com/kepano) — content extraction engine
|
|
68
|
-
- **[JSDOM](https://github.com/jsdom/jsdom)** — HTML parsing without a browser
|
|
69
71
|
- **[pi](https://github.com/badlogic/pi-mono)** by [Mario Zechner (badlogic)](https://github.com/badlogic) — agent framework
|
|
70
72
|
|
|
71
73
|
## License
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@rjshrjndrn/pi-fetch",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "Web content extraction for pi — fetch any URL as clean Markdown using Defuddle",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pi-package",
|
|
@@ -18,13 +18,15 @@
|
|
|
18
18
|
]
|
|
19
19
|
},
|
|
20
20
|
"dependencies": {
|
|
21
|
-
"defuddle": "^0.14.0"
|
|
21
|
+
"defuddle": "^0.14.0",
|
|
22
|
+
"jsdom": "^29.0.1"
|
|
22
23
|
},
|
|
23
24
|
"peerDependencies": {
|
|
24
25
|
"@mariozechner/pi-coding-agent": "*",
|
|
25
26
|
"@sinclair/typebox": "*"
|
|
26
27
|
},
|
|
27
28
|
"devDependencies": {
|
|
29
|
+
"@types/jsdom": "^28.0.1",
|
|
28
30
|
"@types/node": "^20.0.0",
|
|
29
31
|
"typescript": "^5.4.0",
|
|
30
32
|
"vitest": "^1.6.0"
|
package/src/fetch.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { JSDOM } from "jsdom"
|
|
2
|
+
|
|
1
3
|
export interface FetchResult {
|
|
2
4
|
url: string
|
|
3
5
|
title: string | null
|
|
@@ -10,13 +12,32 @@ export interface FetchResult {
|
|
|
10
12
|
}
|
|
11
13
|
|
|
12
14
|
const USER_AGENT =
|
|
13
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
15
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Browser-like request headers to reduce 403 blocks from bot detection.
|
|
19
|
+
*/
|
|
20
|
+
const BROWSER_HEADERS: Record<string, string> = {
|
|
21
|
+
"User-Agent": USER_AGENT,
|
|
22
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
23
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
24
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
25
|
+
"Cache-Control": "no-cache",
|
|
26
|
+
"Sec-Fetch-Dest": "document",
|
|
27
|
+
"Sec-Fetch-Mode": "navigate",
|
|
28
|
+
"Sec-Fetch-Site": "none",
|
|
29
|
+
"Sec-Fetch-User": "?1",
|
|
30
|
+
"Upgrade-Insecure-Requests": "1",
|
|
31
|
+
}
|
|
14
32
|
|
|
15
33
|
/**
|
|
16
34
|
* Fetch a URL and extract its main content as Markdown via Defuddle.
|
|
17
35
|
*
|
|
18
|
-
* Uses native fetch +
|
|
36
|
+
* Uses native fetch + JSDOM + Defuddle (no headless browser) to
|
|
19
37
|
* strip navigation, ads, sidebars, and clutter.
|
|
38
|
+
*
|
|
39
|
+
* JSDOM provides full DOM API support (including getComputedStyle)
|
|
40
|
+
* which Defuddle needs for accurate hidden-element detection.
|
|
20
41
|
*/
|
|
21
42
|
export async function fetchPage(
|
|
22
43
|
url: string,
|
|
@@ -26,10 +47,25 @@ export async function fetchPage(
|
|
|
26
47
|
|
|
27
48
|
let html: string
|
|
28
49
|
try {
|
|
29
|
-
|
|
50
|
+
let response = await fetch(url, {
|
|
30
51
|
signal,
|
|
31
|
-
|
|
52
|
+
redirect: "follow",
|
|
53
|
+
headers: BROWSER_HEADERS,
|
|
32
54
|
})
|
|
55
|
+
|
|
56
|
+
// Cloudflare blocks requests that fake a browser UA but fail the TLS
|
|
57
|
+
// fingerprint check. Retrying with an honest UA often passes through.
|
|
58
|
+
if (
|
|
59
|
+
response.status === 403 &&
|
|
60
|
+
response.headers.get("cf-mitigated") === "challenge"
|
|
61
|
+
) {
|
|
62
|
+
response = await fetch(url, {
|
|
63
|
+
signal,
|
|
64
|
+
redirect: "follow",
|
|
65
|
+
headers: { ...BROWSER_HEADERS, "User-Agent": "pi-fetch" },
|
|
66
|
+
})
|
|
67
|
+
}
|
|
68
|
+
|
|
33
69
|
if (!response.ok) {
|
|
34
70
|
throw new Error(`HTTP ${response.status} ${response.statusText}`)
|
|
35
71
|
}
|
|
@@ -39,7 +75,8 @@ export async function fetchPage(
|
|
|
39
75
|
throw new Error(`Failed to fetch ${url}: ${err?.message ?? err}`)
|
|
40
76
|
}
|
|
41
77
|
|
|
42
|
-
const
|
|
78
|
+
const dom = new JSDOM(html, { url, pretendToBeVisual: true })
|
|
79
|
+
const result = await Defuddle(dom.window.document, url, { markdown: true })
|
|
43
80
|
|
|
44
81
|
return {
|
|
45
82
|
url,
|