@rjshrjndrn/pi-fetch 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -8,7 +8,7 @@ Powered by [Defuddle](https://github.com/kepano/defuddle) by [Steph Ango](https:
8
8
 
9
9
  Registers a `web_fetch` tool that the LLM can call with any URL. Under the hood:
10
10
 
11
- 1. **[JSDOM](https://github.com/jsdom/jsdom)** fetches and parses the HTML (lightweight, no browser engine)
11
+ 1. **Native `fetch`** retrieves the HTML (lightweight, no browser engine)
12
12
  2. **[Defuddle](https://github.com/kepano/defuddle)** extracts the main content, stripping navigation, ads, sidebars, cookie banners, and clutter
13
13
  3. Returns clean **Markdown** with metadata (title, author, description, word count)
14
14
 
@@ -32,7 +32,10 @@ Output is automatically truncated to stay within pi's context limits.
32
32
 
33
33
  ```bash
34
34
  # As a pi package
35
- pi install npm:pi-fetch
35
+ pi install npm:@rjshrjndrn/pi-fetch
36
+
37
+ # Or test without installing
38
+ pi -e npm:@rjshrjndrn/pi-fetch
36
39
 
37
40
  # Or test locally
38
41
  pi -e ./extensions/index.ts
@@ -50,7 +53,7 @@ Or the LLM will use `web_fetch` automatically when it needs to read a webpage.
50
53
 
51
54
  ## Limitations
52
55
 
53
- - **No JavaScript rendering** — JSDOM does not execute JavaScript. SPAs that require JS to render content will return empty or minimal results. For those, you'll still need a headless browser.
56
+ - **No JavaScript rendering** — uses native `fetch`, not a browser. SPAs that require JS to render content will return empty or minimal results. For those, you'll still need a headless browser.
54
57
  - **Some sites block non-browser requests** — sites with aggressive bot detection may reject the request.
55
58
  - **Output truncation** — very large pages are truncated to 50KB / 2000 lines to protect context window.
56
59
 
@@ -65,7 +68,6 @@ npm test
65
68
  ## Credits
66
69
 
67
70
  - **[Defuddle](https://github.com/kepano/defuddle)** by [Steph Ango (kepano)](https://github.com/kepano) — content extraction engine
68
- - **[JSDOM](https://github.com/jsdom/jsdom)** — HTML parsing without a browser
69
71
  - **[pi](https://github.com/badlogic/pi-mono)** by [Mario Zechner (badlogic)](https://github.com/badlogic) — agent framework
70
72
 
71
73
  ## License
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rjshrjndrn/pi-fetch",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "Web content extraction for pi — fetch any URL as clean Markdown using Defuddle",
5
5
  "keywords": [
6
6
  "pi-package",
@@ -18,13 +18,15 @@
18
18
  ]
19
19
  },
20
20
  "dependencies": {
21
- "defuddle": "^0.14.0"
21
+ "defuddle": "^0.14.0",
22
+ "jsdom": "^29.0.1"
22
23
  },
23
24
  "peerDependencies": {
24
25
  "@mariozechner/pi-coding-agent": "*",
25
26
  "@sinclair/typebox": "*"
26
27
  },
27
28
  "devDependencies": {
29
+ "@types/jsdom": "^28.0.1",
28
30
  "@types/node": "^20.0.0",
29
31
  "typescript": "^5.4.0",
30
32
  "vitest": "^1.6.0"
package/src/fetch.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import { JSDOM } from "jsdom"
2
+
1
3
  export interface FetchResult {
2
4
  url: string
3
5
  title: string | null
@@ -10,13 +12,32 @@ export interface FetchResult {
10
12
  }
11
13
 
12
14
  const USER_AGENT =
13
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
15
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
16
+
17
+ /**
18
+ * Browser-like request headers to reduce 403 blocks from bot detection.
19
+ */
20
+ const BROWSER_HEADERS: Record<string, string> = {
21
+ "User-Agent": USER_AGENT,
22
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
23
+ "Accept-Language": "en-US,en;q=0.9",
24
+ "Accept-Encoding": "gzip, deflate, br",
25
+ "Cache-Control": "no-cache",
26
+ "Sec-Fetch-Dest": "document",
27
+ "Sec-Fetch-Mode": "navigate",
28
+ "Sec-Fetch-Site": "none",
29
+ "Sec-Fetch-User": "?1",
30
+ "Upgrade-Insecure-Requests": "1",
31
+ }
14
32
 
15
33
  /**
16
34
  * Fetch a URL and extract its main content as Markdown via Defuddle.
17
35
  *
18
- * Uses native fetch + Defuddle's Node API (no headless browser) to
36
+ * Uses native fetch + JSDOM + Defuddle (no headless browser) to
19
37
  * strip navigation, ads, sidebars, and clutter.
38
+ *
39
+ * JSDOM provides full DOM API support (including getComputedStyle)
40
+ * which Defuddle needs for accurate hidden-element detection.
20
41
  */
21
42
  export async function fetchPage(
22
43
  url: string,
@@ -26,10 +47,25 @@ export async function fetchPage(
26
47
 
27
48
  let html: string
28
49
  try {
29
- const response = await fetch(url, {
50
+ let response = await fetch(url, {
30
51
  signal,
31
- headers: { "User-Agent": USER_AGENT },
52
+ redirect: "follow",
53
+ headers: BROWSER_HEADERS,
32
54
  })
55
+
56
+ // Cloudflare blocks requests that fake a browser UA but fail the TLS
57
+ // fingerprint check. Retrying with an honest UA often passes through.
58
+ if (
59
+ response.status === 403 &&
60
+ response.headers.get("cf-mitigated") === "challenge"
61
+ ) {
62
+ response = await fetch(url, {
63
+ signal,
64
+ redirect: "follow",
65
+ headers: { ...BROWSER_HEADERS, "User-Agent": "pi-fetch" },
66
+ })
67
+ }
68
+
33
69
  if (!response.ok) {
34
70
  throw new Error(`HTTP ${response.status} ${response.statusText}`)
35
71
  }
@@ -39,7 +75,8 @@ export async function fetchPage(
39
75
  throw new Error(`Failed to fetch ${url}: ${err?.message ?? err}`)
40
76
  }
41
77
 
42
- const result = await Defuddle(html, url, { markdown: true })
78
+ const dom = new JSDOM(html, { url, pretendToBeVisual: true })
79
+ const result = await Defuddle(dom.window.document, url, { markdown: true })
43
80
 
44
81
  return {
45
82
  url,