@rjshrjndrn/pi-fetch 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -34,6 +34,9 @@ Output is automatically truncated to stay within pi's context limits.
34
34
  # As a pi package
35
35
  pi install npm:@rjshrjndrn/pi-fetch
36
36
 
37
+ # Or test without installing
38
+ pi -e npm:@rjshrjndrn/pi-fetch
39
+
37
40
  # Or test locally
38
41
  pi -e ./extensions/index.ts
39
42
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@rjshrjndrn/pi-fetch",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "Web content extraction for pi — fetch any URL as clean Markdown using Defuddle",
5
5
  "keywords": [
6
6
  "pi-package",
@@ -18,13 +18,15 @@
18
18
  ]
19
19
  },
20
20
  "dependencies": {
21
- "defuddle": "^0.14.0"
21
+ "defuddle": "^0.14.0",
22
+ "jsdom": "^29.0.1"
22
23
  },
23
24
  "peerDependencies": {
24
25
  "@mariozechner/pi-coding-agent": "*",
25
26
  "@sinclair/typebox": "*"
26
27
  },
27
28
  "devDependencies": {
29
+ "@types/jsdom": "^28.0.1",
28
30
  "@types/node": "^20.0.0",
29
31
  "typescript": "^5.4.0",
30
32
  "vitest": "^1.6.0"
package/src/fetch.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import { JSDOM } from "jsdom"
2
+
1
3
  export interface FetchResult {
2
4
  url: string
3
5
  title: string | null
@@ -10,13 +12,32 @@ export interface FetchResult {
10
12
  }
11
13
 
12
14
  const USER_AGENT =
13
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
15
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
16
+
17
+ /**
18
+ * Browser-like request headers to reduce 403 blocks from bot detection.
19
+ */
20
+ const BROWSER_HEADERS: Record<string, string> = {
21
+ "User-Agent": USER_AGENT,
22
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
23
+ "Accept-Language": "en-US,en;q=0.9",
24
+ "Accept-Encoding": "gzip, deflate, br",
25
+ "Cache-Control": "no-cache",
26
+ "Sec-Fetch-Dest": "document",
27
+ "Sec-Fetch-Mode": "navigate",
28
+ "Sec-Fetch-Site": "none",
29
+ "Sec-Fetch-User": "?1",
30
+ "Upgrade-Insecure-Requests": "1",
31
+ }
14
32
 
15
33
  /**
16
34
  * Fetch a URL and extract its main content as Markdown via Defuddle.
17
35
  *
18
- * Uses native fetch + Defuddle's Node API (no headless browser) to
36
+ * Uses native fetch + JSDOM + Defuddle (no headless browser) to
19
37
  * strip navigation, ads, sidebars, and clutter.
38
+ *
39
+ * JSDOM provides full DOM API support (including getComputedStyle)
40
+ * which Defuddle needs for accurate hidden-element detection.
20
41
  */
21
42
  export async function fetchPage(
22
43
  url: string,
@@ -26,10 +47,25 @@ export async function fetchPage(
26
47
 
27
48
  let html: string
28
49
  try {
29
- const response = await fetch(url, {
50
+ let response = await fetch(url, {
30
51
  signal,
31
- headers: { "User-Agent": USER_AGENT },
52
+ redirect: "follow",
53
+ headers: BROWSER_HEADERS,
32
54
  })
55
+
56
+ // Cloudflare blocks requests that fake a browser UA but fail the TLS
57
+ // fingerprint check. Retrying with an honest UA often passes through.
58
+ if (
59
+ response.status === 403 &&
60
+ response.headers.get("cf-mitigated") === "challenge"
61
+ ) {
62
+ response = await fetch(url, {
63
+ signal,
64
+ redirect: "follow",
65
+ headers: { ...BROWSER_HEADERS, "User-Agent": "pi-fetch" },
66
+ })
67
+ }
68
+
33
69
  if (!response.ok) {
34
70
  throw new Error(`HTTP ${response.status} ${response.statusText}`)
35
71
  }
@@ -39,7 +75,8 @@ export async function fetchPage(
39
75
  throw new Error(`Failed to fetch ${url}: ${err?.message ?? err}`)
40
76
  }
41
77
 
42
- const result = await Defuddle(html, url, { markdown: true })
78
+ const dom = new JSDOM(html, { url, pretendToBeVisual: true })
79
+ const result = await Defuddle(dom.window.document, url, { markdown: true })
43
80
 
44
81
  return {
45
82
  url,