pi-web-access 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/README.md +3 -3
- package/extract.ts +125 -5
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file.
|
|
4
4
|
|
|
5
|
+
## [0.7.3] - 2026-02-05
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- Jina Reader fallback for JS-rendered pages. When Readability returns insufficient content (cookie notices, consent walls, SPA shells), the extraction chain now tries Jina Reader (`r.jina.ai`) before falling back to Gemini. Jina handles JavaScript rendering server-side and returns clean markdown. No API key required.
|
|
9
|
+
- JS-render detection heuristic (`isLikelyJSRendered`) produces more specific error messages when pages appear to load content dynamically.
|
|
10
|
+
- Actionable guidance when all extraction methods fail, listing steps to configure Gemini API or use `web_search` instead.
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- HTTP fetch headers now mimic Chrome (realistic `User-Agent`, `Sec-Fetch-*`, `Accept-Language`) instead of the default Node.js user agent. Reduces blocks from bot-detection systems.
|
|
14
|
+
- Short Readability output (< 500 chars) is now treated as a content failure, triggering the fallback chain. Previously, a 266-char cookie notice was returned as "successful" content.
|
|
15
|
+
- Extraction fallback order is now: HTTP+Readability → RSC → Jina Reader → Gemini URL Context → Gemini Web → error with guidance.
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
- `parseTimestamp` now rejects negative values in colon-separated format (`-1:30`, `1:-30`). Previously only the numeric path (`-90`) rejected negatives, while the colon path computed and returned negative seconds.
|
|
19
|
+
|
|
5
20
|
## [0.7.2] - 2026-02-03
|
|
6
21
|
|
|
7
22
|
### Added
|
package/README.md
CHANGED
|
@@ -18,7 +18,7 @@ https://github.com/user-attachments/assets/cac6a17a-1eeb-4dde-9818-cdf85d8ea98f
|
|
|
18
18
|
|
|
19
19
|
**Video Understanding** — Point it at a YouTube video or local screen recording and ask questions about what's on screen. Full transcripts, visual descriptions, and frame extraction at exact timestamps.
|
|
20
20
|
|
|
21
|
-
**Smart Fallbacks** — Every capability has a fallback chain. Search tries Perplexity, then Gemini API, then Gemini Web. YouTube tries Gemini Web, then API, then Perplexity. Blocked pages retry through Gemini extraction. Something always works.
|
|
21
|
+
**Smart Fallbacks** — Every capability has a fallback chain. Search tries Perplexity, then Gemini API, then Gemini Web. YouTube tries Gemini Web, then API, then Perplexity. Blocked pages retry through Jina Reader and Gemini extraction. Something always works.
|
|
22
22
|
|
|
23
23
|
**GitHub Cloning** — GitHub URLs are cloned locally instead of scraped. The agent gets real file contents and a local path to explore, not rendered HTML.
|
|
24
24
|
|
|
@@ -164,7 +164,7 @@ PDF URLs are extracted as text and saved to `~/Downloads/` as markdown. The agen
|
|
|
164
164
|
|
|
165
165
|
### Blocked pages
|
|
166
166
|
|
|
167
|
-
When Readability fails or
|
|
167
|
+
When Readability fails or returns only a cookie notice, the extension retries via Jina Reader (handles JS rendering server-side, no API key needed), then Gemini URL Context API, then Gemini Web extraction. Handles SPAs, JS-heavy pages, and anti-bot protections transparently. Also parses Next.js RSC flight data when present.
|
|
168
168
|
|
|
169
169
|
## How It Works
|
|
170
170
|
|
|
@@ -174,7 +174,7 @@ fetch_content(url)
|
|
|
174
174
|
→ GitHub URL? Clone repo, return file contents + local path
|
|
175
175
|
→ YouTube URL? Gemini Web → Gemini API → Perplexity
|
|
176
176
|
→ HTTP fetch → PDF? Extract text, save to ~/Downloads/
|
|
177
|
-
→ HTML? Readability → RSC parser → Gemini fallback
|
|
177
|
+
→ HTML? Readability → RSC parser → Jina Reader → Gemini fallback
|
|
178
178
|
→ Text/JSON/Markdown? Return directly
|
|
179
179
|
```
|
|
180
180
|
|
package/extract.ts
CHANGED
|
@@ -15,6 +15,7 @@ const DEFAULT_TIMEOUT_MS = 30000;
|
|
|
15
15
|
const CONCURRENT_LIMIT = 3;
|
|
16
16
|
|
|
17
17
|
const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
|
|
18
|
+
const MIN_USEFUL_CONTENT = 500;
|
|
18
19
|
|
|
19
20
|
const turndown = new TurndownService({
|
|
20
21
|
headingStyle: "atx",
|
|
@@ -51,11 +52,69 @@ export interface ExtractOptions {
|
|
|
51
52
|
model?: string;
|
|
52
53
|
}
|
|
53
54
|
|
|
55
|
+
const JINA_READER_BASE = "https://r.jina.ai/";
|
|
56
|
+
const JINA_TIMEOUT_MS = 30000;
|
|
57
|
+
|
|
58
|
+
async function extractWithJinaReader(
|
|
59
|
+
url: string,
|
|
60
|
+
signal?: AbortSignal,
|
|
61
|
+
): Promise<ExtractedContent | null> {
|
|
62
|
+
const jinaUrl = JINA_READER_BASE + url;
|
|
63
|
+
|
|
64
|
+
const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
const res = await fetch(jinaUrl, {
|
|
68
|
+
headers: {
|
|
69
|
+
"Accept": "text/markdown",
|
|
70
|
+
"X-No-Cache": "true",
|
|
71
|
+
},
|
|
72
|
+
signal: AbortSignal.any([
|
|
73
|
+
AbortSignal.timeout(JINA_TIMEOUT_MS),
|
|
74
|
+
...(signal ? [signal] : []),
|
|
75
|
+
]),
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
if (!res.ok) {
|
|
79
|
+
activityMonitor.logComplete(activityId, res.status);
|
|
80
|
+
return null;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const content = await res.text();
|
|
84
|
+
activityMonitor.logComplete(activityId, res.status);
|
|
85
|
+
|
|
86
|
+
const contentStart = content.indexOf("Markdown Content:");
|
|
87
|
+
if (contentStart < 0) {
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length
|
|
92
|
+
|
|
93
|
+
// Check for failed JS rendering or minimal content
|
|
94
|
+
if (markdownPart.length < 100 ||
|
|
95
|
+
markdownPart.startsWith("Loading...") ||
|
|
96
|
+
markdownPart.startsWith("Please enable JavaScript")) {
|
|
97
|
+
return null;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
|
|
101
|
+
return { url, title, content: markdownPart, error: null };
|
|
102
|
+
} catch (err) {
|
|
103
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
104
|
+
if (message.toLowerCase().includes("abort")) {
|
|
105
|
+
activityMonitor.logComplete(activityId, 0);
|
|
106
|
+
} else {
|
|
107
|
+
activityMonitor.logError(activityId, message);
|
|
108
|
+
}
|
|
109
|
+
return null;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
54
113
|
function parseTimestamp(ts: string): number | null {
|
|
55
114
|
const num = Number(ts);
|
|
56
115
|
if (!isNaN(num) && num >= 0) return Math.floor(num);
|
|
57
116
|
const parts = ts.split(":").map(Number);
|
|
58
|
-
if (parts.some(isNaN)) return null;
|
|
117
|
+
if (parts.some(p => isNaN(p) || p < 0)) return null;
|
|
59
118
|
if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]);
|
|
60
119
|
if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]);
|
|
61
120
|
return null;
|
|
@@ -286,10 +345,45 @@ export async function extractContent(
|
|
|
286
345
|
if (!httpResult.error || signal?.aborted) return httpResult;
|
|
287
346
|
if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
|
|
288
347
|
|
|
348
|
+
const jinaResult = await extractWithJinaReader(url, signal);
|
|
349
|
+
if (jinaResult) return jinaResult;
|
|
350
|
+
|
|
289
351
|
const geminiResult = await extractWithUrlContext(url, signal)
|
|
290
352
|
?? await extractWithGeminiWeb(url, signal);
|
|
291
353
|
|
|
292
|
-
|
|
354
|
+
if (geminiResult) return geminiResult;
|
|
355
|
+
|
|
356
|
+
const guidance = [
|
|
357
|
+
httpResult.error,
|
|
358
|
+
"",
|
|
359
|
+
"Fallback options:",
|
|
360
|
+
" \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json",
|
|
361
|
+
" \u2022 Sign into gemini.google.com in Chrome",
|
|
362
|
+
" \u2022 Use web_search to find content about this topic",
|
|
363
|
+
].join("\n");
|
|
364
|
+
return { ...httpResult, error: guidance };
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
function isLikelyJSRendered(html: string): boolean {
|
|
368
|
+
// Extract body content
|
|
369
|
+
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
370
|
+
if (!bodyMatch) return false;
|
|
371
|
+
|
|
372
|
+
const bodyHtml = bodyMatch[1];
|
|
373
|
+
|
|
374
|
+
// Strip tags to get text content
|
|
375
|
+
const textContent = bodyHtml
|
|
376
|
+
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
377
|
+
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
378
|
+
.replace(/<[^>]+>/g, "")
|
|
379
|
+
.replace(/\s+/g, " ")
|
|
380
|
+
.trim();
|
|
381
|
+
|
|
382
|
+
// Count scripts
|
|
383
|
+
const scriptCount = (html.match(/<script/gi) || []).length;
|
|
384
|
+
|
|
385
|
+
// Heuristic: little text content but many scripts suggests JS rendering
|
|
386
|
+
return textContent.length < 500 && scriptCount > 3;
|
|
293
387
|
}
|
|
294
388
|
|
|
295
389
|
async function extractViaHttp(
|
|
@@ -310,8 +404,15 @@ async function extractViaHttp(
|
|
|
310
404
|
const response = await fetch(url, {
|
|
311
405
|
signal: controller.signal,
|
|
312
406
|
headers: {
|
|
313
|
-
"User-Agent": "Mozilla/5.0 (
|
|
314
|
-
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
407
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
408
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
409
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
410
|
+
"Cache-Control": "no-cache",
|
|
411
|
+
"Sec-Fetch-Dest": "document",
|
|
412
|
+
"Sec-Fetch-Mode": "navigate",
|
|
413
|
+
"Sec-Fetch-Site": "none",
|
|
414
|
+
"Sec-Fetch-User": "?1",
|
|
415
|
+
"Upgrade-Insecure-Requests": "1",
|
|
315
416
|
},
|
|
316
417
|
});
|
|
317
418
|
|
|
@@ -395,16 +496,35 @@ async function extractViaHttp(
|
|
|
395
496
|
}
|
|
396
497
|
|
|
397
498
|
activityMonitor.logComplete(activityId, response.status);
|
|
499
|
+
|
|
500
|
+
// Provide more specific error message
|
|
501
|
+
const jsRendered = isLikelyJSRendered(text);
|
|
502
|
+
const errorMsg = jsRendered
|
|
503
|
+
? "Page appears to be JavaScript-rendered (content loads dynamically)"
|
|
504
|
+
: "Could not extract readable content from HTML structure";
|
|
505
|
+
|
|
398
506
|
return {
|
|
399
507
|
url,
|
|
400
508
|
title: "",
|
|
401
509
|
content: "",
|
|
402
|
-
error:
|
|
510
|
+
error: errorMsg,
|
|
403
511
|
};
|
|
404
512
|
}
|
|
405
513
|
|
|
406
514
|
const markdown = turndown.turndown(article.content);
|
|
407
515
|
activityMonitor.logComplete(activityId, response.status);
|
|
516
|
+
|
|
517
|
+
if (markdown.length < MIN_USEFUL_CONTENT) {
|
|
518
|
+
return {
|
|
519
|
+
url,
|
|
520
|
+
title: article.title || "",
|
|
521
|
+
content: markdown,
|
|
522
|
+
error: isLikelyJSRendered(text)
|
|
523
|
+
? "Page appears to be JavaScript-rendered (content loads dynamically)"
|
|
524
|
+
: "Extracted content appears incomplete",
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
|
|
408
528
|
return { url, title: article.title || "", content: markdown, error: null };
|
|
409
529
|
} catch (err) {
|
|
410
530
|
const message = err instanceof Error ? err.message : String(err);
|
package/package.json
CHANGED