pi-web-access 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,21 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.7.3] - 2026-02-05
6
+
7
+ ### Added
8
+ - Jina Reader fallback for JS-rendered pages. When Readability returns insufficient content (cookie notices, consent walls, SPA shells), the extraction chain now tries Jina Reader (`r.jina.ai`) before falling back to Gemini. Jina handles JavaScript rendering server-side and returns clean markdown. No API key required.
9
+ - JS-render detection heuristic (`isLikelyJSRendered`) produces more specific error messages when pages appear to load content dynamically.
10
+ - Actionable guidance when all extraction methods fail, listing steps to configure Gemini API or use `web_search` instead.
11
+
12
+ ### Changed
13
+ - HTTP fetch headers now mimic Chrome (realistic `User-Agent`, `Sec-Fetch-*`, `Accept-Language`) instead of the default Node.js user agent. Reduces blocks from bot-detection systems.
14
+ - Short Readability output (< 500 chars) is now treated as a content failure, triggering the fallback chain. Previously, a 266-char cookie notice was returned as "successful" content.
15
+ - Extraction fallback order is now: HTTP+Readability → RSC → Jina Reader → Gemini URL Context → Gemini Web → error with guidance.
16
+
17
+ ### Fixed
18
+ - `parseTimestamp` now rejects negative values in colon-separated format (`-1:30`, `1:-30`). Previously only the numeric path (`-90`) rejected negatives, while the colon path computed and returned negative seconds.
19
+
5
20
  ## [0.7.2] - 2026-02-03
6
21
 
7
22
  ### Added
package/README.md CHANGED
@@ -18,7 +18,7 @@ https://github.com/user-attachments/assets/cac6a17a-1eeb-4dde-9818-cdf85d8ea98f
18
18
 
19
19
  **Video Understanding** — Point it at a YouTube video or local screen recording and ask questions about what's on screen. Full transcripts, visual descriptions, and frame extraction at exact timestamps.
20
20
 
21
- **Smart Fallbacks** — Every capability has a fallback chain. Search tries Perplexity, then Gemini API, then Gemini Web. YouTube tries Gemini Web, then API, then Perplexity. Blocked pages retry through Gemini extraction. Something always works.
21
+ **Smart Fallbacks** — Every capability has a fallback chain. Search tries Perplexity, then Gemini API, then Gemini Web. YouTube tries Gemini Web, then API, then Perplexity. Blocked pages retry through Jina Reader and Gemini extraction. Something always works.
22
22
 
23
23
  **GitHub Cloning** — GitHub URLs are cloned locally instead of scraped. The agent gets real file contents and a local path to explore, not rendered HTML.
24
24
 
@@ -164,7 +164,7 @@ PDF URLs are extracted as text and saved to `~/Downloads/` as markdown. The agen
164
164
 
165
165
  ### Blocked pages
166
166
 
167
- When Readability fails or a site blocks bot traffic, the extension retries via Gemini URL Context API or Gemini Web extraction. Handles SPAs, JS-heavy pages, and anti-bot protections transparently. Also parses Next.js RSC flight data when present.
167
+ When Readability fails or returns only a cookie notice, the extension retries via Jina Reader (handles JS rendering server-side, no API key needed), then Gemini URL Context API, then Gemini Web extraction. Handles SPAs, JS-heavy pages, and anti-bot protections transparently. Also parses Next.js RSC flight data when present.
168
168
 
169
169
  ## How It Works
170
170
 
@@ -174,7 +174,7 @@ fetch_content(url)
174
174
  → GitHub URL? Clone repo, return file contents + local path
175
175
  → YouTube URL? Gemini Web → Gemini API → Perplexity
176
176
  → HTTP fetch → PDF? Extract text, save to ~/Downloads/
177
- → HTML? Readability → RSC parser → Gemini fallback
177
+ → HTML? Readability → RSC parser → Jina Reader → Gemini fallback
178
178
  → Text/JSON/Markdown? Return directly
179
179
  ```
180
180
 
package/extract.ts CHANGED
@@ -15,6 +15,7 @@ const DEFAULT_TIMEOUT_MS = 30000;
15
15
  const CONCURRENT_LIMIT = 3;
16
16
 
17
17
  const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
18
+ const MIN_USEFUL_CONTENT = 500;
18
19
 
19
20
  const turndown = new TurndownService({
20
21
  headingStyle: "atx",
@@ -51,11 +52,69 @@ export interface ExtractOptions {
51
52
  model?: string;
52
53
  }
53
54
 
55
+ const JINA_READER_BASE = "https://r.jina.ai/";
56
+ const JINA_TIMEOUT_MS = 30000;
57
+
58
+ async function extractWithJinaReader(
59
+ url: string,
60
+ signal?: AbortSignal,
61
+ ): Promise<ExtractedContent | null> {
62
+ const jinaUrl = JINA_READER_BASE + url;
63
+
64
+ const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
65
+
66
+ try {
67
+ const res = await fetch(jinaUrl, {
68
+ headers: {
69
+ "Accept": "text/markdown",
70
+ "X-No-Cache": "true",
71
+ },
72
+ signal: AbortSignal.any([
73
+ AbortSignal.timeout(JINA_TIMEOUT_MS),
74
+ ...(signal ? [signal] : []),
75
+ ]),
76
+ });
77
+
78
+ if (!res.ok) {
79
+ activityMonitor.logComplete(activityId, res.status);
80
+ return null;
81
+ }
82
+
83
+ const content = await res.text();
84
+ activityMonitor.logComplete(activityId, res.status);
85
+
86
+ const contentStart = content.indexOf("Markdown Content:");
87
+ if (contentStart < 0) {
88
+ return null;
89
+ }
90
+
91
+ const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length
92
+
93
+ // Check for failed JS rendering or minimal content
94
+ if (markdownPart.length < 100 ||
95
+ markdownPart.startsWith("Loading...") ||
96
+ markdownPart.startsWith("Please enable JavaScript")) {
97
+ return null;
98
+ }
99
+
100
+ const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
101
+ return { url, title, content: markdownPart, error: null };
102
+ } catch (err) {
103
+ const message = err instanceof Error ? err.message : String(err);
104
+ if (message.toLowerCase().includes("abort")) {
105
+ activityMonitor.logComplete(activityId, 0);
106
+ } else {
107
+ activityMonitor.logError(activityId, message);
108
+ }
109
+ return null;
110
+ }
111
+ }
112
+
54
113
  function parseTimestamp(ts: string): number | null {
55
114
  const num = Number(ts);
56
115
  if (!isNaN(num) && num >= 0) return Math.floor(num);
57
116
  const parts = ts.split(":").map(Number);
58
- if (parts.some(isNaN)) return null;
117
+ if (parts.some(p => isNaN(p) || p < 0)) return null;
59
118
  if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]);
60
119
  if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]);
61
120
  return null;
@@ -286,10 +345,45 @@ export async function extractContent(
286
345
  if (!httpResult.error || signal?.aborted) return httpResult;
287
346
  if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
288
347
 
348
+ const jinaResult = await extractWithJinaReader(url, signal);
349
+ if (jinaResult) return jinaResult;
350
+
289
351
  const geminiResult = await extractWithUrlContext(url, signal)
290
352
  ?? await extractWithGeminiWeb(url, signal);
291
353
 
292
- return geminiResult ?? httpResult;
354
+ if (geminiResult) return geminiResult;
355
+
356
+ const guidance = [
357
+ httpResult.error,
358
+ "",
359
+ "Fallback options:",
360
+ " \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json",
361
+ " \u2022 Sign into gemini.google.com in Chrome",
362
+ " \u2022 Use web_search to find content about this topic",
363
+ ].join("\n");
364
+ return { ...httpResult, error: guidance };
365
+ }
366
+
367
+ function isLikelyJSRendered(html: string): boolean {
368
+ // Extract body content
369
+ const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
370
+ if (!bodyMatch) return false;
371
+
372
+ const bodyHtml = bodyMatch[1];
373
+
374
+ // Strip tags to get text content
375
+ const textContent = bodyHtml
376
+ .replace(/<script[\s\S]*?<\/script>/gi, "")
377
+ .replace(/<style[\s\S]*?<\/style>/gi, "")
378
+ .replace(/<[^>]+>/g, "")
379
+ .replace(/\s+/g, " ")
380
+ .trim();
381
+
382
+ // Count scripts
383
+ const scriptCount = (html.match(/<script/gi) || []).length;
384
+
385
+ // Heuristic: little text content but many scripts suggests JS rendering
386
+ return textContent.length < 500 && scriptCount > 3;
293
387
  }
294
388
 
295
389
  async function extractViaHttp(
@@ -310,8 +404,15 @@ async function extractViaHttp(
310
404
  const response = await fetch(url, {
311
405
  signal: controller.signal,
312
406
  headers: {
313
- "User-Agent": "Mozilla/5.0 (compatible; pi-agent/1.0)",
314
- Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
407
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
408
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
409
+ "Accept-Language": "en-US,en;q=0.9",
410
+ "Cache-Control": "no-cache",
411
+ "Sec-Fetch-Dest": "document",
412
+ "Sec-Fetch-Mode": "navigate",
413
+ "Sec-Fetch-Site": "none",
414
+ "Sec-Fetch-User": "?1",
415
+ "Upgrade-Insecure-Requests": "1",
315
416
  },
316
417
  });
317
418
 
@@ -395,16 +496,35 @@ async function extractViaHttp(
395
496
  }
396
497
 
397
498
  activityMonitor.logComplete(activityId, response.status);
499
+
500
+ // Provide more specific error message
501
+ const jsRendered = isLikelyJSRendered(text);
502
+ const errorMsg = jsRendered
503
+ ? "Page appears to be JavaScript-rendered (content loads dynamically)"
504
+ : "Could not extract readable content from HTML structure";
505
+
398
506
  return {
399
507
  url,
400
508
  title: "",
401
509
  content: "",
402
- error: "Could not extract readable content",
510
+ error: errorMsg,
403
511
  };
404
512
  }
405
513
 
406
514
  const markdown = turndown.turndown(article.content);
407
515
  activityMonitor.logComplete(activityId, response.status);
516
+
517
+ if (markdown.length < MIN_USEFUL_CONTENT) {
518
+ return {
519
+ url,
520
+ title: article.title || "",
521
+ content: markdown,
522
+ error: isLikelyJSRendered(text)
523
+ ? "Page appears to be JavaScript-rendered (content loads dynamically)"
524
+ : "Extracted content appears incomplete",
525
+ };
526
+ }
527
+
408
528
  return { url, title: article.title || "", content: markdown, error: null };
409
529
  } catch (err) {
410
530
  const message = err instanceof Error ? err.message : String(err);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-web-access",
3
- "version": "0.7.2",
3
+ "version": "0.7.3",
4
4
  "description": "Web search, URL fetching, GitHub repo cloning, PDF extraction, YouTube video understanding, and local video analysis for Pi coding agent",
5
5
  "type": "module",
6
6
  "keywords": [