@apmantza/greedysearch-pi 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/fetcher.mjs CHANGED
@@ -1,652 +1,666 @@
1
- // src/fetcher.mjs — HTTP source fetching with Readability extraction
2
-
3
- import { Readability } from "@mozilla/readability";
4
- import { JSDOM } from "jsdom";
5
- import TurndownService from "turndown";
6
-
7
- const turndown = new TurndownService({
8
- headingStyle: "atx",
9
- bulletListMarker: "-",
10
- codeBlockStyle: "fenced",
11
- });
12
-
13
- // Strip data URLs from markdown
14
- turndown.addRule("removeDataUrls", {
15
- filter: (node) =>
16
- node.tagName === "IMG" && node.getAttribute("src")?.startsWith("data:"),
17
- replacement: () => "",
18
- });
19
-
20
- const DEFAULT_USER_AGENT =
21
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
22
-
23
- const DEFAULT_HEADERS = {
24
- "user-agent": DEFAULT_USER_AGENT,
25
- accept:
26
- "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
27
- "accept-language": "en-US,en;q=0.9",
28
- "accept-encoding": "gzip, deflate, br",
29
- "cache-control": "no-cache",
30
- pragma: "no-cache",
31
- // Sec-CH-UA client hints must match the User-Agent (Chrome 122 on Windows).
32
- // Inconsistency between UA and Client Hints is a strong bot signal.
33
- "sec-ch-ua":
34
- '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
35
- "sec-ch-ua-mobile": "?0",
36
- "sec-ch-ua-platform": '"Windows"',
37
- "sec-fetch-dest": "document",
38
- "sec-fetch-mode": "navigate",
39
- "sec-fetch-site": "none",
40
- "sec-fetch-user": "?1",
41
- "upgrade-insecure-requests": "1",
42
- };
43
-
44
- /** Blocked private/internal URL patterns */
45
- const PRIVATE_URL_PATTERNS = [
46
- /^localhost$/i,
47
- /^127\.\d+\.\d+\.\d+$/,
48
- /^0\.0\.0\.0$/,
49
- /^\[::1\]$/,
50
- /^10\./, // RFC1918 - Class A
51
- /^172\.(1[6-9]|2\d|3[01])\./, // RFC1918 - Class B
52
- /^192\.168\./, // RFC1918 - Class C
53
- /^169\.254\./, // Link-local
54
- /^fc00:/i, // IPv6 unique local
55
- /^fe80:/i, // IPv6 link-local
56
- /\.local$/i,
57
- /\.internal$/i,
58
- /\.localhost$/i,
59
- ];
60
-
61
- /**
62
- * Check if URL is a private/internal address that should not be fetched
63
- * @param {string} url - URL to check
64
- * @returns {{blocked: boolean, reason?: string}}
65
- */
66
- export function isPrivateUrl(url) {
67
- try {
68
- const parsed = new URL(url);
69
- const hostname = parsed.hostname.toLowerCase();
70
-
71
- for (const pattern of PRIVATE_URL_PATTERNS) {
72
- if (pattern.test(hostname)) {
73
- return {
74
- blocked: true,
75
- reason: `Private/internal address: ${hostname}`,
76
- };
77
- }
78
- }
79
-
80
- // Block file:// protocol
81
- if (parsed.protocol === "file:") {
82
- return { blocked: true, reason: "File protocol not allowed" };
83
- }
84
-
85
- return { blocked: false };
86
- } catch (error) {
87
- return { blocked: true, reason: `Invalid URL: ${error.message}` };
88
- }
89
- }
90
-
91
- /**
92
- * Rewrite GitHub blob URLs to raw.githubusercontent.com
93
- * github.com/owner/repo/blob/ref/path → raw.githubusercontent.com/owner/repo/ref/path
94
- * @param {string} url - URL to rewrite
95
- * @returns {string} - Rewritten URL or original if not applicable
96
- */
97
- export function rewriteGitHubUrl(url) {
98
- try {
99
- const parsed = new URL(url);
100
-
101
- // Only process github.com
102
- if (
103
- !(
104
- parsed.hostname === "github.com" ||
105
- parsed.hostname.endsWith(".github.com")
106
- )
107
- ) {
108
- return url;
109
- }
110
-
111
- // Parse path: /owner/repo/blob/ref/path/to/file
112
- const parts = parsed.pathname.split("/").filter(Boolean);
113
- if (parts.length < 5) {
114
- return url; // Not a blob URL (need owner, repo, 'blob', ref, path...)
115
- }
116
-
117
- const [owner, repo, type, ref, ...fileParts] = parts;
118
-
119
- // Must be /blob/ path
120
- if (type !== "blob") {
121
- return url;
122
- }
123
-
124
- // Build raw URL
125
- const rawPath = fileParts.join("/");
126
- const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${rawPath}`;
127
-
128
- return rawUrl;
129
- } catch {
130
- // If parsing fails, return original
131
- return url;
132
- }
133
- }
134
-
135
- /**
136
- * Fetch a URL via HTTP and extract readable content
137
- * @param {string} url - URL to fetch
138
- * @param {object} options - Options
139
- * @param {number} [options.timeoutMs=15000] - Request timeout
140
- * @param {string} [options.userAgent] - Custom user agent
141
- * @param {AbortSignal} [options.signal] - Abort signal
142
- * @returns {Promise<FetchResult>}
143
- */
144
- export async function fetchSourceHttp(url, options = {}) {
145
- // Security: Block private/internal URLs
146
- const privateCheck = isPrivateUrl(url);
147
- if (privateCheck.blocked) {
148
- return {
149
- ok: false,
150
- url,
151
- finalUrl: url,
152
- status: 403,
153
- error: `Blocked: ${privateCheck.reason}`,
154
- needsBrowser: false,
155
- };
156
- }
157
-
158
- // Rewrite GitHub blob URLs to raw.githubusercontent.com
159
- const originalUrl = url;
160
- url = rewriteGitHubUrl(url);
161
- if (url !== originalUrl) {
162
- console.error(
163
- `[fetcher] Rewrote GitHub URL: ${originalUrl.slice(0, 60)}... → raw.githubusercontent.com`,
164
- );
165
- }
166
-
167
- const { timeoutMs = 15000, userAgent, signal } = options;
168
-
169
- const controller = new AbortController();
170
- const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
171
-
172
- // Link external signal if provided
173
- if (signal) {
174
- signal.addEventListener("abort", () => controller.abort(), { once: true });
175
- }
176
-
177
- try {
178
- const response = await fetch(url, {
179
- method: "GET",
180
- headers: {
181
- ...DEFAULT_HEADERS,
182
- "user-agent": userAgent || DEFAULT_USER_AGENT,
183
- },
184
- redirect: "follow",
185
- signal: controller.signal,
186
- });
187
-
188
- clearTimeout(timeoutId);
189
-
190
- const contentType = response.headers.get("content-type") || "";
191
- const finalUrl = response.url;
192
- const lastModified = response.headers.get("last-modified") || "";
193
-
194
- // Handle raw text/plain from GitHub (raw file content)
195
- let isRawGitHub = false;
196
- try {
197
- const finalHost = new URL(finalUrl).hostname.toLowerCase();
198
- isRawGitHub = finalHost === "raw.githubusercontent.com";
199
- } catch {}
200
- if (contentType.includes("text/plain") && isRawGitHub) {
201
- const text = await response.text();
202
- return {
203
- ok: true,
204
- url: originalUrl,
205
- finalUrl,
206
- status: response.status,
207
- title: finalUrl.split("/").pop() || "GitHub File",
208
- byline: "",
209
- siteName: "GitHub",
210
- lang: "",
211
- publishedTime: lastModified,
212
- lastModified,
213
- markdown: text,
214
- contentLength: text.length,
215
- excerpt: text.slice(0, 300).replaceAll(/\n/g, " "),
216
- needsBrowser: false,
217
- };
218
- }
219
-
220
- // Check for non-HTML content
221
- if (
222
- !contentType.includes("text/html") &&
223
- !contentType.includes("application/xhtml")
224
- ) {
225
- return {
226
- ok: false,
227
- url,
228
- finalUrl,
229
- status: response.status,
230
- error: `Unsupported content type: ${contentType}`,
231
- needsBrowser: false,
232
- };
233
- }
234
-
235
- const html = await response.text();
236
-
237
- // Quick bot detection check (pass both original and final URL for redirect detection)
238
- const quickCheck = detectBotBlock(response.status, html, finalUrl, url);
239
- if (quickCheck.blocked) {
240
- return {
241
- ok: false,
242
- url,
243
- finalUrl,
244
- status: response.status,
245
- error: `Blocked: ${quickCheck.reason}`,
246
- needsBrowser: true,
247
- };
248
- }
249
-
250
- // Extract content with Readability
251
- const extracted = extractContent(html, finalUrl);
252
-
253
- // Quality check: if content looks suspicious or too short, recommend browser
254
- const quality = checkContentQuality(extracted);
255
- if (!quality.ok) {
256
- return {
257
- ok: false,
258
- url,
259
- finalUrl,
260
- status: response.status,
261
- error: `Low quality content: ${quality.reason}`,
262
- needsBrowser: true,
263
- };
264
- }
265
-
266
- return {
267
- ok: true,
268
- url,
269
- finalUrl,
270
- status: response.status,
271
- title: extracted.title,
272
- byline: extracted.byline,
273
- siteName: extracted.siteName,
274
- lang: extracted.lang,
275
- publishedTime: extracted.publishedTime || lastModified,
276
- lastModified,
277
- markdown: extracted.markdown,
278
- excerpt: extracted.excerpt,
279
- contentLength: extracted.markdown.length,
280
- needsBrowser: false,
281
- };
282
- } catch (error) {
283
- clearTimeout(timeoutId);
284
-
285
- // Check for network errors that might work with browser
286
- const needsBrowser = isNetworkErrorRetryableWithBrowser(error);
287
-
288
- return {
289
- ok: false,
290
- url,
291
- finalUrl: url,
292
- status: 0,
293
- error: error.message,
294
- needsBrowser,
295
- };
296
- }
297
- }
298
-
299
- /**
300
- * Detect if HTTP response indicates bot blocking
301
- * Checks first 30KB of HTML for performance
302
- */
303
- export function detectBotBlock(status, html, finalUrl, originalUrl) {
304
- const title =
305
- html.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.toLowerCase() || "";
306
- const sample = html.slice(0, 30000).toLowerCase();
307
- const combined = `${title} ${sample}`;
308
-
309
- // Status-based blocks
310
- if (status === 403 || status === 429 || status === 503) {
311
- return { blocked: true, reason: `HTTP ${status}` };
312
- }
313
-
314
- // Content-based blocks - more specific patterns to avoid false positives
315
- const blockSignals = [
316
- // Captcha: must be in context of challenge (not just mentioned on page)
317
- {
318
- pattern: /class=["'][^"']*captcha["']|<div[^>]*id=["']captcha/i,
319
- reason: "captcha",
320
- },
321
- {
322
- pattern: /g-recaptcha|data-sitekey|i['"]m not a robot/i,
323
- reason: "captcha",
324
- },
325
-
326
- // Cloudflare challenge pages
327
- {
328
- pattern:
329
- /checking your browser.{0,100}please wait|cf-browser-verification/i,
330
- reason: "cloudflare challenge",
331
- },
332
- {
333
- pattern:
334
- /just a moment.{0,50}security check|ddos protection by cloudflare/i,
335
- reason: "cloudflare challenge",
336
- },
337
-
338
- // Bot detection
339
- {
340
- pattern: /unusual traffic.{0,50}from your computer network/i,
341
- reason: "unusual traffic",
342
- },
343
- {
344
- pattern: /bot detected|automated.{0,20}request/i,
345
- reason: "bot detection",
346
- },
347
-
348
- // JavaScript requirements (specific patterns)
349
- {
350
- pattern:
351
- /enable\s+javascript\s+to\s+view|javascript\s+is\s+required.{0,50}enabled/i,
352
- reason: "requires javascript",
353
- },
354
-
355
- // Access denied
356
- { pattern: /access denied|accessdenied/i, reason: "access denied" },
357
-
358
- // Anubis (new proof-of-work anti-bot system)
359
- {
360
- pattern: /protected by anubis|anubis uses a proof-of-work/i,
361
- reason: "anubis challenge",
362
- },
363
- ];
364
-
365
- for (const signal of blockSignals) {
366
- if (signal.pattern.test(combined)) {
367
- return { blocked: true, reason: signal.reason };
368
- }
369
- }
370
-
371
- // Check for login redirect (different hostname, auth patterns)
372
- const loginRedirect = detectLoginRedirect(originalUrl, finalUrl, html);
373
- if (loginRedirect) {
374
- return { blocked: true, reason: loginRedirect };
375
- }
376
-
377
- return { blocked: false };
378
- }
379
-
380
- /** Known authentication/login domains. */
381
- const AUTH_DOMAINS = [
382
- "accounts.google.com",
383
- "login.microsoftonline.com",
384
- "login.live.com",
385
- "auth0.com",
386
- "okta.com",
387
- "auth.mozilla.auth0.com",
388
- "id.atlassian.com",
389
- ];
390
-
391
- /** Hostname prefixes that indicate an auth/login service. */
392
- const AUTH_HOSTNAME_PREFIXES = [
393
- "login.",
394
- "signin.",
395
- "auth.",
396
- "sso.",
397
- "accounts.",
398
- "idp.",
399
- ];
400
-
401
- /** Content patterns that indicate a login wall when combined with a hostname redirect. */
402
- const LOGIN_CONTENT_PATTERNS = [
403
- "sign in to continue",
404
- "log in to continue",
405
- "authentication required",
406
- "create an account to continue",
407
- "subscribe to continue reading",
408
- "members only",
409
- ];
410
-
411
- /**
412
- * Detects redirect-to-login pages: sites that return 200 but redirect to an
413
- * auth domain or serve a login form instead of the requested content.
414
- */
415
- function detectLoginRedirect(requestedUrl, finalUrl, html) {
416
- try {
417
- const requested = new URL(requestedUrl);
418
- const final = new URL(finalUrl);
419
-
420
- // Same hostname = not a redirect to login
421
- if (requested.hostname.toLowerCase() === final.hostname.toLowerCase()) {
422
- return undefined;
423
- }
424
-
425
- const finalHost = final.hostname.toLowerCase();
426
-
427
- // Check for known auth domains
428
- if (
429
- AUTH_DOMAINS.some((d) => finalHost === d || finalHost.endsWith(`.${d}`))
430
- ) {
431
- return `redirected to login (${final.hostname})`;
432
- }
433
-
434
- // Check for auth-related hostname prefixes
435
- if (AUTH_HOSTNAME_PREFIXES.some((p) => finalHost.startsWith(p))) {
436
- return `redirected to login (${final.hostname})`;
437
- }
438
-
439
- // Check for login content patterns (only when redirected)
440
- const sample = html.slice(0, 20000).toLowerCase();
441
- if (LOGIN_CONTENT_PATTERNS.some((p) => sample.includes(p))) {
442
- return `redirected to login page (${final.hostname})`;
443
- }
444
- } catch {
445
- // URL parsing failures are not login redirects
446
- }
447
-
448
- return undefined;
449
- }
450
-
451
- /**
452
- * Check if a network error might succeed with browser fallback
453
- */
454
- function isNetworkErrorRetryableWithBrowser(error) {
455
- const message = error.message.toLowerCase();
456
- return (
457
- message.includes("fetch failed") ||
458
- message.includes("unable to verify") || // TLS issues
459
- message.includes("certificate") ||
460
- message.includes("timeout")
461
- );
462
- }
463
-
464
- /**
465
- * Extract a date string from <meta> tags (Open Graph, schema.org, standard)
466
- * Returns ISO string or empty string.
467
- */
468
- function extractMetaDate(document) {
469
- const selectors = [
470
- 'meta[property="article:published_time"]',
471
- 'meta[name="article:published_time"]',
472
- 'meta[property="og:published_time"]',
473
- 'meta[name="publication_date"]',
474
- 'meta[name="date"]',
475
- 'meta[itemprop="datePublished"]',
476
- 'time[itemprop="datePublished"]',
477
- 'meta[name="DC.date"]',
478
- ];
479
- for (const sel of selectors) {
480
- const el = document.querySelector(sel);
481
- const val =
482
- el?.getAttribute("content") || el?.getAttribute("datetime") || "";
483
- if (val) return val;
484
- }
485
- return "";
486
- }
487
-
488
- /**
489
- * Extract readable content using Mozilla Readability + Turndown
490
- */
491
- export function extractContent(html, url) {
492
- const dom = new JSDOM(html, { url });
493
- const document = dom.window.document;
494
-
495
- // Try Readability first
496
- const reader = new Readability(document);
497
- const article = reader.parse();
498
-
499
- if (article && article.content) {
500
- const markdown = turndown.turndown(article.content);
501
- const cleanMarkdown = markdown.replaceAll(/\n{3,}/g, "\n\n").trim();
502
-
503
- const publishedTime =
504
- article.publishedTime || extractMetaDate(document) || "";
505
-
506
- return {
507
- title: article.title || document.title || url,
508
- byline: article.byline || "",
509
- siteName: article.siteName || "",
510
- lang: article.lang || "",
511
- publishedTime,
512
- markdown: cleanMarkdown,
513
- excerpt: cleanMarkdown.slice(0, 300).replaceAll(/\n/g, " "),
514
- };
515
- }
516
-
517
- // Fallback: extract body text
518
- const body = document.body;
519
- if (body) {
520
- // Remove script/style/nav/footer
521
- const clone = body.cloneNode(true);
522
- clone
523
- .querySelectorAll("script, style, nav, footer, header, aside")
524
- .forEach((el) => el.remove());
525
- const text = clone.textContent || "";
526
- const cleanText = text.replaceAll(/\s+/g, " ").trim();
527
-
528
- return {
529
- title: document.title || url,
530
- byline: "",
531
- siteName: "",
532
- lang: "",
533
- publishedTime: extractMetaDate(document),
534
- markdown: cleanText,
535
- excerpt: cleanText.slice(0, 300),
536
- };
537
- }
538
-
539
- // Last resort
540
- return {
541
- title: url,
542
- byline: "",
543
- siteName: "",
544
- lang: "",
545
- publishedTime: "",
546
- markdown: "",
547
- excerpt: "",
548
- };
549
- }
550
-
551
- /**
552
- * Check if extracted content quality is sufficient
553
- * Returns { ok: true } or { ok: false, reason: string }
554
- */
555
- export function checkContentQuality(extracted) {
556
- const markdown = extracted.markdown.trim().toLowerCase();
557
- const title = (extracted.title || "").toLowerCase();
558
-
559
- // Minimum content length check
560
- if (extracted.markdown.trim().length < 100) {
561
- return { ok: false, reason: "content too short (< 100 chars)" };
562
- }
563
-
564
- // Suspicious content patterns that indicate bot block or incomplete extraction
565
- const suspiciousPatterns = [
566
- { pattern: /\bloading\b.{0,50}?\bplease wait\b/is, desc: "loading page" },
567
- {
568
- pattern: /please\s+ensure\s+javascript\s+is\s+enabled/i,
569
- desc: "requires javascript",
570
- },
571
- {
572
- pattern: /enable\s+javascript\s+to\s+view/i,
573
- desc: "requires javascript",
574
- },
575
- {
576
- pattern: /just\s+a\s+moment\b/i,
577
- desc: "cloudflare challenge detected in content",
578
- },
579
- { pattern: /verify\s+you\s+are\s+human/i, desc: "human verification" },
580
- { pattern: /captcha\s+required/i, desc: "captcha in extracted content" },
581
- { pattern: /access\s+denied/i, desc: "access denied in content" },
582
- {
583
- pattern: /^\s*sign\s+in\s*$|^\s*log\s+in\s*$/im,
584
- desc: "login form only",
585
- },
586
- ];
587
-
588
- for (const { pattern, desc } of suspiciousPatterns) {
589
- if (pattern.test(markdown)) {
590
- return { ok: false, reason: desc };
591
- }
592
- }
593
-
594
- // Title-based checks
595
- if (
596
- title.includes("just a moment") ||
597
- title.includes("checking your browser")
598
- ) {
599
- return { ok: false, reason: "cloudflare challenge page detected in title" };
600
- }
601
-
602
- return { ok: true };
603
- }
604
-
605
- /**
606
- * Predict if a URL will likely need browser fallback (before attempting HTTP)
607
- * @param {string} url - URL to check
608
- * @returns {boolean}
609
- */
610
- export function shouldUseBrowser(url) {
611
- try {
612
- const parsed = new URL(url);
613
- const hostname = parsed.hostname.toLowerCase();
614
- const pathname = parsed.pathname.toLowerCase();
615
-
616
- // Known JS-heavy sites
617
- const jsHeavyDomains = [
618
- "react.dev",
619
- "nextjs.org",
620
- "vuejs.org",
621
- "angular.io",
622
- "svelte.dev",
623
- "docs.expo.dev",
624
- "tailwindcss.com",
625
- "storybook.js.org",
626
- ];
627
-
628
- if (
629
- jsHeavyDomains.some((d) => hostname === d || hostname.endsWith(`.${d}`))
630
- ) {
631
- return true;
632
- }
633
-
634
- // Single-page app indicators in URL
635
- if (
636
- pathname.includes("/playground") ||
637
- pathname.includes("/demo") ||
638
- pathname.includes("/app")
639
- ) {
640
- return true;
641
- }
642
-
643
- // Hash-based routing often indicates SPA
644
- if (parsed.hash && parsed.hash.length > 1) {
645
- return true;
646
- }
647
-
648
- return false;
649
- } catch {
650
- return false;
651
- }
652
- }
1
+ // src/fetcher.mjs — HTTP source fetching with Readability extraction
2
+
3
+ import { Readability } from "@mozilla/readability";
4
+ import { JSDOM } from "jsdom";
5
+ import TurndownService from "turndown";
6
+
7
+ const turndown = new TurndownService({
8
+ headingStyle: "atx",
9
+ bulletListMarker: "-",
10
+ codeBlockStyle: "fenced",
11
+ });
12
+
13
+ // Strip data URLs from markdown
14
+ turndown.addRule("removeDataUrls", {
15
+ filter: (node) =>
16
+ node.tagName === "IMG" && node.getAttribute("src")?.startsWith("data:"),
17
+ replacement: () => "",
18
+ });
19
+
20
+ const DEFAULT_USER_AGENT =
21
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
22
+
23
+ const DEFAULT_HEADERS = {
24
+ "user-agent": DEFAULT_USER_AGENT,
25
+ accept:
26
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
27
+ "accept-language": "en-US,en;q=0.9",
28
+ "accept-encoding": "gzip, deflate, br",
29
+ "cache-control": "no-cache",
30
+ pragma: "no-cache",
31
+ // Sec-CH-UA client hints must match the User-Agent (Chrome 122 on Windows).
32
+ // Inconsistency between UA and Client Hints is a strong bot signal.
33
+ "sec-ch-ua":
34
+ '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
35
+ "sec-ch-ua-mobile": "?0",
36
+ "sec-ch-ua-platform": '"Windows"',
37
+ "sec-fetch-dest": "document",
38
+ "sec-fetch-mode": "navigate",
39
+ "sec-fetch-site": "none",
40
+ "sec-fetch-user": "?1",
41
+ "upgrade-insecure-requests": "1",
42
+ };
43
+
44
+ /** Blocked private/internal URL patterns */
45
+ const PRIVATE_URL_PATTERNS = [
46
+ /^localhost$/i,
47
+ /^127\.\d+\.\d+\.\d+$/,
48
+ /^0\.0\.0\.0$/,
49
+ /^\[::1\]$/,
50
+ /^10\./, // RFC1918 - Class A
51
+ /^172\.(1[6-9]|2\d|3[01])\./, // RFC1918 - Class B
52
+ /^192\.168\./, // RFC1918 - Class C
53
+ /^169\.254\./, // Link-local
54
+ /^fc00:/i, // IPv6 unique local
55
+ /^fe80:/i, // IPv6 link-local
56
+ /\.local$/i,
57
+ /\.internal$/i,
58
+ /\.localhost$/i,
59
+ ];
60
+
61
+ /**
62
+ * Check if URL is a private/internal address that should not be fetched
63
+ * @param {string} url - URL to check
64
+ * @returns {{blocked: boolean, reason?: string}}
65
+ */
66
+ export function isPrivateUrl(url) {
67
+ try {
68
+ const parsed = new URL(url);
69
+ const hostname = parsed.hostname.toLowerCase();
70
+
71
+ for (const pattern of PRIVATE_URL_PATTERNS) {
72
+ if (pattern.test(hostname)) {
73
+ return {
74
+ blocked: true,
75
+ reason: `Private/internal address: ${hostname}`,
76
+ };
77
+ }
78
+ }
79
+
80
+ // Block file:// protocol
81
+ if (parsed.protocol === "file:") {
82
+ return { blocked: true, reason: "File protocol not allowed" };
83
+ }
84
+
85
+ return { blocked: false };
86
+ } catch (error) {
87
+ return { blocked: true, reason: `Invalid URL: ${error.message}` };
88
+ }
89
+ }
90
+
91
+ /**
92
+ * Rewrite GitHub blob URLs to raw.githubusercontent.com
93
+ * github.com/owner/repo/blob/ref/path → raw.githubusercontent.com/owner/repo/ref/path
94
+ * @param {string} url - URL to rewrite
95
+ * @returns {string} - Rewritten URL or original if not applicable
96
+ */
97
+ export function rewriteGitHubUrl(url) {
98
+ try {
99
+ const parsed = new URL(url);
100
+
101
+ // Only process github.com
102
+ if (
103
+ !(
104
+ parsed.hostname === "github.com" ||
105
+ parsed.hostname.endsWith(".github.com")
106
+ )
107
+ ) {
108
+ return url;
109
+ }
110
+
111
+ // Parse path: /owner/repo/blob/ref/path/to/file
112
+ const parts = parsed.pathname.split("/").filter(Boolean);
113
+ if (parts.length < 5) {
114
+ return url; // Not a blob URL (need owner, repo, 'blob', ref, path...)
115
+ }
116
+
117
+ const [owner, repo, type, ref, ...fileParts] = parts;
118
+
119
+ // Must be /blob/ path
120
+ if (type !== "blob") {
121
+ return url;
122
+ }
123
+
124
+ // Build raw URL
125
+ const rawPath = fileParts.join("/");
126
+ const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${rawPath}`;
127
+
128
+ return rawUrl;
129
+ } catch {
130
+ // If parsing fails, return original
131
+ return url;
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Fetch a URL via HTTP and extract readable content
137
+ * @param {string} url - URL to fetch
138
+ * @param {object} options - Options
139
+ * @param {number} [options.timeoutMs=15000] - Request timeout
140
+ * @param {string} [options.userAgent] - Custom user agent
141
+ * @param {AbortSignal} [options.signal] - Abort signal
142
+ * @returns {Promise<FetchResult>}
143
+ */
144
+ export async function fetchSourceHttp(url, options = {}) {
145
+ // Security: Block private/internal URLs
146
+ const privateCheck = isPrivateUrl(url);
147
+ if (privateCheck.blocked) {
148
+ return {
149
+ ok: false,
150
+ url,
151
+ finalUrl: url,
152
+ status: 403,
153
+ error: `Blocked: ${privateCheck.reason}`,
154
+ needsBrowser: false,
155
+ };
156
+ }
157
+
158
+ // Rewrite GitHub blob URLs to raw.githubusercontent.com
159
+ const originalUrl = url;
160
+ url = rewriteGitHubUrl(url);
161
+ if (url !== originalUrl) {
162
+ console.error(
163
+ `[fetcher] Rewrote GitHub URL: ${originalUrl.slice(0, 60)}... → raw.githubusercontent.com`,
164
+ );
165
+ }
166
+
167
+ const { timeoutMs = 15000, userAgent, signal } = options;
168
+
169
+ const controller = new AbortController();
170
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
171
+
172
+ // Link external signal if provided
173
+ if (signal) {
174
+ signal.addEventListener("abort", () => controller.abort(), { once: true });
175
+ }
176
+
177
+ try {
178
+ const response = await fetch(url, {
179
+ method: "GET",
180
+ headers: {
181
+ ...DEFAULT_HEADERS,
182
+ "user-agent": userAgent || DEFAULT_USER_AGENT,
183
+ },
184
+ redirect: "follow",
185
+ signal: controller.signal,
186
+ });
187
+
188
+ clearTimeout(timeoutId);
189
+
190
+ const contentType = response.headers.get("content-type") || "";
191
+ const finalUrl = response.url;
192
+ const lastModified = response.headers.get("last-modified") || "";
193
+
194
+ // Handle raw text/plain from GitHub (raw file content)
195
+ let isRawGitHub = false;
196
+ try {
197
+ const finalHost = new URL(finalUrl).hostname.toLowerCase();
198
+ isRawGitHub = finalHost === "raw.githubusercontent.com";
199
+ } catch {}
200
+ if (contentType.includes("text/plain") && isRawGitHub) {
201
+ const text = await response.text();
202
+ return {
203
+ ok: true,
204
+ url: originalUrl,
205
+ finalUrl,
206
+ status: response.status,
207
+ title: finalUrl.split("/").pop() || "GitHub File",
208
+ byline: "",
209
+ siteName: "GitHub",
210
+ lang: "",
211
+ publishedTime: lastModified,
212
+ lastModified,
213
+ markdown: text,
214
+ contentLength: text.length,
215
+ excerpt: text.slice(0, 300).replaceAll(/\n/g, " "),
216
+ needsBrowser: false,
217
+ };
218
+ }
219
+
220
+ // Check for non-HTML content
221
+ if (
222
+ !contentType.includes("text/html") &&
223
+ !contentType.includes("application/xhtml")
224
+ ) {
225
+ return {
226
+ ok: false,
227
+ url,
228
+ finalUrl,
229
+ status: response.status,
230
+ error: `Unsupported content type: ${contentType}`,
231
+ needsBrowser: false,
232
+ };
233
+ }
234
+
235
+ const html = await response.text();
236
+
237
+ // Quick bot detection check (pass both original and final URL for redirect detection)
238
+ const quickCheck = detectBotBlock(response.status, html, finalUrl, url);
239
+ if (quickCheck.blocked) {
240
+ return {
241
+ ok: false,
242
+ url,
243
+ finalUrl,
244
+ status: response.status,
245
+ error: `Blocked: ${quickCheck.reason}`,
246
+ needsBrowser: true,
247
+ };
248
+ }
249
+
250
+ // Extract content with Readability
251
+ const extracted = extractContent(html, finalUrl);
252
+
253
+ // Quality check: if content looks suspicious or too short, recommend browser
254
+ const quality = checkContentQuality(extracted);
255
+ if (!quality.ok) {
256
+ return {
257
+ ok: false,
258
+ url,
259
+ finalUrl,
260
+ status: response.status,
261
+ error: `Low quality content: ${quality.reason}`,
262
+ needsBrowser: true,
263
+ };
264
+ }
265
+
266
+ return {
267
+ ok: true,
268
+ url,
269
+ finalUrl,
270
+ status: response.status,
271
+ title: extracted.title,
272
+ byline: extracted.byline,
273
+ siteName: extracted.siteName,
274
+ lang: extracted.lang,
275
+ publishedTime: extracted.publishedTime || lastModified,
276
+ lastModified,
277
+ markdown: extracted.markdown,
278
+ excerpt: extracted.excerpt,
279
+ contentLength: extracted.markdown.length,
280
+ needsBrowser: false,
281
+ };
282
+ } catch (error) {
283
+ clearTimeout(timeoutId);
284
+
285
+ // Check for network errors that might work with browser
286
+ const needsBrowser = isNetworkErrorRetryableWithBrowser(error);
287
+
288
+ return {
289
+ ok: false,
290
+ url,
291
+ finalUrl: url,
292
+ status: 0,
293
+ error: error.message,
294
+ needsBrowser,
295
+ };
296
+ }
297
+ }
298
+
299
+ /**
300
+ * Detect if HTTP response indicates bot blocking
301
+ * Checks first 30KB of HTML for performance
302
+ */
303
+ export function detectBotBlock(status, html, finalUrl, originalUrl) {
304
+ const title =
305
+ html.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.toLowerCase() || "";
306
+ const sample = html.slice(0, 30000).toLowerCase();
307
+ const combined = `${title} ${sample}`;
308
+
309
+ // Status-based blocks
310
+ if (status === 403 || status === 429 || status === 503) {
311
+ return { blocked: true, reason: `HTTP ${status}` };
312
+ }
313
+
314
+ // Content-based blocks - more specific patterns to avoid false positives
315
+ const blockSignals = [
316
+ // Captcha: must be in context of challenge (not just mentioned on page)
317
+ {
318
+ pattern: /class=["'][^"']*captcha["']|<div[^>]*id=["']captcha/i,
319
+ reason: "captcha",
320
+ },
321
+ {
322
+ pattern: /g-recaptcha|data-sitekey|i['"]m not a robot/i,
323
+ reason: "captcha",
324
+ },
325
+
326
+ // Cloudflare challenge pages
327
+ {
328
+ pattern:
329
+ /checking your browser.{0,100}please wait|cf-browser-verification/i,
330
+ reason: "cloudflare challenge",
331
+ },
332
+ {
333
+ pattern:
334
+ /just a moment.{0,50}security check|ddos protection by cloudflare/i,
335
+ reason: "cloudflare challenge",
336
+ },
337
+
338
+ // Bot detection
339
+ {
340
+ pattern: /unusual traffic.{0,50}from your computer network/i,
341
+ reason: "unusual traffic",
342
+ },
343
+ {
344
+ pattern: /bot detected|automated.{0,20}request/i,
345
+ reason: "bot detection",
346
+ },
347
+
348
+ // JavaScript requirements (specific patterns)
349
+ {
350
+ pattern:
351
+ /enable\s+javascript\s+to\s+view|javascript\s+is\s+required.{0,50}enabled/i,
352
+ reason: "requires javascript",
353
+ },
354
+
355
+ // Access denied
356
+ { pattern: /access denied|accessdenied/i, reason: "access denied" },
357
+
358
+ // Anubis (new proof-of-work anti-bot system)
359
+ {
360
+ pattern: /protected by anubis|anubis uses a proof-of-work/i,
361
+ reason: "anubis challenge",
362
+ },
363
+ ];
364
+
365
+ for (const signal of blockSignals) {
366
+ if (signal.pattern.test(combined)) {
367
+ return { blocked: true, reason: signal.reason };
368
+ }
369
+ }
370
+
371
+ // Check for login redirect (different hostname, auth patterns)
372
+ const loginRedirect = detectLoginRedirect(originalUrl, finalUrl, html);
373
+ if (loginRedirect) {
374
+ return { blocked: true, reason: loginRedirect };
375
+ }
376
+
377
+ return { blocked: false };
378
+ }
379
+
380
+ /** Known authentication/login domains. */
381
+ const AUTH_DOMAINS = [
382
+ "accounts.google.com",
383
+ "login.microsoftonline.com",
384
+ "login.live.com",
385
+ "auth0.com",
386
+ "okta.com",
387
+ "auth.mozilla.auth0.com",
388
+ "id.atlassian.com",
389
+ ];
390
+
391
+ /** Hostname prefixes that indicate an auth/login service. */
392
+ const AUTH_HOSTNAME_PREFIXES = [
393
+ "login.",
394
+ "signin.",
395
+ "auth.",
396
+ "sso.",
397
+ "accounts.",
398
+ "idp.",
399
+ ];
400
+
401
+ /** Content patterns that indicate a login wall when combined with a hostname redirect. */
402
+ const LOGIN_CONTENT_PATTERNS = [
403
+ "sign in to continue",
404
+ "log in to continue",
405
+ "authentication required",
406
+ "create an account to continue",
407
+ "subscribe to continue reading",
408
+ "members only",
409
+ ];
410
+
411
+ /**
412
+ * Detects redirect-to-login pages: sites that return 200 but redirect to an
413
+ * auth domain or serve a login form instead of the requested content.
414
+ */
415
+ function detectLoginRedirect(requestedUrl, finalUrl, html) {
416
+ try {
417
+ const requested = new URL(requestedUrl);
418
+ const final = new URL(finalUrl);
419
+
420
+ // Same hostname = not a redirect to login
421
+ if (requested.hostname.toLowerCase() === final.hostname.toLowerCase()) {
422
+ return undefined;
423
+ }
424
+
425
+ const finalHost = final.hostname.toLowerCase();
426
+
427
+ // Check for known auth domains
428
+ if (
429
+ AUTH_DOMAINS.some((d) => finalHost === d || finalHost.endsWith(`.${d}`))
430
+ ) {
431
+ return `redirected to login (${final.hostname})`;
432
+ }
433
+
434
+ // Check for auth-related hostname prefixes
435
+ if (AUTH_HOSTNAME_PREFIXES.some((p) => finalHost.startsWith(p))) {
436
+ return `redirected to login (${final.hostname})`;
437
+ }
438
+
439
+ // Check for login content patterns (only when redirected)
440
+ const sample = html.slice(0, 20000).toLowerCase();
441
+ if (LOGIN_CONTENT_PATTERNS.some((p) => sample.includes(p))) {
442
+ return `redirected to login page (${final.hostname})`;
443
+ }
444
+ } catch {
445
+ // URL parsing failures are not login redirects
446
+ }
447
+
448
+ return undefined;
449
+ }
450
+
451
+ /**
452
+ * Check if a network error might succeed with browser fallback
453
+ */
454
+ function isNetworkErrorRetryableWithBrowser(error) {
455
+ const message = error.message.toLowerCase();
456
+ return (
457
+ message.includes("fetch failed") ||
458
+ message.includes("unable to verify") || // TLS issues
459
+ message.includes("certificate") ||
460
+ message.includes("timeout")
461
+ );
462
+ }
463
+
464
+ /**
465
+ * Extract a date string from <meta> tags (Open Graph, schema.org, standard)
466
+ * Returns ISO string or empty string.
467
+ */
468
+ function extractMetaDate(document) {
469
+ const selectors = [
470
+ 'meta[property="article:published_time"]',
471
+ 'meta[name="article:published_time"]',
472
+ 'meta[property="og:published_time"]',
473
+ 'meta[name="publication_date"]',
474
+ 'meta[name="date"]',
475
+ 'meta[itemprop="datePublished"]',
476
+ 'time[itemprop="datePublished"]',
477
+ 'meta[name="DC.date"]',
478
+ ];
479
+ for (const sel of selectors) {
480
+ const el = document.querySelector(sel);
481
+ const val =
482
+ el?.getAttribute("content") || el?.getAttribute("datetime") || "";
483
+ if (val) return val;
484
+ }
485
+ return "";
486
+ }
487
+
488
+ /**
489
+ * Extract readable content using Mozilla Readability + Turndown
490
+ */
491
+ export function extractContent(html, url) {
492
+ const dom = new JSDOM(html, { url });
493
+ const document = dom.window.document;
494
+
495
+ // Try Readability first
496
+ const reader = new Readability(document);
497
+ const article = reader.parse();
498
+
499
+ if (article && article.content) {
500
+ const markdown = turndown.turndown(article.content);
501
+ const cleanMarkdown = markdown.replaceAll(/\n{3,}/g, "\n\n").trim();
502
+
503
+ const publishedTime =
504
+ article.publishedTime || extractMetaDate(document) || "";
505
+
506
+ return {
507
+ title: article.title || document.title || url,
508
+ byline: article.byline || "",
509
+ siteName: article.siteName || "",
510
+ lang: article.lang || "",
511
+ publishedTime,
512
+ markdown: cleanMarkdown,
513
+ excerpt: cleanMarkdown.slice(0, 300).replaceAll(/\n/g, " "),
514
+ };
515
+ }
516
+
517
+ // Fallback: extract body text
518
+ const body = document.body;
519
+ if (body) {
520
+ // Remove script/style/nav/footer
521
+ const clone = body.cloneNode(true);
522
+ clone
523
+ .querySelectorAll("script, style, nav, footer, header, aside")
524
+ .forEach((el) => el.remove());
525
+ const text = clone.textContent || "";
526
+ const cleanText = text.replaceAll(/\s+/g, " ").trim();
527
+
528
+ return {
529
+ title: document.title || url,
530
+ byline: "",
531
+ siteName: "",
532
+ lang: "",
533
+ publishedTime: extractMetaDate(document),
534
+ markdown: cleanText,
535
+ excerpt: cleanText.slice(0, 300),
536
+ };
537
+ }
538
+
539
+ // Last resort
540
+ return {
541
+ title: url,
542
+ byline: "",
543
+ siteName: "",
544
+ lang: "",
545
+ publishedTime: "",
546
+ markdown: "",
547
+ excerpt: "",
548
+ };
549
+ }
550
+
551
+ /**
552
+ * Check if extracted content quality is sufficient
553
+ * Returns { ok: true } or { ok: false, reason: string }
554
+ */
555
+ export function checkContentQuality(extracted) {
556
+ const markdown = extracted.markdown.trim().toLowerCase();
557
+ const title = (extracted.title || "").toLowerCase();
558
+
559
+ // Minimum content length check
560
+ if (extracted.markdown.trim().length < 100) {
561
+ return { ok: false, reason: "content too short (< 100 chars)" };
562
+ }
563
+
564
+ // Suspicious content patterns that indicate bot block or incomplete extraction
565
+ // Use simple string checks instead of regex to avoid ReDoS (SonarCloud javasecurity:S5852)
566
+ const lc = markdown.toLowerCase();
567
+ const suspiciousChecks = [
568
+ {
569
+ check: () => lc.includes("loading") && lc.includes("please wait"),
570
+ desc: "loading page",
571
+ },
572
+ {
573
+ check: () => lc.includes("please ensure javascript is enabled"),
574
+ desc: "requires javascript",
575
+ },
576
+ {
577
+ check: () => lc.includes("enable javascript to view"),
578
+ desc: "requires javascript",
579
+ },
580
+ {
581
+ check: () => lc.includes("just a moment"),
582
+ desc: "cloudflare challenge detected in content",
583
+ },
584
+ {
585
+ check: () => lc.includes("verify you are human"),
586
+ desc: "human verification",
587
+ },
588
+ {
589
+ check: () => lc.includes("captcha required"),
590
+ desc: "captcha in extracted content",
591
+ },
592
+ {
593
+ check: () => lc.includes("access denied"),
594
+ desc: "access denied in content",
595
+ },
596
+ {
597
+ check: () => /^\s{0,10}sign\s{1,5}in\s{0,10}$|^\s{0,10}log\s{1,5}in\s{0,10}$/im.test(markdown),
598
+ desc: "login form only",
599
+ },
600
+ ];
601
+
602
+ for (const { check, desc } of suspiciousChecks) {
603
+ if (check()) {
604
+ return { ok: false, reason: desc };
605
+ }
606
+ }
607
+
608
+ // Title-based checks
609
+ if (
610
+ title.includes("just a moment") ||
611
+ title.includes("checking your browser")
612
+ ) {
613
+ return { ok: false, reason: "cloudflare challenge page detected in title" };
614
+ }
615
+
616
+ return { ok: true };
617
+ }
618
+
619
+ /**
620
+ * Predict if a URL will likely need browser fallback (before attempting HTTP)
621
+ * @param {string} url - URL to check
622
+ * @returns {boolean}
623
+ */
624
+ export function shouldUseBrowser(url) {
625
+ try {
626
+ const parsed = new URL(url);
627
+ const hostname = parsed.hostname.toLowerCase();
628
+ const pathname = parsed.pathname.toLowerCase();
629
+
630
+ // Known JS-heavy sites
631
+ const jsHeavyDomains = [
632
+ "react.dev",
633
+ "nextjs.org",
634
+ "vuejs.org",
635
+ "angular.io",
636
+ "svelte.dev",
637
+ "docs.expo.dev",
638
+ "tailwindcss.com",
639
+ "storybook.js.org",
640
+ ];
641
+
642
+ if (
643
+ jsHeavyDomains.some((d) => hostname === d || hostname.endsWith(`.${d}`))
644
+ ) {
645
+ return true;
646
+ }
647
+
648
+ // Single-page app indicators in URL
649
+ if (
650
+ pathname.includes("/playground") ||
651
+ pathname.includes("/demo") ||
652
+ pathname.includes("/app")
653
+ ) {
654
+ return true;
655
+ }
656
+
657
+ // Hash-based routing often indicates SPA
658
+ if (parsed.hash && parsed.hash.length > 1) {
659
+ return true;
660
+ }
661
+
662
+ return false;
663
+ } catch {
664
+ return false;
665
+ }
666
+ }