@apmantza/greedysearch-pi 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/README.md +11 -1
- package/bin/launch-visible.mjs +65 -0
- package/bin/launch.mjs +442 -417
- package/bin/search.mjs +757 -679
- package/extractors/bing-copilot.mjs +490 -374
- package/extractors/common.mjs +703 -596
- package/extractors/consent.mjs +421 -388
- package/extractors/selectors.mjs +55 -54
- package/index.ts +176 -177
- package/package.json +8 -3
- package/skills/greedy-search/skill.md +5 -19
- package/src/fetcher.mjs +666 -652
- package/src/formatters/synthesis.ts +1 -5
- package/src/search/output.mjs +23 -1
- package/src/search/research.mjs +1581 -0
- package/src/search/sources.mjs +488 -466
- package/src/search/synthesis-runner.mjs +52 -46
- package/src/tools/greedy-search-handler.ts +298 -124
- package/test.mjs +971 -534
package/src/fetcher.mjs
CHANGED
|
@@ -1,652 +1,666 @@
|
|
|
1
|
-
// src/fetcher.mjs — HTTP source fetching with Readability extraction
|
|
2
|
-
|
|
3
|
-
import { Readability } from "@mozilla/readability";
|
|
4
|
-
import { JSDOM } from "jsdom";
|
|
5
|
-
import TurndownService from "turndown";
|
|
6
|
-
|
|
7
|
-
const turndown = new TurndownService({
|
|
8
|
-
headingStyle: "atx",
|
|
9
|
-
bulletListMarker: "-",
|
|
10
|
-
codeBlockStyle: "fenced",
|
|
11
|
-
});
|
|
12
|
-
|
|
13
|
-
// Strip data URLs from markdown
|
|
14
|
-
turndown.addRule("removeDataUrls", {
|
|
15
|
-
filter: (node) =>
|
|
16
|
-
node.tagName === "IMG" && node.getAttribute("src")?.startsWith("data:"),
|
|
17
|
-
replacement: () => "",
|
|
18
|
-
});
|
|
19
|
-
|
|
20
|
-
const DEFAULT_USER_AGENT =
|
|
21
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
|
|
22
|
-
|
|
23
|
-
const DEFAULT_HEADERS = {
|
|
24
|
-
"user-agent": DEFAULT_USER_AGENT,
|
|
25
|
-
accept:
|
|
26
|
-
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
27
|
-
"accept-language": "en-US,en;q=0.9",
|
|
28
|
-
"accept-encoding": "gzip, deflate, br",
|
|
29
|
-
"cache-control": "no-cache",
|
|
30
|
-
pragma: "no-cache",
|
|
31
|
-
// Sec-CH-UA client hints must match the User-Agent (Chrome 122 on Windows).
|
|
32
|
-
// Inconsistency between UA and Client Hints is a strong bot signal.
|
|
33
|
-
"sec-ch-ua":
|
|
34
|
-
'"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
|
|
35
|
-
"sec-ch-ua-mobile": "?0",
|
|
36
|
-
"sec-ch-ua-platform": '"Windows"',
|
|
37
|
-
"sec-fetch-dest": "document",
|
|
38
|
-
"sec-fetch-mode": "navigate",
|
|
39
|
-
"sec-fetch-site": "none",
|
|
40
|
-
"sec-fetch-user": "?1",
|
|
41
|
-
"upgrade-insecure-requests": "1",
|
|
42
|
-
};
|
|
43
|
-
|
|
44
|
-
/** Blocked private/internal URL patterns */
|
|
45
|
-
const PRIVATE_URL_PATTERNS = [
|
|
46
|
-
/^localhost$/i,
|
|
47
|
-
/^127\.\d+\.\d+\.\d+$/,
|
|
48
|
-
/^0\.0\.0\.0$/,
|
|
49
|
-
/^\[::1\]$/,
|
|
50
|
-
/^10\./, // RFC1918 - Class A
|
|
51
|
-
/^172\.(1[6-9]|2\d|3[01])\./, // RFC1918 - Class B
|
|
52
|
-
/^192\.168\./, // RFC1918 - Class C
|
|
53
|
-
/^169\.254\./, // Link-local
|
|
54
|
-
/^fc00:/i, // IPv6 unique local
|
|
55
|
-
/^fe80:/i, // IPv6 link-local
|
|
56
|
-
/\.local$/i,
|
|
57
|
-
/\.internal$/i,
|
|
58
|
-
/\.localhost$/i,
|
|
59
|
-
];
|
|
60
|
-
|
|
61
|
-
/**
|
|
62
|
-
* Check if URL is a private/internal address that should not be fetched
|
|
63
|
-
* @param {string} url - URL to check
|
|
64
|
-
* @returns {{blocked: boolean, reason?: string}}
|
|
65
|
-
*/
|
|
66
|
-
export function isPrivateUrl(url) {
|
|
67
|
-
try {
|
|
68
|
-
const parsed = new URL(url);
|
|
69
|
-
const hostname = parsed.hostname.toLowerCase();
|
|
70
|
-
|
|
71
|
-
for (const pattern of PRIVATE_URL_PATTERNS) {
|
|
72
|
-
if (pattern.test(hostname)) {
|
|
73
|
-
return {
|
|
74
|
-
blocked: true,
|
|
75
|
-
reason: `Private/internal address: ${hostname}`,
|
|
76
|
-
};
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// Block file:// protocol
|
|
81
|
-
if (parsed.protocol === "file:") {
|
|
82
|
-
return { blocked: true, reason: "File protocol not allowed" };
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
return { blocked: false };
|
|
86
|
-
} catch (error) {
|
|
87
|
-
return { blocked: true, reason: `Invalid URL: ${error.message}` };
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/**
|
|
92
|
-
* Rewrite GitHub blob URLs to raw.githubusercontent.com
|
|
93
|
-
* github.com/owner/repo/blob/ref/path → raw.githubusercontent.com/owner/repo/ref/path
|
|
94
|
-
* @param {string} url - URL to rewrite
|
|
95
|
-
* @returns {string} - Rewritten URL or original if not applicable
|
|
96
|
-
*/
|
|
97
|
-
export function rewriteGitHubUrl(url) {
|
|
98
|
-
try {
|
|
99
|
-
const parsed = new URL(url);
|
|
100
|
-
|
|
101
|
-
// Only process github.com
|
|
102
|
-
if (
|
|
103
|
-
!(
|
|
104
|
-
parsed.hostname === "github.com" ||
|
|
105
|
-
parsed.hostname.endsWith(".github.com")
|
|
106
|
-
)
|
|
107
|
-
) {
|
|
108
|
-
return url;
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
// Parse path: /owner/repo/blob/ref/path/to/file
|
|
112
|
-
const parts = parsed.pathname.split("/").filter(Boolean);
|
|
113
|
-
if (parts.length < 5) {
|
|
114
|
-
return url; // Not a blob URL (need owner, repo, 'blob', ref, path...)
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
const [owner, repo, type, ref, ...fileParts] = parts;
|
|
118
|
-
|
|
119
|
-
// Must be /blob/ path
|
|
120
|
-
if (type !== "blob") {
|
|
121
|
-
return url;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// Build raw URL
|
|
125
|
-
const rawPath = fileParts.join("/");
|
|
126
|
-
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${rawPath}`;
|
|
127
|
-
|
|
128
|
-
return rawUrl;
|
|
129
|
-
} catch {
|
|
130
|
-
// If parsing fails, return original
|
|
131
|
-
return url;
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Fetch a URL via HTTP and extract readable content
|
|
137
|
-
* @param {string} url - URL to fetch
|
|
138
|
-
* @param {object} options - Options
|
|
139
|
-
* @param {number} [options.timeoutMs=15000] - Request timeout
|
|
140
|
-
* @param {string} [options.userAgent] - Custom user agent
|
|
141
|
-
* @param {AbortSignal} [options.signal] - Abort signal
|
|
142
|
-
* @returns {Promise<FetchResult>}
|
|
143
|
-
*/
|
|
144
|
-
export async function fetchSourceHttp(url, options = {}) {
|
|
145
|
-
// Security: Block private/internal URLs
|
|
146
|
-
const privateCheck = isPrivateUrl(url);
|
|
147
|
-
if (privateCheck.blocked) {
|
|
148
|
-
return {
|
|
149
|
-
ok: false,
|
|
150
|
-
url,
|
|
151
|
-
finalUrl: url,
|
|
152
|
-
status: 403,
|
|
153
|
-
error: `Blocked: ${privateCheck.reason}`,
|
|
154
|
-
needsBrowser: false,
|
|
155
|
-
};
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// Rewrite GitHub blob URLs to raw.githubusercontent.com
|
|
159
|
-
const originalUrl = url;
|
|
160
|
-
url = rewriteGitHubUrl(url);
|
|
161
|
-
if (url !== originalUrl) {
|
|
162
|
-
console.error(
|
|
163
|
-
`[fetcher] Rewrote GitHub URL: ${originalUrl.slice(0, 60)}... → raw.githubusercontent.com`,
|
|
164
|
-
);
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
const { timeoutMs = 15000, userAgent, signal } = options;
|
|
168
|
-
|
|
169
|
-
const controller = new AbortController();
|
|
170
|
-
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
171
|
-
|
|
172
|
-
// Link external signal if provided
|
|
173
|
-
if (signal) {
|
|
174
|
-
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
try {
|
|
178
|
-
const response = await fetch(url, {
|
|
179
|
-
method: "GET",
|
|
180
|
-
headers: {
|
|
181
|
-
...DEFAULT_HEADERS,
|
|
182
|
-
"user-agent": userAgent || DEFAULT_USER_AGENT,
|
|
183
|
-
},
|
|
184
|
-
redirect: "follow",
|
|
185
|
-
signal: controller.signal,
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
clearTimeout(timeoutId);
|
|
189
|
-
|
|
190
|
-
const contentType = response.headers.get("content-type") || "";
|
|
191
|
-
const finalUrl = response.url;
|
|
192
|
-
const lastModified = response.headers.get("last-modified") || "";
|
|
193
|
-
|
|
194
|
-
// Handle raw text/plain from GitHub (raw file content)
|
|
195
|
-
let isRawGitHub = false;
|
|
196
|
-
try {
|
|
197
|
-
const finalHost = new URL(finalUrl).hostname.toLowerCase();
|
|
198
|
-
isRawGitHub = finalHost === "raw.githubusercontent.com";
|
|
199
|
-
} catch {}
|
|
200
|
-
if (contentType.includes("text/plain") && isRawGitHub) {
|
|
201
|
-
const text = await response.text();
|
|
202
|
-
return {
|
|
203
|
-
ok: true,
|
|
204
|
-
url: originalUrl,
|
|
205
|
-
finalUrl,
|
|
206
|
-
status: response.status,
|
|
207
|
-
title: finalUrl.split("/").pop() || "GitHub File",
|
|
208
|
-
byline: "",
|
|
209
|
-
siteName: "GitHub",
|
|
210
|
-
lang: "",
|
|
211
|
-
publishedTime: lastModified,
|
|
212
|
-
lastModified,
|
|
213
|
-
markdown: text,
|
|
214
|
-
contentLength: text.length,
|
|
215
|
-
excerpt: text.slice(0, 300).replaceAll(/\n/g, " "),
|
|
216
|
-
needsBrowser: false,
|
|
217
|
-
};
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
// Check for non-HTML content
|
|
221
|
-
if (
|
|
222
|
-
!contentType.includes("text/html") &&
|
|
223
|
-
!contentType.includes("application/xhtml")
|
|
224
|
-
) {
|
|
225
|
-
return {
|
|
226
|
-
ok: false,
|
|
227
|
-
url,
|
|
228
|
-
finalUrl,
|
|
229
|
-
status: response.status,
|
|
230
|
-
error: `Unsupported content type: ${contentType}`,
|
|
231
|
-
needsBrowser: false,
|
|
232
|
-
};
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
const html = await response.text();
|
|
236
|
-
|
|
237
|
-
// Quick bot detection check (pass both original and final URL for redirect detection)
|
|
238
|
-
const quickCheck = detectBotBlock(response.status, html, finalUrl, url);
|
|
239
|
-
if (quickCheck.blocked) {
|
|
240
|
-
return {
|
|
241
|
-
ok: false,
|
|
242
|
-
url,
|
|
243
|
-
finalUrl,
|
|
244
|
-
status: response.status,
|
|
245
|
-
error: `Blocked: ${quickCheck.reason}`,
|
|
246
|
-
needsBrowser: true,
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
// Extract content with Readability
|
|
251
|
-
const extracted = extractContent(html, finalUrl);
|
|
252
|
-
|
|
253
|
-
// Quality check: if content looks suspicious or too short, recommend browser
|
|
254
|
-
const quality = checkContentQuality(extracted);
|
|
255
|
-
if (!quality.ok) {
|
|
256
|
-
return {
|
|
257
|
-
ok: false,
|
|
258
|
-
url,
|
|
259
|
-
finalUrl,
|
|
260
|
-
status: response.status,
|
|
261
|
-
error: `Low quality content: ${quality.reason}`,
|
|
262
|
-
needsBrowser: true,
|
|
263
|
-
};
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
return {
|
|
267
|
-
ok: true,
|
|
268
|
-
url,
|
|
269
|
-
finalUrl,
|
|
270
|
-
status: response.status,
|
|
271
|
-
title: extracted.title,
|
|
272
|
-
byline: extracted.byline,
|
|
273
|
-
siteName: extracted.siteName,
|
|
274
|
-
lang: extracted.lang,
|
|
275
|
-
publishedTime: extracted.publishedTime || lastModified,
|
|
276
|
-
lastModified,
|
|
277
|
-
markdown: extracted.markdown,
|
|
278
|
-
excerpt: extracted.excerpt,
|
|
279
|
-
contentLength: extracted.markdown.length,
|
|
280
|
-
needsBrowser: false,
|
|
281
|
-
};
|
|
282
|
-
} catch (error) {
|
|
283
|
-
clearTimeout(timeoutId);
|
|
284
|
-
|
|
285
|
-
// Check for network errors that might work with browser
|
|
286
|
-
const needsBrowser = isNetworkErrorRetryableWithBrowser(error);
|
|
287
|
-
|
|
288
|
-
return {
|
|
289
|
-
ok: false,
|
|
290
|
-
url,
|
|
291
|
-
finalUrl: url,
|
|
292
|
-
status: 0,
|
|
293
|
-
error: error.message,
|
|
294
|
-
needsBrowser,
|
|
295
|
-
};
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
/**
|
|
300
|
-
* Detect if HTTP response indicates bot blocking
|
|
301
|
-
* Checks first 30KB of HTML for performance
|
|
302
|
-
*/
|
|
303
|
-
export function detectBotBlock(status, html, finalUrl, originalUrl) {
|
|
304
|
-
const title =
|
|
305
|
-
html.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.toLowerCase() || "";
|
|
306
|
-
const sample = html.slice(0, 30000).toLowerCase();
|
|
307
|
-
const combined = `${title} ${sample}`;
|
|
308
|
-
|
|
309
|
-
// Status-based blocks
|
|
310
|
-
if (status === 403 || status === 429 || status === 503) {
|
|
311
|
-
return { blocked: true, reason: `HTTP ${status}` };
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
// Content-based blocks - more specific patterns to avoid false positives
|
|
315
|
-
const blockSignals = [
|
|
316
|
-
// Captcha: must be in context of challenge (not just mentioned on page)
|
|
317
|
-
{
|
|
318
|
-
pattern: /class=["'][^"']*captcha["']|<div[^>]*id=["']captcha/i,
|
|
319
|
-
reason: "captcha",
|
|
320
|
-
},
|
|
321
|
-
{
|
|
322
|
-
pattern: /g-recaptcha|data-sitekey|i['"]m not a robot/i,
|
|
323
|
-
reason: "captcha",
|
|
324
|
-
},
|
|
325
|
-
|
|
326
|
-
// Cloudflare challenge pages
|
|
327
|
-
{
|
|
328
|
-
pattern:
|
|
329
|
-
/checking your browser.{0,100}please wait|cf-browser-verification/i,
|
|
330
|
-
reason: "cloudflare challenge",
|
|
331
|
-
},
|
|
332
|
-
{
|
|
333
|
-
pattern:
|
|
334
|
-
/just a moment.{0,50}security check|ddos protection by cloudflare/i,
|
|
335
|
-
reason: "cloudflare challenge",
|
|
336
|
-
},
|
|
337
|
-
|
|
338
|
-
// Bot detection
|
|
339
|
-
{
|
|
340
|
-
pattern: /unusual traffic.{0,50}from your computer network/i,
|
|
341
|
-
reason: "unusual traffic",
|
|
342
|
-
},
|
|
343
|
-
{
|
|
344
|
-
pattern: /bot detected|automated.{0,20}request/i,
|
|
345
|
-
reason: "bot detection",
|
|
346
|
-
},
|
|
347
|
-
|
|
348
|
-
// JavaScript requirements (specific patterns)
|
|
349
|
-
{
|
|
350
|
-
pattern:
|
|
351
|
-
/enable\s+javascript\s+to\s+view|javascript\s+is\s+required.{0,50}enabled/i,
|
|
352
|
-
reason: "requires javascript",
|
|
353
|
-
},
|
|
354
|
-
|
|
355
|
-
// Access denied
|
|
356
|
-
{ pattern: /access denied|accessdenied/i, reason: "access denied" },
|
|
357
|
-
|
|
358
|
-
// Anubis (new proof-of-work anti-bot system)
|
|
359
|
-
{
|
|
360
|
-
pattern: /protected by anubis|anubis uses a proof-of-work/i,
|
|
361
|
-
reason: "anubis challenge",
|
|
362
|
-
},
|
|
363
|
-
];
|
|
364
|
-
|
|
365
|
-
for (const signal of blockSignals) {
|
|
366
|
-
if (signal.pattern.test(combined)) {
|
|
367
|
-
return { blocked: true, reason: signal.reason };
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
// Check for login redirect (different hostname, auth patterns)
|
|
372
|
-
const loginRedirect = detectLoginRedirect(originalUrl, finalUrl, html);
|
|
373
|
-
if (loginRedirect) {
|
|
374
|
-
return { blocked: true, reason: loginRedirect };
|
|
375
|
-
}
|
|
376
|
-
|
|
377
|
-
return { blocked: false };
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
/** Known authentication/login domains. */
|
|
381
|
-
const AUTH_DOMAINS = [
|
|
382
|
-
"accounts.google.com",
|
|
383
|
-
"login.microsoftonline.com",
|
|
384
|
-
"login.live.com",
|
|
385
|
-
"auth0.com",
|
|
386
|
-
"okta.com",
|
|
387
|
-
"auth.mozilla.auth0.com",
|
|
388
|
-
"id.atlassian.com",
|
|
389
|
-
];
|
|
390
|
-
|
|
391
|
-
/** Hostname prefixes that indicate an auth/login service. */
|
|
392
|
-
const AUTH_HOSTNAME_PREFIXES = [
|
|
393
|
-
"login.",
|
|
394
|
-
"signin.",
|
|
395
|
-
"auth.",
|
|
396
|
-
"sso.",
|
|
397
|
-
"accounts.",
|
|
398
|
-
"idp.",
|
|
399
|
-
];
|
|
400
|
-
|
|
401
|
-
/** Content patterns that indicate a login wall when combined with a hostname redirect. */
|
|
402
|
-
const LOGIN_CONTENT_PATTERNS = [
|
|
403
|
-
"sign in to continue",
|
|
404
|
-
"log in to continue",
|
|
405
|
-
"authentication required",
|
|
406
|
-
"create an account to continue",
|
|
407
|
-
"subscribe to continue reading",
|
|
408
|
-
"members only",
|
|
409
|
-
];
|
|
410
|
-
|
|
411
|
-
/**
|
|
412
|
-
* Detects redirect-to-login pages: sites that return 200 but redirect to an
|
|
413
|
-
* auth domain or serve a login form instead of the requested content.
|
|
414
|
-
*/
|
|
415
|
-
function detectLoginRedirect(requestedUrl, finalUrl, html) {
|
|
416
|
-
try {
|
|
417
|
-
const requested = new URL(requestedUrl);
|
|
418
|
-
const final = new URL(finalUrl);
|
|
419
|
-
|
|
420
|
-
// Same hostname = not a redirect to login
|
|
421
|
-
if (requested.hostname.toLowerCase() === final.hostname.toLowerCase()) {
|
|
422
|
-
return undefined;
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
const finalHost = final.hostname.toLowerCase();
|
|
426
|
-
|
|
427
|
-
// Check for known auth domains
|
|
428
|
-
if (
|
|
429
|
-
AUTH_DOMAINS.some((d) => finalHost === d || finalHost.endsWith(`.${d}`))
|
|
430
|
-
) {
|
|
431
|
-
return `redirected to login (${final.hostname})`;
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
// Check for auth-related hostname prefixes
|
|
435
|
-
if (AUTH_HOSTNAME_PREFIXES.some((p) => finalHost.startsWith(p))) {
|
|
436
|
-
return `redirected to login (${final.hostname})`;
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
// Check for login content patterns (only when redirected)
|
|
440
|
-
const sample = html.slice(0, 20000).toLowerCase();
|
|
441
|
-
if (LOGIN_CONTENT_PATTERNS.some((p) => sample.includes(p))) {
|
|
442
|
-
return `redirected to login page (${final.hostname})`;
|
|
443
|
-
}
|
|
444
|
-
} catch {
|
|
445
|
-
// URL parsing failures are not login redirects
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
return undefined;
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
/**
|
|
452
|
-
* Check if a network error might succeed with browser fallback
|
|
453
|
-
*/
|
|
454
|
-
function isNetworkErrorRetryableWithBrowser(error) {
|
|
455
|
-
const message = error.message.toLowerCase();
|
|
456
|
-
return (
|
|
457
|
-
message.includes("fetch failed") ||
|
|
458
|
-
message.includes("unable to verify") || // TLS issues
|
|
459
|
-
message.includes("certificate") ||
|
|
460
|
-
message.includes("timeout")
|
|
461
|
-
);
|
|
462
|
-
}
|
|
463
|
-
|
|
464
|
-
/**
|
|
465
|
-
* Extract a date string from <meta> tags (Open Graph, schema.org, standard)
|
|
466
|
-
* Returns ISO string or empty string.
|
|
467
|
-
*/
|
|
468
|
-
function extractMetaDate(document) {
|
|
469
|
-
const selectors = [
|
|
470
|
-
'meta[property="article:published_time"]',
|
|
471
|
-
'meta[name="article:published_time"]',
|
|
472
|
-
'meta[property="og:published_time"]',
|
|
473
|
-
'meta[name="publication_date"]',
|
|
474
|
-
'meta[name="date"]',
|
|
475
|
-
'meta[itemprop="datePublished"]',
|
|
476
|
-
'time[itemprop="datePublished"]',
|
|
477
|
-
'meta[name="DC.date"]',
|
|
478
|
-
];
|
|
479
|
-
for (const sel of selectors) {
|
|
480
|
-
const el = document.querySelector(sel);
|
|
481
|
-
const val =
|
|
482
|
-
el?.getAttribute("content") || el?.getAttribute("datetime") || "";
|
|
483
|
-
if (val) return val;
|
|
484
|
-
}
|
|
485
|
-
return "";
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
/**
|
|
489
|
-
* Extract readable content using Mozilla Readability + Turndown
|
|
490
|
-
*/
|
|
491
|
-
export function extractContent(html, url) {
|
|
492
|
-
const dom = new JSDOM(html, { url });
|
|
493
|
-
const document = dom.window.document;
|
|
494
|
-
|
|
495
|
-
// Try Readability first
|
|
496
|
-
const reader = new Readability(document);
|
|
497
|
-
const article = reader.parse();
|
|
498
|
-
|
|
499
|
-
if (article && article.content) {
|
|
500
|
-
const markdown = turndown.turndown(article.content);
|
|
501
|
-
const cleanMarkdown = markdown.replaceAll(/\n{3,}/g, "\n\n").trim();
|
|
502
|
-
|
|
503
|
-
const publishedTime =
|
|
504
|
-
article.publishedTime || extractMetaDate(document) || "";
|
|
505
|
-
|
|
506
|
-
return {
|
|
507
|
-
title: article.title || document.title || url,
|
|
508
|
-
byline: article.byline || "",
|
|
509
|
-
siteName: article.siteName || "",
|
|
510
|
-
lang: article.lang || "",
|
|
511
|
-
publishedTime,
|
|
512
|
-
markdown: cleanMarkdown,
|
|
513
|
-
excerpt: cleanMarkdown.slice(0, 300).replaceAll(/\n/g, " "),
|
|
514
|
-
};
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
// Fallback: extract body text
|
|
518
|
-
const body = document.body;
|
|
519
|
-
if (body) {
|
|
520
|
-
// Remove script/style/nav/footer
|
|
521
|
-
const clone = body.cloneNode(true);
|
|
522
|
-
clone
|
|
523
|
-
.querySelectorAll("script, style, nav, footer, header, aside")
|
|
524
|
-
.forEach((el) => el.remove());
|
|
525
|
-
const text = clone.textContent || "";
|
|
526
|
-
const cleanText = text.replaceAll(/\s+/g, " ").trim();
|
|
527
|
-
|
|
528
|
-
return {
|
|
529
|
-
title: document.title || url,
|
|
530
|
-
byline: "",
|
|
531
|
-
siteName: "",
|
|
532
|
-
lang: "",
|
|
533
|
-
publishedTime: extractMetaDate(document),
|
|
534
|
-
markdown: cleanText,
|
|
535
|
-
excerpt: cleanText.slice(0, 300),
|
|
536
|
-
};
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
// Last resort
|
|
540
|
-
return {
|
|
541
|
-
title: url,
|
|
542
|
-
byline: "",
|
|
543
|
-
siteName: "",
|
|
544
|
-
lang: "",
|
|
545
|
-
publishedTime: "",
|
|
546
|
-
markdown: "",
|
|
547
|
-
excerpt: "",
|
|
548
|
-
};
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
/**
|
|
552
|
-
* Check if extracted content quality is sufficient
|
|
553
|
-
* Returns { ok: true } or { ok: false, reason: string }
|
|
554
|
-
*/
|
|
555
|
-
export function checkContentQuality(extracted) {
|
|
556
|
-
const markdown = extracted.markdown.trim().toLowerCase();
|
|
557
|
-
const title = (extracted.title || "").toLowerCase();
|
|
558
|
-
|
|
559
|
-
// Minimum content length check
|
|
560
|
-
if (extracted.markdown.trim().length < 100) {
|
|
561
|
-
return { ok: false, reason: "content too short (< 100 chars)" };
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
// Suspicious content patterns that indicate bot block or incomplete extraction
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
{
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
return true;
|
|
646
|
-
}
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
1
|
+
// src/fetcher.mjs — HTTP source fetching with Readability extraction
|
|
2
|
+
|
|
3
|
+
import { Readability } from "@mozilla/readability";
|
|
4
|
+
import { JSDOM } from "jsdom";
|
|
5
|
+
import TurndownService from "turndown";
|
|
6
|
+
|
|
7
|
+
const turndown = new TurndownService({
|
|
8
|
+
headingStyle: "atx",
|
|
9
|
+
bulletListMarker: "-",
|
|
10
|
+
codeBlockStyle: "fenced",
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
// Strip data URLs from markdown
|
|
14
|
+
turndown.addRule("removeDataUrls", {
|
|
15
|
+
filter: (node) =>
|
|
16
|
+
node.tagName === "IMG" && node.getAttribute("src")?.startsWith("data:"),
|
|
17
|
+
replacement: () => "",
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
const DEFAULT_USER_AGENT =
|
|
21
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
|
|
22
|
+
|
|
23
|
+
const DEFAULT_HEADERS = {
|
|
24
|
+
"user-agent": DEFAULT_USER_AGENT,
|
|
25
|
+
accept:
|
|
26
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
27
|
+
"accept-language": "en-US,en;q=0.9",
|
|
28
|
+
"accept-encoding": "gzip, deflate, br",
|
|
29
|
+
"cache-control": "no-cache",
|
|
30
|
+
pragma: "no-cache",
|
|
31
|
+
// Sec-CH-UA client hints must match the User-Agent (Chrome 122 on Windows).
|
|
32
|
+
// Inconsistency between UA and Client Hints is a strong bot signal.
|
|
33
|
+
"sec-ch-ua":
|
|
34
|
+
'"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
|
|
35
|
+
"sec-ch-ua-mobile": "?0",
|
|
36
|
+
"sec-ch-ua-platform": '"Windows"',
|
|
37
|
+
"sec-fetch-dest": "document",
|
|
38
|
+
"sec-fetch-mode": "navigate",
|
|
39
|
+
"sec-fetch-site": "none",
|
|
40
|
+
"sec-fetch-user": "?1",
|
|
41
|
+
"upgrade-insecure-requests": "1",
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
/** Blocked private/internal URL patterns */
|
|
45
|
+
const PRIVATE_URL_PATTERNS = [
|
|
46
|
+
/^localhost$/i,
|
|
47
|
+
/^127\.\d+\.\d+\.\d+$/,
|
|
48
|
+
/^0\.0\.0\.0$/,
|
|
49
|
+
/^\[::1\]$/,
|
|
50
|
+
/^10\./, // RFC1918 - Class A
|
|
51
|
+
/^172\.(1[6-9]|2\d|3[01])\./, // RFC1918 - Class B
|
|
52
|
+
/^192\.168\./, // RFC1918 - Class C
|
|
53
|
+
/^169\.254\./, // Link-local
|
|
54
|
+
/^fc00:/i, // IPv6 unique local
|
|
55
|
+
/^fe80:/i, // IPv6 link-local
|
|
56
|
+
/\.local$/i,
|
|
57
|
+
/\.internal$/i,
|
|
58
|
+
/\.localhost$/i,
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Check if URL is a private/internal address that should not be fetched
|
|
63
|
+
* @param {string} url - URL to check
|
|
64
|
+
* @returns {{blocked: boolean, reason?: string}}
|
|
65
|
+
*/
|
|
66
|
+
export function isPrivateUrl(url) {
|
|
67
|
+
try {
|
|
68
|
+
const parsed = new URL(url);
|
|
69
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
70
|
+
|
|
71
|
+
for (const pattern of PRIVATE_URL_PATTERNS) {
|
|
72
|
+
if (pattern.test(hostname)) {
|
|
73
|
+
return {
|
|
74
|
+
blocked: true,
|
|
75
|
+
reason: `Private/internal address: ${hostname}`,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Block file:// protocol
|
|
81
|
+
if (parsed.protocol === "file:") {
|
|
82
|
+
return { blocked: true, reason: "File protocol not allowed" };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return { blocked: false };
|
|
86
|
+
} catch (error) {
|
|
87
|
+
return { blocked: true, reason: `Invalid URL: ${error.message}` };
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Rewrite GitHub blob URLs to raw.githubusercontent.com
|
|
93
|
+
* github.com/owner/repo/blob/ref/path → raw.githubusercontent.com/owner/repo/ref/path
|
|
94
|
+
* @param {string} url - URL to rewrite
|
|
95
|
+
* @returns {string} - Rewritten URL or original if not applicable
|
|
96
|
+
*/
|
|
97
|
+
export function rewriteGitHubUrl(url) {
|
|
98
|
+
try {
|
|
99
|
+
const parsed = new URL(url);
|
|
100
|
+
|
|
101
|
+
// Only process github.com
|
|
102
|
+
if (
|
|
103
|
+
!(
|
|
104
|
+
parsed.hostname === "github.com" ||
|
|
105
|
+
parsed.hostname.endsWith(".github.com")
|
|
106
|
+
)
|
|
107
|
+
) {
|
|
108
|
+
return url;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Parse path: /owner/repo/blob/ref/path/to/file
|
|
112
|
+
const parts = parsed.pathname.split("/").filter(Boolean);
|
|
113
|
+
if (parts.length < 5) {
|
|
114
|
+
return url; // Not a blob URL (need owner, repo, 'blob', ref, path...)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const [owner, repo, type, ref, ...fileParts] = parts;
|
|
118
|
+
|
|
119
|
+
// Must be /blob/ path
|
|
120
|
+
if (type !== "blob") {
|
|
121
|
+
return url;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Build raw URL
|
|
125
|
+
const rawPath = fileParts.join("/");
|
|
126
|
+
const rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${rawPath}`;
|
|
127
|
+
|
|
128
|
+
return rawUrl;
|
|
129
|
+
} catch {
|
|
130
|
+
// If parsing fails, return original
|
|
131
|
+
return url;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Fetch a URL via HTTP and extract readable content
|
|
137
|
+
* @param {string} url - URL to fetch
|
|
138
|
+
* @param {object} options - Options
|
|
139
|
+
* @param {number} [options.timeoutMs=15000] - Request timeout
|
|
140
|
+
* @param {string} [options.userAgent] - Custom user agent
|
|
141
|
+
* @param {AbortSignal} [options.signal] - Abort signal
|
|
142
|
+
* @returns {Promise<FetchResult>}
|
|
143
|
+
*/
|
|
144
|
+
export async function fetchSourceHttp(url, options = {}) {
|
|
145
|
+
// Security: Block private/internal URLs
|
|
146
|
+
const privateCheck = isPrivateUrl(url);
|
|
147
|
+
if (privateCheck.blocked) {
|
|
148
|
+
return {
|
|
149
|
+
ok: false,
|
|
150
|
+
url,
|
|
151
|
+
finalUrl: url,
|
|
152
|
+
status: 403,
|
|
153
|
+
error: `Blocked: ${privateCheck.reason}`,
|
|
154
|
+
needsBrowser: false,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Rewrite GitHub blob URLs to raw.githubusercontent.com
|
|
159
|
+
const originalUrl = url;
|
|
160
|
+
url = rewriteGitHubUrl(url);
|
|
161
|
+
if (url !== originalUrl) {
|
|
162
|
+
console.error(
|
|
163
|
+
`[fetcher] Rewrote GitHub URL: ${originalUrl.slice(0, 60)}... → raw.githubusercontent.com`,
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const { timeoutMs = 15000, userAgent, signal } = options;
|
|
168
|
+
|
|
169
|
+
const controller = new AbortController();
|
|
170
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
171
|
+
|
|
172
|
+
// Link external signal if provided
|
|
173
|
+
if (signal) {
|
|
174
|
+
signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
try {
|
|
178
|
+
const response = await fetch(url, {
|
|
179
|
+
method: "GET",
|
|
180
|
+
headers: {
|
|
181
|
+
...DEFAULT_HEADERS,
|
|
182
|
+
"user-agent": userAgent || DEFAULT_USER_AGENT,
|
|
183
|
+
},
|
|
184
|
+
redirect: "follow",
|
|
185
|
+
signal: controller.signal,
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
clearTimeout(timeoutId);
|
|
189
|
+
|
|
190
|
+
const contentType = response.headers.get("content-type") || "";
|
|
191
|
+
const finalUrl = response.url;
|
|
192
|
+
const lastModified = response.headers.get("last-modified") || "";
|
|
193
|
+
|
|
194
|
+
// Handle raw text/plain from GitHub (raw file content)
|
|
195
|
+
let isRawGitHub = false;
|
|
196
|
+
try {
|
|
197
|
+
const finalHost = new URL(finalUrl).hostname.toLowerCase();
|
|
198
|
+
isRawGitHub = finalHost === "raw.githubusercontent.com";
|
|
199
|
+
} catch {}
|
|
200
|
+
if (contentType.includes("text/plain") && isRawGitHub) {
|
|
201
|
+
const text = await response.text();
|
|
202
|
+
return {
|
|
203
|
+
ok: true,
|
|
204
|
+
url: originalUrl,
|
|
205
|
+
finalUrl,
|
|
206
|
+
status: response.status,
|
|
207
|
+
title: finalUrl.split("/").pop() || "GitHub File",
|
|
208
|
+
byline: "",
|
|
209
|
+
siteName: "GitHub",
|
|
210
|
+
lang: "",
|
|
211
|
+
publishedTime: lastModified,
|
|
212
|
+
lastModified,
|
|
213
|
+
markdown: text,
|
|
214
|
+
contentLength: text.length,
|
|
215
|
+
excerpt: text.slice(0, 300).replaceAll(/\n/g, " "),
|
|
216
|
+
needsBrowser: false,
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Check for non-HTML content
|
|
221
|
+
if (
|
|
222
|
+
!contentType.includes("text/html") &&
|
|
223
|
+
!contentType.includes("application/xhtml")
|
|
224
|
+
) {
|
|
225
|
+
return {
|
|
226
|
+
ok: false,
|
|
227
|
+
url,
|
|
228
|
+
finalUrl,
|
|
229
|
+
status: response.status,
|
|
230
|
+
error: `Unsupported content type: ${contentType}`,
|
|
231
|
+
needsBrowser: false,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const html = await response.text();
|
|
236
|
+
|
|
237
|
+
// Quick bot detection check (pass both original and final URL for redirect detection)
|
|
238
|
+
const quickCheck = detectBotBlock(response.status, html, finalUrl, url);
|
|
239
|
+
if (quickCheck.blocked) {
|
|
240
|
+
return {
|
|
241
|
+
ok: false,
|
|
242
|
+
url,
|
|
243
|
+
finalUrl,
|
|
244
|
+
status: response.status,
|
|
245
|
+
error: `Blocked: ${quickCheck.reason}`,
|
|
246
|
+
needsBrowser: true,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Extract content with Readability
|
|
251
|
+
const extracted = extractContent(html, finalUrl);
|
|
252
|
+
|
|
253
|
+
// Quality check: if content looks suspicious or too short, recommend browser
|
|
254
|
+
const quality = checkContentQuality(extracted);
|
|
255
|
+
if (!quality.ok) {
|
|
256
|
+
return {
|
|
257
|
+
ok: false,
|
|
258
|
+
url,
|
|
259
|
+
finalUrl,
|
|
260
|
+
status: response.status,
|
|
261
|
+
error: `Low quality content: ${quality.reason}`,
|
|
262
|
+
needsBrowser: true,
|
|
263
|
+
};
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
ok: true,
|
|
268
|
+
url,
|
|
269
|
+
finalUrl,
|
|
270
|
+
status: response.status,
|
|
271
|
+
title: extracted.title,
|
|
272
|
+
byline: extracted.byline,
|
|
273
|
+
siteName: extracted.siteName,
|
|
274
|
+
lang: extracted.lang,
|
|
275
|
+
publishedTime: extracted.publishedTime || lastModified,
|
|
276
|
+
lastModified,
|
|
277
|
+
markdown: extracted.markdown,
|
|
278
|
+
excerpt: extracted.excerpt,
|
|
279
|
+
contentLength: extracted.markdown.length,
|
|
280
|
+
needsBrowser: false,
|
|
281
|
+
};
|
|
282
|
+
} catch (error) {
|
|
283
|
+
clearTimeout(timeoutId);
|
|
284
|
+
|
|
285
|
+
// Check for network errors that might work with browser
|
|
286
|
+
const needsBrowser = isNetworkErrorRetryableWithBrowser(error);
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
ok: false,
|
|
290
|
+
url,
|
|
291
|
+
finalUrl: url,
|
|
292
|
+
status: 0,
|
|
293
|
+
error: error.message,
|
|
294
|
+
needsBrowser,
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Detect if HTTP response indicates bot blocking
|
|
301
|
+
* Checks first 30KB of HTML for performance
|
|
302
|
+
*/
|
|
303
|
+
export function detectBotBlock(status, html, finalUrl, originalUrl) {
|
|
304
|
+
const title =
|
|
305
|
+
html.match(/<title[^>]*>([^<]*)<\/title>/i)?.[1]?.toLowerCase() || "";
|
|
306
|
+
const sample = html.slice(0, 30000).toLowerCase();
|
|
307
|
+
const combined = `${title} ${sample}`;
|
|
308
|
+
|
|
309
|
+
// Status-based blocks
|
|
310
|
+
if (status === 403 || status === 429 || status === 503) {
|
|
311
|
+
return { blocked: true, reason: `HTTP ${status}` };
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Content-based blocks - more specific patterns to avoid false positives
|
|
315
|
+
const blockSignals = [
|
|
316
|
+
// Captcha: must be in context of challenge (not just mentioned on page)
|
|
317
|
+
{
|
|
318
|
+
pattern: /class=["'][^"']*captcha["']|<div[^>]*id=["']captcha/i,
|
|
319
|
+
reason: "captcha",
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
pattern: /g-recaptcha|data-sitekey|i['"]m not a robot/i,
|
|
323
|
+
reason: "captcha",
|
|
324
|
+
},
|
|
325
|
+
|
|
326
|
+
// Cloudflare challenge pages
|
|
327
|
+
{
|
|
328
|
+
pattern:
|
|
329
|
+
/checking your browser.{0,100}please wait|cf-browser-verification/i,
|
|
330
|
+
reason: "cloudflare challenge",
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
pattern:
|
|
334
|
+
/just a moment.{0,50}security check|ddos protection by cloudflare/i,
|
|
335
|
+
reason: "cloudflare challenge",
|
|
336
|
+
},
|
|
337
|
+
|
|
338
|
+
// Bot detection
|
|
339
|
+
{
|
|
340
|
+
pattern: /unusual traffic.{0,50}from your computer network/i,
|
|
341
|
+
reason: "unusual traffic",
|
|
342
|
+
},
|
|
343
|
+
{
|
|
344
|
+
pattern: /bot detected|automated.{0,20}request/i,
|
|
345
|
+
reason: "bot detection",
|
|
346
|
+
},
|
|
347
|
+
|
|
348
|
+
// JavaScript requirements (specific patterns)
|
|
349
|
+
{
|
|
350
|
+
pattern:
|
|
351
|
+
/enable\s+javascript\s+to\s+view|javascript\s+is\s+required.{0,50}enabled/i,
|
|
352
|
+
reason: "requires javascript",
|
|
353
|
+
},
|
|
354
|
+
|
|
355
|
+
// Access denied
|
|
356
|
+
{ pattern: /access denied|accessdenied/i, reason: "access denied" },
|
|
357
|
+
|
|
358
|
+
// Anubis (new proof-of-work anti-bot system)
|
|
359
|
+
{
|
|
360
|
+
pattern: /protected by anubis|anubis uses a proof-of-work/i,
|
|
361
|
+
reason: "anubis challenge",
|
|
362
|
+
},
|
|
363
|
+
];
|
|
364
|
+
|
|
365
|
+
for (const signal of blockSignals) {
|
|
366
|
+
if (signal.pattern.test(combined)) {
|
|
367
|
+
return { blocked: true, reason: signal.reason };
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// Check for login redirect (different hostname, auth patterns)
|
|
372
|
+
const loginRedirect = detectLoginRedirect(originalUrl, finalUrl, html);
|
|
373
|
+
if (loginRedirect) {
|
|
374
|
+
return { blocked: true, reason: loginRedirect };
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
return { blocked: false };
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/** Known authentication/login domains. */
|
|
381
|
+
const AUTH_DOMAINS = [
|
|
382
|
+
"accounts.google.com",
|
|
383
|
+
"login.microsoftonline.com",
|
|
384
|
+
"login.live.com",
|
|
385
|
+
"auth0.com",
|
|
386
|
+
"okta.com",
|
|
387
|
+
"auth.mozilla.auth0.com",
|
|
388
|
+
"id.atlassian.com",
|
|
389
|
+
];
|
|
390
|
+
|
|
391
|
+
/** Hostname prefixes that indicate an auth/login service. */
|
|
392
|
+
const AUTH_HOSTNAME_PREFIXES = [
|
|
393
|
+
"login.",
|
|
394
|
+
"signin.",
|
|
395
|
+
"auth.",
|
|
396
|
+
"sso.",
|
|
397
|
+
"accounts.",
|
|
398
|
+
"idp.",
|
|
399
|
+
];
|
|
400
|
+
|
|
401
|
+
/** Content patterns that indicate a login wall when combined with a hostname redirect. */
|
|
402
|
+
const LOGIN_CONTENT_PATTERNS = [
|
|
403
|
+
"sign in to continue",
|
|
404
|
+
"log in to continue",
|
|
405
|
+
"authentication required",
|
|
406
|
+
"create an account to continue",
|
|
407
|
+
"subscribe to continue reading",
|
|
408
|
+
"members only",
|
|
409
|
+
];
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Detects redirect-to-login pages: sites that return 200 but redirect to an
|
|
413
|
+
* auth domain or serve a login form instead of the requested content.
|
|
414
|
+
*/
|
|
415
|
+
function detectLoginRedirect(requestedUrl, finalUrl, html) {
|
|
416
|
+
try {
|
|
417
|
+
const requested = new URL(requestedUrl);
|
|
418
|
+
const final = new URL(finalUrl);
|
|
419
|
+
|
|
420
|
+
// Same hostname = not a redirect to login
|
|
421
|
+
if (requested.hostname.toLowerCase() === final.hostname.toLowerCase()) {
|
|
422
|
+
return undefined;
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const finalHost = final.hostname.toLowerCase();
|
|
426
|
+
|
|
427
|
+
// Check for known auth domains
|
|
428
|
+
if (
|
|
429
|
+
AUTH_DOMAINS.some((d) => finalHost === d || finalHost.endsWith(`.${d}`))
|
|
430
|
+
) {
|
|
431
|
+
return `redirected to login (${final.hostname})`;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// Check for auth-related hostname prefixes
|
|
435
|
+
if (AUTH_HOSTNAME_PREFIXES.some((p) => finalHost.startsWith(p))) {
|
|
436
|
+
return `redirected to login (${final.hostname})`;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
// Check for login content patterns (only when redirected)
|
|
440
|
+
const sample = html.slice(0, 20000).toLowerCase();
|
|
441
|
+
if (LOGIN_CONTENT_PATTERNS.some((p) => sample.includes(p))) {
|
|
442
|
+
return `redirected to login page (${final.hostname})`;
|
|
443
|
+
}
|
|
444
|
+
} catch {
|
|
445
|
+
// URL parsing failures are not login redirects
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
return undefined;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Check if a network error might succeed with browser fallback
|
|
453
|
+
*/
|
|
454
|
+
function isNetworkErrorRetryableWithBrowser(error) {
|
|
455
|
+
const message = error.message.toLowerCase();
|
|
456
|
+
return (
|
|
457
|
+
message.includes("fetch failed") ||
|
|
458
|
+
message.includes("unable to verify") || // TLS issues
|
|
459
|
+
message.includes("certificate") ||
|
|
460
|
+
message.includes("timeout")
|
|
461
|
+
);
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Extract a date string from <meta> tags (Open Graph, schema.org, standard)
|
|
466
|
+
* Returns ISO string or empty string.
|
|
467
|
+
*/
|
|
468
|
+
function extractMetaDate(document) {
|
|
469
|
+
const selectors = [
|
|
470
|
+
'meta[property="article:published_time"]',
|
|
471
|
+
'meta[name="article:published_time"]',
|
|
472
|
+
'meta[property="og:published_time"]',
|
|
473
|
+
'meta[name="publication_date"]',
|
|
474
|
+
'meta[name="date"]',
|
|
475
|
+
'meta[itemprop="datePublished"]',
|
|
476
|
+
'time[itemprop="datePublished"]',
|
|
477
|
+
'meta[name="DC.date"]',
|
|
478
|
+
];
|
|
479
|
+
for (const sel of selectors) {
|
|
480
|
+
const el = document.querySelector(sel);
|
|
481
|
+
const val =
|
|
482
|
+
el?.getAttribute("content") || el?.getAttribute("datetime") || "";
|
|
483
|
+
if (val) return val;
|
|
484
|
+
}
|
|
485
|
+
return "";
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
/**
|
|
489
|
+
* Extract readable content using Mozilla Readability + Turndown
|
|
490
|
+
*/
|
|
491
|
+
export function extractContent(html, url) {
|
|
492
|
+
const dom = new JSDOM(html, { url });
|
|
493
|
+
const document = dom.window.document;
|
|
494
|
+
|
|
495
|
+
// Try Readability first
|
|
496
|
+
const reader = new Readability(document);
|
|
497
|
+
const article = reader.parse();
|
|
498
|
+
|
|
499
|
+
if (article && article.content) {
|
|
500
|
+
const markdown = turndown.turndown(article.content);
|
|
501
|
+
const cleanMarkdown = markdown.replaceAll(/\n{3,}/g, "\n\n").trim();
|
|
502
|
+
|
|
503
|
+
const publishedTime =
|
|
504
|
+
article.publishedTime || extractMetaDate(document) || "";
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
title: article.title || document.title || url,
|
|
508
|
+
byline: article.byline || "",
|
|
509
|
+
siteName: article.siteName || "",
|
|
510
|
+
lang: article.lang || "",
|
|
511
|
+
publishedTime,
|
|
512
|
+
markdown: cleanMarkdown,
|
|
513
|
+
excerpt: cleanMarkdown.slice(0, 300).replaceAll(/\n/g, " "),
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Fallback: extract body text
|
|
518
|
+
const body = document.body;
|
|
519
|
+
if (body) {
|
|
520
|
+
// Remove script/style/nav/footer
|
|
521
|
+
const clone = body.cloneNode(true);
|
|
522
|
+
clone
|
|
523
|
+
.querySelectorAll("script, style, nav, footer, header, aside")
|
|
524
|
+
.forEach((el) => el.remove());
|
|
525
|
+
const text = clone.textContent || "";
|
|
526
|
+
const cleanText = text.replaceAll(/\s+/g, " ").trim();
|
|
527
|
+
|
|
528
|
+
return {
|
|
529
|
+
title: document.title || url,
|
|
530
|
+
byline: "",
|
|
531
|
+
siteName: "",
|
|
532
|
+
lang: "",
|
|
533
|
+
publishedTime: extractMetaDate(document),
|
|
534
|
+
markdown: cleanText,
|
|
535
|
+
excerpt: cleanText.slice(0, 300),
|
|
536
|
+
};
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Last resort
|
|
540
|
+
return {
|
|
541
|
+
title: url,
|
|
542
|
+
byline: "",
|
|
543
|
+
siteName: "",
|
|
544
|
+
lang: "",
|
|
545
|
+
publishedTime: "",
|
|
546
|
+
markdown: "",
|
|
547
|
+
excerpt: "",
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
/**
|
|
552
|
+
* Check if extracted content quality is sufficient
|
|
553
|
+
* Returns { ok: true } or { ok: false, reason: string }
|
|
554
|
+
*/
|
|
555
|
+
export function checkContentQuality(extracted) {
|
|
556
|
+
const markdown = extracted.markdown.trim().toLowerCase();
|
|
557
|
+
const title = (extracted.title || "").toLowerCase();
|
|
558
|
+
|
|
559
|
+
// Minimum content length check
|
|
560
|
+
if (extracted.markdown.trim().length < 100) {
|
|
561
|
+
return { ok: false, reason: "content too short (< 100 chars)" };
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Suspicious content patterns that indicate bot block or incomplete extraction
|
|
565
|
+
// Use simple string checks instead of regex to avoid ReDoS (SonarCloud javasecurity:S5852)
|
|
566
|
+
const lc = markdown.toLowerCase();
|
|
567
|
+
const suspiciousChecks = [
|
|
568
|
+
{
|
|
569
|
+
check: () => lc.includes("loading") && lc.includes("please wait"),
|
|
570
|
+
desc: "loading page",
|
|
571
|
+
},
|
|
572
|
+
{
|
|
573
|
+
check: () => lc.includes("please ensure javascript is enabled"),
|
|
574
|
+
desc: "requires javascript",
|
|
575
|
+
},
|
|
576
|
+
{
|
|
577
|
+
check: () => lc.includes("enable javascript to view"),
|
|
578
|
+
desc: "requires javascript",
|
|
579
|
+
},
|
|
580
|
+
{
|
|
581
|
+
check: () => lc.includes("just a moment"),
|
|
582
|
+
desc: "cloudflare challenge detected in content",
|
|
583
|
+
},
|
|
584
|
+
{
|
|
585
|
+
check: () => lc.includes("verify you are human"),
|
|
586
|
+
desc: "human verification",
|
|
587
|
+
},
|
|
588
|
+
{
|
|
589
|
+
check: () => lc.includes("captcha required"),
|
|
590
|
+
desc: "captcha in extracted content",
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
check: () => lc.includes("access denied"),
|
|
594
|
+
desc: "access denied in content",
|
|
595
|
+
},
|
|
596
|
+
{
|
|
597
|
+
check: () => /^\s{0,10}sign\s{1,5}in\s{0,10}$|^\s{0,10}log\s{1,5}in\s{0,10}$/im.test(markdown),
|
|
598
|
+
desc: "login form only",
|
|
599
|
+
},
|
|
600
|
+
];
|
|
601
|
+
|
|
602
|
+
for (const { check, desc } of suspiciousChecks) {
|
|
603
|
+
if (check()) {
|
|
604
|
+
return { ok: false, reason: desc };
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// Title-based checks
|
|
609
|
+
if (
|
|
610
|
+
title.includes("just a moment") ||
|
|
611
|
+
title.includes("checking your browser")
|
|
612
|
+
) {
|
|
613
|
+
return { ok: false, reason: "cloudflare challenge page detected in title" };
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
return { ok: true };
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Predict if a URL will likely need browser fallback (before attempting HTTP)
|
|
621
|
+
* @param {string} url - URL to check
|
|
622
|
+
* @returns {boolean}
|
|
623
|
+
*/
|
|
624
|
+
export function shouldUseBrowser(url) {
|
|
625
|
+
try {
|
|
626
|
+
const parsed = new URL(url);
|
|
627
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
628
|
+
const pathname = parsed.pathname.toLowerCase();
|
|
629
|
+
|
|
630
|
+
// Known JS-heavy sites
|
|
631
|
+
const jsHeavyDomains = [
|
|
632
|
+
"react.dev",
|
|
633
|
+
"nextjs.org",
|
|
634
|
+
"vuejs.org",
|
|
635
|
+
"angular.io",
|
|
636
|
+
"svelte.dev",
|
|
637
|
+
"docs.expo.dev",
|
|
638
|
+
"tailwindcss.com",
|
|
639
|
+
"storybook.js.org",
|
|
640
|
+
];
|
|
641
|
+
|
|
642
|
+
if (
|
|
643
|
+
jsHeavyDomains.some((d) => hostname === d || hostname.endsWith(`.${d}`))
|
|
644
|
+
) {
|
|
645
|
+
return true;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// Single-page app indicators in URL
|
|
649
|
+
if (
|
|
650
|
+
pathname.includes("/playground") ||
|
|
651
|
+
pathname.includes("/demo") ||
|
|
652
|
+
pathname.includes("/app")
|
|
653
|
+
) {
|
|
654
|
+
return true;
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
// Hash-based routing often indicates SPA
|
|
658
|
+
if (parsed.hash && parsed.hash.length > 1) {
|
|
659
|
+
return true;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
return false;
|
|
663
|
+
} catch {
|
|
664
|
+
return false;
|
|
665
|
+
}
|
|
666
|
+
}
|