@vakra-dev/reader 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +205 -0
- package/README.md +658 -0
- package/dist/cli/index.js +3046 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/index.d.ts +1216 -0
- package/dist/index.js +3073 -0
- package/dist/index.js.map +1 -0
- package/package.json +87 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,3073 @@
|
|
|
1
|
+
// src/client.ts
|
|
2
|
+
import HeroCore from "@ulixee/hero-core";
|
|
3
|
+
import { TransportBridge } from "@ulixee/net";
|
|
4
|
+
import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
5
|
+
|
|
6
|
+
// src/scraper.ts
|
|
7
|
+
import pLimit from "p-limit";
|
|
8
|
+
|
|
9
|
+
// src/cloudflare/detector.ts
|
|
10
|
+
var CHALLENGE_DOM_SELECTORS = [
|
|
11
|
+
"#challenge-running",
|
|
12
|
+
"#challenge-stage",
|
|
13
|
+
"#challenge-form",
|
|
14
|
+
".cf-browser-verification"
|
|
15
|
+
];
|
|
16
|
+
var CHALLENGE_TEXT_PATTERNS = [
|
|
17
|
+
"verifying you are human",
|
|
18
|
+
"checking if the site connection is secure",
|
|
19
|
+
"this process is automatic. your browser will redirect"
|
|
20
|
+
];
|
|
21
|
+
var BLOCKED_SIGNALS = [
|
|
22
|
+
"you have been blocked",
|
|
23
|
+
"access to this page has been denied",
|
|
24
|
+
"sorry, you have been blocked",
|
|
25
|
+
"access denied",
|
|
26
|
+
"403 forbidden"
|
|
27
|
+
];
|
|
28
|
+
async function detectChallenge(hero) {
|
|
29
|
+
const signals = [];
|
|
30
|
+
let type = "none";
|
|
31
|
+
try {
|
|
32
|
+
if (!hero.document) {
|
|
33
|
+
return {
|
|
34
|
+
isChallenge: false,
|
|
35
|
+
type: "none",
|
|
36
|
+
confidence: 0,
|
|
37
|
+
signals: ["No document available"]
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
41
|
+
const htmlLower = html.toLowerCase();
|
|
42
|
+
for (const selector of CHALLENGE_DOM_SELECTORS) {
|
|
43
|
+
if (htmlLower.includes(selector.toLowerCase())) {
|
|
44
|
+
signals.push(`Challenge element: ${selector}`);
|
|
45
|
+
type = "js_challenge";
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
for (const pattern of CHALLENGE_TEXT_PATTERNS) {
|
|
49
|
+
if (htmlLower.includes(pattern)) {
|
|
50
|
+
signals.push(`Challenge text: "${pattern}"`);
|
|
51
|
+
type = type === "none" ? "js_challenge" : type;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
55
|
+
signals.push('Challenge text: "waiting for...to respond"');
|
|
56
|
+
type = type === "none" ? "js_challenge" : type;
|
|
57
|
+
}
|
|
58
|
+
for (const pattern of BLOCKED_SIGNALS) {
|
|
59
|
+
if (htmlLower.includes(pattern)) {
|
|
60
|
+
signals.push(`Blocked: "${pattern}"`);
|
|
61
|
+
type = "blocked";
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
const isChallenge = signals.length > 0;
|
|
66
|
+
const confidence = isChallenge ? 100 : 0;
|
|
67
|
+
return {
|
|
68
|
+
isChallenge,
|
|
69
|
+
type: isChallenge ? type : "none",
|
|
70
|
+
confidence,
|
|
71
|
+
signals
|
|
72
|
+
};
|
|
73
|
+
} catch (error) {
|
|
74
|
+
return {
|
|
75
|
+
isChallenge: false,
|
|
76
|
+
type: "none",
|
|
77
|
+
confidence: 0,
|
|
78
|
+
signals: [`Error during detection: ${error.message}`]
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
async function isChallengePage(hero) {
|
|
83
|
+
const detection = await detectChallenge(hero);
|
|
84
|
+
return detection.isChallenge;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// src/cloudflare/handler.ts
|
|
88
|
+
async function waitForChallengeResolution(hero, options) {
|
|
89
|
+
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
90
|
+
const startTime = Date.now();
|
|
91
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
92
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
93
|
+
const elapsed = Date.now() - startTime;
|
|
94
|
+
try {
|
|
95
|
+
const currentUrl = await hero.url;
|
|
96
|
+
if (currentUrl !== initialUrl) {
|
|
97
|
+
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
98
|
+
log(` Waiting for new page to load...`);
|
|
99
|
+
try {
|
|
100
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
101
|
+
log(` DOMContentLoaded`);
|
|
102
|
+
} catch {
|
|
103
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
104
|
+
}
|
|
105
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
106
|
+
});
|
|
107
|
+
log(` Page stabilized`);
|
|
108
|
+
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
109
|
+
}
|
|
110
|
+
} catch {
|
|
111
|
+
}
|
|
112
|
+
const detection = await detectChallenge(hero);
|
|
113
|
+
if (!detection.isChallenge) {
|
|
114
|
+
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
115
|
+
log(` Waiting for page to load...`);
|
|
116
|
+
try {
|
|
117
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
118
|
+
log(` DOMContentLoaded`);
|
|
119
|
+
} catch {
|
|
120
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
121
|
+
}
|
|
122
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
123
|
+
});
|
|
124
|
+
log(` Page stabilized`);
|
|
125
|
+
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
126
|
+
}
|
|
127
|
+
log(
|
|
128
|
+
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
129
|
+
);
|
|
130
|
+
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
131
|
+
}
|
|
132
|
+
return {
|
|
133
|
+
resolved: false,
|
|
134
|
+
method: "timeout",
|
|
135
|
+
waitedMs: Date.now() - startTime
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
|
|
139
|
+
const startTime = Date.now();
|
|
140
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
141
|
+
log(`Waiting for selector: "${selector}"`);
|
|
142
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
143
|
+
try {
|
|
144
|
+
const element = await hero.document.querySelector(selector);
|
|
145
|
+
if (element) {
|
|
146
|
+
const elapsed = Date.now() - startTime;
|
|
147
|
+
log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
|
|
148
|
+
return { found: true, waitedMs: elapsed };
|
|
149
|
+
}
|
|
150
|
+
} catch {
|
|
151
|
+
}
|
|
152
|
+
await new Promise((resolve) => setTimeout(resolve, 300));
|
|
153
|
+
}
|
|
154
|
+
log(`\u2717 Selector not found within timeout`);
|
|
155
|
+
return { found: false, waitedMs: Date.now() - startTime };
|
|
156
|
+
}
|
|
157
|
+
async function handleChallenge(hero, options = {}) {
|
|
158
|
+
const initialUrl = await hero.url;
|
|
159
|
+
const detection = await detectChallenge(hero);
|
|
160
|
+
if (!detection.isChallenge) {
|
|
161
|
+
return { resolved: true, method: "signals_cleared", waitedMs: 0 };
|
|
162
|
+
}
|
|
163
|
+
return waitForChallengeResolution(hero, {
|
|
164
|
+
...options,
|
|
165
|
+
initialUrl
|
|
166
|
+
});
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// src/formatters/markdown.ts
|
|
170
|
+
import TurndownService from "turndown";
|
|
171
|
+
var turndownService = new TurndownService({
|
|
172
|
+
headingStyle: "atx",
|
|
173
|
+
hr: "---",
|
|
174
|
+
bulletListMarker: "-",
|
|
175
|
+
codeBlockStyle: "fenced",
|
|
176
|
+
fence: "```",
|
|
177
|
+
emDelimiter: "*",
|
|
178
|
+
strongDelimiter: "**",
|
|
179
|
+
linkStyle: "inlined",
|
|
180
|
+
linkReferenceStyle: "full"
|
|
181
|
+
});
|
|
182
|
+
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
183
|
+
const sections = [];
|
|
184
|
+
if (includeMetadata) {
|
|
185
|
+
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
186
|
+
}
|
|
187
|
+
if (pages.length > 1) {
|
|
188
|
+
sections.push(createMarkdownTOC(pages));
|
|
189
|
+
}
|
|
190
|
+
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
191
|
+
return sections.join("\n\n");
|
|
192
|
+
}
|
|
193
|
+
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
194
|
+
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
195
|
+
const description = website.description || "";
|
|
196
|
+
let header = `# Website Scrape: ${title}
|
|
197
|
+
|
|
198
|
+
`;
|
|
199
|
+
header += `**Base URL:** ${baseUrl}
|
|
200
|
+
`;
|
|
201
|
+
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
202
|
+
`;
|
|
203
|
+
header += `**Duration:** ${duration}ms
|
|
204
|
+
`;
|
|
205
|
+
header += `**Total pages:** ${totalPages}
|
|
206
|
+
`;
|
|
207
|
+
if (description) {
|
|
208
|
+
header += `**Description:** ${description}
|
|
209
|
+
`;
|
|
210
|
+
}
|
|
211
|
+
if (website.author) {
|
|
212
|
+
header += `**Author:** ${website.author}
|
|
213
|
+
`;
|
|
214
|
+
}
|
|
215
|
+
if (website.language) {
|
|
216
|
+
header += `**Language:** ${website.language}
|
|
217
|
+
`;
|
|
218
|
+
}
|
|
219
|
+
return header;
|
|
220
|
+
}
|
|
221
|
+
function createMarkdownTOC(pages) {
|
|
222
|
+
let toc = "## Table of Contents\n\n";
|
|
223
|
+
pages.forEach((page, index) => {
|
|
224
|
+
const depth = " ".repeat(page.depth);
|
|
225
|
+
const pageNumber = index + 1;
|
|
226
|
+
const title = page.title || `Page ${pageNumber}`;
|
|
227
|
+
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
228
|
+
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
229
|
+
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
230
|
+
`;
|
|
231
|
+
});
|
|
232
|
+
return toc;
|
|
233
|
+
}
|
|
234
|
+
function createMarkdownPage(page, pageNumber) {
|
|
235
|
+
const title = page.title || `Page ${pageNumber}`;
|
|
236
|
+
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
237
|
+
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
238
|
+
let pageContent = `---
|
|
239
|
+
|
|
240
|
+
`;
|
|
241
|
+
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
242
|
+
|
|
243
|
+
`;
|
|
244
|
+
pageContent += `**URL:** ${page.url}
|
|
245
|
+
`;
|
|
246
|
+
pageContent += `**Title:** ${page.title}
|
|
247
|
+
`;
|
|
248
|
+
pageContent += `**Depth:** ${page.depth}
|
|
249
|
+
`;
|
|
250
|
+
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
251
|
+
|
|
252
|
+
`;
|
|
253
|
+
pageContent += `---
|
|
254
|
+
|
|
255
|
+
`;
|
|
256
|
+
const markdown = htmlToMarkdown(page.html);
|
|
257
|
+
pageContent += markdown;
|
|
258
|
+
return pageContent;
|
|
259
|
+
}
|
|
260
|
+
function htmlToMarkdown(html) {
|
|
261
|
+
try {
|
|
262
|
+
return turndownService.turndown(html);
|
|
263
|
+
} catch (error) {
|
|
264
|
+
console.warn("Error converting HTML to Markdown:", error);
|
|
265
|
+
return html.replace(/<[^>]*>/g, "").trim();
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
function extractDomainFromUrl(url) {
|
|
269
|
+
try {
|
|
270
|
+
return new URL(url).hostname;
|
|
271
|
+
} catch {
|
|
272
|
+
return "Unknown";
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// src/formatters/html.ts
|
|
277
|
+
function formatToHTML(pages, baseUrl, scrapedAt, duration, website) {
|
|
278
|
+
const html = `<!DOCTYPE html>
|
|
279
|
+
<html lang="${website.language || "en"}">
|
|
280
|
+
<head>
|
|
281
|
+
<meta charset="${website.charset || "UTF-8"}">
|
|
282
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
283
|
+
<title>Scrape: ${website.title || extractDomainFromUrl2(baseUrl)}</title>
|
|
284
|
+
${generateMetaTags(website)}
|
|
285
|
+
<style>
|
|
286
|
+
${generateCSS()}
|
|
287
|
+
</style>
|
|
288
|
+
</head>
|
|
289
|
+
<body>
|
|
290
|
+
<header class="header">
|
|
291
|
+
<h1>Website Scrape: ${escapeHtml(website.title || extractDomainFromUrl2(baseUrl))}</h1>
|
|
292
|
+
<div class="meta-info">
|
|
293
|
+
<p><strong>Base URL:</strong> <a href="${escapeHtml(
|
|
294
|
+
baseUrl
|
|
295
|
+
)}" target="_blank">${escapeHtml(baseUrl)}</a></p>
|
|
296
|
+
<p><strong>Scraped at:</strong> ${new Date(scrapedAt).toLocaleString()}</p>
|
|
297
|
+
<p><strong>Duration:</strong> ${duration}ms</p>
|
|
298
|
+
<p><strong>Total pages:</strong> ${pages.length}</p>
|
|
299
|
+
${website.description ? `<p><strong>Description:</strong> ${escapeHtml(website.description)}</p>` : ""}
|
|
300
|
+
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
301
|
+
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
302
|
+
</div>
|
|
303
|
+
</header>
|
|
304
|
+
|
|
305
|
+
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
306
|
+
|
|
307
|
+
<main class="content">
|
|
308
|
+
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
309
|
+
</main>
|
|
310
|
+
|
|
311
|
+
<footer class="footer">
|
|
312
|
+
<p>Generated by Reader JS/TS SDK</p>
|
|
313
|
+
</footer>
|
|
314
|
+
|
|
315
|
+
<script>
|
|
316
|
+
${generateJavaScript()}
|
|
317
|
+
</script>
|
|
318
|
+
</body>
|
|
319
|
+
</html>`;
|
|
320
|
+
return html;
|
|
321
|
+
}
|
|
322
|
+
function generateMetaTags(website) {
|
|
323
|
+
const tags = [];
|
|
324
|
+
if (website.description) {
|
|
325
|
+
tags.push(`<meta name="description" content="${escapeHtml(website.description)}">`);
|
|
326
|
+
}
|
|
327
|
+
if (website.author) {
|
|
328
|
+
tags.push(`<meta name="author" content="${escapeHtml(website.author)}">`);
|
|
329
|
+
}
|
|
330
|
+
if (website.keywords) {
|
|
331
|
+
tags.push(`<meta name="keywords" content="${escapeHtml(website.keywords.join(", "))}">`);
|
|
332
|
+
}
|
|
333
|
+
if (website.robots) {
|
|
334
|
+
tags.push(`<meta name="robots" content="${escapeHtml(website.robots)}">`);
|
|
335
|
+
}
|
|
336
|
+
if (website.themeColor) {
|
|
337
|
+
tags.push(`<meta name="theme-color" content="${escapeHtml(website.themeColor)}">`);
|
|
338
|
+
}
|
|
339
|
+
if (website.favicon) {
|
|
340
|
+
tags.push(`<link rel="icon" href="${escapeHtml(website.favicon)}">`);
|
|
341
|
+
}
|
|
342
|
+
if (website.canonical) {
|
|
343
|
+
tags.push(`<link rel="canonical" href="${escapeHtml(website.canonical)}">`);
|
|
344
|
+
}
|
|
345
|
+
if (website.openGraph) {
|
|
346
|
+
const og = website.openGraph;
|
|
347
|
+
if (og.title) tags.push(`<meta property="og:title" content="${escapeHtml(og.title)}">`);
|
|
348
|
+
if (og.description)
|
|
349
|
+
tags.push(`<meta property="og:description" content="${escapeHtml(og.description)}">`);
|
|
350
|
+
if (og.type) tags.push(`<meta property="og:type" content="${escapeHtml(og.type)}">`);
|
|
351
|
+
if (og.url) tags.push(`<meta property="og:url" content="${escapeHtml(og.url)}">`);
|
|
352
|
+
if (og.image) tags.push(`<meta property="og:image" content="${escapeHtml(og.image)}">`);
|
|
353
|
+
if (og.siteName)
|
|
354
|
+
tags.push(`<meta property="og:site_name" content="${escapeHtml(og.siteName)}">`);
|
|
355
|
+
if (og.locale) tags.push(`<meta property="og:locale" content="${escapeHtml(og.locale)}">`);
|
|
356
|
+
}
|
|
357
|
+
if (website.twitter) {
|
|
358
|
+
const twitter = website.twitter;
|
|
359
|
+
if (twitter.card) tags.push(`<meta name="twitter:card" content="${escapeHtml(twitter.card)}">`);
|
|
360
|
+
if (twitter.site) tags.push(`<meta name="twitter:site" content="${escapeHtml(twitter.site)}">`);
|
|
361
|
+
if (twitter.creator)
|
|
362
|
+
tags.push(`<meta name="twitter:creator" content="${escapeHtml(twitter.creator)}">`);
|
|
363
|
+
if (twitter.title)
|
|
364
|
+
tags.push(`<meta name="twitter:title" content="${escapeHtml(twitter.title)}">`);
|
|
365
|
+
if (twitter.description)
|
|
366
|
+
tags.push(`<meta name="twitter:description" content="${escapeHtml(twitter.description)}">`);
|
|
367
|
+
if (twitter.image)
|
|
368
|
+
tags.push(`<meta name="twitter:image" content="${escapeHtml(twitter.image)}">`);
|
|
369
|
+
}
|
|
370
|
+
return tags.join("\n ");
|
|
371
|
+
}
|
|
372
|
+
function generateCSS() {
|
|
373
|
+
return `
|
|
374
|
+
* {
|
|
375
|
+
margin: 0;
|
|
376
|
+
padding: 0;
|
|
377
|
+
box-sizing: border-box;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
body {
|
|
381
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
382
|
+
line-height: 1.6;
|
|
383
|
+
color: #333;
|
|
384
|
+
background-color: #f8f9fa;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
.header {
|
|
388
|
+
background: white;
|
|
389
|
+
padding: 2rem;
|
|
390
|
+
border-bottom: 1px solid #e9ecef;
|
|
391
|
+
margin-bottom: 2rem;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
.header h1 {
|
|
395
|
+
color: #2c3e50;
|
|
396
|
+
margin-bottom: 1rem;
|
|
397
|
+
font-size: 2rem;
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
.meta-info {
|
|
401
|
+
display: grid;
|
|
402
|
+
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
403
|
+
gap: 0.5rem;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
.meta-info p {
|
|
407
|
+
margin: 0.25rem 0;
|
|
408
|
+
font-size: 0.9rem;
|
|
409
|
+
color: #6c757d;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
.toc {
|
|
413
|
+
background: white;
|
|
414
|
+
padding: 1.5rem;
|
|
415
|
+
margin: 2rem 0;
|
|
416
|
+
border-radius: 8px;
|
|
417
|
+
border: 1px solid #e9ecef;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
.toc h2 {
|
|
421
|
+
color: #2c3e50;
|
|
422
|
+
margin-bottom: 1rem;
|
|
423
|
+
font-size: 1.25rem;
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
.toc ul {
|
|
427
|
+
list-style: none;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
.toc li {
|
|
431
|
+
margin: 0.5rem 0;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
.toc a {
|
|
435
|
+
color: #007bff;
|
|
436
|
+
text-decoration: none;
|
|
437
|
+
transition: color 0.2s;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
.toc a:hover {
|
|
441
|
+
color: #0056b3;
|
|
442
|
+
text-decoration: underline;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
.content {
|
|
446
|
+
max-width: 800px;
|
|
447
|
+
margin: 0 auto;
|
|
448
|
+
padding: 0 1rem;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
.page {
|
|
452
|
+
background: white;
|
|
453
|
+
margin: 2rem 0;
|
|
454
|
+
padding: 2rem;
|
|
455
|
+
border-radius: 8px;
|
|
456
|
+
border: 1px solid #e9ecef;
|
|
457
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
.page-header {
|
|
461
|
+
border-bottom: 2px solid #e9ecef;
|
|
462
|
+
padding-bottom: 1rem;
|
|
463
|
+
margin-bottom: 2rem;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
.page-header h2 {
|
|
467
|
+
color: #2c3e50;
|
|
468
|
+
margin-bottom: 0.5rem;
|
|
469
|
+
font-size: 1.5rem;
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
.page-meta {
|
|
473
|
+
display: flex;
|
|
474
|
+
flex-wrap: wrap;
|
|
475
|
+
gap: 1rem;
|
|
476
|
+
font-size: 0.9rem;
|
|
477
|
+
color: #6c757d;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
.page-content {
|
|
481
|
+
line-height: 1.8;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
.page-content h1, .page-content h2, .page-content h3,
|
|
485
|
+
.page-content h4, .page-content h5, .page-content h6 {
|
|
486
|
+
color: #2c3e50;
|
|
487
|
+
margin: 1.5rem 0 0.5rem 0;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
.page-content p {
|
|
491
|
+
margin: 1rem 0;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
.page-content a {
|
|
495
|
+
color: #007bff;
|
|
496
|
+
text-decoration: none;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
.page-content a:hover {
|
|
500
|
+
text-decoration: underline;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
.page-content code {
|
|
504
|
+
background: #f8f9fa;
|
|
505
|
+
padding: 0.2rem 0.4rem;
|
|
506
|
+
border-radius: 4px;
|
|
507
|
+
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
508
|
+
font-size: 0.9em;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
.page-content pre {
|
|
512
|
+
background: #f8f9fa;
|
|
513
|
+
padding: 1rem;
|
|
514
|
+
border-radius: 4px;
|
|
515
|
+
overflow-x: auto;
|
|
516
|
+
margin: 1rem 0;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
.page-content blockquote {
|
|
520
|
+
border-left: 4px solid #007bff;
|
|
521
|
+
padding-left: 1rem;
|
|
522
|
+
margin: 1rem 0;
|
|
523
|
+
color: #6c757d;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
.footer {
|
|
527
|
+
text-align: center;
|
|
528
|
+
padding: 2rem;
|
|
529
|
+
margin-top: 3rem;
|
|
530
|
+
border-top: 1px solid #e9ecef;
|
|
531
|
+
color: #6c757d;
|
|
532
|
+
font-size: 0.9rem;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
@media (max-width: 768px) {
|
|
536
|
+
.header {
|
|
537
|
+
padding: 1rem;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
.header h1 {
|
|
541
|
+
font-size: 1.5rem;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
.page {
|
|
545
|
+
padding: 1rem;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
.page-meta {
|
|
549
|
+
flex-direction: column;
|
|
550
|
+
gap: 0.5rem;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
`.trim();
|
|
554
|
+
}
|
|
555
|
+
function generateTOC(pages) {
|
|
556
|
+
const tocItems = pages.map((page, index) => {
|
|
557
|
+
const pageNumber = index + 1;
|
|
558
|
+
const title = page.title || `Page ${pageNumber}`;
|
|
559
|
+
const id = `page-${pageNumber}`;
|
|
560
|
+
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
561
|
+
}).join("\n");
|
|
562
|
+
return `
|
|
563
|
+
<nav class="toc">
|
|
564
|
+
<h2>Table of Contents</h2>
|
|
565
|
+
<ul>
|
|
566
|
+
${tocItems}
|
|
567
|
+
</ul>
|
|
568
|
+
</nav>`;
|
|
569
|
+
}
|
|
570
|
+
function generatePageHTML(page, pageNumber) {
|
|
571
|
+
const id = `page-${pageNumber}`;
|
|
572
|
+
const title = page.title || `Page ${pageNumber}`;
|
|
573
|
+
return `
|
|
574
|
+
<article class="page" id="${id}">
|
|
575
|
+
<div class="page-header">
|
|
576
|
+
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
577
|
+
<div class="page-meta">
|
|
578
|
+
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
579
|
+
page.url
|
|
580
|
+
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
581
|
+
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
582
|
+
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
583
|
+
</div>
|
|
584
|
+
</div>
|
|
585
|
+
<div class="page-content">
|
|
586
|
+
${page.html}
|
|
587
|
+
</div>
|
|
588
|
+
</article>`;
|
|
589
|
+
}
|
|
590
|
+
function generateJavaScript() {
|
|
591
|
+
return `
|
|
592
|
+
// Smooth scrolling for TOC links
|
|
593
|
+
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
594
|
+
anchor.addEventListener('click', function (e) {
|
|
595
|
+
e.preventDefault();
|
|
596
|
+
const target = document.querySelector(this.getAttribute('href'));
|
|
597
|
+
if (target) {
|
|
598
|
+
target.scrollIntoView({
|
|
599
|
+
behavior: 'smooth',
|
|
600
|
+
block: 'start'
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
});
|
|
604
|
+
});
|
|
605
|
+
|
|
606
|
+
// Highlight current section in TOC
|
|
607
|
+
window.addEventListener('scroll', function() {
|
|
608
|
+
const pages = document.querySelectorAll('.page');
|
|
609
|
+
const tocLinks = document.querySelectorAll('.toc a');
|
|
610
|
+
|
|
611
|
+
let currentPage = null;
|
|
612
|
+
pages.forEach(page => {
|
|
613
|
+
const rect = page.getBoundingClientRect();
|
|
614
|
+
if (rect.top <= 100) {
|
|
615
|
+
currentPage = page;
|
|
616
|
+
}
|
|
617
|
+
});
|
|
618
|
+
|
|
619
|
+
tocLinks.forEach(link => {
|
|
620
|
+
link.style.fontWeight = 'normal';
|
|
621
|
+
const target = document.querySelector(link.getAttribute('href'));
|
|
622
|
+
if (target === currentPage) {
|
|
623
|
+
link.style.fontWeight = 'bold';
|
|
624
|
+
}
|
|
625
|
+
});
|
|
626
|
+
});
|
|
627
|
+
`;
|
|
628
|
+
}
|
|
629
|
+
function escapeHtml(text) {
|
|
630
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
631
|
+
}
|
|
632
|
+
function extractDomainFromUrl2(url) {
|
|
633
|
+
try {
|
|
634
|
+
return new URL(url).hostname;
|
|
635
|
+
} catch {
|
|
636
|
+
return "Unknown";
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// src/formatters/json.ts
|
|
641
|
+
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
642
|
+
const jsonResult = {
|
|
643
|
+
metadata: {
|
|
644
|
+
baseUrl,
|
|
645
|
+
totalPages: pages.length,
|
|
646
|
+
scrapedAt,
|
|
647
|
+
duration,
|
|
648
|
+
website
|
|
649
|
+
},
|
|
650
|
+
pages: pages.map((page, index) => ({
|
|
651
|
+
index: index + 1,
|
|
652
|
+
url: page.url,
|
|
653
|
+
title: page.title,
|
|
654
|
+
markdown: page.markdown,
|
|
655
|
+
html: page.html,
|
|
656
|
+
fetchedAt: page.fetchedAt,
|
|
657
|
+
depth: page.depth,
|
|
658
|
+
wordCount: countWords(page.markdown),
|
|
659
|
+
readingTime: estimateReadingTime(page.markdown)
|
|
660
|
+
}))
|
|
661
|
+
};
|
|
662
|
+
return JSON.stringify(jsonResult, null, 2);
|
|
663
|
+
}
|
|
664
|
+
function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
|
|
665
|
+
const jsonResult = {
|
|
666
|
+
metadata: {
|
|
667
|
+
baseUrl,
|
|
668
|
+
totalPages: pages.length,
|
|
669
|
+
scrapedAt,
|
|
670
|
+
duration,
|
|
671
|
+
website
|
|
672
|
+
},
|
|
673
|
+
pages: pages.map((page, index) => ({
|
|
674
|
+
index: index + 1,
|
|
675
|
+
url: page.url,
|
|
676
|
+
title: page.title,
|
|
677
|
+
markdown: page.markdown,
|
|
678
|
+
fetchedAt: page.fetchedAt,
|
|
679
|
+
depth: page.depth,
|
|
680
|
+
wordCount: countWords(page.markdown),
|
|
681
|
+
readingTime: estimateReadingTime(page.markdown)
|
|
682
|
+
}))
|
|
683
|
+
};
|
|
684
|
+
return JSON.stringify(jsonResult, null, 2);
|
|
685
|
+
}
|
|
686
|
+
function countWords(markdown) {
|
|
687
|
+
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
688
|
+
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
689
|
+
}
|
|
690
|
+
function estimateReadingTime(markdown) {
|
|
691
|
+
const wordCount = countWords(markdown);
|
|
692
|
+
return Math.ceil(wordCount / 200);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// src/formatters/text.ts
|
|
696
|
+
import { parseHTML } from "linkedom";
|
|
697
|
+
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
698
|
+
const sections = [];
|
|
699
|
+
if (includeMetadata) {
|
|
700
|
+
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
701
|
+
}
|
|
702
|
+
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
703
|
+
return sections.join("\n\n");
|
|
704
|
+
}
|
|
705
|
+
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
706
|
+
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
707
|
+
const lines = [];
|
|
708
|
+
lines.push(`=== ${title} ===`);
|
|
709
|
+
lines.push("");
|
|
710
|
+
lines.push(`URL: ${baseUrl}`);
|
|
711
|
+
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
712
|
+
lines.push(`Duration: ${duration}ms`);
|
|
713
|
+
lines.push(`Pages: ${totalPages}`);
|
|
714
|
+
if (website.description) {
|
|
715
|
+
lines.push(`Description: ${website.description}`);
|
|
716
|
+
}
|
|
717
|
+
if (website.author) {
|
|
718
|
+
lines.push(`Author: ${website.author}`);
|
|
719
|
+
}
|
|
720
|
+
if (website.language) {
|
|
721
|
+
lines.push(`Language: ${website.language}`);
|
|
722
|
+
}
|
|
723
|
+
return lines.join("\n");
|
|
724
|
+
}
|
|
725
|
+
function createTextPage(page, pageNumber, showSeparator) {
|
|
726
|
+
const lines = [];
|
|
727
|
+
if (showSeparator) {
|
|
728
|
+
lines.push("\u2500".repeat(60));
|
|
729
|
+
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
730
|
+
lines.push(`URL: ${page.url}`);
|
|
731
|
+
lines.push("\u2500".repeat(60));
|
|
732
|
+
}
|
|
733
|
+
const plainText = htmlToPlainText(page.html);
|
|
734
|
+
lines.push(plainText);
|
|
735
|
+
return lines.join("\n");
|
|
736
|
+
}
|
|
737
|
+
function htmlToPlainText(html) {
|
|
738
|
+
const { document } = parseHTML(html);
|
|
739
|
+
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
740
|
+
elementsToRemove.forEach((tag) => {
|
|
741
|
+
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
742
|
+
});
|
|
743
|
+
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
744
|
+
text = text.replace(/[ \t]+/g, " ");
|
|
745
|
+
text = text.replace(/\n[ \t]+/g, "\n");
|
|
746
|
+
text = text.replace(/[ \t]+\n/g, "\n");
|
|
747
|
+
text = text.replace(/\n{3,}/g, "\n\n");
|
|
748
|
+
text = text.trim();
|
|
749
|
+
return text;
|
|
750
|
+
}
|
|
751
|
+
function extractDomainFromUrl3(url) {
|
|
752
|
+
try {
|
|
753
|
+
return new URL(url).hostname;
|
|
754
|
+
} catch {
|
|
755
|
+
return "Unknown";
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
// src/utils/content-cleaner.ts
|
|
760
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
761
|
+
var ALWAYS_REMOVE_SELECTORS = [
|
|
762
|
+
// Navigation and menus
|
|
763
|
+
"nav",
|
|
764
|
+
"header nav",
|
|
765
|
+
"footer nav",
|
|
766
|
+
".nav",
|
|
767
|
+
".navigation",
|
|
768
|
+
".menu",
|
|
769
|
+
".navbar",
|
|
770
|
+
".sidebar",
|
|
771
|
+
".aside",
|
|
772
|
+
// Header and footer elements
|
|
773
|
+
"header",
|
|
774
|
+
"footer",
|
|
775
|
+
".site-header",
|
|
776
|
+
".page-header",
|
|
777
|
+
".site-footer",
|
|
778
|
+
".page-footer",
|
|
779
|
+
// Social media and sharing
|
|
780
|
+
".social",
|
|
781
|
+
".share",
|
|
782
|
+
".sharing",
|
|
783
|
+
".twitter",
|
|
784
|
+
".facebook",
|
|
785
|
+
".linkedin",
|
|
786
|
+
".instagram",
|
|
787
|
+
// Comments and discussions
|
|
788
|
+
".comments",
|
|
789
|
+
".comment",
|
|
790
|
+
".discussion",
|
|
791
|
+
".disqus",
|
|
792
|
+
// Forms and interactive elements
|
|
793
|
+
"form",
|
|
794
|
+
"input",
|
|
795
|
+
"button:not([type='submit'])",
|
|
796
|
+
"select",
|
|
797
|
+
"textarea",
|
|
798
|
+
// Scripts and styles
|
|
799
|
+
"script",
|
|
800
|
+
"style",
|
|
801
|
+
"noscript",
|
|
802
|
+
// Hidden elements
|
|
803
|
+
"[hidden]",
|
|
804
|
+
"[style*='display: none']",
|
|
805
|
+
"[style*='display:none']",
|
|
806
|
+
// Common utility classes
|
|
807
|
+
".cookie",
|
|
808
|
+
".cookie-banner",
|
|
809
|
+
".popup",
|
|
810
|
+
".modal",
|
|
811
|
+
".overlay",
|
|
812
|
+
".notification",
|
|
813
|
+
// Breadcrumbs
|
|
814
|
+
".breadcrumb",
|
|
815
|
+
".breadcrumbs",
|
|
816
|
+
".breadcrumb-trail"
|
|
817
|
+
];
|
|
818
|
+
var AD_SELECTORS = [
|
|
819
|
+
// Ads and promotions
|
|
820
|
+
".ad",
|
|
821
|
+
".ads",
|
|
822
|
+
".advertisement",
|
|
823
|
+
".promotion",
|
|
824
|
+
".sponsored",
|
|
825
|
+
"[class*='ad-']",
|
|
826
|
+
"[id*='ad-']",
|
|
827
|
+
"[class*='advert']",
|
|
828
|
+
"[id*='advert']",
|
|
829
|
+
"[class*='banner']",
|
|
830
|
+
"[id*='banner']",
|
|
831
|
+
".google-ad",
|
|
832
|
+
".adsense",
|
|
833
|
+
"[data-ad]",
|
|
834
|
+
"[data-ads]",
|
|
835
|
+
"ins.adsbygoogle",
|
|
836
|
+
// Tracking
|
|
837
|
+
"[class*='tracking']",
|
|
838
|
+
"[id*='tracking']",
|
|
839
|
+
"[class*='analytics']",
|
|
840
|
+
"[id*='analytics']"
|
|
841
|
+
];
|
|
842
|
+
function cleanHtml(html, baseUrl, options = {}) {
|
|
843
|
+
const { removeAds = true, removeBase64Images = true } = options;
|
|
844
|
+
const { document } = parseHTML2(html);
|
|
845
|
+
for (const selector of ALWAYS_REMOVE_SELECTORS) {
|
|
846
|
+
try {
|
|
847
|
+
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
848
|
+
} catch {
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
if (removeAds) {
|
|
852
|
+
for (const selector of AD_SELECTORS) {
|
|
853
|
+
try {
|
|
854
|
+
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
855
|
+
} catch {
|
|
856
|
+
}
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
if (removeBase64Images) {
|
|
860
|
+
removeBase64ImagesFromDocument(document);
|
|
861
|
+
}
|
|
862
|
+
const walker = document.createTreeWalker(
|
|
863
|
+
document,
|
|
864
|
+
128
|
|
865
|
+
/* NodeFilter.SHOW_COMMENT */
|
|
866
|
+
);
|
|
867
|
+
const comments = [];
|
|
868
|
+
while (walker.nextNode()) {
|
|
869
|
+
comments.push(walker.currentNode);
|
|
870
|
+
}
|
|
871
|
+
comments.forEach((comment) => comment.parentNode?.removeChild(comment));
|
|
872
|
+
convertRelativeUrls(document, baseUrl);
|
|
873
|
+
return document.documentElement?.outerHTML || html;
|
|
874
|
+
}
|
|
875
|
+
function removeBase64ImagesFromDocument(document) {
|
|
876
|
+
document.querySelectorAll("img[src^='data:']").forEach((el) => {
|
|
877
|
+
el.remove();
|
|
878
|
+
});
|
|
879
|
+
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
880
|
+
const style = el.getAttribute("style");
|
|
881
|
+
if (style) {
|
|
882
|
+
const cleanedStyle = style.replace(/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "");
|
|
883
|
+
if (cleanedStyle.trim()) {
|
|
884
|
+
el.setAttribute("style", cleanedStyle);
|
|
885
|
+
} else {
|
|
886
|
+
el.removeAttribute("style");
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
});
|
|
890
|
+
document.querySelectorAll("source[src^='data:'], source[srcset*='data:']").forEach((el) => {
|
|
891
|
+
el.remove();
|
|
892
|
+
});
|
|
893
|
+
}
|
|
894
|
+
function convertRelativeUrls(document, baseUrl) {
|
|
895
|
+
document.querySelectorAll("[src]").forEach((el) => {
|
|
896
|
+
const src = el.getAttribute("src");
|
|
897
|
+
if (src && !src.startsWith("http") && !src.startsWith("//") && !src.startsWith("data:")) {
|
|
898
|
+
try {
|
|
899
|
+
el.setAttribute("src", new URL(src, baseUrl).toString());
|
|
900
|
+
} catch {
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
});
|
|
904
|
+
document.querySelectorAll("[href]").forEach((el) => {
|
|
905
|
+
const href = el.getAttribute("href");
|
|
906
|
+
if (href && !href.startsWith("http") && !href.startsWith("//") && !href.startsWith("#") && !href.startsWith("mailto:") && !href.startsWith("tel:") && !href.startsWith("javascript:")) {
|
|
907
|
+
try {
|
|
908
|
+
el.setAttribute("href", new URL(href, baseUrl).toString());
|
|
909
|
+
} catch {
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
});
|
|
913
|
+
}
|
|
914
|
+
function cleanContent(html, baseUrl, options = {}) {
|
|
915
|
+
return cleanHtml(html, baseUrl, options);
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
// src/utils/metadata-extractor.ts
|
|
919
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
920
|
+
|
|
921
|
+
// src/utils/url-helpers.ts
|
|
922
|
+
import { URL as URL2 } from "url";
|
|
923
|
+
import RE2 from "re2";
|
|
924
|
+
function resolveUrl(relative, base) {
|
|
925
|
+
try {
|
|
926
|
+
return new URL2(relative, base).toString();
|
|
927
|
+
} catch {
|
|
928
|
+
return relative;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
function isValidUrl(string) {
|
|
932
|
+
try {
|
|
933
|
+
new URL2(string);
|
|
934
|
+
return true;
|
|
935
|
+
} catch {
|
|
936
|
+
return false;
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
function normalizeUrl(url, baseUrl) {
|
|
940
|
+
try {
|
|
941
|
+
let parsedUrl;
|
|
942
|
+
if (url.startsWith("http://") || url.startsWith("https://")) {
|
|
943
|
+
parsedUrl = new URL2(url);
|
|
944
|
+
} else if (baseUrl) {
|
|
945
|
+
parsedUrl = new URL2(url, baseUrl);
|
|
946
|
+
} else {
|
|
947
|
+
throw new Error("Relative URL requires base URL");
|
|
948
|
+
}
|
|
949
|
+
parsedUrl.hash = "";
|
|
950
|
+
return parsedUrl.toString();
|
|
951
|
+
} catch {
|
|
952
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
function extractBaseDomain(url) {
|
|
956
|
+
try {
|
|
957
|
+
const parsedUrl = new URL2(url);
|
|
958
|
+
return parsedUrl.hostname;
|
|
959
|
+
} catch {
|
|
960
|
+
throw new Error(`Invalid URL for domain extraction: ${url}`);
|
|
961
|
+
}
|
|
962
|
+
}
|
|
963
|
+
function getRootDomain(hostname) {
|
|
964
|
+
const parts = hostname.split(".");
|
|
965
|
+
if (parts.length <= 2) {
|
|
966
|
+
return hostname;
|
|
967
|
+
}
|
|
968
|
+
const twoPartTLDs = ["co.uk", "com.au", "co.nz", "com.br", "co.jp", "co.kr", "com.mx", "org.uk"];
|
|
969
|
+
const lastTwo = parts.slice(-2).join(".");
|
|
970
|
+
if (twoPartTLDs.includes(lastTwo)) {
|
|
971
|
+
return parts.slice(-3).join(".");
|
|
972
|
+
}
|
|
973
|
+
return parts.slice(-2).join(".");
|
|
974
|
+
}
|
|
975
|
+
function isSameDomain(url, baseUrl) {
|
|
976
|
+
try {
|
|
977
|
+
const urlDomain = extractBaseDomain(url);
|
|
978
|
+
const baseDomain = extractBaseDomain(baseUrl);
|
|
979
|
+
if (urlDomain === baseDomain) {
|
|
980
|
+
return true;
|
|
981
|
+
}
|
|
982
|
+
const urlRoot = getRootDomain(urlDomain);
|
|
983
|
+
const baseRoot = getRootDomain(baseDomain);
|
|
984
|
+
return urlRoot === baseRoot;
|
|
985
|
+
} catch {
|
|
986
|
+
return false;
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
function getUrlKey(url) {
|
|
990
|
+
try {
|
|
991
|
+
const parsedUrl = new URL2(url);
|
|
992
|
+
parsedUrl.search = "";
|
|
993
|
+
return parsedUrl.toString().toLowerCase();
|
|
994
|
+
} catch {
|
|
995
|
+
return url.toLowerCase();
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
function validateUrls(urls) {
|
|
999
|
+
const validUrls = [];
|
|
1000
|
+
const errors = [];
|
|
1001
|
+
if (!urls || urls.length === 0) {
|
|
1002
|
+
return {
|
|
1003
|
+
isValid: false,
|
|
1004
|
+
validUrls: [],
|
|
1005
|
+
errors: [{ url: "", error: "At least one URL is required" }]
|
|
1006
|
+
};
|
|
1007
|
+
}
|
|
1008
|
+
for (const url of urls) {
|
|
1009
|
+
if (!url || typeof url !== "string") {
|
|
1010
|
+
errors.push({
|
|
1011
|
+
url: String(url),
|
|
1012
|
+
error: "URL must be a non-empty string"
|
|
1013
|
+
});
|
|
1014
|
+
continue;
|
|
1015
|
+
}
|
|
1016
|
+
const trimmedUrl = url.trim();
|
|
1017
|
+
if (trimmedUrl === "") {
|
|
1018
|
+
errors.push({ url: String(url), error: "URL cannot be empty" });
|
|
1019
|
+
continue;
|
|
1020
|
+
}
|
|
1021
|
+
if (!isValidUrl(trimmedUrl)) {
|
|
1022
|
+
errors.push({ url: trimmedUrl, error: "Invalid URL format" });
|
|
1023
|
+
continue;
|
|
1024
|
+
}
|
|
1025
|
+
if (!trimmedUrl.startsWith("http://") && !trimmedUrl.startsWith("https://")) {
|
|
1026
|
+
errors.push({
|
|
1027
|
+
url: trimmedUrl,
|
|
1028
|
+
error: "URL must start with http:// or https://"
|
|
1029
|
+
});
|
|
1030
|
+
continue;
|
|
1031
|
+
}
|
|
1032
|
+
validUrls.push(trimmedUrl);
|
|
1033
|
+
}
|
|
1034
|
+
const uniqueValidUrls = Array.from(new Set(validUrls));
|
|
1035
|
+
return {
|
|
1036
|
+
isValid: uniqueValidUrls.length > 0 && errors.length === 0,
|
|
1037
|
+
validUrls: uniqueValidUrls,
|
|
1038
|
+
errors
|
|
1039
|
+
};
|
|
1040
|
+
}
|
|
1041
|
+
function matchesPatterns(url, patterns) {
|
|
1042
|
+
if (!patterns || patterns.length === 0) {
|
|
1043
|
+
return false;
|
|
1044
|
+
}
|
|
1045
|
+
return patterns.some((pattern) => {
|
|
1046
|
+
try {
|
|
1047
|
+
const regex = new RE2(pattern, "i");
|
|
1048
|
+
return regex.test(url);
|
|
1049
|
+
} catch {
|
|
1050
|
+
return false;
|
|
1051
|
+
}
|
|
1052
|
+
});
|
|
1053
|
+
}
|
|
1054
|
+
function shouldIncludeUrl(url, includePatterns, excludePatterns) {
|
|
1055
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
1056
|
+
if (!matchesPatterns(url, includePatterns)) {
|
|
1057
|
+
return false;
|
|
1058
|
+
}
|
|
1059
|
+
}
|
|
1060
|
+
if (excludePatterns && excludePatterns.length > 0) {
|
|
1061
|
+
if (matchesPatterns(url, excludePatterns)) {
|
|
1062
|
+
return false;
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
return true;
|
|
1066
|
+
}
|
|
1067
|
+
function isContentUrl(url) {
|
|
1068
|
+
const lowerUrl = url.toLowerCase();
|
|
1069
|
+
const nonContentPatterns = [
|
|
1070
|
+
// Legal and policy pages
|
|
1071
|
+
/\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\b/i,
|
|
1072
|
+
/\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\b/i,
|
|
1073
|
+
/\/(cookie-policy|data-protection|acceptable-use|user-agreement)\b/i,
|
|
1074
|
+
/\/(refund|cancellation|shipping|return)-?(policy)?\b/i,
|
|
1075
|
+
// Contact and support pages (usually not main content)
|
|
1076
|
+
/\/(contact|support|help|faq|feedback)\/?$/i,
|
|
1077
|
+
// About pages that are typically boilerplate
|
|
1078
|
+
/\/(about-us|careers|jobs|press|investors|team)\/?$/i,
|
|
1079
|
+
// Authentication and admin areas
|
|
1080
|
+
/\/(admin|login|auth|account|dashboard|profile|settings)\//i,
|
|
1081
|
+
// E-commerce utility pages
|
|
1082
|
+
/\/(cart|checkout|payment|subscription|wishlist)\//i,
|
|
1083
|
+
// File downloads and assets
|
|
1084
|
+
/\/(uploads|assets|files|static|media|resources)\//i,
|
|
1085
|
+
// API endpoints
|
|
1086
|
+
/\/(api|graphql|rest|webhook)\//i
|
|
1087
|
+
];
|
|
1088
|
+
if (nonContentPatterns.some((pattern) => pattern.test(lowerUrl))) {
|
|
1089
|
+
return false;
|
|
1090
|
+
}
|
|
1091
|
+
const skipExtensions = [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".zip", ".exe"];
|
|
1092
|
+
if (skipExtensions.some((ext) => lowerUrl.endsWith(ext))) {
|
|
1093
|
+
return false;
|
|
1094
|
+
}
|
|
1095
|
+
return true;
|
|
1096
|
+
}
|
|
1097
|
+
function shouldCrawlUrl(url, baseUrl, maxDepth, currentDepth, visited) {
|
|
1098
|
+
if (currentDepth > maxDepth) {
|
|
1099
|
+
return false;
|
|
1100
|
+
}
|
|
1101
|
+
const urlKey = getUrlKey(url);
|
|
1102
|
+
if (visited.has(urlKey)) {
|
|
1103
|
+
return false;
|
|
1104
|
+
}
|
|
1105
|
+
if (!isSameDomain(url, baseUrl)) {
|
|
1106
|
+
return false;
|
|
1107
|
+
}
|
|
1108
|
+
const lowerUrl = url.toLowerCase();
|
|
1109
|
+
const skipExtensions = [
|
|
1110
|
+
".pdf",
|
|
1111
|
+
".doc",
|
|
1112
|
+
".docx",
|
|
1113
|
+
".xls",
|
|
1114
|
+
".xlsx",
|
|
1115
|
+
".ppt",
|
|
1116
|
+
".pptx",
|
|
1117
|
+
".zip",
|
|
1118
|
+
".rar",
|
|
1119
|
+
".tar",
|
|
1120
|
+
".gz",
|
|
1121
|
+
".exe",
|
|
1122
|
+
".dmg",
|
|
1123
|
+
".pkg",
|
|
1124
|
+
".deb",
|
|
1125
|
+
".rpm",
|
|
1126
|
+
".apk",
|
|
1127
|
+
".ipa",
|
|
1128
|
+
// Image files
|
|
1129
|
+
".jpg",
|
|
1130
|
+
".jpeg",
|
|
1131
|
+
".png",
|
|
1132
|
+
".gif",
|
|
1133
|
+
".bmp",
|
|
1134
|
+
".svg",
|
|
1135
|
+
".webp",
|
|
1136
|
+
".ico",
|
|
1137
|
+
".favicon",
|
|
1138
|
+
// Video files
|
|
1139
|
+
".mp4",
|
|
1140
|
+
".avi",
|
|
1141
|
+
".mov",
|
|
1142
|
+
".wmv",
|
|
1143
|
+
".flv",
|
|
1144
|
+
".webm",
|
|
1145
|
+
// Audio files
|
|
1146
|
+
".mp3",
|
|
1147
|
+
".wav",
|
|
1148
|
+
".ogg",
|
|
1149
|
+
".m4a",
|
|
1150
|
+
".aac",
|
|
1151
|
+
// Font files
|
|
1152
|
+
".woff",
|
|
1153
|
+
".woff2",
|
|
1154
|
+
".ttf",
|
|
1155
|
+
".otf",
|
|
1156
|
+
".eot",
|
|
1157
|
+
// Style and script files
|
|
1158
|
+
".css",
|
|
1159
|
+
".js",
|
|
1160
|
+
".mjs",
|
|
1161
|
+
".ts",
|
|
1162
|
+
".jsx",
|
|
1163
|
+
".tsx",
|
|
1164
|
+
// Data and config files
|
|
1165
|
+
".json",
|
|
1166
|
+
".xml",
|
|
1167
|
+
".txt",
|
|
1168
|
+
".md",
|
|
1169
|
+
".rss",
|
|
1170
|
+
".atom",
|
|
1171
|
+
".sitemap",
|
|
1172
|
+
".robots",
|
|
1173
|
+
".webmanifest",
|
|
1174
|
+
// Archive files
|
|
1175
|
+
".zip",
|
|
1176
|
+
".tar",
|
|
1177
|
+
".gz",
|
|
1178
|
+
".bz2",
|
|
1179
|
+
".7z"
|
|
1180
|
+
];
|
|
1181
|
+
if (skipExtensions.some((ext) => lowerUrl.includes(ext))) {
|
|
1182
|
+
return false;
|
|
1183
|
+
}
|
|
1184
|
+
const skipPatterns = [
|
|
1185
|
+
// File downloads and assets
|
|
1186
|
+
/\/(uploads|assets|files|static|media|resources)\//i,
|
|
1187
|
+
// Authentication and admin areas
|
|
1188
|
+
/\/(admin|login|auth|account|dashboard|profile|settings)\//i,
|
|
1189
|
+
// API endpoints
|
|
1190
|
+
/\/(api|graphql|rest|ws:|webhook)\//i,
|
|
1191
|
+
// Common tracking and analytics
|
|
1192
|
+
/\/(analytics|tracking|pixel|beacon|ads)\//i,
|
|
1193
|
+
// Development and testing areas
|
|
1194
|
+
/\/(test|dev|staging|beta|demo)\//i,
|
|
1195
|
+
// Common utility and service pages
|
|
1196
|
+
/\/(search|cart|checkout|payment|subscription)\//i,
|
|
1197
|
+
// Social media and external services
|
|
1198
|
+
/\/(facebook|twitter|instagram|youtube|linkedin|github)\//i,
|
|
1199
|
+
// Legal and policy pages
|
|
1200
|
+
/\/(privacy|terms|tos|legal|cookie|gdpr|disclaimer|imprint|impressum)\b/i,
|
|
1201
|
+
/\/(privacy-policy|terms-of-service|terms-of-use|terms-and-conditions)\b/i,
|
|
1202
|
+
/\/(cookie-policy|data-protection|acceptable-use|user-agreement)\b/i,
|
|
1203
|
+
/\/(refund|cancellation|shipping|return)-?(policy)?\b/i,
|
|
1204
|
+
// Contact and support pages (usually not main content)
|
|
1205
|
+
/\/(contact|support|help|faq|feedback)\/?$/i,
|
|
1206
|
+
// About pages that are typically boilerplate
|
|
1207
|
+
/\/(about-us|careers|jobs|press|investors|team)\/?$/i
|
|
1208
|
+
];
|
|
1209
|
+
if (skipPatterns.some((pattern) => pattern.test(url))) {
|
|
1210
|
+
return false;
|
|
1211
|
+
}
|
|
1212
|
+
if (url.includes("?") && ["download", "file", "attachment", "export", "print", "share", "email"].some(
|
|
1213
|
+
(param) => url.toLowerCase().includes(param)
|
|
1214
|
+
)) {
|
|
1215
|
+
return false;
|
|
1216
|
+
}
|
|
1217
|
+
if (url.split("/").filter(Boolean).length < 2 && url.split("?")[0].split("/").length <= 2) {
|
|
1218
|
+
return false;
|
|
1219
|
+
}
|
|
1220
|
+
return true;
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
// src/utils/metadata-extractor.ts
|
|
1224
|
+
function extractMetadata(html, baseUrl) {
|
|
1225
|
+
return extractWebsiteMetadata(html, baseUrl);
|
|
1226
|
+
}
|
|
1227
|
+
function extractWebsiteMetadata(html, baseUrl) {
|
|
1228
|
+
const { document } = parseHTML3(html);
|
|
1229
|
+
const metadata = {
|
|
1230
|
+
title: null,
|
|
1231
|
+
description: null,
|
|
1232
|
+
author: null,
|
|
1233
|
+
language: null,
|
|
1234
|
+
charset: null,
|
|
1235
|
+
favicon: null,
|
|
1236
|
+
canonical: null,
|
|
1237
|
+
image: null,
|
|
1238
|
+
keywords: null,
|
|
1239
|
+
robots: null,
|
|
1240
|
+
themeColor: null,
|
|
1241
|
+
openGraph: null,
|
|
1242
|
+
twitter: null
|
|
1243
|
+
};
|
|
1244
|
+
metadata.title = extractTitle(document);
|
|
1245
|
+
metadata.description = extractMetaContent(document, "description");
|
|
1246
|
+
metadata.author = extractMetaContent(document, "author");
|
|
1247
|
+
metadata.language = extractLanguage(document);
|
|
1248
|
+
metadata.charset = extractCharset(document);
|
|
1249
|
+
metadata.favicon = extractFavicon(document, baseUrl);
|
|
1250
|
+
metadata.canonical = extractCanonical(document, baseUrl);
|
|
1251
|
+
metadata.image = extractMetaContent(document, "og:image") || extractMetaContent(document, "twitter:image");
|
|
1252
|
+
metadata.keywords = extractKeywords(document);
|
|
1253
|
+
metadata.robots = extractMetaContent(document, "robots");
|
|
1254
|
+
metadata.themeColor = extractMetaContent(document, "theme-color");
|
|
1255
|
+
metadata.openGraph = extractOpenGraph(document);
|
|
1256
|
+
metadata.twitter = extractTwitterCard(document);
|
|
1257
|
+
return metadata;
|
|
1258
|
+
}
|
|
1259
|
+
function extractTitle(document) {
|
|
1260
|
+
const titleElement = document.querySelector("title");
|
|
1261
|
+
if (titleElement?.textContent) {
|
|
1262
|
+
return titleElement.textContent.trim();
|
|
1263
|
+
}
|
|
1264
|
+
return extractMetaContent(document, "og:title");
|
|
1265
|
+
}
|
|
1266
|
+
function extractMetaContent(document, name) {
|
|
1267
|
+
const byName = document.querySelector(`meta[name="${name}"]`);
|
|
1268
|
+
if (byName) {
|
|
1269
|
+
const content = byName.getAttribute("content");
|
|
1270
|
+
if (content) return content.trim();
|
|
1271
|
+
}
|
|
1272
|
+
const byProperty = document.querySelector(`meta[property="${name}"]`);
|
|
1273
|
+
if (byProperty) {
|
|
1274
|
+
const content = byProperty.getAttribute("content");
|
|
1275
|
+
if (content) return content.trim();
|
|
1276
|
+
}
|
|
1277
|
+
return null;
|
|
1278
|
+
}
|
|
1279
|
+
function extractLanguage(document) {
|
|
1280
|
+
const lang = document.documentElement?.getAttribute("lang");
|
|
1281
|
+
return lang?.trim() || null;
|
|
1282
|
+
}
|
|
1283
|
+
function extractCharset(document) {
|
|
1284
|
+
const charsetMeta = document.querySelector("meta[charset]");
|
|
1285
|
+
if (charsetMeta) {
|
|
1286
|
+
const charset = charsetMeta.getAttribute("charset");
|
|
1287
|
+
if (charset) return charset.trim();
|
|
1288
|
+
}
|
|
1289
|
+
const httpEquivMeta = document.querySelector('meta[http-equiv="Content-Type"]');
|
|
1290
|
+
if (httpEquivMeta) {
|
|
1291
|
+
const content = httpEquivMeta.getAttribute("content");
|
|
1292
|
+
if (content) {
|
|
1293
|
+
const charsetMatch = content.match(/charset=([^\s;]+)/i);
|
|
1294
|
+
if (charsetMatch) return charsetMatch[1].trim();
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
return null;
|
|
1298
|
+
}
|
|
1299
|
+
function extractFavicon(document, baseUrl) {
|
|
1300
|
+
const iconSelectors = [
|
|
1301
|
+
'link[rel="icon"]',
|
|
1302
|
+
'link[rel="shortcut icon"]',
|
|
1303
|
+
'link[rel="apple-touch-icon"]',
|
|
1304
|
+
'link[rel*="icon"]'
|
|
1305
|
+
];
|
|
1306
|
+
for (const selector of iconSelectors) {
|
|
1307
|
+
const iconLink = document.querySelector(selector);
|
|
1308
|
+
if (iconLink) {
|
|
1309
|
+
const href = iconLink.getAttribute("href");
|
|
1310
|
+
if (href) {
|
|
1311
|
+
return normalizeUrl(href, baseUrl);
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
try {
|
|
1316
|
+
return normalizeUrl("/favicon.ico", baseUrl);
|
|
1317
|
+
} catch {
|
|
1318
|
+
return null;
|
|
1319
|
+
}
|
|
1320
|
+
}
|
|
1321
|
+
function extractCanonical(document, baseUrl) {
|
|
1322
|
+
const canonicalLink = document.querySelector('link[rel="canonical"]');
|
|
1323
|
+
if (canonicalLink) {
|
|
1324
|
+
const href = canonicalLink.getAttribute("href");
|
|
1325
|
+
if (href) {
|
|
1326
|
+
return normalizeUrl(href, baseUrl);
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
return null;
|
|
1330
|
+
}
|
|
1331
|
+
function extractKeywords(document) {
|
|
1332
|
+
const keywordsContent = extractMetaContent(document, "keywords");
|
|
1333
|
+
if (!keywordsContent) {
|
|
1334
|
+
return null;
|
|
1335
|
+
}
|
|
1336
|
+
return keywordsContent.split(",").map((keyword) => keyword.trim()).filter((keyword) => keyword.length > 0);
|
|
1337
|
+
}
|
|
1338
|
+
function extractOpenGraph(document) {
|
|
1339
|
+
const openGraph = {
|
|
1340
|
+
title: null,
|
|
1341
|
+
description: null,
|
|
1342
|
+
type: null,
|
|
1343
|
+
url: null,
|
|
1344
|
+
image: null,
|
|
1345
|
+
siteName: null,
|
|
1346
|
+
locale: null
|
|
1347
|
+
};
|
|
1348
|
+
openGraph.title = extractMetaContent(document, "og:title");
|
|
1349
|
+
openGraph.description = extractMetaContent(document, "og:description");
|
|
1350
|
+
openGraph.type = extractMetaContent(document, "og:type");
|
|
1351
|
+
openGraph.url = extractMetaContent(document, "og:url");
|
|
1352
|
+
openGraph.image = extractMetaContent(document, "og:image");
|
|
1353
|
+
openGraph.siteName = extractMetaContent(document, "og:site_name");
|
|
1354
|
+
openGraph.locale = extractMetaContent(document, "og:locale");
|
|
1355
|
+
if (Object.values(openGraph).every((value) => !value)) {
|
|
1356
|
+
return null;
|
|
1357
|
+
}
|
|
1358
|
+
return openGraph;
|
|
1359
|
+
}
|
|
1360
|
+
function extractTwitterCard(document) {
|
|
1361
|
+
const twitter = {
|
|
1362
|
+
card: null,
|
|
1363
|
+
site: null,
|
|
1364
|
+
creator: null,
|
|
1365
|
+
title: null,
|
|
1366
|
+
description: null,
|
|
1367
|
+
image: null
|
|
1368
|
+
};
|
|
1369
|
+
twitter.card = extractMetaContent(document, "twitter:card");
|
|
1370
|
+
twitter.site = extractMetaContent(document, "twitter:site");
|
|
1371
|
+
twitter.creator = extractMetaContent(document, "twitter:creator");
|
|
1372
|
+
twitter.title = extractMetaContent(document, "twitter:title");
|
|
1373
|
+
twitter.description = extractMetaContent(document, "twitter:description");
|
|
1374
|
+
twitter.image = extractMetaContent(document, "twitter:image");
|
|
1375
|
+
if (Object.values(twitter).every((value) => !value)) {
|
|
1376
|
+
return null;
|
|
1377
|
+
}
|
|
1378
|
+
return twitter;
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
// src/utils/logger.ts
|
|
1382
|
+
import pino from "pino";
|
|
1383
|
+
function createLogger(name = "reader", level = process.env.LOG_LEVEL || "info") {
|
|
1384
|
+
return pino({
|
|
1385
|
+
name,
|
|
1386
|
+
level,
|
|
1387
|
+
transport: process.env.NODE_ENV !== "production" ? {
|
|
1388
|
+
target: "pino-pretty",
|
|
1389
|
+
options: {
|
|
1390
|
+
colorize: true,
|
|
1391
|
+
translateTime: "SYS:standard",
|
|
1392
|
+
ignore: "pid,hostname"
|
|
1393
|
+
}
|
|
1394
|
+
} : void 0
|
|
1395
|
+
});
|
|
1396
|
+
}
|
|
1397
|
+
var logger = createLogger();
|
|
1398
|
+
|
|
1399
|
+
// src/utils/robots-parser.ts
|
|
1400
|
+
function parseRobotsTxt(content, userAgent = "*") {
|
|
1401
|
+
const rules = {
|
|
1402
|
+
disallowedPaths: [],
|
|
1403
|
+
allowedPaths: [],
|
|
1404
|
+
crawlDelay: null
|
|
1405
|
+
};
|
|
1406
|
+
const lines = content.split("\n").map((line) => line.trim());
|
|
1407
|
+
let currentUserAgent = "";
|
|
1408
|
+
let matchesUserAgent = false;
|
|
1409
|
+
for (const line of lines) {
|
|
1410
|
+
if (!line || line.startsWith("#")) {
|
|
1411
|
+
continue;
|
|
1412
|
+
}
|
|
1413
|
+
const colonIndex = line.indexOf(":");
|
|
1414
|
+
if (colonIndex === -1) {
|
|
1415
|
+
continue;
|
|
1416
|
+
}
|
|
1417
|
+
const directive = line.substring(0, colonIndex).trim().toLowerCase();
|
|
1418
|
+
const value = line.substring(colonIndex + 1).trim();
|
|
1419
|
+
if (directive === "user-agent") {
|
|
1420
|
+
currentUserAgent = value.toLowerCase();
|
|
1421
|
+
matchesUserAgent = currentUserAgent === "*" || currentUserAgent === userAgent.toLowerCase();
|
|
1422
|
+
} else if (matchesUserAgent) {
|
|
1423
|
+
if (directive === "disallow" && value) {
|
|
1424
|
+
rules.disallowedPaths.push(value);
|
|
1425
|
+
} else if (directive === "allow" && value) {
|
|
1426
|
+
rules.allowedPaths.push(value);
|
|
1427
|
+
} else if (directive === "crawl-delay") {
|
|
1428
|
+
const delay = parseFloat(value);
|
|
1429
|
+
if (!isNaN(delay)) {
|
|
1430
|
+
rules.crawlDelay = delay * 1e3;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
return rules;
|
|
1436
|
+
}
|
|
1437
|
+
function isPathAllowed(path, rules) {
|
|
1438
|
+
const normalizedPath = path.startsWith("/") ? path : "/" + path;
|
|
1439
|
+
for (const allowedPath of rules.allowedPaths) {
|
|
1440
|
+
if (pathMatches(normalizedPath, allowedPath)) {
|
|
1441
|
+
return true;
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
for (const disallowedPath of rules.disallowedPaths) {
|
|
1445
|
+
if (pathMatches(normalizedPath, disallowedPath)) {
|
|
1446
|
+
return false;
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
return true;
|
|
1450
|
+
}
|
|
1451
|
+
function pathMatches(path, pattern) {
|
|
1452
|
+
if (!pattern) {
|
|
1453
|
+
return false;
|
|
1454
|
+
}
|
|
1455
|
+
let regexPattern = pattern.replace(/[.+?^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1456
|
+
if (regexPattern.endsWith("\\$")) {
|
|
1457
|
+
regexPattern = regexPattern.slice(0, -2) + "$";
|
|
1458
|
+
} else {
|
|
1459
|
+
regexPattern = "^" + regexPattern;
|
|
1460
|
+
}
|
|
1461
|
+
try {
|
|
1462
|
+
const regex = new RegExp(regexPattern);
|
|
1463
|
+
return regex.test(path);
|
|
1464
|
+
} catch {
|
|
1465
|
+
return path.startsWith(pattern);
|
|
1466
|
+
}
|
|
1467
|
+
}
|
|
1468
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
1469
|
+
try {
|
|
1470
|
+
const url = new URL("/robots.txt", baseUrl);
|
|
1471
|
+
const response = await fetch(url.toString(), {
|
|
1472
|
+
headers: {
|
|
1473
|
+
"User-Agent": "ReaderEngine/1.0"
|
|
1474
|
+
}
|
|
1475
|
+
});
|
|
1476
|
+
if (!response.ok) {
|
|
1477
|
+
return null;
|
|
1478
|
+
}
|
|
1479
|
+
const content = await response.text();
|
|
1480
|
+
return parseRobotsTxt(content, "ReaderEngine");
|
|
1481
|
+
} catch {
|
|
1482
|
+
return null;
|
|
1483
|
+
}
|
|
1484
|
+
}
|
|
1485
|
+
function isUrlAllowed(url, rules) {
|
|
1486
|
+
if (!rules) {
|
|
1487
|
+
return true;
|
|
1488
|
+
}
|
|
1489
|
+
try {
|
|
1490
|
+
const parsedUrl = new URL(url);
|
|
1491
|
+
return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);
|
|
1492
|
+
} catch {
|
|
1493
|
+
return true;
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
// src/types.ts
|
|
1498
|
+
var DEFAULT_OPTIONS = {
|
|
1499
|
+
urls: [],
|
|
1500
|
+
formats: ["markdown"],
|
|
1501
|
+
includeMetadata: true,
|
|
1502
|
+
timeoutMs: 3e4,
|
|
1503
|
+
includePatterns: [],
|
|
1504
|
+
excludePatterns: [],
|
|
1505
|
+
// Content cleaning defaults
|
|
1506
|
+
removeAds: true,
|
|
1507
|
+
removeBase64Images: true,
|
|
1508
|
+
skipTLSVerification: true,
|
|
1509
|
+
// Batch defaults
|
|
1510
|
+
batchConcurrency: 1,
|
|
1511
|
+
batchTimeoutMs: 3e5,
|
|
1512
|
+
maxRetries: 2,
|
|
1513
|
+
onProgress: () => {
|
|
1514
|
+
},
|
|
1515
|
+
// Default no-op progress callback
|
|
1516
|
+
// Hero-specific defaults
|
|
1517
|
+
verbose: false,
|
|
1518
|
+
showChrome: false
|
|
1519
|
+
};
|
|
1520
|
+
function isValidFormat(format) {
|
|
1521
|
+
return format === "markdown" || format === "html" || format === "json" || format === "text";
|
|
1522
|
+
}
|
|
1523
|
+
function shouldCrawlUrl2(url, baseDomain) {
|
|
1524
|
+
return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
// src/scraper.ts
|
|
1528
|
+
var Scraper = class {
|
|
1529
|
+
options;
|
|
1530
|
+
pool;
|
|
1531
|
+
logger = createLogger("scraper");
|
|
1532
|
+
robotsCache = /* @__PURE__ */ new Map();
|
|
1533
|
+
constructor(options) {
|
|
1534
|
+
this.options = {
|
|
1535
|
+
...DEFAULT_OPTIONS,
|
|
1536
|
+
...options
|
|
1537
|
+
};
|
|
1538
|
+
if (!options.pool) {
|
|
1539
|
+
throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
|
|
1540
|
+
}
|
|
1541
|
+
this.pool = options.pool;
|
|
1542
|
+
}
|
|
1543
|
+
/**
|
|
1544
|
+
* Get robots.txt rules for a URL, cached per domain
|
|
1545
|
+
*/
|
|
1546
|
+
async getRobotsRules(url) {
|
|
1547
|
+
const origin = new URL(url).origin;
|
|
1548
|
+
if (!this.robotsCache.has(origin)) {
|
|
1549
|
+
const rules = await fetchRobotsTxt(origin);
|
|
1550
|
+
this.robotsCache.set(origin, rules);
|
|
1551
|
+
}
|
|
1552
|
+
return this.robotsCache.get(origin) ?? null;
|
|
1553
|
+
}
|
|
1554
|
+
/**
|
|
1555
|
+
* Scrape all URLs
|
|
1556
|
+
*
|
|
1557
|
+
* @returns Scrape result with pages and metadata
|
|
1558
|
+
*/
|
|
1559
|
+
async scrape() {
|
|
1560
|
+
const startTime = Date.now();
|
|
1561
|
+
const results = await this.scrapeWithConcurrency();
|
|
1562
|
+
return this.buildScrapeResult(results, startTime);
|
|
1563
|
+
}
|
|
1564
|
+
/**
|
|
1565
|
+
* Scrape URLs with concurrency control
|
|
1566
|
+
*/
|
|
1567
|
+
async scrapeWithConcurrency() {
|
|
1568
|
+
const limit = pLimit(this.options.batchConcurrency || 1);
|
|
1569
|
+
const tasks = this.options.urls.map(
|
|
1570
|
+
(url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
|
|
1571
|
+
);
|
|
1572
|
+
const batchPromise = Promise.all(tasks);
|
|
1573
|
+
if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
|
|
1574
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1575
|
+
setTimeout(() => {
|
|
1576
|
+
reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
|
|
1577
|
+
}, this.options.batchTimeoutMs);
|
|
1578
|
+
});
|
|
1579
|
+
return Promise.race([batchPromise, timeoutPromise]);
|
|
1580
|
+
}
|
|
1581
|
+
return batchPromise;
|
|
1582
|
+
}
|
|
1583
|
+
/**
|
|
1584
|
+
* Scrape a single URL with retry logic
|
|
1585
|
+
*/
|
|
1586
|
+
async scrapeSingleUrlWithRetry(url, index) {
|
|
1587
|
+
const maxRetries = this.options.maxRetries || 2;
|
|
1588
|
+
let lastError;
|
|
1589
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1590
|
+
try {
|
|
1591
|
+
const result = await this.scrapeSingleUrl(url, index);
|
|
1592
|
+
if (result) {
|
|
1593
|
+
return { result };
|
|
1594
|
+
}
|
|
1595
|
+
lastError = `Failed to scrape ${url}: No content returned`;
|
|
1596
|
+
} catch (error) {
|
|
1597
|
+
lastError = error.message;
|
|
1598
|
+
if (attempt < maxRetries) {
|
|
1599
|
+
const delay = Math.pow(2, attempt) * 1e3;
|
|
1600
|
+
this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
|
|
1601
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
|
|
1606
|
+
return { result: null, error: lastError };
|
|
1607
|
+
}
|
|
1608
|
+
/**
|
|
1609
|
+
* Wait for the final page to load after any Cloudflare redirects
|
|
1610
|
+
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
1611
|
+
* we're on the actual content page before scraping.
|
|
1612
|
+
*/
|
|
1613
|
+
async waitForFinalPage(hero, originalUrl, verbose) {
|
|
1614
|
+
const maxWaitMs = 15e3;
|
|
1615
|
+
const startTime = Date.now();
|
|
1616
|
+
const log = (msg) => verbose && this.logger.info(msg);
|
|
1617
|
+
try {
|
|
1618
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1619
|
+
} catch {
|
|
1620
|
+
}
|
|
1621
|
+
let currentUrl = await hero.url;
|
|
1622
|
+
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1623
|
+
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1624
|
+
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1625
|
+
log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1626
|
+
let lastUrl = currentUrl;
|
|
1627
|
+
let stableCount = 0;
|
|
1628
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1629
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
1630
|
+
try {
|
|
1631
|
+
currentUrl = await hero.url;
|
|
1632
|
+
if (currentUrl === lastUrl) {
|
|
1633
|
+
stableCount++;
|
|
1634
|
+
if (stableCount >= 2) {
|
|
1635
|
+
break;
|
|
1636
|
+
}
|
|
1637
|
+
} else {
|
|
1638
|
+
stableCount = 0;
|
|
1639
|
+
lastUrl = currentUrl;
|
|
1640
|
+
log(`URL changed to: ${currentUrl}`);
|
|
1641
|
+
}
|
|
1642
|
+
} catch {
|
|
1643
|
+
}
|
|
1644
|
+
}
|
|
1645
|
+
try {
|
|
1646
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
|
|
1647
|
+
} catch {
|
|
1648
|
+
}
|
|
1649
|
+
}
|
|
1650
|
+
await hero.waitForPaintingStable();
|
|
1651
|
+
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1652
|
+
}
|
|
1653
|
+
/**
|
|
1654
|
+
* Scrape a single URL
|
|
1655
|
+
*/
|
|
1656
|
+
async scrapeSingleUrl(url, index) {
|
|
1657
|
+
const startTime = Date.now();
|
|
1658
|
+
const robotsRules = await this.getRobotsRules(url);
|
|
1659
|
+
if (!isUrlAllowed(url, robotsRules)) {
|
|
1660
|
+
throw new Error(`URL blocked by robots.txt: ${url}`);
|
|
1661
|
+
}
|
|
1662
|
+
try {
|
|
1663
|
+
return await this.pool.withBrowser(async (hero) => {
|
|
1664
|
+
await hero.goto(url, { timeoutMs: this.options.timeoutMs });
|
|
1665
|
+
try {
|
|
1666
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: this.options.timeoutMs });
|
|
1667
|
+
} catch {
|
|
1668
|
+
}
|
|
1669
|
+
await hero.waitForPaintingStable();
|
|
1670
|
+
let hadChallenge = false;
|
|
1671
|
+
let challengeType = "none";
|
|
1672
|
+
let waitTimeMs = 0;
|
|
1673
|
+
const initialUrl = await hero.url;
|
|
1674
|
+
const detection = await detectChallenge(hero);
|
|
1675
|
+
if (detection.isChallenge) {
|
|
1676
|
+
hadChallenge = true;
|
|
1677
|
+
challengeType = detection.type;
|
|
1678
|
+
if (this.options.verbose) {
|
|
1679
|
+
this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
|
|
1680
|
+
}
|
|
1681
|
+
const result2 = await waitForChallengeResolution(hero, {
|
|
1682
|
+
maxWaitMs: 45e3,
|
|
1683
|
+
pollIntervalMs: 500,
|
|
1684
|
+
verbose: this.options.verbose,
|
|
1685
|
+
initialUrl
|
|
1686
|
+
});
|
|
1687
|
+
waitTimeMs = result2.waitedMs;
|
|
1688
|
+
if (!result2.resolved) {
|
|
1689
|
+
throw new Error(`Challenge not resolved: ${detection.type}`);
|
|
1690
|
+
}
|
|
1691
|
+
if (this.options.verbose) {
|
|
1692
|
+
this.logger.info(`Challenge resolved via ${result2.method} in ${waitTimeMs}ms`);
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
await this.waitForFinalPage(hero, url, this.options.verbose);
|
|
1696
|
+
if (this.options.waitForSelector) {
|
|
1697
|
+
try {
|
|
1698
|
+
await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
|
|
1699
|
+
timeoutMs: this.options.timeoutMs
|
|
1700
|
+
});
|
|
1701
|
+
} catch (error) {
|
|
1702
|
+
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1703
|
+
}
|
|
1704
|
+
}
|
|
1705
|
+
const pageTitle = await hero.document.title;
|
|
1706
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1707
|
+
const cleanedHtml = cleanContent(html, url, {
|
|
1708
|
+
removeAds: this.options.removeAds,
|
|
1709
|
+
removeBase64Images: this.options.removeBase64Images
|
|
1710
|
+
});
|
|
1711
|
+
const websiteMetadata = extractMetadata(cleanedHtml, url);
|
|
1712
|
+
const duration = Date.now() - startTime;
|
|
1713
|
+
const scrapedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
1714
|
+
const page = {
|
|
1715
|
+
url,
|
|
1716
|
+
title: pageTitle,
|
|
1717
|
+
markdown: "",
|
|
1718
|
+
// Will be set by formatter
|
|
1719
|
+
html: cleanedHtml,
|
|
1720
|
+
fetchedAt: scrapedAt,
|
|
1721
|
+
depth: 0,
|
|
1722
|
+
hadChallenge,
|
|
1723
|
+
challengeType,
|
|
1724
|
+
waitTimeMs
|
|
1725
|
+
};
|
|
1726
|
+
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1727
|
+
[page],
|
|
1728
|
+
url,
|
|
1729
|
+
scrapedAt,
|
|
1730
|
+
duration,
|
|
1731
|
+
websiteMetadata,
|
|
1732
|
+
this.options.includeMetadata
|
|
1733
|
+
) : void 0;
|
|
1734
|
+
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1735
|
+
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1736
|
+
const text = this.options.formats.includes("text") ? formatToText(
|
|
1737
|
+
[page],
|
|
1738
|
+
url,
|
|
1739
|
+
scrapedAt,
|
|
1740
|
+
duration,
|
|
1741
|
+
websiteMetadata,
|
|
1742
|
+
this.options.includeMetadata
|
|
1743
|
+
) : void 0;
|
|
1744
|
+
if (this.options.onProgress) {
|
|
1745
|
+
this.options.onProgress({
|
|
1746
|
+
completed: index + 1,
|
|
1747
|
+
total: this.options.urls.length,
|
|
1748
|
+
currentUrl: url
|
|
1749
|
+
});
|
|
1750
|
+
}
|
|
1751
|
+
let proxyMetadata;
|
|
1752
|
+
if (this.options.proxy) {
|
|
1753
|
+
const proxy = this.options.proxy;
|
|
1754
|
+
if (proxy.url) {
|
|
1755
|
+
try {
|
|
1756
|
+
const proxyUrl = new URL(proxy.url);
|
|
1757
|
+
proxyMetadata = {
|
|
1758
|
+
host: proxyUrl.hostname,
|
|
1759
|
+
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1760
|
+
country: proxy.country
|
|
1761
|
+
};
|
|
1762
|
+
} catch {
|
|
1763
|
+
}
|
|
1764
|
+
} else if (proxy.host && proxy.port) {
|
|
1765
|
+
proxyMetadata = {
|
|
1766
|
+
host: proxy.host,
|
|
1767
|
+
port: proxy.port,
|
|
1768
|
+
country: proxy.country
|
|
1769
|
+
};
|
|
1770
|
+
}
|
|
1771
|
+
}
|
|
1772
|
+
const result = {
|
|
1773
|
+
markdown,
|
|
1774
|
+
html: htmlOutput,
|
|
1775
|
+
json,
|
|
1776
|
+
text,
|
|
1777
|
+
metadata: {
|
|
1778
|
+
baseUrl: url,
|
|
1779
|
+
totalPages: 1,
|
|
1780
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1781
|
+
duration,
|
|
1782
|
+
website: websiteMetadata,
|
|
1783
|
+
proxy: proxyMetadata
|
|
1784
|
+
}
|
|
1785
|
+
};
|
|
1786
|
+
return result;
|
|
1787
|
+
});
|
|
1788
|
+
} catch (error) {
|
|
1789
|
+
this.logger.error(`Failed to scrape ${url}: ${error.message}`);
|
|
1790
|
+
if (this.options.onProgress) {
|
|
1791
|
+
this.options.onProgress({
|
|
1792
|
+
completed: index + 1,
|
|
1793
|
+
total: this.options.urls.length,
|
|
1794
|
+
currentUrl: url
|
|
1795
|
+
});
|
|
1796
|
+
}
|
|
1797
|
+
return null;
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
/**
|
|
1801
|
+
* Build final scrape result
|
|
1802
|
+
*/
|
|
1803
|
+
buildScrapeResult(results, startTime) {
|
|
1804
|
+
const successful = results.filter((r) => r.result !== null).map((r) => r.result);
|
|
1805
|
+
const errors = [];
|
|
1806
|
+
results.forEach((r, index) => {
|
|
1807
|
+
if (r.result === null && r.error) {
|
|
1808
|
+
errors.push({ url: this.options.urls[index], error: r.error });
|
|
1809
|
+
}
|
|
1810
|
+
});
|
|
1811
|
+
const batchMetadata = {
|
|
1812
|
+
totalUrls: this.options.urls.length,
|
|
1813
|
+
successfulUrls: successful.length,
|
|
1814
|
+
failedUrls: results.filter((r) => r.result === null).length,
|
|
1815
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1816
|
+
totalDuration: Date.now() - startTime,
|
|
1817
|
+
errors
|
|
1818
|
+
};
|
|
1819
|
+
return {
|
|
1820
|
+
data: successful,
|
|
1821
|
+
batchMetadata
|
|
1822
|
+
};
|
|
1823
|
+
}
|
|
1824
|
+
};
|
|
1825
|
+
async function scrape(options) {
|
|
1826
|
+
const scraper = new Scraper(options);
|
|
1827
|
+
return scraper.scrape();
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
// src/crawler.ts
|
|
1831
|
+
import { parseHTML as parseHTML4 } from "linkedom";
|
|
1832
|
+
|
|
1833
|
+
// src/utils/rate-limiter.ts
|
|
1834
|
+
import pLimit2 from "p-limit";
|
|
1835
|
+
async function rateLimit(ms) {
|
|
1836
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1837
|
+
}
|
|
1838
|
+
|
|
1839
|
+
// src/crawler.ts
|
|
1840
|
+
var Crawler = class {
|
|
1841
|
+
options;
|
|
1842
|
+
visited = /* @__PURE__ */ new Set();
|
|
1843
|
+
queue = [];
|
|
1844
|
+
urls = [];
|
|
1845
|
+
pool;
|
|
1846
|
+
logger = createLogger("crawler");
|
|
1847
|
+
robotsRules = null;
|
|
1848
|
+
constructor(options) {
|
|
1849
|
+
if (!options.pool) {
|
|
1850
|
+
throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
|
|
1851
|
+
}
|
|
1852
|
+
this.pool = options.pool;
|
|
1853
|
+
this.options = {
|
|
1854
|
+
url: options.url,
|
|
1855
|
+
depth: options.depth || 1,
|
|
1856
|
+
maxPages: options.maxPages || 20,
|
|
1857
|
+
scrape: options.scrape || false,
|
|
1858
|
+
delayMs: options.delayMs || 1e3,
|
|
1859
|
+
timeoutMs: options.timeoutMs,
|
|
1860
|
+
includePatterns: options.includePatterns,
|
|
1861
|
+
excludePatterns: options.excludePatterns,
|
|
1862
|
+
formats: options.formats || ["markdown", "html"],
|
|
1863
|
+
scrapeConcurrency: options.scrapeConcurrency || 2,
|
|
1864
|
+
proxy: options.proxy,
|
|
1865
|
+
userAgent: options.userAgent,
|
|
1866
|
+
verbose: options.verbose || false,
|
|
1867
|
+
showChrome: options.showChrome || false,
|
|
1868
|
+
connectionToCore: options.connectionToCore,
|
|
1869
|
+
// Content cleaning options
|
|
1870
|
+
removeAds: options.removeAds,
|
|
1871
|
+
removeBase64Images: options.removeBase64Images
|
|
1872
|
+
};
|
|
1873
|
+
}
|
|
1874
|
+
/**
|
|
1875
|
+
* Start crawling
|
|
1876
|
+
*/
|
|
1877
|
+
async crawl() {
|
|
1878
|
+
const startTime = Date.now();
|
|
1879
|
+
this.robotsRules = await fetchRobotsTxt(this.options.url);
|
|
1880
|
+
if (this.robotsRules) {
|
|
1881
|
+
this.logger.info("Loaded robots.txt rules");
|
|
1882
|
+
}
|
|
1883
|
+
if (isUrlAllowed(this.options.url, this.robotsRules)) {
|
|
1884
|
+
this.queue.push({ url: this.options.url, depth: 0 });
|
|
1885
|
+
} else {
|
|
1886
|
+
this.logger.warn(`Seed URL blocked by robots.txt: ${this.options.url}`);
|
|
1887
|
+
}
|
|
1888
|
+
while (this.queue.length > 0 && this.urls.length < this.options.maxPages) {
|
|
1889
|
+
if (this.options.timeoutMs && Date.now() - startTime > this.options.timeoutMs) {
|
|
1890
|
+
this.logger.warn(`Crawl timed out after ${this.options.timeoutMs}ms`);
|
|
1891
|
+
break;
|
|
1892
|
+
}
|
|
1893
|
+
const item = this.queue.shift();
|
|
1894
|
+
const urlKey = getUrlKey(item.url);
|
|
1895
|
+
if (this.visited.has(urlKey)) {
|
|
1896
|
+
continue;
|
|
1897
|
+
}
|
|
1898
|
+
const result = await this.fetchPage(item.url);
|
|
1899
|
+
if (result) {
|
|
1900
|
+
this.urls.push(result.crawlUrl);
|
|
1901
|
+
this.visited.add(urlKey);
|
|
1902
|
+
if (item.depth < this.options.depth) {
|
|
1903
|
+
const links = this.extractLinks(result.html, item.url, item.depth + 1);
|
|
1904
|
+
this.queue.push(...links);
|
|
1905
|
+
}
|
|
1906
|
+
}
|
|
1907
|
+
const delay = this.robotsRules?.crawlDelay || this.options.delayMs;
|
|
1908
|
+
await rateLimit(delay);
|
|
1909
|
+
}
|
|
1910
|
+
const metadata = {
|
|
1911
|
+
totalUrls: this.urls.length,
|
|
1912
|
+
maxDepth: this.options.depth,
|
|
1913
|
+
totalDuration: Date.now() - startTime,
|
|
1914
|
+
seedUrl: this.options.url
|
|
1915
|
+
};
|
|
1916
|
+
let scraped;
|
|
1917
|
+
if (this.options.scrape) {
|
|
1918
|
+
scraped = await this.scrapeDiscoveredUrls();
|
|
1919
|
+
}
|
|
1920
|
+
return {
|
|
1921
|
+
urls: this.urls,
|
|
1922
|
+
scraped,
|
|
1923
|
+
metadata
|
|
1924
|
+
};
|
|
1925
|
+
}
|
|
1926
|
+
/**
|
|
1927
|
+
* Fetch a single page and extract basic info
|
|
1928
|
+
*/
|
|
1929
|
+
async fetchPage(url) {
|
|
1930
|
+
try {
|
|
1931
|
+
return await this.pool.withBrowser(async (hero) => {
|
|
1932
|
+
await hero.goto(url, { timeoutMs: 3e4 });
|
|
1933
|
+
await hero.waitForPaintingStable();
|
|
1934
|
+
const initialUrl = await hero.url;
|
|
1935
|
+
const detection = await detectChallenge(hero);
|
|
1936
|
+
if (detection.isChallenge) {
|
|
1937
|
+
if (this.options.verbose) {
|
|
1938
|
+
this.logger.info(`Challenge detected on ${url}`);
|
|
1939
|
+
}
|
|
1940
|
+
const result = await waitForChallengeResolution(hero, {
|
|
1941
|
+
maxWaitMs: 45e3,
|
|
1942
|
+
pollIntervalMs: 500,
|
|
1943
|
+
verbose: this.options.verbose,
|
|
1944
|
+
initialUrl
|
|
1945
|
+
});
|
|
1946
|
+
if (!result.resolved) {
|
|
1947
|
+
throw new Error(`Challenge not resolved`);
|
|
1948
|
+
}
|
|
1949
|
+
}
|
|
1950
|
+
const title = await hero.document.title;
|
|
1951
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1952
|
+
let description = null;
|
|
1953
|
+
try {
|
|
1954
|
+
const metaDesc = await hero.document.querySelector('meta[name="description"]');
|
|
1955
|
+
if (metaDesc) {
|
|
1956
|
+
description = await metaDesc.getAttribute("content");
|
|
1957
|
+
}
|
|
1958
|
+
} catch {
|
|
1959
|
+
}
|
|
1960
|
+
return {
|
|
1961
|
+
crawlUrl: {
|
|
1962
|
+
url,
|
|
1963
|
+
title: title || "Untitled",
|
|
1964
|
+
description
|
|
1965
|
+
},
|
|
1966
|
+
html
|
|
1967
|
+
};
|
|
1968
|
+
});
|
|
1969
|
+
} catch (error) {
|
|
1970
|
+
this.logger.error(`Failed to fetch ${url}: ${error.message}`);
|
|
1971
|
+
return null;
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1974
|
+
/**
|
|
1975
|
+
* Extract links from HTML content using DOM parsing
|
|
1976
|
+
* Handles all href formats (single quotes, double quotes, unquoted)
|
|
1977
|
+
*/
|
|
1978
|
+
extractLinks(html, baseUrl, depth) {
|
|
1979
|
+
const links = [];
|
|
1980
|
+
const { document } = parseHTML4(html);
|
|
1981
|
+
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1982
|
+
const href = anchor.getAttribute("href");
|
|
1983
|
+
if (!href) return;
|
|
1984
|
+
const resolved = resolveUrl(href, baseUrl);
|
|
1985
|
+
if (!resolved || !isValidUrl(resolved)) return;
|
|
1986
|
+
if (!isSameDomain(resolved, this.options.url)) return;
|
|
1987
|
+
if (!isContentUrl(resolved)) return;
|
|
1988
|
+
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
1989
|
+
if (!isUrlAllowed(resolved, this.robotsRules)) return;
|
|
1990
|
+
const urlKey = getUrlKey(resolved);
|
|
1991
|
+
if (this.visited.has(urlKey) || this.queue.some((q) => getUrlKey(q.url) === urlKey)) {
|
|
1992
|
+
return;
|
|
1993
|
+
}
|
|
1994
|
+
links.push({ url: resolved, depth });
|
|
1995
|
+
});
|
|
1996
|
+
return links;
|
|
1997
|
+
}
|
|
1998
|
+
/**
|
|
1999
|
+
* Scrape all discovered URLs
|
|
2000
|
+
*/
|
|
2001
|
+
async scrapeDiscoveredUrls() {
|
|
2002
|
+
const urls = this.urls.map((u) => u.url);
|
|
2003
|
+
return scrape({
|
|
2004
|
+
urls,
|
|
2005
|
+
formats: this.options.formats,
|
|
2006
|
+
batchConcurrency: this.options.scrapeConcurrency,
|
|
2007
|
+
proxy: this.options.proxy,
|
|
2008
|
+
userAgent: this.options.userAgent,
|
|
2009
|
+
verbose: this.options.verbose,
|
|
2010
|
+
showChrome: this.options.showChrome,
|
|
2011
|
+
pool: this.pool,
|
|
2012
|
+
// Content cleaning options
|
|
2013
|
+
removeAds: this.options.removeAds,
|
|
2014
|
+
removeBase64Images: this.options.removeBase64Images
|
|
2015
|
+
});
|
|
2016
|
+
}
|
|
2017
|
+
};
|
|
2018
|
+
async function crawl(options) {
|
|
2019
|
+
const crawler = new Crawler(options);
|
|
2020
|
+
return crawler.crawl();
|
|
2021
|
+
}
|
|
2022
|
+
|
|
2023
|
+
// src/browser/pool.ts
|
|
2024
|
+
import Hero from "@ulixee/hero";
|
|
2025
|
+
|
|
2026
|
+
// src/proxy/config.ts
|
|
2027
|
+
function createProxyUrl(config) {
|
|
2028
|
+
if (config.url) {
|
|
2029
|
+
return config.url;
|
|
2030
|
+
}
|
|
2031
|
+
if (config.type === "residential") {
|
|
2032
|
+
const sessionId = `hero_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
|
2033
|
+
return `http://customer-${config.username}_session-${sessionId}_country-${config.country || "us"}:${config.password}@${config.host}:${config.port}`;
|
|
2034
|
+
}
|
|
2035
|
+
return `http://${config.username}:${config.password}@${config.host}:${config.port}`;
|
|
2036
|
+
}
|
|
2037
|
+
function parseProxyUrl(url) {
|
|
2038
|
+
try {
|
|
2039
|
+
const parsed = new URL(url);
|
|
2040
|
+
return {
|
|
2041
|
+
url,
|
|
2042
|
+
username: parsed.username,
|
|
2043
|
+
password: parsed.password,
|
|
2044
|
+
host: parsed.hostname,
|
|
2045
|
+
port: parsed.port ? parseInt(parsed.port, 10) : void 0
|
|
2046
|
+
};
|
|
2047
|
+
} catch (error) {
|
|
2048
|
+
throw new Error(`Invalid proxy URL: ${url}`);
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// src/browser/hero-config.ts
|
|
2053
|
+
function createHeroConfig(options = {}) {
|
|
2054
|
+
const config = {
|
|
2055
|
+
// Show or hide Chrome window
|
|
2056
|
+
showChrome: options.showChrome ?? false,
|
|
2057
|
+
// ============================================================================
|
|
2058
|
+
// CRITICAL: TLS fingerprint emulation
|
|
2059
|
+
// ============================================================================
|
|
2060
|
+
// Setting disableMitm to false enables TLS/TCP fingerprint emulation
|
|
2061
|
+
// This is ESSENTIAL for bypassing Cloudflare and other anti-bot systems
|
|
2062
|
+
disableMitm: false,
|
|
2063
|
+
// ============================================================================
|
|
2064
|
+
// Session management
|
|
2065
|
+
// ============================================================================
|
|
2066
|
+
// Use incognito for clean session state
|
|
2067
|
+
disableIncognito: false,
|
|
2068
|
+
// ============================================================================
|
|
2069
|
+
// Docker compatibility
|
|
2070
|
+
// ============================================================================
|
|
2071
|
+
// Required when running in containerized environments
|
|
2072
|
+
noChromeSandbox: true,
|
|
2073
|
+
// ============================================================================
|
|
2074
|
+
// DNS over TLS (mimics Chrome behavior)
|
|
2075
|
+
// ============================================================================
|
|
2076
|
+
// Using Cloudflare's DNS (1.1.1.1) over TLS makes the connection
|
|
2077
|
+
// look more like a real Chrome browser
|
|
2078
|
+
dnsOverTlsProvider: {
|
|
2079
|
+
host: "1.1.1.1",
|
|
2080
|
+
servername: "cloudflare-dns.com"
|
|
2081
|
+
},
|
|
2082
|
+
// ============================================================================
|
|
2083
|
+
// WebRTC IP leak prevention
|
|
2084
|
+
// ============================================================================
|
|
2085
|
+
// Masks the real IP address in WebRTC connections
|
|
2086
|
+
// Uses ipify.org to detect the public IP
|
|
2087
|
+
upstreamProxyIpMask: {
|
|
2088
|
+
ipLookupService: "https://api.ipify.org?format=json"
|
|
2089
|
+
},
|
|
2090
|
+
// ============================================================================
|
|
2091
|
+
// Locale and timezone
|
|
2092
|
+
// ============================================================================
|
|
2093
|
+
locale: "en-US",
|
|
2094
|
+
timezoneId: "America/New_York",
|
|
2095
|
+
// ============================================================================
|
|
2096
|
+
// Viewport (standard desktop size)
|
|
2097
|
+
// ============================================================================
|
|
2098
|
+
viewport: {
|
|
2099
|
+
width: 1920,
|
|
2100
|
+
height: 1080
|
|
2101
|
+
},
|
|
2102
|
+
// ============================================================================
|
|
2103
|
+
// User agent (if provided)
|
|
2104
|
+
// ============================================================================
|
|
2105
|
+
...options.userAgent && { userAgent: options.userAgent },
|
|
2106
|
+
// ============================================================================
|
|
2107
|
+
// Connection to Core (if provided)
|
|
2108
|
+
// ============================================================================
|
|
2109
|
+
...options.connectionToCore && { connectionToCore: options.connectionToCore }
|
|
2110
|
+
};
|
|
2111
|
+
if (options.proxy) {
|
|
2112
|
+
config.upstreamProxyUrl = createProxyUrl(options.proxy);
|
|
2113
|
+
config.upstreamProxyUseSystemDns = false;
|
|
2114
|
+
}
|
|
2115
|
+
return config;
|
|
2116
|
+
}
|
|
2117
|
+
|
|
2118
|
+
// src/browser/pool.ts
|
|
2119
|
+
var DEFAULT_POOL_CONFIG = {
|
|
2120
|
+
size: 2,
|
|
2121
|
+
retireAfterPageCount: 100,
|
|
2122
|
+
retireAfterAgeMs: 30 * 60 * 1e3,
|
|
2123
|
+
// 30 minutes
|
|
2124
|
+
recycleCheckInterval: 60 * 1e3,
|
|
2125
|
+
// 1 minute
|
|
2126
|
+
healthCheckInterval: 5 * 60 * 1e3,
|
|
2127
|
+
// 5 minutes
|
|
2128
|
+
maxConsecutiveFailures: 3,
|
|
2129
|
+
maxQueueSize: 100,
|
|
2130
|
+
queueTimeout: 60 * 1e3
|
|
2131
|
+
// 1 minute
|
|
2132
|
+
};
|
|
2133
|
+
function generateId() {
|
|
2134
|
+
return `browser_${Date.now()}_${Math.random().toString(36).slice(2, 9)}`;
|
|
2135
|
+
}
|
|
2136
|
+
var BrowserPool = class {
|
|
2137
|
+
instances = [];
|
|
2138
|
+
available = [];
|
|
2139
|
+
inUse = /* @__PURE__ */ new Set();
|
|
2140
|
+
queue = [];
|
|
2141
|
+
config;
|
|
2142
|
+
proxy;
|
|
2143
|
+
recycleTimer;
|
|
2144
|
+
healthTimer;
|
|
2145
|
+
totalRequests = 0;
|
|
2146
|
+
totalRequestDuration = 0;
|
|
2147
|
+
showChrome;
|
|
2148
|
+
connectionToCore;
|
|
2149
|
+
userAgent;
|
|
2150
|
+
verbose;
|
|
2151
|
+
logger = createLogger("pool");
|
|
2152
|
+
constructor(config = {}, proxy, showChrome = false, connectionToCore, userAgent, verbose = false) {
|
|
2153
|
+
this.config = { ...DEFAULT_POOL_CONFIG, ...config };
|
|
2154
|
+
this.proxy = proxy;
|
|
2155
|
+
this.showChrome = showChrome;
|
|
2156
|
+
this.connectionToCore = connectionToCore;
|
|
2157
|
+
this.userAgent = userAgent;
|
|
2158
|
+
this.verbose = verbose;
|
|
2159
|
+
}
|
|
2160
|
+
/**
|
|
2161
|
+
* Initialize the pool by pre-launching browsers
|
|
2162
|
+
*/
|
|
2163
|
+
async initialize() {
|
|
2164
|
+
if (this.verbose) {
|
|
2165
|
+
this.logger.info(`Initializing pool with ${this.config.size} browsers...`);
|
|
2166
|
+
}
|
|
2167
|
+
const launchPromises = [];
|
|
2168
|
+
for (let i = 0; i < this.config.size; i++) {
|
|
2169
|
+
launchPromises.push(this.createInstance());
|
|
2170
|
+
}
|
|
2171
|
+
this.instances = await Promise.all(launchPromises);
|
|
2172
|
+
this.available = [...this.instances];
|
|
2173
|
+
this.startRecycling();
|
|
2174
|
+
this.startHealthChecks();
|
|
2175
|
+
if (this.verbose) {
|
|
2176
|
+
this.logger.info(`Pool ready: ${this.instances.length} browsers available`);
|
|
2177
|
+
}
|
|
2178
|
+
}
|
|
2179
|
+
/**
|
|
2180
|
+
* Shutdown the pool and close all browsers
|
|
2181
|
+
*/
|
|
2182
|
+
async shutdown() {
|
|
2183
|
+
if (this.verbose) {
|
|
2184
|
+
const stats = this.getStats();
|
|
2185
|
+
this.logger.info(
|
|
2186
|
+
`Shutting down pool: ${stats.totalRequests} total requests processed, ${Math.round(stats.avgRequestDuration)}ms avg duration`
|
|
2187
|
+
);
|
|
2188
|
+
}
|
|
2189
|
+
if (this.recycleTimer) clearInterval(this.recycleTimer);
|
|
2190
|
+
if (this.healthTimer) clearInterval(this.healthTimer);
|
|
2191
|
+
for (const item of this.queue) {
|
|
2192
|
+
item.reject(new Error("Pool shutting down"));
|
|
2193
|
+
}
|
|
2194
|
+
this.queue = [];
|
|
2195
|
+
const closePromises = this.instances.map((instance) => instance.hero.close().catch(() => {
|
|
2196
|
+
}));
|
|
2197
|
+
await Promise.all(closePromises);
|
|
2198
|
+
if (this.connectionToCore) {
|
|
2199
|
+
try {
|
|
2200
|
+
await this.connectionToCore.disconnect();
|
|
2201
|
+
} catch {
|
|
2202
|
+
}
|
|
2203
|
+
this.connectionToCore = void 0;
|
|
2204
|
+
}
|
|
2205
|
+
this.instances = [];
|
|
2206
|
+
this.available = [];
|
|
2207
|
+
this.inUse.clear();
|
|
2208
|
+
}
|
|
2209
|
+
/**
|
|
2210
|
+
* Acquire a browser from the pool
|
|
2211
|
+
*/
|
|
2212
|
+
async acquire() {
|
|
2213
|
+
const instance = this.available.shift();
|
|
2214
|
+
if (!instance) {
|
|
2215
|
+
if (this.verbose) {
|
|
2216
|
+
this.logger.info(`No browsers available, queuing request (queue: ${this.queue.length + 1})`);
|
|
2217
|
+
}
|
|
2218
|
+
return this.queueRequest();
|
|
2219
|
+
}
|
|
2220
|
+
instance.status = "busy";
|
|
2221
|
+
instance.lastUsed = Date.now();
|
|
2222
|
+
this.inUse.add(instance);
|
|
2223
|
+
if (this.verbose) {
|
|
2224
|
+
this.logger.info(
|
|
2225
|
+
`Acquired browser ${instance.id} (available: ${this.available.length}, busy: ${this.inUse.size})`
|
|
2226
|
+
);
|
|
2227
|
+
}
|
|
2228
|
+
return instance.hero;
|
|
2229
|
+
}
|
|
2230
|
+
/**
|
|
2231
|
+
* Release a browser back to the pool
|
|
2232
|
+
*/
|
|
2233
|
+
release(hero) {
|
|
2234
|
+
const instance = this.instances.find((i) => i.hero === hero);
|
|
2235
|
+
if (!instance) return;
|
|
2236
|
+
instance.status = "idle";
|
|
2237
|
+
instance.requestCount++;
|
|
2238
|
+
this.inUse.delete(instance);
|
|
2239
|
+
if (this.verbose) {
|
|
2240
|
+
this.logger.info(
|
|
2241
|
+
`Released browser ${instance.id} (requests: ${instance.requestCount}, available: ${this.available.length + 1})`
|
|
2242
|
+
);
|
|
2243
|
+
}
|
|
2244
|
+
if (this.shouldRecycle(instance)) {
|
|
2245
|
+
if (this.verbose) {
|
|
2246
|
+
this.logger.info(`Recycling browser ${instance.id} (age or request limit reached)`);
|
|
2247
|
+
}
|
|
2248
|
+
this.recycleInstance(instance).catch(() => {
|
|
2249
|
+
});
|
|
2250
|
+
} else {
|
|
2251
|
+
this.available.push(instance);
|
|
2252
|
+
this.processQueue();
|
|
2253
|
+
}
|
|
2254
|
+
}
|
|
2255
|
+
/**
|
|
2256
|
+
* Execute callback with auto-managed browser
|
|
2257
|
+
*/
|
|
2258
|
+
async withBrowser(callback) {
|
|
2259
|
+
const startTime = Date.now();
|
|
2260
|
+
const hero = await this.acquire();
|
|
2261
|
+
try {
|
|
2262
|
+
const result = await callback(hero);
|
|
2263
|
+
this.totalRequests++;
|
|
2264
|
+
this.totalRequestDuration += Date.now() - startTime;
|
|
2265
|
+
return result;
|
|
2266
|
+
} finally {
|
|
2267
|
+
this.release(hero);
|
|
2268
|
+
}
|
|
2269
|
+
}
|
|
2270
|
+
/**
|
|
2271
|
+
* Get pool statistics
|
|
2272
|
+
*/
|
|
2273
|
+
getStats() {
|
|
2274
|
+
const recycling = this.instances.filter((i) => i.status === "recycling").length;
|
|
2275
|
+
const unhealthy = this.instances.filter((i) => i.status === "unhealthy").length;
|
|
2276
|
+
return {
|
|
2277
|
+
total: this.instances.length,
|
|
2278
|
+
available: this.available.length,
|
|
2279
|
+
busy: this.inUse.size,
|
|
2280
|
+
recycling,
|
|
2281
|
+
unhealthy,
|
|
2282
|
+
queueLength: this.queue.length,
|
|
2283
|
+
totalRequests: this.totalRequests,
|
|
2284
|
+
avgRequestDuration: this.totalRequests > 0 ? this.totalRequestDuration / this.totalRequests : 0
|
|
2285
|
+
};
|
|
2286
|
+
}
|
|
2287
|
+
/**
|
|
2288
|
+
* Run health check
|
|
2289
|
+
*/
|
|
2290
|
+
async healthCheck() {
|
|
2291
|
+
const issues = [];
|
|
2292
|
+
const stats = this.getStats();
|
|
2293
|
+
if (stats.unhealthy > 0) {
|
|
2294
|
+
issues.push(`${stats.unhealthy} unhealthy instances`);
|
|
2295
|
+
}
|
|
2296
|
+
if (stats.queueLength > this.config.maxQueueSize * 0.8) {
|
|
2297
|
+
issues.push(`Queue near capacity: ${stats.queueLength}/${this.config.maxQueueSize}`);
|
|
2298
|
+
}
|
|
2299
|
+
if (stats.available === 0 && stats.queueLength > 0) {
|
|
2300
|
+
issues.push("Pool saturated - all browsers busy with pending requests");
|
|
2301
|
+
}
|
|
2302
|
+
return {
|
|
2303
|
+
healthy: issues.length === 0,
|
|
2304
|
+
issues,
|
|
2305
|
+
stats
|
|
2306
|
+
};
|
|
2307
|
+
}
|
|
2308
|
+
// =========================================================================
|
|
2309
|
+
// Private methods
|
|
2310
|
+
// =========================================================================
|
|
2311
|
+
/**
|
|
2312
|
+
* Create a new browser instance
|
|
2313
|
+
*/
|
|
2314
|
+
async createInstance() {
|
|
2315
|
+
const heroConfig = createHeroConfig({
|
|
2316
|
+
proxy: this.proxy,
|
|
2317
|
+
showChrome: this.showChrome,
|
|
2318
|
+
connectionToCore: this.connectionToCore,
|
|
2319
|
+
userAgent: this.userAgent
|
|
2320
|
+
});
|
|
2321
|
+
const hero = new Hero(heroConfig);
|
|
2322
|
+
return {
|
|
2323
|
+
hero,
|
|
2324
|
+
id: generateId(),
|
|
2325
|
+
createdAt: Date.now(),
|
|
2326
|
+
lastUsed: Date.now(),
|
|
2327
|
+
requestCount: 0,
|
|
2328
|
+
status: "idle"
|
|
2329
|
+
};
|
|
2330
|
+
}
|
|
2331
|
+
/**
|
|
2332
|
+
* Check if instance should be recycled
|
|
2333
|
+
*/
|
|
2334
|
+
shouldRecycle(instance) {
|
|
2335
|
+
const age = Date.now() - instance.createdAt;
|
|
2336
|
+
return instance.requestCount >= this.config.retireAfterPageCount || age >= this.config.retireAfterAgeMs;
|
|
2337
|
+
}
|
|
2338
|
+
/**
|
|
2339
|
+
* Recycle an instance (close old, create new)
|
|
2340
|
+
*/
|
|
2341
|
+
async recycleInstance(instance) {
|
|
2342
|
+
instance.status = "recycling";
|
|
2343
|
+
try {
|
|
2344
|
+
await instance.hero.close().catch(() => {
|
|
2345
|
+
});
|
|
2346
|
+
const newInstance = await this.createInstance();
|
|
2347
|
+
const index = this.instances.indexOf(instance);
|
|
2348
|
+
if (index !== -1) {
|
|
2349
|
+
this.instances[index] = newInstance;
|
|
2350
|
+
}
|
|
2351
|
+
this.available.push(newInstance);
|
|
2352
|
+
if (this.verbose) {
|
|
2353
|
+
this.logger.info(`Recycled browser: ${instance.id} \u2192 ${newInstance.id}`);
|
|
2354
|
+
}
|
|
2355
|
+
this.processQueue();
|
|
2356
|
+
} catch (error) {
|
|
2357
|
+
instance.status = "unhealthy";
|
|
2358
|
+
if (this.verbose) {
|
|
2359
|
+
this.logger.warn(`Failed to recycle browser ${instance.id}`);
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
}
|
|
2363
|
+
/**
|
|
2364
|
+
* Queue a request when no browsers available
|
|
2365
|
+
*/
|
|
2366
|
+
queueRequest() {
|
|
2367
|
+
return new Promise((resolve, reject) => {
|
|
2368
|
+
if (this.queue.length >= this.config.maxQueueSize) {
|
|
2369
|
+
reject(new Error("Queue full"));
|
|
2370
|
+
return;
|
|
2371
|
+
}
|
|
2372
|
+
const item = {
|
|
2373
|
+
resolve,
|
|
2374
|
+
reject,
|
|
2375
|
+
queuedAt: Date.now()
|
|
2376
|
+
};
|
|
2377
|
+
this.queue.push(item);
|
|
2378
|
+
setTimeout(() => {
|
|
2379
|
+
const index = this.queue.indexOf(item);
|
|
2380
|
+
if (index !== -1) {
|
|
2381
|
+
this.queue.splice(index, 1);
|
|
2382
|
+
reject(new Error("Queue timeout"));
|
|
2383
|
+
}
|
|
2384
|
+
}, this.config.queueTimeout);
|
|
2385
|
+
});
|
|
2386
|
+
}
|
|
2387
|
+
/**
|
|
2388
|
+
* Process queued requests
|
|
2389
|
+
*/
|
|
2390
|
+
processQueue() {
|
|
2391
|
+
while (this.queue.length > 0 && this.available.length > 0) {
|
|
2392
|
+
const item = this.queue.shift();
|
|
2393
|
+
const age = Date.now() - item.queuedAt;
|
|
2394
|
+
if (age > this.config.queueTimeout) {
|
|
2395
|
+
item.reject(new Error("Queue timeout"));
|
|
2396
|
+
continue;
|
|
2397
|
+
}
|
|
2398
|
+
this.acquire().then(item.resolve).catch(item.reject);
|
|
2399
|
+
}
|
|
2400
|
+
}
|
|
2401
|
+
/**
|
|
2402
|
+
* Start background recycling task
|
|
2403
|
+
*/
|
|
2404
|
+
startRecycling() {
|
|
2405
|
+
this.recycleTimer = setInterval(() => {
|
|
2406
|
+
for (const instance of this.instances) {
|
|
2407
|
+
if (instance.status === "idle" && this.shouldRecycle(instance)) {
|
|
2408
|
+
this.recycleInstance(instance).catch(() => {
|
|
2409
|
+
});
|
|
2410
|
+
}
|
|
2411
|
+
}
|
|
2412
|
+
}, this.config.recycleCheckInterval);
|
|
2413
|
+
this.recycleTimer.unref();
|
|
2414
|
+
}
|
|
2415
|
+
/**
|
|
2416
|
+
* Start background health checks
|
|
2417
|
+
*/
|
|
2418
|
+
startHealthChecks() {
|
|
2419
|
+
this.healthTimer = setInterval(async () => {
|
|
2420
|
+
const health = await this.healthCheck();
|
|
2421
|
+
if (!health.healthy && health.issues.length > 0) {
|
|
2422
|
+
console.warn("[BrowserPool] Health issues:", health.issues);
|
|
2423
|
+
}
|
|
2424
|
+
}, this.config.healthCheckInterval);
|
|
2425
|
+
this.healthTimer.unref();
|
|
2426
|
+
}
|
|
2427
|
+
};
|
|
2428
|
+
|
|
2429
|
+
// src/client.ts
|
|
2430
|
+
var logger2 = createLogger("client");
|
|
2431
|
+
var ReaderClient = class {
|
|
2432
|
+
heroCore = null;
|
|
2433
|
+
pool = null;
|
|
2434
|
+
initialized = false;
|
|
2435
|
+
initializing = null;
|
|
2436
|
+
closed = false;
|
|
2437
|
+
options;
|
|
2438
|
+
proxyIndex = 0;
|
|
2439
|
+
cleanupHandler = null;
|
|
2440
|
+
constructor(options = {}) {
|
|
2441
|
+
this.options = options;
|
|
2442
|
+
const skipTLS = options.skipTLSVerification ?? true;
|
|
2443
|
+
if (skipTLS) {
|
|
2444
|
+
process.env.MITM_ALLOW_INSECURE = "true";
|
|
2445
|
+
}
|
|
2446
|
+
this.registerCleanup();
|
|
2447
|
+
}
|
|
2448
|
+
/**
|
|
2449
|
+
* Get the next proxy from the rotation pool
|
|
2450
|
+
*/
|
|
2451
|
+
getNextProxy() {
|
|
2452
|
+
const { proxies, proxyRotation = "round-robin" } = this.options;
|
|
2453
|
+
if (!proxies || proxies.length === 0) {
|
|
2454
|
+
return void 0;
|
|
2455
|
+
}
|
|
2456
|
+
if (proxyRotation === "random") {
|
|
2457
|
+
return proxies[Math.floor(Math.random() * proxies.length)];
|
|
2458
|
+
}
|
|
2459
|
+
const proxy = proxies[this.proxyIndex % proxies.length];
|
|
2460
|
+
this.proxyIndex++;
|
|
2461
|
+
return proxy;
|
|
2462
|
+
}
|
|
2463
|
+
/**
|
|
2464
|
+
* Initialize HeroCore. Called automatically on first scrape/crawl.
|
|
2465
|
+
* Can be called explicitly if you want to pre-warm the client.
|
|
2466
|
+
*/
|
|
2467
|
+
async start() {
|
|
2468
|
+
if (this.closed) {
|
|
2469
|
+
throw new Error("ReaderClient has been closed. Create a new instance.");
|
|
2470
|
+
}
|
|
2471
|
+
if (this.initialized) {
|
|
2472
|
+
return;
|
|
2473
|
+
}
|
|
2474
|
+
if (this.initializing) {
|
|
2475
|
+
await this.initializing;
|
|
2476
|
+
return;
|
|
2477
|
+
}
|
|
2478
|
+
this.initializing = this.initializeCore();
|
|
2479
|
+
await this.initializing;
|
|
2480
|
+
this.initializing = null;
|
|
2481
|
+
}
|
|
2482
|
+
/**
|
|
2483
|
+
* Internal initialization logic
|
|
2484
|
+
*/
|
|
2485
|
+
async initializeCore() {
|
|
2486
|
+
try {
|
|
2487
|
+
if (this.options.verbose) {
|
|
2488
|
+
logger2.info("Starting HeroCore...");
|
|
2489
|
+
}
|
|
2490
|
+
this.heroCore = new HeroCore();
|
|
2491
|
+
await this.heroCore.start();
|
|
2492
|
+
if (this.options.verbose) {
|
|
2493
|
+
logger2.info("HeroCore started successfully");
|
|
2494
|
+
}
|
|
2495
|
+
if (this.options.verbose) {
|
|
2496
|
+
logger2.info("Initializing browser pool...");
|
|
2497
|
+
}
|
|
2498
|
+
const browserPoolConfig = this.options.browserPool;
|
|
2499
|
+
const poolConfig = {
|
|
2500
|
+
size: browserPoolConfig?.size ?? 2,
|
|
2501
|
+
retireAfterPageCount: browserPoolConfig?.retireAfterPages ?? 100,
|
|
2502
|
+
retireAfterAgeMs: (browserPoolConfig?.retireAfterMinutes ?? 30) * 60 * 1e3,
|
|
2503
|
+
maxQueueSize: browserPoolConfig?.maxQueueSize ?? 100
|
|
2504
|
+
};
|
|
2505
|
+
this.pool = new BrowserPool(
|
|
2506
|
+
poolConfig,
|
|
2507
|
+
void 0,
|
|
2508
|
+
// proxy set per-request
|
|
2509
|
+
this.options.showChrome,
|
|
2510
|
+
this.createConnection(),
|
|
2511
|
+
void 0,
|
|
2512
|
+
// userAgent
|
|
2513
|
+
this.options.verbose
|
|
2514
|
+
);
|
|
2515
|
+
await this.pool.initialize();
|
|
2516
|
+
this.initialized = true;
|
|
2517
|
+
if (this.options.verbose) {
|
|
2518
|
+
logger2.info("Browser pool initialized successfully");
|
|
2519
|
+
}
|
|
2520
|
+
} catch (error) {
|
|
2521
|
+
if (this.pool) {
|
|
2522
|
+
await this.pool.shutdown().catch(() => {
|
|
2523
|
+
});
|
|
2524
|
+
this.pool = null;
|
|
2525
|
+
}
|
|
2526
|
+
if (this.heroCore) {
|
|
2527
|
+
await this.heroCore.close().catch(() => {
|
|
2528
|
+
});
|
|
2529
|
+
this.heroCore = null;
|
|
2530
|
+
}
|
|
2531
|
+
this.initialized = false;
|
|
2532
|
+
const message = error.message || String(error);
|
|
2533
|
+
if (message.includes("EADDRINUSE")) {
|
|
2534
|
+
throw new Error(
|
|
2535
|
+
"Failed to start HeroCore: Port already in use. Another instance may be running. Close it or use a different port."
|
|
2536
|
+
);
|
|
2537
|
+
}
|
|
2538
|
+
if (message.includes("chrome") || message.includes("Chrome")) {
|
|
2539
|
+
throw new Error(
|
|
2540
|
+
"Failed to start HeroCore: Chrome/Chromium not found. Please install Chrome or set CHROME_PATH environment variable."
|
|
2541
|
+
);
|
|
2542
|
+
}
|
|
2543
|
+
throw new Error(`Failed to start HeroCore: ${message}`);
|
|
2544
|
+
}
|
|
2545
|
+
}
|
|
2546
|
+
/**
|
|
2547
|
+
* Create a connection to the HeroCore instance
|
|
2548
|
+
*/
|
|
2549
|
+
createConnection() {
|
|
2550
|
+
if (!this.heroCore) {
|
|
2551
|
+
throw new Error("HeroCore not initialized. This should not happen.");
|
|
2552
|
+
}
|
|
2553
|
+
const bridge = new TransportBridge();
|
|
2554
|
+
this.heroCore.addConnection(bridge.transportToClient);
|
|
2555
|
+
return new ConnectionToHeroCore(bridge.transportToCore);
|
|
2556
|
+
}
|
|
2557
|
+
/**
|
|
2558
|
+
* Ensure client is initialized before operation
|
|
2559
|
+
*/
|
|
2560
|
+
async ensureInitialized() {
|
|
2561
|
+
if (this.closed) {
|
|
2562
|
+
throw new Error("ReaderClient has been closed. Create a new instance.");
|
|
2563
|
+
}
|
|
2564
|
+
if (!this.initialized) {
|
|
2565
|
+
await this.start();
|
|
2566
|
+
}
|
|
2567
|
+
}
|
|
2568
|
+
/**
|
|
2569
|
+
* Scrape one or more URLs
|
|
2570
|
+
*
|
|
2571
|
+
* @param options - Scrape options (urls, formats, etc.)
|
|
2572
|
+
* @returns Scrape result with data and metadata
|
|
2573
|
+
*
|
|
2574
|
+
* @example
|
|
2575
|
+
* const result = await reader.scrape({
|
|
2576
|
+
* urls: ['https://example.com'],
|
|
2577
|
+
* formats: ['markdown', 'html'],
|
|
2578
|
+
* });
|
|
2579
|
+
*/
|
|
2580
|
+
async scrape(options) {
|
|
2581
|
+
await this.ensureInitialized();
|
|
2582
|
+
if (!this.pool) {
|
|
2583
|
+
throw new Error("Browser pool not initialized. This should not happen.");
|
|
2584
|
+
}
|
|
2585
|
+
const proxy = options.proxy ?? this.getNextProxy();
|
|
2586
|
+
return await scrape({
|
|
2587
|
+
...options,
|
|
2588
|
+
proxy,
|
|
2589
|
+
showChrome: options.showChrome ?? this.options.showChrome,
|
|
2590
|
+
verbose: options.verbose ?? this.options.verbose,
|
|
2591
|
+
pool: this.pool
|
|
2592
|
+
});
|
|
2593
|
+
}
|
|
2594
|
+
/**
|
|
2595
|
+
* Crawl a website to discover URLs
|
|
2596
|
+
*
|
|
2597
|
+
* @param options - Crawl options (url, depth, maxPages, etc.)
|
|
2598
|
+
* @returns Crawl result with discovered URLs and optional scraped content
|
|
2599
|
+
*
|
|
2600
|
+
* @example
|
|
2601
|
+
* const result = await reader.crawl({
|
|
2602
|
+
* url: 'https://example.com',
|
|
2603
|
+
* depth: 2,
|
|
2604
|
+
* maxPages: 50,
|
|
2605
|
+
* scrape: true,
|
|
2606
|
+
* });
|
|
2607
|
+
*/
|
|
2608
|
+
async crawl(options) {
|
|
2609
|
+
await this.ensureInitialized();
|
|
2610
|
+
if (!this.pool) {
|
|
2611
|
+
throw new Error("Browser pool not initialized. This should not happen.");
|
|
2612
|
+
}
|
|
2613
|
+
const proxy = options.proxy ?? this.getNextProxy();
|
|
2614
|
+
return await crawl({
|
|
2615
|
+
...options,
|
|
2616
|
+
proxy,
|
|
2617
|
+
pool: this.pool
|
|
2618
|
+
});
|
|
2619
|
+
}
|
|
2620
|
+
/**
|
|
2621
|
+
* Check if the client is initialized and ready
|
|
2622
|
+
*/
|
|
2623
|
+
isReady() {
|
|
2624
|
+
return this.initialized && !this.closed;
|
|
2625
|
+
}
|
|
2626
|
+
/**
|
|
2627
|
+
* Close the client and release resources
|
|
2628
|
+
*
|
|
2629
|
+
* Note: This is optional - the client will auto-close on process exit.
|
|
2630
|
+
*/
|
|
2631
|
+
async close() {
|
|
2632
|
+
if (this.closed) {
|
|
2633
|
+
return;
|
|
2634
|
+
}
|
|
2635
|
+
this.closed = true;
|
|
2636
|
+
this.removeCleanupHandlers();
|
|
2637
|
+
if (this.pool) {
|
|
2638
|
+
if (this.options.verbose) {
|
|
2639
|
+
logger2.info("Shutting down browser pool...");
|
|
2640
|
+
}
|
|
2641
|
+
try {
|
|
2642
|
+
await this.pool.shutdown();
|
|
2643
|
+
} catch (error) {
|
|
2644
|
+
if (this.options.verbose) {
|
|
2645
|
+
logger2.warn(`Error shutting down pool: ${error.message}`);
|
|
2646
|
+
}
|
|
2647
|
+
}
|
|
2648
|
+
this.pool = null;
|
|
2649
|
+
}
|
|
2650
|
+
if (this.heroCore) {
|
|
2651
|
+
if (this.options.verbose) {
|
|
2652
|
+
logger2.info("Closing HeroCore...");
|
|
2653
|
+
}
|
|
2654
|
+
try {
|
|
2655
|
+
await this.heroCore.close();
|
|
2656
|
+
await HeroCore.shutdown();
|
|
2657
|
+
} catch (error) {
|
|
2658
|
+
if (this.options.verbose) {
|
|
2659
|
+
logger2.warn(`Error closing HeroCore: ${error.message}`);
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
this.heroCore = null;
|
|
2663
|
+
}
|
|
2664
|
+
this.initialized = false;
|
|
2665
|
+
if (this.options.verbose) {
|
|
2666
|
+
logger2.info("ReaderClient closed");
|
|
2667
|
+
}
|
|
2668
|
+
}
|
|
2669
|
+
/**
|
|
2670
|
+
* Register cleanup handlers for process exit
|
|
2671
|
+
*/
|
|
2672
|
+
registerCleanup() {
|
|
2673
|
+
this.cleanupHandler = async () => {
|
|
2674
|
+
await this.close();
|
|
2675
|
+
};
|
|
2676
|
+
process.once("beforeExit", this.cleanupHandler);
|
|
2677
|
+
process.once("SIGINT", async () => {
|
|
2678
|
+
await this.cleanupHandler?.();
|
|
2679
|
+
process.exit(0);
|
|
2680
|
+
});
|
|
2681
|
+
process.once("SIGTERM", async () => {
|
|
2682
|
+
await this.cleanupHandler?.();
|
|
2683
|
+
process.exit(0);
|
|
2684
|
+
});
|
|
2685
|
+
}
|
|
2686
|
+
/**
|
|
2687
|
+
* Remove process cleanup handlers
|
|
2688
|
+
*/
|
|
2689
|
+
removeCleanupHandlers() {
|
|
2690
|
+
if (this.cleanupHandler) {
|
|
2691
|
+
process.removeListener("beforeExit", this.cleanupHandler);
|
|
2692
|
+
this.cleanupHandler = null;
|
|
2693
|
+
}
|
|
2694
|
+
}
|
|
2695
|
+
};
|
|
2696
|
+
|
|
2697
|
+
// src/daemon/server.ts
|
|
2698
|
+
import http from "http";
|
|
2699
|
+
var logger3 = createLogger("daemon");
|
|
2700
|
+
var DEFAULT_DAEMON_PORT = 3847;
|
|
2701
|
+
var PID_FILE_NAME = ".reader-daemon.pid";
|
|
2702
|
+
var DaemonServer = class {
|
|
2703
|
+
server = null;
|
|
2704
|
+
client = null;
|
|
2705
|
+
options;
|
|
2706
|
+
startTime = 0;
|
|
2707
|
+
constructor(options = {}) {
|
|
2708
|
+
this.options = {
|
|
2709
|
+
port: options.port ?? DEFAULT_DAEMON_PORT,
|
|
2710
|
+
poolSize: options.poolSize ?? 5,
|
|
2711
|
+
verbose: options.verbose ?? false,
|
|
2712
|
+
showChrome: options.showChrome ?? false
|
|
2713
|
+
};
|
|
2714
|
+
}
|
|
2715
|
+
/**
|
|
2716
|
+
* Start the daemon server
|
|
2717
|
+
*/
|
|
2718
|
+
async start() {
|
|
2719
|
+
if (this.server) {
|
|
2720
|
+
throw new Error("Daemon is already running");
|
|
2721
|
+
}
|
|
2722
|
+
const clientOptions = {
|
|
2723
|
+
verbose: this.options.verbose,
|
|
2724
|
+
showChrome: this.options.showChrome,
|
|
2725
|
+
browserPool: {
|
|
2726
|
+
size: this.options.poolSize
|
|
2727
|
+
}
|
|
2728
|
+
};
|
|
2729
|
+
this.client = new ReaderClient(clientOptions);
|
|
2730
|
+
await this.client.start();
|
|
2731
|
+
this.server = http.createServer(this.handleRequest.bind(this));
|
|
2732
|
+
await new Promise((resolve, reject) => {
|
|
2733
|
+
this.server.listen(this.options.port, () => {
|
|
2734
|
+
this.startTime = Date.now();
|
|
2735
|
+
if (this.options.verbose) {
|
|
2736
|
+
logger3.info(`Daemon started on port ${this.options.port} with pool size ${this.options.poolSize}`);
|
|
2737
|
+
}
|
|
2738
|
+
resolve();
|
|
2739
|
+
});
|
|
2740
|
+
this.server.on("error", (error) => {
|
|
2741
|
+
if (error.code === "EADDRINUSE") {
|
|
2742
|
+
reject(new Error(`Port ${this.options.port} is already in use. Is another daemon running?`));
|
|
2743
|
+
} else {
|
|
2744
|
+
reject(error);
|
|
2745
|
+
}
|
|
2746
|
+
});
|
|
2747
|
+
});
|
|
2748
|
+
await this.writePidFile();
|
|
2749
|
+
}
|
|
2750
|
+
/**
|
|
2751
|
+
* Stop the daemon server
|
|
2752
|
+
*/
|
|
2753
|
+
async stop() {
|
|
2754
|
+
if (this.server) {
|
|
2755
|
+
await new Promise((resolve) => {
|
|
2756
|
+
this.server.close(() => resolve());
|
|
2757
|
+
});
|
|
2758
|
+
this.server = null;
|
|
2759
|
+
}
|
|
2760
|
+
if (this.client) {
|
|
2761
|
+
await this.client.close();
|
|
2762
|
+
this.client = null;
|
|
2763
|
+
}
|
|
2764
|
+
await this.removePidFile();
|
|
2765
|
+
if (this.options.verbose) {
|
|
2766
|
+
logger3.info("Daemon stopped");
|
|
2767
|
+
}
|
|
2768
|
+
}
|
|
2769
|
+
/**
|
|
2770
|
+
* Get the port the daemon is running on
|
|
2771
|
+
*/
|
|
2772
|
+
getPort() {
|
|
2773
|
+
return this.options.port;
|
|
2774
|
+
}
|
|
2775
|
+
/**
|
|
2776
|
+
* Handle incoming HTTP requests
|
|
2777
|
+
*/
|
|
2778
|
+
async handleRequest(req, res) {
|
|
2779
|
+
if (req.method !== "POST" || req.url !== "/") {
|
|
2780
|
+
res.writeHead(404, { "Content-Type": "application/json" });
|
|
2781
|
+
res.end(JSON.stringify({ success: false, error: "Not found" }));
|
|
2782
|
+
return;
|
|
2783
|
+
}
|
|
2784
|
+
let body = "";
|
|
2785
|
+
for await (const chunk of req) {
|
|
2786
|
+
body += chunk;
|
|
2787
|
+
}
|
|
2788
|
+
let request;
|
|
2789
|
+
try {
|
|
2790
|
+
request = JSON.parse(body);
|
|
2791
|
+
} catch {
|
|
2792
|
+
this.sendResponse(res, 400, { success: false, error: "Invalid JSON" });
|
|
2793
|
+
return;
|
|
2794
|
+
}
|
|
2795
|
+
try {
|
|
2796
|
+
switch (request.action) {
|
|
2797
|
+
case "scrape":
|
|
2798
|
+
await this.handleScrape(res, request.options);
|
|
2799
|
+
break;
|
|
2800
|
+
case "crawl":
|
|
2801
|
+
await this.handleCrawl(res, request.options);
|
|
2802
|
+
break;
|
|
2803
|
+
case "status":
|
|
2804
|
+
this.handleStatus(res);
|
|
2805
|
+
break;
|
|
2806
|
+
case "shutdown":
|
|
2807
|
+
await this.handleShutdown(res);
|
|
2808
|
+
break;
|
|
2809
|
+
default:
|
|
2810
|
+
this.sendResponse(res, 400, { success: false, error: "Unknown action" });
|
|
2811
|
+
}
|
|
2812
|
+
} catch (error) {
|
|
2813
|
+
this.sendResponse(res, 500, { success: false, error: error.message });
|
|
2814
|
+
}
|
|
2815
|
+
}
|
|
2816
|
+
/**
|
|
2817
|
+
* Handle scrape request
|
|
2818
|
+
*/
|
|
2819
|
+
async handleScrape(res, options) {
|
|
2820
|
+
if (!this.client) {
|
|
2821
|
+
this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
|
|
2822
|
+
return;
|
|
2823
|
+
}
|
|
2824
|
+
const result = await this.client.scrape(options);
|
|
2825
|
+
this.sendResponse(res, 200, { success: true, data: result });
|
|
2826
|
+
}
|
|
2827
|
+
/**
|
|
2828
|
+
* Handle crawl request
|
|
2829
|
+
*/
|
|
2830
|
+
async handleCrawl(res, options) {
|
|
2831
|
+
if (!this.client) {
|
|
2832
|
+
this.sendResponse(res, 500, { success: false, error: "Client not initialized" });
|
|
2833
|
+
return;
|
|
2834
|
+
}
|
|
2835
|
+
const result = await this.client.crawl(options);
|
|
2836
|
+
this.sendResponse(res, 200, { success: true, data: result });
|
|
2837
|
+
}
|
|
2838
|
+
/**
|
|
2839
|
+
* Handle status request
|
|
2840
|
+
*/
|
|
2841
|
+
handleStatus(res) {
|
|
2842
|
+
const status = {
|
|
2843
|
+
running: true,
|
|
2844
|
+
port: this.options.port,
|
|
2845
|
+
poolSize: this.options.poolSize,
|
|
2846
|
+
uptime: Date.now() - this.startTime,
|
|
2847
|
+
pid: process.pid
|
|
2848
|
+
};
|
|
2849
|
+
this.sendResponse(res, 200, { success: true, data: status });
|
|
2850
|
+
}
|
|
2851
|
+
/**
|
|
2852
|
+
* Handle shutdown request
|
|
2853
|
+
*/
|
|
2854
|
+
async handleShutdown(res) {
|
|
2855
|
+
this.sendResponse(res, 200, { success: true, data: { message: "Shutting down" } });
|
|
2856
|
+
setTimeout(() => {
|
|
2857
|
+
this.stop().then(() => process.exit(0));
|
|
2858
|
+
}, 100);
|
|
2859
|
+
}
|
|
2860
|
+
/**
|
|
2861
|
+
* Send JSON response
|
|
2862
|
+
*/
|
|
2863
|
+
sendResponse(res, statusCode, data) {
|
|
2864
|
+
res.writeHead(statusCode, { "Content-Type": "application/json" });
|
|
2865
|
+
res.end(JSON.stringify(data));
|
|
2866
|
+
}
|
|
2867
|
+
/**
|
|
2868
|
+
* Write PID file
|
|
2869
|
+
*/
|
|
2870
|
+
async writePidFile() {
|
|
2871
|
+
const fs = await import("fs/promises");
|
|
2872
|
+
const path = await import("path");
|
|
2873
|
+
const os = await import("os");
|
|
2874
|
+
const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
|
|
2875
|
+
const data = JSON.stringify({
|
|
2876
|
+
pid: process.pid,
|
|
2877
|
+
port: this.options.port,
|
|
2878
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2879
|
+
});
|
|
2880
|
+
await fs.writeFile(pidFile, data);
|
|
2881
|
+
}
|
|
2882
|
+
/**
|
|
2883
|
+
* Remove PID file
|
|
2884
|
+
*/
|
|
2885
|
+
async removePidFile() {
|
|
2886
|
+
const fs = await import("fs/promises");
|
|
2887
|
+
const path = await import("path");
|
|
2888
|
+
const os = await import("os");
|
|
2889
|
+
const pidFile = path.join(os.tmpdir(), PID_FILE_NAME);
|
|
2890
|
+
try {
|
|
2891
|
+
await fs.unlink(pidFile);
|
|
2892
|
+
} catch {
|
|
2893
|
+
}
|
|
2894
|
+
}
|
|
2895
|
+
};
|
|
2896
|
+
async function getPidFilePath() {
|
|
2897
|
+
const path = await import("path");
|
|
2898
|
+
const os = await import("os");
|
|
2899
|
+
return path.join(os.tmpdir(), PID_FILE_NAME);
|
|
2900
|
+
}
|
|
2901
|
+
async function getDaemonInfo() {
|
|
2902
|
+
const fs = await import("fs/promises");
|
|
2903
|
+
const pidFile = await getPidFilePath();
|
|
2904
|
+
try {
|
|
2905
|
+
const data = await fs.readFile(pidFile, "utf-8");
|
|
2906
|
+
const info = JSON.parse(data);
|
|
2907
|
+
try {
|
|
2908
|
+
process.kill(info.pid, 0);
|
|
2909
|
+
return info;
|
|
2910
|
+
} catch {
|
|
2911
|
+
await fs.unlink(pidFile).catch(() => {
|
|
2912
|
+
});
|
|
2913
|
+
return null;
|
|
2914
|
+
}
|
|
2915
|
+
} catch {
|
|
2916
|
+
return null;
|
|
2917
|
+
}
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
// src/daemon/client.ts
|
|
2921
|
+
import http2 from "http";
|
|
2922
|
+
var DaemonClient = class {
|
|
2923
|
+
options;
|
|
2924
|
+
constructor(options = {}) {
|
|
2925
|
+
this.options = {
|
|
2926
|
+
port: options.port ?? DEFAULT_DAEMON_PORT,
|
|
2927
|
+
timeoutMs: options.timeoutMs ?? 6e5
|
|
2928
|
+
// 10 minutes default
|
|
2929
|
+
};
|
|
2930
|
+
}
|
|
2931
|
+
/**
|
|
2932
|
+
* Scrape URLs via daemon
|
|
2933
|
+
*/
|
|
2934
|
+
async scrape(options) {
|
|
2935
|
+
return this.request({
|
|
2936
|
+
action: "scrape",
|
|
2937
|
+
options
|
|
2938
|
+
});
|
|
2939
|
+
}
|
|
2940
|
+
/**
|
|
2941
|
+
* Crawl URL via daemon
|
|
2942
|
+
*/
|
|
2943
|
+
async crawl(options) {
|
|
2944
|
+
return this.request({
|
|
2945
|
+
action: "crawl",
|
|
2946
|
+
options
|
|
2947
|
+
});
|
|
2948
|
+
}
|
|
2949
|
+
/**
|
|
2950
|
+
* Get daemon status
|
|
2951
|
+
*/
|
|
2952
|
+
async status() {
|
|
2953
|
+
return this.request({
|
|
2954
|
+
action: "status"
|
|
2955
|
+
});
|
|
2956
|
+
}
|
|
2957
|
+
/**
|
|
2958
|
+
* Request daemon shutdown
|
|
2959
|
+
*/
|
|
2960
|
+
async shutdown() {
|
|
2961
|
+
await this.request({
|
|
2962
|
+
action: "shutdown"
|
|
2963
|
+
});
|
|
2964
|
+
}
|
|
2965
|
+
/**
|
|
2966
|
+
* Check if daemon is reachable
|
|
2967
|
+
*/
|
|
2968
|
+
async isRunning() {
|
|
2969
|
+
try {
|
|
2970
|
+
await this.status();
|
|
2971
|
+
return true;
|
|
2972
|
+
} catch {
|
|
2973
|
+
return false;
|
|
2974
|
+
}
|
|
2975
|
+
}
|
|
2976
|
+
/**
|
|
2977
|
+
* Make HTTP request to daemon
|
|
2978
|
+
*/
|
|
2979
|
+
request(body) {
|
|
2980
|
+
return new Promise((resolve, reject) => {
|
|
2981
|
+
const data = JSON.stringify(body);
|
|
2982
|
+
const req = http2.request(
|
|
2983
|
+
{
|
|
2984
|
+
hostname: "127.0.0.1",
|
|
2985
|
+
port: this.options.port,
|
|
2986
|
+
path: "/",
|
|
2987
|
+
method: "POST",
|
|
2988
|
+
headers: {
|
|
2989
|
+
"Content-Type": "application/json",
|
|
2990
|
+
"Content-Length": Buffer.byteLength(data)
|
|
2991
|
+
},
|
|
2992
|
+
timeout: this.options.timeoutMs
|
|
2993
|
+
},
|
|
2994
|
+
(res) => {
|
|
2995
|
+
let responseBody = "";
|
|
2996
|
+
res.on("data", (chunk) => {
|
|
2997
|
+
responseBody += chunk;
|
|
2998
|
+
});
|
|
2999
|
+
res.on("end", () => {
|
|
3000
|
+
try {
|
|
3001
|
+
const response = JSON.parse(responseBody);
|
|
3002
|
+
if (response.success) {
|
|
3003
|
+
resolve(response.data);
|
|
3004
|
+
} else {
|
|
3005
|
+
reject(new Error(response.error || "Unknown daemon error"));
|
|
3006
|
+
}
|
|
3007
|
+
} catch (error) {
|
|
3008
|
+
reject(new Error(`Failed to parse daemon response: ${responseBody}`));
|
|
3009
|
+
}
|
|
3010
|
+
});
|
|
3011
|
+
}
|
|
3012
|
+
);
|
|
3013
|
+
req.on("error", (error) => {
|
|
3014
|
+
if (error.code === "ECONNREFUSED") {
|
|
3015
|
+
reject(new Error(`Cannot connect to daemon on port ${this.options.port}. Is it running?`));
|
|
3016
|
+
} else {
|
|
3017
|
+
reject(error);
|
|
3018
|
+
}
|
|
3019
|
+
});
|
|
3020
|
+
req.on("timeout", () => {
|
|
3021
|
+
req.destroy();
|
|
3022
|
+
reject(new Error(`Request to daemon timed out after ${this.options.timeoutMs}ms`));
|
|
3023
|
+
});
|
|
3024
|
+
req.write(data);
|
|
3025
|
+
req.end();
|
|
3026
|
+
});
|
|
3027
|
+
}
|
|
3028
|
+
};
|
|
3029
|
+
async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
|
|
3030
|
+
const client = new DaemonClient({ port, timeoutMs: 5e3 });
|
|
3031
|
+
return client.isRunning();
|
|
3032
|
+
}
|
|
3033
|
+
export {
|
|
3034
|
+
BrowserPool,
|
|
3035
|
+
Crawler,
|
|
3036
|
+
DEFAULT_DAEMON_PORT,
|
|
3037
|
+
DEFAULT_OPTIONS,
|
|
3038
|
+
DaemonClient,
|
|
3039
|
+
DaemonServer,
|
|
3040
|
+
BrowserPool as HeroBrowserPool,
|
|
3041
|
+
ReaderClient,
|
|
3042
|
+
Scraper,
|
|
3043
|
+
cleanContent,
|
|
3044
|
+
crawl,
|
|
3045
|
+
createHeroConfig,
|
|
3046
|
+
createProxyUrl,
|
|
3047
|
+
detectChallenge,
|
|
3048
|
+
extractMetadata,
|
|
3049
|
+
formatToHTML,
|
|
3050
|
+
formatToJson,
|
|
3051
|
+
formatToJsonLite,
|
|
3052
|
+
formatToMarkdown,
|
|
3053
|
+
formatToText,
|
|
3054
|
+
getDaemonInfo,
|
|
3055
|
+
getPidFilePath,
|
|
3056
|
+
getUrlKey,
|
|
3057
|
+
handleChallenge,
|
|
3058
|
+
isChallengePage,
|
|
3059
|
+
isDaemonRunning,
|
|
3060
|
+
isSameDomain,
|
|
3061
|
+
isValidFormat,
|
|
3062
|
+
isValidUrl,
|
|
3063
|
+
parseProxyUrl,
|
|
3064
|
+
rateLimit,
|
|
3065
|
+
resolveUrl,
|
|
3066
|
+
scrape,
|
|
3067
|
+
shouldCrawlUrl,
|
|
3068
|
+
shouldCrawlUrl2 as shouldCrawlUrlFn,
|
|
3069
|
+
validateUrls,
|
|
3070
|
+
waitForChallengeResolution,
|
|
3071
|
+
waitForSelector
|
|
3072
|
+
};
|
|
3073
|
+
//# sourceMappingURL=index.js.map
|