@vakra-dev/reader 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +29 -26
- package/dist/cli/index.js +1356 -1039
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +233 -50
- package/dist/index.js +1591 -1042
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.js
CHANGED
|
@@ -13,166 +13,6 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
13
13
|
// src/scraper.ts
|
|
14
14
|
import pLimit from "p-limit";
|
|
15
15
|
|
|
16
|
-
// src/cloudflare/detector.ts
|
|
17
|
-
var CHALLENGE_DOM_SELECTORS = [
|
|
18
|
-
"#challenge-running",
|
|
19
|
-
"#challenge-stage",
|
|
20
|
-
"#challenge-form",
|
|
21
|
-
".cf-browser-verification"
|
|
22
|
-
];
|
|
23
|
-
var CHALLENGE_TEXT_PATTERNS = [
|
|
24
|
-
"verifying you are human",
|
|
25
|
-
"checking if the site connection is secure",
|
|
26
|
-
"this process is automatic. your browser will redirect"
|
|
27
|
-
];
|
|
28
|
-
var BLOCKED_SIGNALS = [
|
|
29
|
-
"you have been blocked",
|
|
30
|
-
"access to this page has been denied",
|
|
31
|
-
"sorry, you have been blocked",
|
|
32
|
-
"access denied",
|
|
33
|
-
"403 forbidden"
|
|
34
|
-
];
|
|
35
|
-
async function detectChallenge(hero) {
|
|
36
|
-
const signals = [];
|
|
37
|
-
let type = "none";
|
|
38
|
-
try {
|
|
39
|
-
if (!hero.document) {
|
|
40
|
-
return {
|
|
41
|
-
isChallenge: false,
|
|
42
|
-
type: "none",
|
|
43
|
-
confidence: 0,
|
|
44
|
-
signals: ["No document available"]
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
48
|
-
const htmlLower = html.toLowerCase();
|
|
49
|
-
for (const selector of CHALLENGE_DOM_SELECTORS) {
|
|
50
|
-
if (htmlLower.includes(selector.toLowerCase())) {
|
|
51
|
-
signals.push(`Challenge element: ${selector}`);
|
|
52
|
-
type = "js_challenge";
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
for (const pattern of CHALLENGE_TEXT_PATTERNS) {
|
|
56
|
-
if (htmlLower.includes(pattern)) {
|
|
57
|
-
signals.push(`Challenge text: "${pattern}"`);
|
|
58
|
-
type = type === "none" ? "js_challenge" : type;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
62
|
-
signals.push('Challenge text: "waiting for...to respond"');
|
|
63
|
-
type = type === "none" ? "js_challenge" : type;
|
|
64
|
-
}
|
|
65
|
-
for (const pattern of BLOCKED_SIGNALS) {
|
|
66
|
-
if (htmlLower.includes(pattern)) {
|
|
67
|
-
signals.push(`Blocked: "${pattern}"`);
|
|
68
|
-
type = "blocked";
|
|
69
|
-
break;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
const isChallenge = signals.length > 0;
|
|
73
|
-
const confidence = isChallenge ? 100 : 0;
|
|
74
|
-
return {
|
|
75
|
-
isChallenge,
|
|
76
|
-
type: isChallenge ? type : "none",
|
|
77
|
-
confidence,
|
|
78
|
-
signals
|
|
79
|
-
};
|
|
80
|
-
} catch (error) {
|
|
81
|
-
return {
|
|
82
|
-
isChallenge: false,
|
|
83
|
-
type: "none",
|
|
84
|
-
confidence: 0,
|
|
85
|
-
signals: [`Error during detection: ${error.message}`]
|
|
86
|
-
};
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
async function isChallengePage(hero) {
|
|
90
|
-
const detection = await detectChallenge(hero);
|
|
91
|
-
return detection.isChallenge;
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// src/cloudflare/handler.ts
|
|
95
|
-
async function waitForChallengeResolution(hero, options) {
|
|
96
|
-
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
97
|
-
const startTime = Date.now();
|
|
98
|
-
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
99
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
100
|
-
const elapsed = Date.now() - startTime;
|
|
101
|
-
try {
|
|
102
|
-
const currentUrl = await hero.url;
|
|
103
|
-
if (currentUrl !== initialUrl) {
|
|
104
|
-
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
105
|
-
log(` Waiting for new page to load...`);
|
|
106
|
-
try {
|
|
107
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
108
|
-
log(` DOMContentLoaded`);
|
|
109
|
-
} catch {
|
|
110
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
111
|
-
}
|
|
112
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
113
|
-
});
|
|
114
|
-
log(` Page stabilized`);
|
|
115
|
-
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
116
|
-
}
|
|
117
|
-
} catch {
|
|
118
|
-
}
|
|
119
|
-
const detection = await detectChallenge(hero);
|
|
120
|
-
if (!detection.isChallenge) {
|
|
121
|
-
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
122
|
-
log(` Waiting for page to load...`);
|
|
123
|
-
try {
|
|
124
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
125
|
-
log(` DOMContentLoaded`);
|
|
126
|
-
} catch {
|
|
127
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
128
|
-
}
|
|
129
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
130
|
-
});
|
|
131
|
-
log(` Page stabilized`);
|
|
132
|
-
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
133
|
-
}
|
|
134
|
-
log(
|
|
135
|
-
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
136
|
-
);
|
|
137
|
-
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
138
|
-
}
|
|
139
|
-
return {
|
|
140
|
-
resolved: false,
|
|
141
|
-
method: "timeout",
|
|
142
|
-
waitedMs: Date.now() - startTime
|
|
143
|
-
};
|
|
144
|
-
}
|
|
145
|
-
async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
|
|
146
|
-
const startTime = Date.now();
|
|
147
|
-
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
148
|
-
log(`Waiting for selector: "${selector}"`);
|
|
149
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
150
|
-
try {
|
|
151
|
-
const element = await hero.document.querySelector(selector);
|
|
152
|
-
if (element) {
|
|
153
|
-
const elapsed = Date.now() - startTime;
|
|
154
|
-
log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
|
|
155
|
-
return { found: true, waitedMs: elapsed };
|
|
156
|
-
}
|
|
157
|
-
} catch {
|
|
158
|
-
}
|
|
159
|
-
await new Promise((resolve) => setTimeout(resolve, 300));
|
|
160
|
-
}
|
|
161
|
-
log(`\u2717 Selector not found within timeout`);
|
|
162
|
-
return { found: false, waitedMs: Date.now() - startTime };
|
|
163
|
-
}
|
|
164
|
-
async function handleChallenge(hero, options = {}) {
|
|
165
|
-
const initialUrl = await hero.url;
|
|
166
|
-
const detection = await detectChallenge(hero);
|
|
167
|
-
if (!detection.isChallenge) {
|
|
168
|
-
return { resolved: true, method: "signals_cleared", waitedMs: 0 };
|
|
169
|
-
}
|
|
170
|
-
return waitForChallengeResolution(hero, {
|
|
171
|
-
...options,
|
|
172
|
-
initialUrl
|
|
173
|
-
});
|
|
174
|
-
}
|
|
175
|
-
|
|
176
16
|
// src/formatters/markdown.ts
|
|
177
17
|
import TurndownService from "turndown";
|
|
178
18
|
var turndownService = new TurndownService({
|
|
@@ -186,84 +26,6 @@ var turndownService = new TurndownService({
|
|
|
186
26
|
linkStyle: "inlined",
|
|
187
27
|
linkReferenceStyle: "full"
|
|
188
28
|
});
|
|
189
|
-
function formatToMarkdown(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
190
|
-
const sections = [];
|
|
191
|
-
if (includeMetadata) {
|
|
192
|
-
sections.push(createMarkdownHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
193
|
-
}
|
|
194
|
-
if (pages.length > 1) {
|
|
195
|
-
sections.push(createMarkdownTOC(pages));
|
|
196
|
-
}
|
|
197
|
-
sections.push(...pages.map((page, index) => createMarkdownPage(page, index + 1)));
|
|
198
|
-
return sections.join("\n\n");
|
|
199
|
-
}
|
|
200
|
-
function createMarkdownHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
201
|
-
const title = website.title || extractDomainFromUrl(baseUrl);
|
|
202
|
-
const description = website.description || "";
|
|
203
|
-
let header = `# Website Scrape: ${title}
|
|
204
|
-
|
|
205
|
-
`;
|
|
206
|
-
header += `**Base URL:** ${baseUrl}
|
|
207
|
-
`;
|
|
208
|
-
header += `**Scraped at:** ${new Date(scrapedAt).toLocaleString()}
|
|
209
|
-
`;
|
|
210
|
-
header += `**Duration:** ${duration}ms
|
|
211
|
-
`;
|
|
212
|
-
header += `**Total pages:** ${totalPages}
|
|
213
|
-
`;
|
|
214
|
-
if (description) {
|
|
215
|
-
header += `**Description:** ${description}
|
|
216
|
-
`;
|
|
217
|
-
}
|
|
218
|
-
if (website.author) {
|
|
219
|
-
header += `**Author:** ${website.author}
|
|
220
|
-
`;
|
|
221
|
-
}
|
|
222
|
-
if (website.language) {
|
|
223
|
-
header += `**Language:** ${website.language}
|
|
224
|
-
`;
|
|
225
|
-
}
|
|
226
|
-
return header;
|
|
227
|
-
}
|
|
228
|
-
function createMarkdownTOC(pages) {
|
|
229
|
-
let toc = "## Table of Contents\n\n";
|
|
230
|
-
pages.forEach((page, index) => {
|
|
231
|
-
const depth = " ".repeat(page.depth);
|
|
232
|
-
const pageNumber = index + 1;
|
|
233
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
234
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
235
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
236
|
-
toc += `${depth}${pageNumber}. [${title}](#page-${pageNumber}-${anchor})
|
|
237
|
-
`;
|
|
238
|
-
});
|
|
239
|
-
return toc;
|
|
240
|
-
}
|
|
241
|
-
function createMarkdownPage(page, pageNumber) {
|
|
242
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
243
|
-
const cleanTitle = title.replace(/[#[\]/\\:*?"<>|]/g, "").trim();
|
|
244
|
-
const anchor = cleanTitle.toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, "");
|
|
245
|
-
let pageContent = `---
|
|
246
|
-
|
|
247
|
-
`;
|
|
248
|
-
pageContent += `## Page ${pageNumber}: ${title} {#page-${pageNumber}-${anchor}}
|
|
249
|
-
|
|
250
|
-
`;
|
|
251
|
-
pageContent += `**URL:** ${page.url}
|
|
252
|
-
`;
|
|
253
|
-
pageContent += `**Title:** ${page.title}
|
|
254
|
-
`;
|
|
255
|
-
pageContent += `**Depth:** ${page.depth}
|
|
256
|
-
`;
|
|
257
|
-
pageContent += `**Fetched at:** ${new Date(page.fetchedAt).toLocaleString()}
|
|
258
|
-
|
|
259
|
-
`;
|
|
260
|
-
pageContent += `---
|
|
261
|
-
|
|
262
|
-
`;
|
|
263
|
-
const markdown = htmlToMarkdown(page.html);
|
|
264
|
-
pageContent += markdown;
|
|
265
|
-
return pageContent;
|
|
266
|
-
}
|
|
267
29
|
function htmlToMarkdown(html) {
|
|
268
30
|
try {
|
|
269
31
|
return turndownService.turndown(html);
|
|
@@ -272,596 +34,340 @@ function htmlToMarkdown(html) {
|
|
|
272
34
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
273
35
|
}
|
|
274
36
|
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
37
|
+
var formatToMarkdown = htmlToMarkdown;
|
|
38
|
+
|
|
39
|
+
// src/utils/content-cleaner.ts
|
|
40
|
+
import { parseHTML } from "linkedom";
|
|
41
|
+
var ALWAYS_REMOVE_SELECTORS = [
|
|
42
|
+
// Scripts and styles
|
|
43
|
+
"script",
|
|
44
|
+
"style",
|
|
45
|
+
"noscript",
|
|
46
|
+
"link[rel='stylesheet']",
|
|
47
|
+
// Hidden elements
|
|
48
|
+
"[hidden]",
|
|
49
|
+
"[aria-hidden='true']",
|
|
50
|
+
"[style*='display: none']",
|
|
51
|
+
"[style*='display:none']",
|
|
52
|
+
"[style*='visibility: hidden']",
|
|
53
|
+
"[style*='visibility:hidden']",
|
|
54
|
+
// SVG icons and decorative elements
|
|
55
|
+
"svg[aria-hidden='true']",
|
|
56
|
+
"svg.icon",
|
|
57
|
+
"svg[class*='icon']",
|
|
58
|
+
// Template and metadata
|
|
59
|
+
"template",
|
|
60
|
+
"meta",
|
|
61
|
+
// Embeds that don't convert to text
|
|
62
|
+
"iframe",
|
|
63
|
+
"canvas",
|
|
64
|
+
"object",
|
|
65
|
+
"embed",
|
|
66
|
+
// Forms (usually not main content)
|
|
67
|
+
"form",
|
|
68
|
+
"input",
|
|
69
|
+
"select",
|
|
70
|
+
"textarea",
|
|
71
|
+
"button"
|
|
72
|
+
];
|
|
73
|
+
var OVERLAY_SELECTORS = [
|
|
74
|
+
"[class*='modal']",
|
|
75
|
+
"[class*='popup']",
|
|
76
|
+
"[class*='overlay']",
|
|
77
|
+
"[class*='dialog']",
|
|
78
|
+
"[role='dialog']",
|
|
79
|
+
"[role='alertdialog']",
|
|
80
|
+
"[class*='cookie']",
|
|
81
|
+
"[class*='consent']",
|
|
82
|
+
"[class*='gdpr']",
|
|
83
|
+
"[class*='privacy-banner']",
|
|
84
|
+
"[class*='notification-bar']",
|
|
85
|
+
"[id*='cookie']",
|
|
86
|
+
"[id*='consent']",
|
|
87
|
+
"[id*='gdpr']",
|
|
88
|
+
// Fixed/sticky positioned elements
|
|
89
|
+
"[style*='position: fixed']",
|
|
90
|
+
"[style*='position:fixed']",
|
|
91
|
+
"[style*='position: sticky']",
|
|
92
|
+
"[style*='position:sticky']"
|
|
93
|
+
];
|
|
94
|
+
var NAVIGATION_SELECTORS = [
|
|
95
|
+
// Semantic elements
|
|
96
|
+
"header",
|
|
97
|
+
"footer",
|
|
98
|
+
"nav",
|
|
99
|
+
"aside",
|
|
100
|
+
// Header variations
|
|
101
|
+
".header",
|
|
102
|
+
".top",
|
|
103
|
+
".navbar",
|
|
104
|
+
"#header",
|
|
105
|
+
// Footer variations
|
|
106
|
+
".footer",
|
|
107
|
+
".bottom",
|
|
108
|
+
"#footer",
|
|
109
|
+
// Sidebars
|
|
110
|
+
".sidebar",
|
|
111
|
+
".side",
|
|
112
|
+
".aside",
|
|
113
|
+
"#sidebar",
|
|
114
|
+
// Modals/popups (backup if not caught by OVERLAY_SELECTORS)
|
|
115
|
+
".modal",
|
|
116
|
+
".popup",
|
|
117
|
+
"#modal",
|
|
118
|
+
".overlay",
|
|
119
|
+
// Ads
|
|
120
|
+
".ad",
|
|
121
|
+
".ads",
|
|
122
|
+
".advert",
|
|
123
|
+
"#ad",
|
|
124
|
+
// Language selectors
|
|
125
|
+
".lang-selector",
|
|
126
|
+
".language",
|
|
127
|
+
"#language-selector",
|
|
128
|
+
// Social
|
|
129
|
+
".social",
|
|
130
|
+
".social-media",
|
|
131
|
+
".social-links",
|
|
132
|
+
"#social",
|
|
133
|
+
// Navigation/menus
|
|
134
|
+
".menu",
|
|
135
|
+
".navigation",
|
|
136
|
+
"#nav",
|
|
137
|
+
// Breadcrumbs
|
|
138
|
+
".breadcrumbs",
|
|
139
|
+
"#breadcrumbs",
|
|
140
|
+
// Share buttons
|
|
141
|
+
".share",
|
|
142
|
+
"#share",
|
|
143
|
+
// Widgets
|
|
144
|
+
".widget",
|
|
145
|
+
"#widget",
|
|
146
|
+
// Cookie notices (backup)
|
|
147
|
+
".cookie",
|
|
148
|
+
"#cookie"
|
|
149
|
+
];
|
|
150
|
+
var FORCE_INCLUDE_SELECTORS = [
|
|
151
|
+
// IDs
|
|
152
|
+
"#main",
|
|
153
|
+
"#content",
|
|
154
|
+
"#main-content",
|
|
155
|
+
"#article",
|
|
156
|
+
"#post",
|
|
157
|
+
"#page-content",
|
|
158
|
+
// Semantic elements
|
|
159
|
+
"main",
|
|
160
|
+
"article",
|
|
161
|
+
"[role='main']",
|
|
162
|
+
// Classes
|
|
163
|
+
".main-content",
|
|
164
|
+
".content",
|
|
165
|
+
".post-content",
|
|
166
|
+
".article-content",
|
|
167
|
+
".entry-content",
|
|
168
|
+
".page-content",
|
|
169
|
+
".article-body",
|
|
170
|
+
".post-body",
|
|
171
|
+
".story-content",
|
|
172
|
+
".blog-content"
|
|
173
|
+
];
|
|
174
|
+
var AD_SELECTORS = [
|
|
175
|
+
// Google ads
|
|
176
|
+
"ins.adsbygoogle",
|
|
177
|
+
".google-ad",
|
|
178
|
+
".adsense",
|
|
179
|
+
// Generic ad containers
|
|
180
|
+
"[data-ad]",
|
|
181
|
+
"[data-ads]",
|
|
182
|
+
"[data-ad-slot]",
|
|
183
|
+
"[data-ad-client]",
|
|
184
|
+
// Common ad class patterns
|
|
185
|
+
".ad-container",
|
|
186
|
+
".ad-wrapper",
|
|
187
|
+
".advertisement",
|
|
188
|
+
".sponsored-content",
|
|
189
|
+
// Tracking pixels
|
|
190
|
+
"img[width='1'][height='1']",
|
|
191
|
+
"img[src*='pixel']",
|
|
192
|
+
"img[src*='tracking']",
|
|
193
|
+
"img[src*='analytics']"
|
|
194
|
+
];
|
|
195
|
+
function getLinkDensity(element) {
|
|
196
|
+
const text = element.textContent || "";
|
|
197
|
+
const textLength = text.trim().length;
|
|
198
|
+
if (textLength === 0) return 1;
|
|
199
|
+
let linkLength = 0;
|
|
200
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
201
|
+
linkLength += (link.textContent || "").trim().length;
|
|
202
|
+
});
|
|
203
|
+
return linkLength / textLength;
|
|
204
|
+
}
|
|
205
|
+
function getContentScore(element) {
|
|
206
|
+
let score = 0;
|
|
207
|
+
const text = element.textContent || "";
|
|
208
|
+
const textLength = text.trim().length;
|
|
209
|
+
score += Math.min(textLength / 100, 50);
|
|
210
|
+
score += element.querySelectorAll("p").length * 3;
|
|
211
|
+
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
|
|
212
|
+
score += element.querySelectorAll("img").length * 1;
|
|
213
|
+
score -= element.querySelectorAll("a").length * 0.5;
|
|
214
|
+
score -= element.querySelectorAll("li").length * 0.2;
|
|
215
|
+
const linkDensity = getLinkDensity(element);
|
|
216
|
+
if (linkDensity > 0.5) score -= 30;
|
|
217
|
+
else if (linkDensity > 0.3) score -= 15;
|
|
218
|
+
const classAndId = (element.className || "") + " " + (element.id || "");
|
|
219
|
+
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
|
|
220
|
+
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
|
|
221
|
+
return score;
|
|
222
|
+
}
|
|
223
|
+
function looksLikeNavigation(element) {
|
|
224
|
+
const linkDensity = getLinkDensity(element);
|
|
225
|
+
if (linkDensity > 0.5) return true;
|
|
226
|
+
const listItems = element.querySelectorAll("li");
|
|
227
|
+
const links = element.querySelectorAll("a");
|
|
228
|
+
if (listItems.length > 5 && links.length > listItems.length * 0.8) return true;
|
|
229
|
+
return false;
|
|
230
|
+
}
|
|
231
|
+
function removeElements(document, selectors) {
|
|
232
|
+
for (const selector of selectors) {
|
|
233
|
+
try {
|
|
234
|
+
document.querySelectorAll(selector).forEach((el) => el.remove());
|
|
235
|
+
} catch {
|
|
236
|
+
}
|
|
280
237
|
}
|
|
281
238
|
}
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
${website.author ? `<p><strong>Author:</strong> ${escapeHtml(website.author)}</p>` : ""}
|
|
308
|
-
${website.language ? `<p><strong>Language:</strong> ${escapeHtml(website.language)}</p>` : ""}
|
|
309
|
-
</div>
|
|
310
|
-
</header>
|
|
311
|
-
|
|
312
|
-
${pages.length > 1 ? generateTOC(pages) : ""}
|
|
313
|
-
|
|
314
|
-
<main class="content">
|
|
315
|
-
${pages.map((page, index) => generatePageHTML(page, index + 1)).join("\n")}
|
|
316
|
-
</main>
|
|
317
|
-
|
|
318
|
-
<footer class="footer">
|
|
319
|
-
<p>Generated by Reader JS/TS SDK</p>
|
|
320
|
-
</footer>
|
|
321
|
-
|
|
322
|
-
<script>
|
|
323
|
-
${generateJavaScript()}
|
|
324
|
-
</script>
|
|
325
|
-
</body>
|
|
326
|
-
</html>`;
|
|
327
|
-
return html;
|
|
239
|
+
function removeWithProtection(document, selectorsToRemove, protectedSelectors) {
|
|
240
|
+
for (const selector of selectorsToRemove) {
|
|
241
|
+
try {
|
|
242
|
+
document.querySelectorAll(selector).forEach((element) => {
|
|
243
|
+
const isProtected = protectedSelectors.some((ps) => {
|
|
244
|
+
try {
|
|
245
|
+
return element.matches(ps);
|
|
246
|
+
} catch {
|
|
247
|
+
return false;
|
|
248
|
+
}
|
|
249
|
+
});
|
|
250
|
+
if (isProtected) return;
|
|
251
|
+
const containsProtected = protectedSelectors.some((ps) => {
|
|
252
|
+
try {
|
|
253
|
+
return element.querySelector(ps) !== null;
|
|
254
|
+
} catch {
|
|
255
|
+
return false;
|
|
256
|
+
}
|
|
257
|
+
});
|
|
258
|
+
if (containsProtected) return;
|
|
259
|
+
element.remove();
|
|
260
|
+
});
|
|
261
|
+
} catch {
|
|
262
|
+
}
|
|
263
|
+
}
|
|
328
264
|
}
|
|
329
|
-
function
|
|
330
|
-
const
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
if (
|
|
347
|
-
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
265
|
+
function findMainContent(document) {
|
|
266
|
+
const isValidContent = (el) => {
|
|
267
|
+
if (!el) return false;
|
|
268
|
+
const text = el.textContent || "";
|
|
269
|
+
if (text.trim().length < 100) return false;
|
|
270
|
+
if (looksLikeNavigation(el)) return false;
|
|
271
|
+
return true;
|
|
272
|
+
};
|
|
273
|
+
const main = document.querySelector("main");
|
|
274
|
+
if (isValidContent(main) && getLinkDensity(main) < 0.4) {
|
|
275
|
+
return main;
|
|
276
|
+
}
|
|
277
|
+
const roleMain = document.querySelector('[role="main"]');
|
|
278
|
+
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) {
|
|
279
|
+
return roleMain;
|
|
280
|
+
}
|
|
281
|
+
const articles = document.querySelectorAll("article");
|
|
282
|
+
if (articles.length === 1 && isValidContent(articles[0])) {
|
|
283
|
+
return articles[0];
|
|
284
|
+
}
|
|
285
|
+
const contentSelectors = [
|
|
286
|
+
"#content",
|
|
287
|
+
"#main-content",
|
|
288
|
+
"#main",
|
|
289
|
+
".content",
|
|
290
|
+
".main-content",
|
|
291
|
+
".post-content",
|
|
292
|
+
".article-content",
|
|
293
|
+
".entry-content",
|
|
294
|
+
".page-content",
|
|
295
|
+
".article-body",
|
|
296
|
+
".post-body",
|
|
297
|
+
".story-content",
|
|
298
|
+
".blog-content"
|
|
299
|
+
];
|
|
300
|
+
for (const selector of contentSelectors) {
|
|
301
|
+
try {
|
|
302
|
+
const el = document.querySelector(selector);
|
|
303
|
+
if (isValidContent(el) && getLinkDensity(el) < 0.4) {
|
|
304
|
+
return el;
|
|
305
|
+
}
|
|
306
|
+
} catch {
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
const candidates = [];
|
|
310
|
+
const containers = document.querySelectorAll("div, section, article");
|
|
311
|
+
containers.forEach((el) => {
|
|
312
|
+
const text = el.textContent || "";
|
|
313
|
+
if (text.trim().length < 200) return;
|
|
314
|
+
const score = getContentScore(el);
|
|
315
|
+
if (score > 0) {
|
|
316
|
+
candidates.push({ el, score });
|
|
317
|
+
}
|
|
318
|
+
});
|
|
319
|
+
candidates.sort((a, b) => b.score - a.score);
|
|
320
|
+
if (candidates.length > 0 && candidates[0].score > 20) {
|
|
321
|
+
return candidates[0].el;
|
|
322
|
+
}
|
|
323
|
+
return null;
|
|
378
324
|
}
|
|
379
|
-
function generateCSS() {
|
|
380
|
-
return `
|
|
381
|
-
* {
|
|
382
|
-
margin: 0;
|
|
383
|
-
padding: 0;
|
|
384
|
-
box-sizing: border-box;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
body {
|
|
388
|
-
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
389
|
-
line-height: 1.6;
|
|
390
|
-
color: #333;
|
|
391
|
-
background-color: #f8f9fa;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
.header {
|
|
395
|
-
background: white;
|
|
396
|
-
padding: 2rem;
|
|
397
|
-
border-bottom: 1px solid #e9ecef;
|
|
398
|
-
margin-bottom: 2rem;
|
|
399
|
-
}
|
|
400
|
-
|
|
401
|
-
.header h1 {
|
|
402
|
-
color: #2c3e50;
|
|
403
|
-
margin-bottom: 1rem;
|
|
404
|
-
font-size: 2rem;
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
.meta-info {
|
|
408
|
-
display: grid;
|
|
409
|
-
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
410
|
-
gap: 0.5rem;
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
.meta-info p {
|
|
414
|
-
margin: 0.25rem 0;
|
|
415
|
-
font-size: 0.9rem;
|
|
416
|
-
color: #6c757d;
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
.toc {
|
|
420
|
-
background: white;
|
|
421
|
-
padding: 1.5rem;
|
|
422
|
-
margin: 2rem 0;
|
|
423
|
-
border-radius: 8px;
|
|
424
|
-
border: 1px solid #e9ecef;
|
|
425
|
-
}
|
|
426
|
-
|
|
427
|
-
.toc h2 {
|
|
428
|
-
color: #2c3e50;
|
|
429
|
-
margin-bottom: 1rem;
|
|
430
|
-
font-size: 1.25rem;
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
.toc ul {
|
|
434
|
-
list-style: none;
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
.toc li {
|
|
438
|
-
margin: 0.5rem 0;
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
.toc a {
|
|
442
|
-
color: #007bff;
|
|
443
|
-
text-decoration: none;
|
|
444
|
-
transition: color 0.2s;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
.toc a:hover {
|
|
448
|
-
color: #0056b3;
|
|
449
|
-
text-decoration: underline;
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
.content {
|
|
453
|
-
max-width: 800px;
|
|
454
|
-
margin: 0 auto;
|
|
455
|
-
padding: 0 1rem;
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
.page {
|
|
459
|
-
background: white;
|
|
460
|
-
margin: 2rem 0;
|
|
461
|
-
padding: 2rem;
|
|
462
|
-
border-radius: 8px;
|
|
463
|
-
border: 1px solid #e9ecef;
|
|
464
|
-
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
.page-header {
|
|
468
|
-
border-bottom: 2px solid #e9ecef;
|
|
469
|
-
padding-bottom: 1rem;
|
|
470
|
-
margin-bottom: 2rem;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
.page-header h2 {
|
|
474
|
-
color: #2c3e50;
|
|
475
|
-
margin-bottom: 0.5rem;
|
|
476
|
-
font-size: 1.5rem;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
.page-meta {
|
|
480
|
-
display: flex;
|
|
481
|
-
flex-wrap: wrap;
|
|
482
|
-
gap: 1rem;
|
|
483
|
-
font-size: 0.9rem;
|
|
484
|
-
color: #6c757d;
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
.page-content {
|
|
488
|
-
line-height: 1.8;
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
.page-content h1, .page-content h2, .page-content h3,
|
|
492
|
-
.page-content h4, .page-content h5, .page-content h6 {
|
|
493
|
-
color: #2c3e50;
|
|
494
|
-
margin: 1.5rem 0 0.5rem 0;
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
.page-content p {
|
|
498
|
-
margin: 1rem 0;
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
.page-content a {
|
|
502
|
-
color: #007bff;
|
|
503
|
-
text-decoration: none;
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
.page-content a:hover {
|
|
507
|
-
text-decoration: underline;
|
|
508
|
-
}
|
|
509
|
-
|
|
510
|
-
.page-content code {
|
|
511
|
-
background: #f8f9fa;
|
|
512
|
-
padding: 0.2rem 0.4rem;
|
|
513
|
-
border-radius: 4px;
|
|
514
|
-
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
|
515
|
-
font-size: 0.9em;
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
.page-content pre {
|
|
519
|
-
background: #f8f9fa;
|
|
520
|
-
padding: 1rem;
|
|
521
|
-
border-radius: 4px;
|
|
522
|
-
overflow-x: auto;
|
|
523
|
-
margin: 1rem 0;
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
.page-content blockquote {
|
|
527
|
-
border-left: 4px solid #007bff;
|
|
528
|
-
padding-left: 1rem;
|
|
529
|
-
margin: 1rem 0;
|
|
530
|
-
color: #6c757d;
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
.footer {
|
|
534
|
-
text-align: center;
|
|
535
|
-
padding: 2rem;
|
|
536
|
-
margin-top: 3rem;
|
|
537
|
-
border-top: 1px solid #e9ecef;
|
|
538
|
-
color: #6c757d;
|
|
539
|
-
font-size: 0.9rem;
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
@media (max-width: 768px) {
|
|
543
|
-
.header {
|
|
544
|
-
padding: 1rem;
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
.header h1 {
|
|
548
|
-
font-size: 1.5rem;
|
|
549
|
-
}
|
|
550
|
-
|
|
551
|
-
.page {
|
|
552
|
-
padding: 1rem;
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
.page-meta {
|
|
556
|
-
flex-direction: column;
|
|
557
|
-
gap: 0.5rem;
|
|
558
|
-
}
|
|
559
|
-
}
|
|
560
|
-
`.trim();
|
|
561
|
-
}
|
|
562
|
-
function generateTOC(pages) {
|
|
563
|
-
const tocItems = pages.map((page, index) => {
|
|
564
|
-
const pageNumber = index + 1;
|
|
565
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
566
|
-
const id = `page-${pageNumber}`;
|
|
567
|
-
return `<li><a href="#${id}">${pageNumber}. ${escapeHtml(title)}</a></li>`;
|
|
568
|
-
}).join("\n");
|
|
569
|
-
return `
|
|
570
|
-
<nav class="toc">
|
|
571
|
-
<h2>Table of Contents</h2>
|
|
572
|
-
<ul>
|
|
573
|
-
${tocItems}
|
|
574
|
-
</ul>
|
|
575
|
-
</nav>`;
|
|
576
|
-
}
|
|
577
|
-
function generatePageHTML(page, pageNumber) {
|
|
578
|
-
const id = `page-${pageNumber}`;
|
|
579
|
-
const title = page.title || `Page ${pageNumber}`;
|
|
580
|
-
return `
|
|
581
|
-
<article class="page" id="${id}">
|
|
582
|
-
<div class="page-header">
|
|
583
|
-
<h2>${pageNumber}. ${escapeHtml(title)}</h2>
|
|
584
|
-
<div class="page-meta">
|
|
585
|
-
<span><strong>URL:</strong> <a href="${escapeHtml(
|
|
586
|
-
page.url
|
|
587
|
-
)}" target="_blank">${escapeHtml(page.url)}</a></span>
|
|
588
|
-
<span><strong>Depth:</strong> ${page.depth}</span>
|
|
589
|
-
<span><strong>Fetched:</strong> ${new Date(page.fetchedAt).toLocaleString()}</span>
|
|
590
|
-
</div>
|
|
591
|
-
</div>
|
|
592
|
-
<div class="page-content">
|
|
593
|
-
${page.html}
|
|
594
|
-
</div>
|
|
595
|
-
</article>`;
|
|
596
|
-
}
|
|
597
|
-
function generateJavaScript() {
|
|
598
|
-
return `
|
|
599
|
-
// Smooth scrolling for TOC links
|
|
600
|
-
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
|
|
601
|
-
anchor.addEventListener('click', function (e) {
|
|
602
|
-
e.preventDefault();
|
|
603
|
-
const target = document.querySelector(this.getAttribute('href'));
|
|
604
|
-
if (target) {
|
|
605
|
-
target.scrollIntoView({
|
|
606
|
-
behavior: 'smooth',
|
|
607
|
-
block: 'start'
|
|
608
|
-
});
|
|
609
|
-
}
|
|
610
|
-
});
|
|
611
|
-
});
|
|
612
|
-
|
|
613
|
-
// Highlight current section in TOC
|
|
614
|
-
window.addEventListener('scroll', function() {
|
|
615
|
-
const pages = document.querySelectorAll('.page');
|
|
616
|
-
const tocLinks = document.querySelectorAll('.toc a');
|
|
617
|
-
|
|
618
|
-
let currentPage = null;
|
|
619
|
-
pages.forEach(page => {
|
|
620
|
-
const rect = page.getBoundingClientRect();
|
|
621
|
-
if (rect.top <= 100) {
|
|
622
|
-
currentPage = page;
|
|
623
|
-
}
|
|
624
|
-
});
|
|
625
|
-
|
|
626
|
-
tocLinks.forEach(link => {
|
|
627
|
-
link.style.fontWeight = 'normal';
|
|
628
|
-
const target = document.querySelector(link.getAttribute('href'));
|
|
629
|
-
if (target === currentPage) {
|
|
630
|
-
link.style.fontWeight = 'bold';
|
|
631
|
-
}
|
|
632
|
-
});
|
|
633
|
-
});
|
|
634
|
-
`;
|
|
635
|
-
}
|
|
636
|
-
function escapeHtml(text) {
|
|
637
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'").replace(/\//g, "/");
|
|
638
|
-
}
|
|
639
|
-
function extractDomainFromUrl2(url) {
|
|
640
|
-
try {
|
|
641
|
-
return new URL(url).hostname;
|
|
642
|
-
} catch {
|
|
643
|
-
return "Unknown";
|
|
644
|
-
}
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
// src/formatters/json.ts
|
|
648
|
-
function formatToJson(pages, baseUrl, scrapedAt, duration, website) {
|
|
649
|
-
const jsonResult = {
|
|
650
|
-
metadata: {
|
|
651
|
-
baseUrl,
|
|
652
|
-
totalPages: pages.length,
|
|
653
|
-
scrapedAt,
|
|
654
|
-
duration,
|
|
655
|
-
website
|
|
656
|
-
},
|
|
657
|
-
pages: pages.map((page, index) => ({
|
|
658
|
-
index: index + 1,
|
|
659
|
-
url: page.url,
|
|
660
|
-
title: page.title,
|
|
661
|
-
markdown: page.markdown,
|
|
662
|
-
html: page.html,
|
|
663
|
-
fetchedAt: page.fetchedAt,
|
|
664
|
-
depth: page.depth,
|
|
665
|
-
wordCount: countWords(page.markdown),
|
|
666
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
667
|
-
}))
|
|
668
|
-
};
|
|
669
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
670
|
-
}
|
|
671
|
-
function formatToJsonLite(pages, baseUrl, scrapedAt, duration, website) {
|
|
672
|
-
const jsonResult = {
|
|
673
|
-
metadata: {
|
|
674
|
-
baseUrl,
|
|
675
|
-
totalPages: pages.length,
|
|
676
|
-
scrapedAt,
|
|
677
|
-
duration,
|
|
678
|
-
website
|
|
679
|
-
},
|
|
680
|
-
pages: pages.map((page, index) => ({
|
|
681
|
-
index: index + 1,
|
|
682
|
-
url: page.url,
|
|
683
|
-
title: page.title,
|
|
684
|
-
markdown: page.markdown,
|
|
685
|
-
fetchedAt: page.fetchedAt,
|
|
686
|
-
depth: page.depth,
|
|
687
|
-
wordCount: countWords(page.markdown),
|
|
688
|
-
readingTime: estimateReadingTime(page.markdown)
|
|
689
|
-
}))
|
|
690
|
-
};
|
|
691
|
-
return JSON.stringify(jsonResult, null, 2);
|
|
692
|
-
}
|
|
693
|
-
function countWords(markdown) {
|
|
694
|
-
const plainText = markdown.replace(/#{1,6}\s+/g, "").replace(/\*\*(.*?)\*\*/g, "$1").replace(/\*(.*?)\*/g, "$1").replace(/`(.*?)`/g, "$1").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/^\s*[-*+]\s+/gm, "").replace(/^\s*\d+\.\s+/gm, "").replace(/^\s*>\s+/gm, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
695
|
-
return plainText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
696
|
-
}
|
|
697
|
-
function estimateReadingTime(markdown) {
|
|
698
|
-
const wordCount = countWords(markdown);
|
|
699
|
-
return Math.ceil(wordCount / 200);
|
|
700
|
-
}
|
|
701
|
-
|
|
702
|
-
// src/formatters/text.ts
|
|
703
|
-
import { parseHTML } from "linkedom";
|
|
704
|
-
function formatToText(pages, baseUrl, scrapedAt, duration, website, includeMetadata = true) {
|
|
705
|
-
const sections = [];
|
|
706
|
-
if (includeMetadata) {
|
|
707
|
-
sections.push(createTextHeader(baseUrl, scrapedAt, duration, website, pages.length));
|
|
708
|
-
}
|
|
709
|
-
sections.push(...pages.map((page, index) => createTextPage(page, index + 1, pages.length > 1)));
|
|
710
|
-
return sections.join("\n\n");
|
|
711
|
-
}
|
|
712
|
-
function createTextHeader(baseUrl, scrapedAt, duration, website, totalPages) {
|
|
713
|
-
const title = website.title || extractDomainFromUrl3(baseUrl);
|
|
714
|
-
const lines = [];
|
|
715
|
-
lines.push(`=== ${title} ===`);
|
|
716
|
-
lines.push("");
|
|
717
|
-
lines.push(`URL: ${baseUrl}`);
|
|
718
|
-
lines.push(`Scraped: ${new Date(scrapedAt).toLocaleString()}`);
|
|
719
|
-
lines.push(`Duration: ${duration}ms`);
|
|
720
|
-
lines.push(`Pages: ${totalPages}`);
|
|
721
|
-
if (website.description) {
|
|
722
|
-
lines.push(`Description: ${website.description}`);
|
|
723
|
-
}
|
|
724
|
-
if (website.author) {
|
|
725
|
-
lines.push(`Author: ${website.author}`);
|
|
726
|
-
}
|
|
727
|
-
if (website.language) {
|
|
728
|
-
lines.push(`Language: ${website.language}`);
|
|
729
|
-
}
|
|
730
|
-
return lines.join("\n");
|
|
731
|
-
}
|
|
732
|
-
function createTextPage(page, pageNumber, showSeparator) {
|
|
733
|
-
const lines = [];
|
|
734
|
-
if (showSeparator) {
|
|
735
|
-
lines.push("\u2500".repeat(60));
|
|
736
|
-
lines.push(`Page ${pageNumber}: ${page.title || "Untitled"}`);
|
|
737
|
-
lines.push(`URL: ${page.url}`);
|
|
738
|
-
lines.push("\u2500".repeat(60));
|
|
739
|
-
}
|
|
740
|
-
const plainText = htmlToPlainText(page.html);
|
|
741
|
-
lines.push(plainText);
|
|
742
|
-
return lines.join("\n");
|
|
743
|
-
}
|
|
744
|
-
function htmlToPlainText(html) {
|
|
745
|
-
const { document } = parseHTML(html);
|
|
746
|
-
const elementsToRemove = ["script", "style", "noscript", "svg", "canvas", "template"];
|
|
747
|
-
elementsToRemove.forEach((tag) => {
|
|
748
|
-
document.querySelectorAll(tag).forEach((el) => el.remove());
|
|
749
|
-
});
|
|
750
|
-
let text = document.body?.textContent || document.documentElement?.textContent || "";
|
|
751
|
-
text = text.replace(/[ \t]+/g, " ");
|
|
752
|
-
text = text.replace(/\n[ \t]+/g, "\n");
|
|
753
|
-
text = text.replace(/[ \t]+\n/g, "\n");
|
|
754
|
-
text = text.replace(/\n{3,}/g, "\n\n");
|
|
755
|
-
text = text.trim();
|
|
756
|
-
return text;
|
|
757
|
-
}
|
|
758
|
-
function extractDomainFromUrl3(url) {
|
|
759
|
-
try {
|
|
760
|
-
return new URL(url).hostname;
|
|
761
|
-
} catch {
|
|
762
|
-
return "Unknown";
|
|
763
|
-
}
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
// src/utils/content-cleaner.ts
|
|
767
|
-
import { parseHTML as parseHTML2 } from "linkedom";
|
|
768
|
-
var ALWAYS_REMOVE_SELECTORS = [
|
|
769
|
-
// Navigation and menus
|
|
770
|
-
"nav",
|
|
771
|
-
"header nav",
|
|
772
|
-
"footer nav",
|
|
773
|
-
".nav",
|
|
774
|
-
".navigation",
|
|
775
|
-
".menu",
|
|
776
|
-
".navbar",
|
|
777
|
-
".sidebar",
|
|
778
|
-
".aside",
|
|
779
|
-
// Header and footer elements
|
|
780
|
-
"header",
|
|
781
|
-
"footer",
|
|
782
|
-
".site-header",
|
|
783
|
-
".page-header",
|
|
784
|
-
".site-footer",
|
|
785
|
-
".page-footer",
|
|
786
|
-
// Social media and sharing
|
|
787
|
-
".social",
|
|
788
|
-
".share",
|
|
789
|
-
".sharing",
|
|
790
|
-
".twitter",
|
|
791
|
-
".facebook",
|
|
792
|
-
".linkedin",
|
|
793
|
-
".instagram",
|
|
794
|
-
// Comments and discussions
|
|
795
|
-
".comments",
|
|
796
|
-
".comment",
|
|
797
|
-
".discussion",
|
|
798
|
-
".disqus",
|
|
799
|
-
// Forms and interactive elements
|
|
800
|
-
"form",
|
|
801
|
-
"input",
|
|
802
|
-
"button:not([type='submit'])",
|
|
803
|
-
"select",
|
|
804
|
-
"textarea",
|
|
805
|
-
// Scripts and styles
|
|
806
|
-
"script",
|
|
807
|
-
"style",
|
|
808
|
-
"noscript",
|
|
809
|
-
// Hidden elements
|
|
810
|
-
"[hidden]",
|
|
811
|
-
"[style*='display: none']",
|
|
812
|
-
"[style*='display:none']",
|
|
813
|
-
// Common utility classes
|
|
814
|
-
".cookie",
|
|
815
|
-
".cookie-banner",
|
|
816
|
-
".popup",
|
|
817
|
-
".modal",
|
|
818
|
-
".overlay",
|
|
819
|
-
".notification",
|
|
820
|
-
// Breadcrumbs
|
|
821
|
-
".breadcrumb",
|
|
822
|
-
".breadcrumbs",
|
|
823
|
-
".breadcrumb-trail"
|
|
824
|
-
];
|
|
825
|
-
var AD_SELECTORS = [
|
|
826
|
-
// Ads and promotions
|
|
827
|
-
".ad",
|
|
828
|
-
".ads",
|
|
829
|
-
".advertisement",
|
|
830
|
-
".promotion",
|
|
831
|
-
".sponsored",
|
|
832
|
-
"[class*='ad-']",
|
|
833
|
-
"[id*='ad-']",
|
|
834
|
-
"[class*='advert']",
|
|
835
|
-
"[id*='advert']",
|
|
836
|
-
"[class*='banner']",
|
|
837
|
-
"[id*='banner']",
|
|
838
|
-
".google-ad",
|
|
839
|
-
".adsense",
|
|
840
|
-
"[data-ad]",
|
|
841
|
-
"[data-ads]",
|
|
842
|
-
"ins.adsbygoogle",
|
|
843
|
-
// Tracking
|
|
844
|
-
"[class*='tracking']",
|
|
845
|
-
"[id*='tracking']",
|
|
846
|
-
"[class*='analytics']",
|
|
847
|
-
"[id*='analytics']"
|
|
848
|
-
];
|
|
849
325
|
function cleanHtml(html, baseUrl, options = {}) {
|
|
850
|
-
const {
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
326
|
+
const {
|
|
327
|
+
removeAds = true,
|
|
328
|
+
removeBase64Images = true,
|
|
329
|
+
onlyMainContent = true,
|
|
330
|
+
includeTags,
|
|
331
|
+
excludeTags
|
|
332
|
+
} = options;
|
|
333
|
+
const { document } = parseHTML(html);
|
|
334
|
+
removeElements(document, ALWAYS_REMOVE_SELECTORS);
|
|
335
|
+
removeElements(document, OVERLAY_SELECTORS);
|
|
336
|
+
if (removeAds) {
|
|
337
|
+
removeElements(document, AD_SELECTORS);
|
|
338
|
+
}
|
|
339
|
+
if (excludeTags && excludeTags.length > 0) {
|
|
340
|
+
removeElements(document, excludeTags);
|
|
341
|
+
}
|
|
342
|
+
if (onlyMainContent) {
|
|
343
|
+
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
|
|
344
|
+
const mainContent = findMainContent(document);
|
|
345
|
+
if (mainContent) {
|
|
346
|
+
const body = document.body;
|
|
347
|
+
if (body) {
|
|
348
|
+
const clone = mainContent.cloneNode(true);
|
|
349
|
+
body.innerHTML = "";
|
|
350
|
+
body.appendChild(clone);
|
|
351
|
+
}
|
|
856
352
|
}
|
|
857
353
|
}
|
|
858
|
-
if (
|
|
859
|
-
|
|
354
|
+
if (includeTags && includeTags.length > 0) {
|
|
355
|
+
const matchedElements = [];
|
|
356
|
+
for (const selector of includeTags) {
|
|
860
357
|
try {
|
|
861
|
-
document.querySelectorAll(selector).forEach((el) =>
|
|
358
|
+
document.querySelectorAll(selector).forEach((el) => {
|
|
359
|
+
matchedElements.push(el.cloneNode(true));
|
|
360
|
+
});
|
|
862
361
|
} catch {
|
|
863
362
|
}
|
|
864
363
|
}
|
|
364
|
+
if (matchedElements.length > 0) {
|
|
365
|
+
const body = document.body;
|
|
366
|
+
if (body) {
|
|
367
|
+
body.innerHTML = "";
|
|
368
|
+
matchedElements.forEach((el) => body.appendChild(el));
|
|
369
|
+
}
|
|
370
|
+
}
|
|
865
371
|
}
|
|
866
372
|
if (removeBase64Images) {
|
|
867
373
|
removeBase64ImagesFromDocument(document);
|
|
@@ -886,7 +392,10 @@ function removeBase64ImagesFromDocument(document) {
|
|
|
886
392
|
document.querySelectorAll("[style*='data:image']").forEach((el) => {
|
|
887
393
|
const style = el.getAttribute("style");
|
|
888
394
|
if (style) {
|
|
889
|
-
const cleanedStyle = style.replace(
|
|
395
|
+
const cleanedStyle = style.replace(
|
|
396
|
+
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
|
|
397
|
+
""
|
|
398
|
+
);
|
|
890
399
|
if (cleanedStyle.trim()) {
|
|
891
400
|
el.setAttribute("style", cleanedStyle);
|
|
892
401
|
} else {
|
|
@@ -923,7 +432,7 @@ function cleanContent(html, baseUrl, options = {}) {
|
|
|
923
432
|
}
|
|
924
433
|
|
|
925
434
|
// src/utils/metadata-extractor.ts
|
|
926
|
-
import { parseHTML as
|
|
435
|
+
import { parseHTML as parseHTML2 } from "linkedom";
|
|
927
436
|
|
|
928
437
|
// src/utils/url-helpers.ts
|
|
929
438
|
import { URL as URL2 } from "url";
|
|
@@ -996,8 +505,26 @@ function isSameDomain(url, baseUrl) {
|
|
|
996
505
|
function getUrlKey(url) {
|
|
997
506
|
try {
|
|
998
507
|
const parsedUrl = new URL2(url);
|
|
508
|
+
parsedUrl.hash = "";
|
|
999
509
|
parsedUrl.search = "";
|
|
1000
|
-
|
|
510
|
+
if (parsedUrl.hostname.startsWith("www.")) {
|
|
511
|
+
parsedUrl.hostname = parsedUrl.hostname.slice(4);
|
|
512
|
+
}
|
|
513
|
+
if (parsedUrl.protocol === "http:" && parsedUrl.port === "80" || parsedUrl.protocol === "https:" && parsedUrl.port === "443") {
|
|
514
|
+
parsedUrl.port = "";
|
|
515
|
+
}
|
|
516
|
+
const indexFiles = ["index.html", "index.htm", "default.html", "default.htm", "index.php"];
|
|
517
|
+
for (const indexFile of indexFiles) {
|
|
518
|
+
if (parsedUrl.pathname.endsWith(`/${indexFile}`)) {
|
|
519
|
+
parsedUrl.pathname = parsedUrl.pathname.slice(0, -indexFile.length);
|
|
520
|
+
break;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
let normalized = parsedUrl.toString().toLowerCase();
|
|
524
|
+
if (normalized.endsWith("/") && parsedUrl.pathname !== "/") {
|
|
525
|
+
normalized = normalized.slice(0, -1);
|
|
526
|
+
}
|
|
527
|
+
return normalized;
|
|
1001
528
|
} catch {
|
|
1002
529
|
return url.toLowerCase();
|
|
1003
530
|
}
|
|
@@ -1232,7 +759,7 @@ function extractMetadata(html, baseUrl) {
|
|
|
1232
759
|
return extractWebsiteMetadata(html, baseUrl);
|
|
1233
760
|
}
|
|
1234
761
|
function extractWebsiteMetadata(html, baseUrl) {
|
|
1235
|
-
const { document } =
|
|
762
|
+
const { document } = parseHTML2(html);
|
|
1236
763
|
const metadata = {
|
|
1237
764
|
title: null,
|
|
1238
765
|
description: null,
|
|
@@ -1492,58 +1019,930 @@ async function fetchRobotsTxt(baseUrl) {
|
|
|
1492
1019
|
if (!response.ok) {
|
|
1493
1020
|
return null;
|
|
1494
1021
|
}
|
|
1495
|
-
const content = await response.text();
|
|
1496
|
-
return parseRobotsTxt(content, "ReaderEngine");
|
|
1497
|
-
} catch {
|
|
1498
|
-
return null;
|
|
1022
|
+
const content = await response.text();
|
|
1023
|
+
return parseRobotsTxt(content, "ReaderEngine");
|
|
1024
|
+
} catch {
|
|
1025
|
+
return null;
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
function isUrlAllowed(url, rules) {
|
|
1029
|
+
if (!rules) {
|
|
1030
|
+
return true;
|
|
1031
|
+
}
|
|
1032
|
+
try {
|
|
1033
|
+
const parsedUrl = new URL(url);
|
|
1034
|
+
return isPathAllowed(parsedUrl.pathname + parsedUrl.search, rules);
|
|
1035
|
+
} catch {
|
|
1036
|
+
return true;
|
|
1037
|
+
}
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
// src/types.ts
|
|
1041
|
+
var DEFAULT_OPTIONS = {
|
|
1042
|
+
urls: [],
|
|
1043
|
+
formats: ["markdown"],
|
|
1044
|
+
timeoutMs: 3e4,
|
|
1045
|
+
includePatterns: [],
|
|
1046
|
+
excludePatterns: [],
|
|
1047
|
+
// Content cleaning defaults
|
|
1048
|
+
removeAds: true,
|
|
1049
|
+
removeBase64Images: true,
|
|
1050
|
+
onlyMainContent: true,
|
|
1051
|
+
includeTags: [],
|
|
1052
|
+
excludeTags: [],
|
|
1053
|
+
skipTLSVerification: true,
|
|
1054
|
+
// Batch defaults
|
|
1055
|
+
batchConcurrency: 1,
|
|
1056
|
+
batchTimeoutMs: 3e5,
|
|
1057
|
+
maxRetries: 2,
|
|
1058
|
+
onProgress: () => {
|
|
1059
|
+
},
|
|
1060
|
+
// Default no-op progress callback
|
|
1061
|
+
// Hero-specific defaults
|
|
1062
|
+
verbose: false,
|
|
1063
|
+
showChrome: false
|
|
1064
|
+
};
|
|
1065
|
+
function isValidFormat(format) {
|
|
1066
|
+
return format === "markdown" || format === "html";
|
|
1067
|
+
}
|
|
1068
|
+
function shouldCrawlUrl2(url, baseDomain) {
|
|
1069
|
+
return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
// src/engines/types.ts
|
|
1073
|
+
var ENGINE_CONFIGS = {
|
|
1074
|
+
http: {
|
|
1075
|
+
name: "http",
|
|
1076
|
+
timeout: 3e3,
|
|
1077
|
+
maxTimeout: 1e4,
|
|
1078
|
+
quality: 100,
|
|
1079
|
+
features: {
|
|
1080
|
+
javascript: false,
|
|
1081
|
+
cloudflare: false,
|
|
1082
|
+
tlsFingerprint: false,
|
|
1083
|
+
waitFor: false,
|
|
1084
|
+
screenshots: false
|
|
1085
|
+
}
|
|
1086
|
+
},
|
|
1087
|
+
tlsclient: {
|
|
1088
|
+
name: "tlsclient",
|
|
1089
|
+
timeout: 5e3,
|
|
1090
|
+
maxTimeout: 15e3,
|
|
1091
|
+
quality: 80,
|
|
1092
|
+
features: {
|
|
1093
|
+
javascript: false,
|
|
1094
|
+
cloudflare: false,
|
|
1095
|
+
tlsFingerprint: true,
|
|
1096
|
+
waitFor: false,
|
|
1097
|
+
screenshots: false
|
|
1098
|
+
}
|
|
1099
|
+
},
|
|
1100
|
+
hero: {
|
|
1101
|
+
name: "hero",
|
|
1102
|
+
timeout: 3e4,
|
|
1103
|
+
maxTimeout: 6e4,
|
|
1104
|
+
quality: 50,
|
|
1105
|
+
features: {
|
|
1106
|
+
javascript: true,
|
|
1107
|
+
cloudflare: true,
|
|
1108
|
+
tlsFingerprint: true,
|
|
1109
|
+
waitFor: true,
|
|
1110
|
+
screenshots: true
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
};
|
|
1114
|
+
var DEFAULT_ENGINE_ORDER = ["http", "tlsclient", "hero"];
|
|
1115
|
+
|
|
1116
|
+
// src/engines/errors.ts
|
|
1117
|
+
var EngineError = class extends Error {
|
|
1118
|
+
engine;
|
|
1119
|
+
retryable;
|
|
1120
|
+
constructor(engine, message, options) {
|
|
1121
|
+
super(`[${engine}] ${message}`);
|
|
1122
|
+
this.name = "EngineError";
|
|
1123
|
+
this.engine = engine;
|
|
1124
|
+
this.retryable = options?.retryable ?? true;
|
|
1125
|
+
this.cause = options?.cause;
|
|
1126
|
+
if (Error.captureStackTrace) {
|
|
1127
|
+
Error.captureStackTrace(this, this.constructor);
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
};
|
|
1131
|
+
var ChallengeDetectedError = class extends EngineError {
|
|
1132
|
+
challengeType;
|
|
1133
|
+
constructor(engine, challengeType) {
|
|
1134
|
+
super(engine, `Challenge detected: ${challengeType || "unknown"}`, { retryable: true });
|
|
1135
|
+
this.name = "ChallengeDetectedError";
|
|
1136
|
+
this.challengeType = challengeType || "unknown";
|
|
1137
|
+
}
|
|
1138
|
+
};
|
|
1139
|
+
var InsufficientContentError = class extends EngineError {
|
|
1140
|
+
contentLength;
|
|
1141
|
+
threshold;
|
|
1142
|
+
constructor(engine, contentLength, threshold = 100) {
|
|
1143
|
+
super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, { retryable: true });
|
|
1144
|
+
this.name = "InsufficientContentError";
|
|
1145
|
+
this.contentLength = contentLength;
|
|
1146
|
+
this.threshold = threshold;
|
|
1147
|
+
}
|
|
1148
|
+
};
|
|
1149
|
+
var HttpError = class extends EngineError {
|
|
1150
|
+
statusCode;
|
|
1151
|
+
constructor(engine, statusCode, statusText) {
|
|
1152
|
+
const retryable = statusCode >= 500 || statusCode === 429;
|
|
1153
|
+
super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : ""}`, { retryable });
|
|
1154
|
+
this.name = "HttpError";
|
|
1155
|
+
this.statusCode = statusCode;
|
|
1156
|
+
}
|
|
1157
|
+
};
|
|
1158
|
+
var EngineTimeoutError = class extends EngineError {
|
|
1159
|
+
timeoutMs;
|
|
1160
|
+
constructor(engine, timeoutMs) {
|
|
1161
|
+
super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });
|
|
1162
|
+
this.name = "EngineTimeoutError";
|
|
1163
|
+
this.timeoutMs = timeoutMs;
|
|
1164
|
+
}
|
|
1165
|
+
};
|
|
1166
|
+
var EngineUnavailableError = class extends EngineError {
|
|
1167
|
+
constructor(engine, reason) {
|
|
1168
|
+
super(engine, reason || "Engine not available", { retryable: false });
|
|
1169
|
+
this.name = "EngineUnavailableError";
|
|
1170
|
+
}
|
|
1171
|
+
};
|
|
1172
|
+
var AllEnginesFailedError = class extends Error {
|
|
1173
|
+
attemptedEngines;
|
|
1174
|
+
errors;
|
|
1175
|
+
constructor(attemptedEngines, errors) {
|
|
1176
|
+
const summary = attemptedEngines.map((e) => `${e}: ${errors.get(e)?.message || "unknown"}`).join("; ");
|
|
1177
|
+
super(`All engines failed: ${summary}`);
|
|
1178
|
+
this.name = "AllEnginesFailedError";
|
|
1179
|
+
this.attemptedEngines = attemptedEngines;
|
|
1180
|
+
this.errors = errors;
|
|
1181
|
+
}
|
|
1182
|
+
};
|
|
1183
|
+
|
|
1184
|
+
// src/engines/http/index.ts
|
|
1185
|
+
var DEFAULT_HEADERS = {
|
|
1186
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1187
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
1188
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
1189
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
1190
|
+
"Cache-Control": "no-cache",
|
|
1191
|
+
Pragma: "no-cache",
|
|
1192
|
+
"Sec-Fetch-Dest": "document",
|
|
1193
|
+
"Sec-Fetch-Mode": "navigate",
|
|
1194
|
+
"Sec-Fetch-Site": "none",
|
|
1195
|
+
"Sec-Fetch-User": "?1",
|
|
1196
|
+
"Upgrade-Insecure-Requests": "1"
|
|
1197
|
+
};
|
|
1198
|
+
var CHALLENGE_PATTERNS = [
|
|
1199
|
+
// Cloudflare
|
|
1200
|
+
"cf-browser-verification",
|
|
1201
|
+
"cf_chl_opt",
|
|
1202
|
+
"challenge-platform",
|
|
1203
|
+
"cf-spinner",
|
|
1204
|
+
"Just a moment",
|
|
1205
|
+
"Checking your browser",
|
|
1206
|
+
"checking if the site connection is secure",
|
|
1207
|
+
"Enable JavaScript and cookies",
|
|
1208
|
+
"Attention Required",
|
|
1209
|
+
"_cf_chl_tk",
|
|
1210
|
+
"Verifying you are human",
|
|
1211
|
+
"cf-turnstile",
|
|
1212
|
+
"/cdn-cgi/challenge-platform/",
|
|
1213
|
+
// Generic bot detection
|
|
1214
|
+
"Please Wait...",
|
|
1215
|
+
"DDoS protection by",
|
|
1216
|
+
"Access denied",
|
|
1217
|
+
"bot detection",
|
|
1218
|
+
"are you a robot",
|
|
1219
|
+
"complete the security check"
|
|
1220
|
+
];
|
|
1221
|
+
var CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm", "cf-ray"];
|
|
1222
|
+
var MIN_CONTENT_LENGTH = 100;
|
|
1223
|
+
var HttpEngine = class {
|
|
1224
|
+
config = ENGINE_CONFIGS.http;
|
|
1225
|
+
async scrape(meta) {
|
|
1226
|
+
const startTime = Date.now();
|
|
1227
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1228
|
+
try {
|
|
1229
|
+
const controller = new AbortController();
|
|
1230
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1231
|
+
if (abortSignal) {
|
|
1232
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1233
|
+
}
|
|
1234
|
+
logger4?.debug(`[http] Fetching ${url}`);
|
|
1235
|
+
const response = await fetch(url, {
|
|
1236
|
+
method: "GET",
|
|
1237
|
+
headers: {
|
|
1238
|
+
...DEFAULT_HEADERS,
|
|
1239
|
+
...options.headers || {}
|
|
1240
|
+
},
|
|
1241
|
+
redirect: "follow",
|
|
1242
|
+
signal: controller.signal
|
|
1243
|
+
});
|
|
1244
|
+
clearTimeout(timeoutId);
|
|
1245
|
+
const duration = Date.now() - startTime;
|
|
1246
|
+
const html = await response.text();
|
|
1247
|
+
logger4?.debug(`[http] Got response: ${response.status} (${html.length} chars) in ${duration}ms`);
|
|
1248
|
+
if (response.status >= 400) {
|
|
1249
|
+
throw new HttpError("http", response.status, response.statusText);
|
|
1250
|
+
}
|
|
1251
|
+
const challengeType = this.detectChallenge(html);
|
|
1252
|
+
if (challengeType) {
|
|
1253
|
+
logger4?.debug(`[http] Challenge detected: ${challengeType}`);
|
|
1254
|
+
throw new ChallengeDetectedError("http", challengeType);
|
|
1255
|
+
}
|
|
1256
|
+
const textContent = this.extractText(html);
|
|
1257
|
+
if (textContent.length < MIN_CONTENT_LENGTH) {
|
|
1258
|
+
logger4?.debug(`[http] Insufficient content: ${textContent.length} chars`);
|
|
1259
|
+
throw new InsufficientContentError("http", textContent.length, MIN_CONTENT_LENGTH);
|
|
1260
|
+
}
|
|
1261
|
+
return {
|
|
1262
|
+
html,
|
|
1263
|
+
url: response.url,
|
|
1264
|
+
statusCode: response.status,
|
|
1265
|
+
contentType: response.headers.get("content-type") || void 0,
|
|
1266
|
+
headers: this.headersToRecord(response.headers),
|
|
1267
|
+
engine: "http",
|
|
1268
|
+
duration
|
|
1269
|
+
};
|
|
1270
|
+
} catch (error) {
|
|
1271
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError) {
|
|
1272
|
+
throw error;
|
|
1273
|
+
}
|
|
1274
|
+
if (error instanceof Error) {
|
|
1275
|
+
if (error.name === "AbortError") {
|
|
1276
|
+
throw new EngineTimeoutError("http", this.config.maxTimeout);
|
|
1277
|
+
}
|
|
1278
|
+
throw new EngineError("http", error.message, { cause: error });
|
|
1279
|
+
}
|
|
1280
|
+
throw new EngineError("http", String(error));
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
/**
|
|
1284
|
+
* Detect challenge patterns in HTML
|
|
1285
|
+
* @returns Challenge type or null if no challenge detected
|
|
1286
|
+
*/
|
|
1287
|
+
detectChallenge(html) {
|
|
1288
|
+
const htmlLower = html.toLowerCase();
|
|
1289
|
+
const hasCloudflare = CLOUDFLARE_INFRA_PATTERNS.some((p) => htmlLower.includes(p.toLowerCase()));
|
|
1290
|
+
for (const pattern of CHALLENGE_PATTERNS) {
|
|
1291
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1292
|
+
if (hasCloudflare || pattern.includes("cf")) {
|
|
1293
|
+
return "cloudflare";
|
|
1294
|
+
}
|
|
1295
|
+
return "bot-detection";
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
return null;
|
|
1299
|
+
}
|
|
1300
|
+
/**
|
|
1301
|
+
* Convert Headers to Record<string, string>
|
|
1302
|
+
*/
|
|
1303
|
+
headersToRecord(headers) {
|
|
1304
|
+
const record = {};
|
|
1305
|
+
headers.forEach((value, key) => {
|
|
1306
|
+
record[key] = value;
|
|
1307
|
+
});
|
|
1308
|
+
return record;
|
|
1309
|
+
}
|
|
1310
|
+
/**
|
|
1311
|
+
* Extract visible text from HTML (rough extraction)
|
|
1312
|
+
*/
|
|
1313
|
+
extractText(html) {
|
|
1314
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1315
|
+
}
|
|
1316
|
+
isAvailable() {
|
|
1317
|
+
return true;
|
|
1318
|
+
}
|
|
1319
|
+
};
|
|
1320
|
+
var httpEngine = new HttpEngine();
|
|
1321
|
+
|
|
1322
|
+
// src/engines/tlsclient/index.ts
|
|
1323
|
+
import { gotScraping } from "got-scraping";
|
|
1324
|
+
var JS_REQUIRED_PATTERNS = [
|
|
1325
|
+
// Cloudflare JS challenge
|
|
1326
|
+
"cf-browser-verification",
|
|
1327
|
+
"challenge-platform",
|
|
1328
|
+
"_cf_chl_tk",
|
|
1329
|
+
"/cdn-cgi/challenge-platform/",
|
|
1330
|
+
// Generic JS requirements
|
|
1331
|
+
"Enable JavaScript",
|
|
1332
|
+
"JavaScript is required",
|
|
1333
|
+
"Please enable JavaScript",
|
|
1334
|
+
"requires JavaScript",
|
|
1335
|
+
"noscript"
|
|
1336
|
+
];
|
|
1337
|
+
var BLOCKED_PATTERNS = [
|
|
1338
|
+
"Access denied",
|
|
1339
|
+
"Sorry, you have been blocked",
|
|
1340
|
+
"bot detected",
|
|
1341
|
+
"suspicious activity",
|
|
1342
|
+
"too many requests"
|
|
1343
|
+
];
|
|
1344
|
+
var MIN_CONTENT_LENGTH2 = 100;
|
|
1345
|
+
var TlsClientEngine = class {
|
|
1346
|
+
config = ENGINE_CONFIGS.tlsclient;
|
|
1347
|
+
available = true;
|
|
1348
|
+
constructor() {
|
|
1349
|
+
try {
|
|
1350
|
+
if (!gotScraping) {
|
|
1351
|
+
this.available = false;
|
|
1352
|
+
}
|
|
1353
|
+
} catch {
|
|
1354
|
+
this.available = false;
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
async scrape(meta) {
|
|
1358
|
+
if (!this.available) {
|
|
1359
|
+
throw new EngineUnavailableError("tlsclient", "got-scraping not available");
|
|
1360
|
+
}
|
|
1361
|
+
const startTime = Date.now();
|
|
1362
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1363
|
+
try {
|
|
1364
|
+
const controller = new AbortController();
|
|
1365
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1366
|
+
if (abortSignal) {
|
|
1367
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1368
|
+
}
|
|
1369
|
+
logger4?.debug(`[tlsclient] Fetching ${url}`);
|
|
1370
|
+
const response = await gotScraping({
|
|
1371
|
+
url,
|
|
1372
|
+
timeout: {
|
|
1373
|
+
request: this.config.maxTimeout
|
|
1374
|
+
},
|
|
1375
|
+
headers: options.headers,
|
|
1376
|
+
followRedirect: true
|
|
1377
|
+
// got-scraping handles browser fingerprinting automatically
|
|
1378
|
+
// It uses header generators and proper TLS settings
|
|
1379
|
+
});
|
|
1380
|
+
clearTimeout(timeoutId);
|
|
1381
|
+
const duration = Date.now() - startTime;
|
|
1382
|
+
const html = response.body;
|
|
1383
|
+
logger4?.debug(`[tlsclient] Got response: ${response.statusCode} (${html.length} chars) in ${duration}ms`);
|
|
1384
|
+
if (response.statusCode >= 400) {
|
|
1385
|
+
throw new HttpError("tlsclient", response.statusCode, response.statusMessage);
|
|
1386
|
+
}
|
|
1387
|
+
const challengeType = this.detectJsRequired(html);
|
|
1388
|
+
if (challengeType) {
|
|
1389
|
+
logger4?.debug(`[tlsclient] JS required: ${challengeType}`);
|
|
1390
|
+
throw new ChallengeDetectedError("tlsclient", challengeType);
|
|
1391
|
+
}
|
|
1392
|
+
const blockedReason = this.detectBlocked(html);
|
|
1393
|
+
if (blockedReason) {
|
|
1394
|
+
logger4?.debug(`[tlsclient] Blocked: ${blockedReason}`);
|
|
1395
|
+
throw new ChallengeDetectedError("tlsclient", `blocked: ${blockedReason}`);
|
|
1396
|
+
}
|
|
1397
|
+
const textContent = this.extractText(html);
|
|
1398
|
+
if (textContent.length < MIN_CONTENT_LENGTH2) {
|
|
1399
|
+
logger4?.debug(`[tlsclient] Insufficient content: ${textContent.length} chars`);
|
|
1400
|
+
throw new InsufficientContentError("tlsclient", textContent.length, MIN_CONTENT_LENGTH2);
|
|
1401
|
+
}
|
|
1402
|
+
return {
|
|
1403
|
+
html,
|
|
1404
|
+
url: response.url,
|
|
1405
|
+
statusCode: response.statusCode,
|
|
1406
|
+
contentType: response.headers["content-type"],
|
|
1407
|
+
headers: response.headers,
|
|
1408
|
+
engine: "tlsclient",
|
|
1409
|
+
duration
|
|
1410
|
+
};
|
|
1411
|
+
} catch (error) {
|
|
1412
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError || error instanceof EngineUnavailableError) {
|
|
1413
|
+
throw error;
|
|
1414
|
+
}
|
|
1415
|
+
if (error instanceof Error) {
|
|
1416
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1417
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1418
|
+
}
|
|
1419
|
+
if (error.name === "AbortError") {
|
|
1420
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1421
|
+
}
|
|
1422
|
+
throw new EngineError("tlsclient", error.message, { cause: error });
|
|
1423
|
+
}
|
|
1424
|
+
throw new EngineError("tlsclient", String(error));
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
/**
|
|
1428
|
+
* Detect patterns that require JS execution
|
|
1429
|
+
*/
|
|
1430
|
+
detectJsRequired(html) {
|
|
1431
|
+
const htmlLower = html.toLowerCase();
|
|
1432
|
+
for (const pattern of JS_REQUIRED_PATTERNS) {
|
|
1433
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1434
|
+
if (pattern.includes("cf") || pattern.includes("cloudflare")) {
|
|
1435
|
+
return "cloudflare-js";
|
|
1436
|
+
}
|
|
1437
|
+
return "js-required";
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
return null;
|
|
1441
|
+
}
|
|
1442
|
+
/**
|
|
1443
|
+
* Detect blocked/denied patterns
|
|
1444
|
+
*/
|
|
1445
|
+
detectBlocked(html) {
|
|
1446
|
+
const htmlLower = html.toLowerCase();
|
|
1447
|
+
for (const pattern of BLOCKED_PATTERNS) {
|
|
1448
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1449
|
+
return pattern;
|
|
1450
|
+
}
|
|
1451
|
+
}
|
|
1452
|
+
return null;
|
|
1453
|
+
}
|
|
1454
|
+
/**
|
|
1455
|
+
* Extract visible text from HTML
|
|
1456
|
+
*/
|
|
1457
|
+
extractText(html) {
|
|
1458
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1459
|
+
}
|
|
1460
|
+
isAvailable() {
|
|
1461
|
+
return this.available;
|
|
1462
|
+
}
|
|
1463
|
+
};
|
|
1464
|
+
var tlsClientEngine = new TlsClientEngine();
|
|
1465
|
+
|
|
1466
|
+
// src/cloudflare/detector.ts
|
|
1467
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
1468
|
+
"#challenge-running",
|
|
1469
|
+
"#challenge-stage",
|
|
1470
|
+
"#challenge-form",
|
|
1471
|
+
".cf-browser-verification",
|
|
1472
|
+
"#cf-wrapper",
|
|
1473
|
+
"#cf-hcaptcha-container",
|
|
1474
|
+
"#turnstile-wrapper"
|
|
1475
|
+
];
|
|
1476
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
1477
|
+
"checking if the site connection is secure",
|
|
1478
|
+
"this process is automatic. your browser will redirect",
|
|
1479
|
+
"ray id:",
|
|
1480
|
+
"performance & security by cloudflare"
|
|
1481
|
+
];
|
|
1482
|
+
var CLOUDFLARE_INFRA_PATTERNS2 = [
|
|
1483
|
+
"/cdn-cgi/",
|
|
1484
|
+
"cloudflare",
|
|
1485
|
+
"__cf_bm",
|
|
1486
|
+
"cf-ray"
|
|
1487
|
+
];
|
|
1488
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
1489
|
+
"sorry, you have been blocked",
|
|
1490
|
+
"ray id:"
|
|
1491
|
+
];
|
|
1492
|
+
async function detectChallenge(hero) {
|
|
1493
|
+
const signals = [];
|
|
1494
|
+
let type = "none";
|
|
1495
|
+
let hasCloudflareInfra = false;
|
|
1496
|
+
let hasChallengeIndicator = false;
|
|
1497
|
+
try {
|
|
1498
|
+
if (!hero.document) {
|
|
1499
|
+
return {
|
|
1500
|
+
isChallenge: false,
|
|
1501
|
+
type: "none",
|
|
1502
|
+
confidence: 0,
|
|
1503
|
+
signals: ["No document available"]
|
|
1504
|
+
};
|
|
1505
|
+
}
|
|
1506
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1507
|
+
const htmlLower = html.toLowerCase();
|
|
1508
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS2) {
|
|
1509
|
+
if (htmlLower.includes(pattern)) {
|
|
1510
|
+
hasCloudflareInfra = true;
|
|
1511
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
1512
|
+
break;
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
if (!hasCloudflareInfra) {
|
|
1516
|
+
return {
|
|
1517
|
+
isChallenge: false,
|
|
1518
|
+
type: "none",
|
|
1519
|
+
confidence: 0,
|
|
1520
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
1521
|
+
};
|
|
1522
|
+
}
|
|
1523
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
1524
|
+
try {
|
|
1525
|
+
const element = await hero.document.querySelector(selector);
|
|
1526
|
+
if (element) {
|
|
1527
|
+
hasChallengeIndicator = true;
|
|
1528
|
+
signals.push(`Challenge element: ${selector}`);
|
|
1529
|
+
type = "js_challenge";
|
|
1530
|
+
}
|
|
1531
|
+
} catch {
|
|
1532
|
+
}
|
|
1533
|
+
}
|
|
1534
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
1535
|
+
if (htmlLower.includes(pattern)) {
|
|
1536
|
+
hasChallengeIndicator = true;
|
|
1537
|
+
signals.push(`Challenge text: "${pattern}"`);
|
|
1538
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
1542
|
+
hasChallengeIndicator = true;
|
|
1543
|
+
signals.push('Challenge text: "waiting for...to respond"');
|
|
1544
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1545
|
+
}
|
|
1546
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
1547
|
+
if (hasBlocked) {
|
|
1548
|
+
hasChallengeIndicator = true;
|
|
1549
|
+
signals.push("Cloudflare block page detected");
|
|
1550
|
+
type = "blocked";
|
|
1551
|
+
}
|
|
1552
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
1553
|
+
const confidence = isChallenge ? 100 : 0;
|
|
1554
|
+
return {
|
|
1555
|
+
isChallenge,
|
|
1556
|
+
type: isChallenge ? type : "none",
|
|
1557
|
+
confidence,
|
|
1558
|
+
signals
|
|
1559
|
+
};
|
|
1560
|
+
} catch (error) {
|
|
1561
|
+
return {
|
|
1562
|
+
isChallenge: false,
|
|
1563
|
+
type: "none",
|
|
1564
|
+
confidence: 0,
|
|
1565
|
+
signals: [`Error during detection: ${error.message}`]
|
|
1566
|
+
};
|
|
1567
|
+
}
|
|
1568
|
+
}
|
|
1569
|
+
async function isChallengePage(hero) {
|
|
1570
|
+
const detection = await detectChallenge(hero);
|
|
1571
|
+
return detection.isChallenge;
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
// src/cloudflare/handler.ts
|
|
1575
|
+
async function waitForChallengeResolution(hero, options) {
|
|
1576
|
+
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
1577
|
+
const startTime = Date.now();
|
|
1578
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
1579
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1580
|
+
const elapsed = Date.now() - startTime;
|
|
1581
|
+
try {
|
|
1582
|
+
const currentUrl = await hero.url;
|
|
1583
|
+
if (currentUrl !== initialUrl) {
|
|
1584
|
+
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
1585
|
+
log(` Waiting for new page to load...`);
|
|
1586
|
+
try {
|
|
1587
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1588
|
+
log(` DOMContentLoaded`);
|
|
1589
|
+
} catch {
|
|
1590
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1591
|
+
}
|
|
1592
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1593
|
+
});
|
|
1594
|
+
log(` Page stabilized`);
|
|
1595
|
+
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
1596
|
+
}
|
|
1597
|
+
} catch {
|
|
1598
|
+
}
|
|
1599
|
+
const detection = await detectChallenge(hero);
|
|
1600
|
+
if (!detection.isChallenge) {
|
|
1601
|
+
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
1602
|
+
log(` Waiting for page to load...`);
|
|
1603
|
+
try {
|
|
1604
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1605
|
+
log(` DOMContentLoaded`);
|
|
1606
|
+
} catch {
|
|
1607
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1608
|
+
}
|
|
1609
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1610
|
+
});
|
|
1611
|
+
log(` Page stabilized`);
|
|
1612
|
+
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
1613
|
+
}
|
|
1614
|
+
log(
|
|
1615
|
+
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
1616
|
+
);
|
|
1617
|
+
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
1618
|
+
}
|
|
1619
|
+
return {
|
|
1620
|
+
resolved: false,
|
|
1621
|
+
method: "timeout",
|
|
1622
|
+
waitedMs: Date.now() - startTime
|
|
1623
|
+
};
|
|
1624
|
+
}
|
|
1625
|
+
async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
|
|
1626
|
+
const startTime = Date.now();
|
|
1627
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
1628
|
+
log(`Waiting for selector: "${selector}"`);
|
|
1629
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1630
|
+
try {
|
|
1631
|
+
const element = await hero.document.querySelector(selector);
|
|
1632
|
+
if (element) {
|
|
1633
|
+
const elapsed = Date.now() - startTime;
|
|
1634
|
+
log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
|
|
1635
|
+
return { found: true, waitedMs: elapsed };
|
|
1636
|
+
}
|
|
1637
|
+
} catch {
|
|
1638
|
+
}
|
|
1639
|
+
await new Promise((resolve) => setTimeout(resolve, 300));
|
|
1640
|
+
}
|
|
1641
|
+
log(`\u2717 Selector not found within timeout`);
|
|
1642
|
+
return { found: false, waitedMs: Date.now() - startTime };
|
|
1643
|
+
}
|
|
1644
|
+
async function handleChallenge(hero, options = {}) {
|
|
1645
|
+
const initialUrl = await hero.url;
|
|
1646
|
+
const detection = await detectChallenge(hero);
|
|
1647
|
+
if (!detection.isChallenge) {
|
|
1648
|
+
return { resolved: true, method: "signals_cleared", waitedMs: 0 };
|
|
1649
|
+
}
|
|
1650
|
+
return waitForChallengeResolution(hero, {
|
|
1651
|
+
...options,
|
|
1652
|
+
initialUrl
|
|
1653
|
+
});
|
|
1654
|
+
}
|
|
1655
|
+
|
|
1656
|
+
// src/engines/hero/index.ts
|
|
1657
|
+
var MIN_CONTENT_LENGTH3 = 100;
|
|
1658
|
+
var HeroEngine = class {
|
|
1659
|
+
config = ENGINE_CONFIGS.hero;
|
|
1660
|
+
async scrape(meta) {
|
|
1661
|
+
const startTime = Date.now();
|
|
1662
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1663
|
+
const pool = options.pool;
|
|
1664
|
+
if (!pool) {
|
|
1665
|
+
throw new EngineUnavailableError("hero", "Browser pool not available");
|
|
1666
|
+
}
|
|
1667
|
+
if (abortSignal?.aborted) {
|
|
1668
|
+
throw new EngineTimeoutError("hero", 0);
|
|
1669
|
+
}
|
|
1670
|
+
logger4?.debug(`[hero] Starting browser scrape of ${url}`);
|
|
1671
|
+
try {
|
|
1672
|
+
const result = await pool.withBrowser(async (hero) => {
|
|
1673
|
+
let aborted = false;
|
|
1674
|
+
if (abortSignal) {
|
|
1675
|
+
abortSignal.addEventListener("abort", () => {
|
|
1676
|
+
aborted = true;
|
|
1677
|
+
}, { once: true });
|
|
1678
|
+
}
|
|
1679
|
+
const timeoutMs = options.timeoutMs || this.config.maxTimeout;
|
|
1680
|
+
await hero.goto(url, { timeoutMs });
|
|
1681
|
+
if (aborted) {
|
|
1682
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1683
|
+
}
|
|
1684
|
+
try {
|
|
1685
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs });
|
|
1686
|
+
} catch {
|
|
1687
|
+
}
|
|
1688
|
+
await hero.waitForPaintingStable();
|
|
1689
|
+
if (aborted) {
|
|
1690
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1691
|
+
}
|
|
1692
|
+
const initialUrl = await hero.url;
|
|
1693
|
+
const detection = await detectChallenge(hero);
|
|
1694
|
+
if (detection.isChallenge) {
|
|
1695
|
+
logger4?.debug(`[hero] Challenge detected: ${detection.type}`);
|
|
1696
|
+
if (detection.type === "blocked") {
|
|
1697
|
+
throw new ChallengeDetectedError("hero", "blocked");
|
|
1698
|
+
}
|
|
1699
|
+
const resolution = await waitForChallengeResolution(hero, {
|
|
1700
|
+
maxWaitMs: 45e3,
|
|
1701
|
+
pollIntervalMs: 500,
|
|
1702
|
+
verbose: options.verbose,
|
|
1703
|
+
initialUrl
|
|
1704
|
+
});
|
|
1705
|
+
if (!resolution.resolved) {
|
|
1706
|
+
throw new ChallengeDetectedError("hero", `unresolved: ${detection.type}`);
|
|
1707
|
+
}
|
|
1708
|
+
logger4?.debug(`[hero] Challenge resolved via ${resolution.method} in ${resolution.waitedMs}ms`);
|
|
1709
|
+
}
|
|
1710
|
+
if (aborted) {
|
|
1711
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1712
|
+
}
|
|
1713
|
+
await this.waitForFinalPage(hero, url, logger4);
|
|
1714
|
+
if (aborted) {
|
|
1715
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1716
|
+
}
|
|
1717
|
+
if (options.waitForSelector) {
|
|
1718
|
+
try {
|
|
1719
|
+
await hero.waitForElement(hero.document.querySelector(options.waitForSelector), {
|
|
1720
|
+
timeoutMs
|
|
1721
|
+
});
|
|
1722
|
+
} catch {
|
|
1723
|
+
logger4?.debug(`[hero] Selector not found: ${options.waitForSelector}`);
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1727
|
+
const finalUrl = await hero.url;
|
|
1728
|
+
const textContent = this.extractText(html);
|
|
1729
|
+
if (textContent.length < MIN_CONTENT_LENGTH3) {
|
|
1730
|
+
logger4?.debug(`[hero] Insufficient content: ${textContent.length} chars`);
|
|
1731
|
+
throw new InsufficientContentError("hero", textContent.length, MIN_CONTENT_LENGTH3);
|
|
1732
|
+
}
|
|
1733
|
+
const duration = Date.now() - startTime;
|
|
1734
|
+
logger4?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);
|
|
1735
|
+
return {
|
|
1736
|
+
html,
|
|
1737
|
+
url: finalUrl,
|
|
1738
|
+
statusCode: 200,
|
|
1739
|
+
// Hero doesn't expose status code directly
|
|
1740
|
+
engine: "hero",
|
|
1741
|
+
duration
|
|
1742
|
+
};
|
|
1743
|
+
});
|
|
1744
|
+
return result;
|
|
1745
|
+
} catch (error) {
|
|
1746
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError || error instanceof EngineUnavailableError) {
|
|
1747
|
+
throw error;
|
|
1748
|
+
}
|
|
1749
|
+
if (error instanceof Error) {
|
|
1750
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1751
|
+
throw new EngineTimeoutError("hero", this.config.maxTimeout);
|
|
1752
|
+
}
|
|
1753
|
+
if (error.message.includes("Navigation") || error.message.includes("ERR_")) {
|
|
1754
|
+
throw new EngineError("hero", `Navigation failed: ${error.message}`, { cause: error });
|
|
1755
|
+
}
|
|
1756
|
+
throw new EngineError("hero", error.message, { cause: error });
|
|
1757
|
+
}
|
|
1758
|
+
throw new EngineError("hero", String(error));
|
|
1759
|
+
}
|
|
1760
|
+
}
|
|
1761
|
+
/**
|
|
1762
|
+
* Wait for the final page to load after any Cloudflare redirects
|
|
1763
|
+
*/
|
|
1764
|
+
async waitForFinalPage(hero, originalUrl, logger4) {
|
|
1765
|
+
const maxWaitMs = 15e3;
|
|
1766
|
+
const startTime = Date.now();
|
|
1767
|
+
try {
|
|
1768
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1769
|
+
} catch {
|
|
1770
|
+
}
|
|
1771
|
+
let currentUrl = await hero.url;
|
|
1772
|
+
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1773
|
+
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1774
|
+
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1775
|
+
logger4?.debug(`[hero] Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1776
|
+
let lastUrl = currentUrl;
|
|
1777
|
+
let stableCount = 0;
|
|
1778
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1779
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
1780
|
+
try {
|
|
1781
|
+
currentUrl = await hero.url;
|
|
1782
|
+
if (currentUrl === lastUrl) {
|
|
1783
|
+
stableCount++;
|
|
1784
|
+
if (stableCount >= 2) {
|
|
1785
|
+
break;
|
|
1786
|
+
}
|
|
1787
|
+
} else {
|
|
1788
|
+
stableCount = 0;
|
|
1789
|
+
lastUrl = currentUrl;
|
|
1790
|
+
logger4?.debug(`[hero] URL changed to: ${currentUrl}`);
|
|
1791
|
+
}
|
|
1792
|
+
} catch {
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
try {
|
|
1796
|
+
await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
|
|
1797
|
+
} catch {
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
await hero.waitForPaintingStable();
|
|
1801
|
+
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1802
|
+
}
|
|
1803
|
+
/**
|
|
1804
|
+
* Extract visible text from HTML
|
|
1805
|
+
*/
|
|
1806
|
+
extractText(html) {
|
|
1807
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1808
|
+
}
|
|
1809
|
+
isAvailable() {
|
|
1810
|
+
return true;
|
|
1811
|
+
}
|
|
1812
|
+
};
|
|
1813
|
+
var heroEngine = new HeroEngine();
|
|
1814
|
+
|
|
1815
|
+
// src/engines/orchestrator.ts
|
|
1816
|
+
var ENGINE_REGISTRY = {
|
|
1817
|
+
http: httpEngine,
|
|
1818
|
+
tlsclient: tlsClientEngine,
|
|
1819
|
+
hero: heroEngine
|
|
1820
|
+
};
|
|
1821
|
+
var EngineOrchestrator = class {
|
|
1822
|
+
options;
|
|
1823
|
+
engines;
|
|
1824
|
+
engineOrder;
|
|
1825
|
+
constructor(options = {}) {
|
|
1826
|
+
this.options = options;
|
|
1827
|
+
this.engineOrder = this.resolveEngineOrder();
|
|
1828
|
+
this.engines = this.engineOrder.map((name) => ENGINE_REGISTRY[name]).filter((engine) => engine.isAvailable());
|
|
1829
|
+
}
|
|
1830
|
+
/**
|
|
1831
|
+
* Resolve the engine order based on options
|
|
1832
|
+
*/
|
|
1833
|
+
resolveEngineOrder() {
|
|
1834
|
+
if (this.options.forceEngine) {
|
|
1835
|
+
return [this.options.forceEngine];
|
|
1836
|
+
}
|
|
1837
|
+
let order = this.options.engines || [...DEFAULT_ENGINE_ORDER];
|
|
1838
|
+
if (this.options.skipEngines) {
|
|
1839
|
+
order = order.filter((e) => !this.options.skipEngines.includes(e));
|
|
1840
|
+
}
|
|
1841
|
+
return order;
|
|
1499
1842
|
}
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1843
|
+
/**
|
|
1844
|
+
* Get available engines
|
|
1845
|
+
*/
|
|
1846
|
+
getAvailableEngines() {
|
|
1847
|
+
return this.engines.map((e) => e.config.name);
|
|
1504
1848
|
}
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1849
|
+
/**
|
|
1850
|
+
* Scrape a URL using the engine cascade
|
|
1851
|
+
*
|
|
1852
|
+
* @param meta - Engine metadata (url, options, logger, abortSignal)
|
|
1853
|
+
* @returns Scrape result with engine metadata
|
|
1854
|
+
* @throws AllEnginesFailedError if all engines fail
|
|
1855
|
+
*/
|
|
1856
|
+
async scrape(meta) {
|
|
1857
|
+
const attemptedEngines = [];
|
|
1858
|
+
const engineErrors = /* @__PURE__ */ new Map();
|
|
1859
|
+
const logger4 = meta.logger || this.options.logger;
|
|
1860
|
+
const verbose = this.options.verbose || meta.options.verbose;
|
|
1861
|
+
if (this.engines.length === 0) {
|
|
1862
|
+
throw new AllEnginesFailedError([], engineErrors);
|
|
1863
|
+
}
|
|
1864
|
+
const log = (msg) => {
|
|
1865
|
+
if (verbose) {
|
|
1866
|
+
logger4?.info(msg);
|
|
1867
|
+
} else {
|
|
1868
|
+
logger4?.debug(msg);
|
|
1869
|
+
}
|
|
1870
|
+
};
|
|
1871
|
+
log(`[orchestrator] Starting scrape of ${meta.url} with engines: ${this.engineOrder.join(" \u2192 ")}`);
|
|
1872
|
+
for (const engine of this.engines) {
|
|
1873
|
+
const engineName = engine.config.name;
|
|
1874
|
+
attemptedEngines.push(engineName);
|
|
1875
|
+
try {
|
|
1876
|
+
log(`[orchestrator] Trying ${engineName} engine...`);
|
|
1877
|
+
const controller = new AbortController();
|
|
1878
|
+
const timeoutId = setTimeout(() => controller.abort(), engine.config.maxTimeout);
|
|
1879
|
+
if (meta.abortSignal) {
|
|
1880
|
+
meta.abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1881
|
+
}
|
|
1882
|
+
try {
|
|
1883
|
+
const result = await engine.scrape({
|
|
1884
|
+
...meta,
|
|
1885
|
+
abortSignal: controller.signal
|
|
1886
|
+
});
|
|
1887
|
+
clearTimeout(timeoutId);
|
|
1888
|
+
log(`[orchestrator] \u2713 ${engineName} succeeded in ${result.duration}ms`);
|
|
1889
|
+
return {
|
|
1890
|
+
...result,
|
|
1891
|
+
attemptedEngines,
|
|
1892
|
+
engineErrors
|
|
1893
|
+
};
|
|
1894
|
+
} finally {
|
|
1895
|
+
clearTimeout(timeoutId);
|
|
1896
|
+
}
|
|
1897
|
+
} catch (error) {
|
|
1898
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
1899
|
+
engineErrors.set(engineName, err);
|
|
1900
|
+
if (error instanceof ChallengeDetectedError) {
|
|
1901
|
+
log(`[orchestrator] ${engineName} detected challenge: ${error.challengeType}`);
|
|
1902
|
+
} else if (error instanceof InsufficientContentError) {
|
|
1903
|
+
log(`[orchestrator] ${engineName} insufficient content: ${error.contentLength} chars`);
|
|
1904
|
+
} else if (error instanceof HttpError) {
|
|
1905
|
+
log(`[orchestrator] ${engineName} HTTP error: ${error.statusCode}`);
|
|
1906
|
+
} else if (error instanceof EngineTimeoutError) {
|
|
1907
|
+
log(`[orchestrator] ${engineName} timed out after ${error.timeoutMs}ms`);
|
|
1908
|
+
} else if (error instanceof EngineUnavailableError) {
|
|
1909
|
+
log(`[orchestrator] ${engineName} unavailable: ${err.message}`);
|
|
1910
|
+
} else {
|
|
1911
|
+
log(`[orchestrator] ${engineName} failed: ${err.message}`);
|
|
1912
|
+
}
|
|
1913
|
+
if (!this.shouldRetry(error)) {
|
|
1914
|
+
log(`[orchestrator] Non-retryable error, stopping cascade`);
|
|
1915
|
+
break;
|
|
1916
|
+
}
|
|
1917
|
+
log(`[orchestrator] Falling back to next engine...`);
|
|
1918
|
+
}
|
|
1919
|
+
}
|
|
1920
|
+
log(`[orchestrator] All engines failed for ${meta.url}`);
|
|
1921
|
+
throw new AllEnginesFailedError(attemptedEngines, engineErrors);
|
|
1922
|
+
}
|
|
1923
|
+
/**
|
|
1924
|
+
* Determine if we should retry with next engine
|
|
1925
|
+
*/
|
|
1926
|
+
shouldRetry(error) {
|
|
1927
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError) {
|
|
1928
|
+
return true;
|
|
1929
|
+
}
|
|
1930
|
+
if (error instanceof HttpError) {
|
|
1931
|
+
return error.statusCode === 403 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1932
|
+
}
|
|
1933
|
+
if (error instanceof EngineUnavailableError) {
|
|
1934
|
+
return true;
|
|
1935
|
+
}
|
|
1936
|
+
if (error instanceof EngineError) {
|
|
1937
|
+
return error.retryable;
|
|
1938
|
+
}
|
|
1509
1939
|
return true;
|
|
1510
1940
|
}
|
|
1511
|
-
}
|
|
1512
|
-
|
|
1513
|
-
// src/types.ts
|
|
1514
|
-
var DEFAULT_OPTIONS = {
|
|
1515
|
-
urls: [],
|
|
1516
|
-
formats: ["markdown"],
|
|
1517
|
-
includeMetadata: true,
|
|
1518
|
-
timeoutMs: 3e4,
|
|
1519
|
-
includePatterns: [],
|
|
1520
|
-
excludePatterns: [],
|
|
1521
|
-
// Content cleaning defaults
|
|
1522
|
-
removeAds: true,
|
|
1523
|
-
removeBase64Images: true,
|
|
1524
|
-
skipTLSVerification: true,
|
|
1525
|
-
// Batch defaults
|
|
1526
|
-
batchConcurrency: 1,
|
|
1527
|
-
batchTimeoutMs: 3e5,
|
|
1528
|
-
maxRetries: 2,
|
|
1529
|
-
onProgress: () => {
|
|
1530
|
-
},
|
|
1531
|
-
// Default no-op progress callback
|
|
1532
|
-
// Hero-specific defaults
|
|
1533
|
-
verbose: false,
|
|
1534
|
-
showChrome: false
|
|
1535
1941
|
};
|
|
1536
|
-
function isValidFormat(format) {
|
|
1537
|
-
return format === "markdown" || format === "html" || format === "json" || format === "text";
|
|
1538
|
-
}
|
|
1539
|
-
function shouldCrawlUrl2(url, baseDomain) {
|
|
1540
|
-
return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
|
|
1541
|
-
}
|
|
1542
1942
|
|
|
1543
1943
|
// src/scraper.ts
|
|
1544
1944
|
var Scraper = class {
|
|
1545
1945
|
options;
|
|
1546
|
-
pool;
|
|
1547
1946
|
logger = createLogger("scraper");
|
|
1548
1947
|
robotsCache = /* @__PURE__ */ new Map();
|
|
1549
1948
|
constructor(options) {
|
|
@@ -1551,10 +1950,6 @@ var Scraper = class {
|
|
|
1551
1950
|
...DEFAULT_OPTIONS,
|
|
1552
1951
|
...options
|
|
1553
1952
|
};
|
|
1554
|
-
if (!options.pool) {
|
|
1555
|
-
throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
|
|
1556
|
-
}
|
|
1557
|
-
this.pool = options.pool;
|
|
1558
1953
|
}
|
|
1559
1954
|
/**
|
|
1560
1955
|
* Get robots.txt rules for a URL, cached per domain
|
|
@@ -1622,52 +2017,7 @@ var Scraper = class {
|
|
|
1622
2017
|
return { result: null, error: lastError };
|
|
1623
2018
|
}
|
|
1624
2019
|
/**
|
|
1625
|
-
*
|
|
1626
|
-
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
1627
|
-
* we're on the actual content page before scraping.
|
|
1628
|
-
*/
|
|
1629
|
-
async waitForFinalPage(hero, originalUrl, verbose) {
|
|
1630
|
-
const maxWaitMs = 15e3;
|
|
1631
|
-
const startTime = Date.now();
|
|
1632
|
-
const log = (msg) => verbose && this.logger.info(msg);
|
|
1633
|
-
try {
|
|
1634
|
-
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1635
|
-
} catch {
|
|
1636
|
-
}
|
|
1637
|
-
let currentUrl = await hero.url;
|
|
1638
|
-
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1639
|
-
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1640
|
-
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1641
|
-
log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1642
|
-
let lastUrl = currentUrl;
|
|
1643
|
-
let stableCount = 0;
|
|
1644
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
1645
|
-
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
1646
|
-
try {
|
|
1647
|
-
currentUrl = await hero.url;
|
|
1648
|
-
if (currentUrl === lastUrl) {
|
|
1649
|
-
stableCount++;
|
|
1650
|
-
if (stableCount >= 2) {
|
|
1651
|
-
break;
|
|
1652
|
-
}
|
|
1653
|
-
} else {
|
|
1654
|
-
stableCount = 0;
|
|
1655
|
-
lastUrl = currentUrl;
|
|
1656
|
-
log(`URL changed to: ${currentUrl}`);
|
|
1657
|
-
}
|
|
1658
|
-
} catch {
|
|
1659
|
-
}
|
|
1660
|
-
}
|
|
1661
|
-
try {
|
|
1662
|
-
await hero.waitForLoad("AllContentLoaded", { timeoutMs: 1e4 });
|
|
1663
|
-
} catch {
|
|
1664
|
-
}
|
|
1665
|
-
}
|
|
1666
|
-
await hero.waitForPaintingStable();
|
|
1667
|
-
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1668
|
-
}
|
|
1669
|
-
/**
|
|
1670
|
-
* Scrape a single URL
|
|
2020
|
+
* Scrape a single URL using the engine orchestrator
|
|
1671
2021
|
*/
|
|
1672
2022
|
async scrapeSingleUrl(url, index) {
|
|
1673
2023
|
const startTime = Date.now();
|
|
@@ -1676,133 +2026,84 @@ var Scraper = class {
|
|
|
1676
2026
|
throw new Error(`URL blocked by robots.txt: ${url}`);
|
|
1677
2027
|
}
|
|
1678
2028
|
try {
|
|
1679
|
-
|
|
1680
|
-
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
if (this.options.waitForSelector) {
|
|
1713
|
-
try {
|
|
1714
|
-
await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
|
|
1715
|
-
timeoutMs: this.options.timeoutMs
|
|
1716
|
-
});
|
|
1717
|
-
} catch (error) {
|
|
1718
|
-
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1719
|
-
}
|
|
1720
|
-
}
|
|
1721
|
-
const pageTitle = await hero.document.title;
|
|
1722
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
1723
|
-
const cleanedHtml = cleanContent(html, url, {
|
|
1724
|
-
removeAds: this.options.removeAds,
|
|
1725
|
-
removeBase64Images: this.options.removeBase64Images
|
|
2029
|
+
const orchestrator = new EngineOrchestrator({
|
|
2030
|
+
engines: this.options.engines,
|
|
2031
|
+
skipEngines: this.options.skipEngines,
|
|
2032
|
+
forceEngine: this.options.forceEngine,
|
|
2033
|
+
logger: this.logger,
|
|
2034
|
+
verbose: this.options.verbose
|
|
2035
|
+
});
|
|
2036
|
+
const engineResult = await orchestrator.scrape({
|
|
2037
|
+
url,
|
|
2038
|
+
options: this.options,
|
|
2039
|
+
logger: this.logger
|
|
2040
|
+
});
|
|
2041
|
+
if (this.options.verbose) {
|
|
2042
|
+
this.logger.info(
|
|
2043
|
+
`[scraper] ${url} scraped with ${engineResult.engine} engine in ${engineResult.duration}ms (attempted: ${engineResult.attemptedEngines.join(" \u2192 ")})`
|
|
2044
|
+
);
|
|
2045
|
+
}
|
|
2046
|
+
const cleanedHtml = cleanContent(engineResult.html, engineResult.url, {
|
|
2047
|
+
removeAds: this.options.removeAds,
|
|
2048
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
2049
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
2050
|
+
includeTags: this.options.includeTags,
|
|
2051
|
+
excludeTags: this.options.excludeTags
|
|
2052
|
+
});
|
|
2053
|
+
const websiteMetadata = extractMetadata(cleanedHtml, engineResult.url);
|
|
2054
|
+
const duration = Date.now() - startTime;
|
|
2055
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
2056
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
2057
|
+
if (this.options.onProgress) {
|
|
2058
|
+
this.options.onProgress({
|
|
2059
|
+
completed: index + 1,
|
|
2060
|
+
total: this.options.urls.length,
|
|
2061
|
+
currentUrl: url
|
|
1726
2062
|
});
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
|
|
1730
|
-
const
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
1734
|
-
// Will be set by formatter
|
|
1735
|
-
html: cleanedHtml,
|
|
1736
|
-
fetchedAt: scrapedAt,
|
|
1737
|
-
depth: 0,
|
|
1738
|
-
hadChallenge,
|
|
1739
|
-
challengeType,
|
|
1740
|
-
waitTimeMs
|
|
1741
|
-
};
|
|
1742
|
-
const markdown = this.options.formats.includes("markdown") ? formatToMarkdown(
|
|
1743
|
-
[page],
|
|
1744
|
-
url,
|
|
1745
|
-
scrapedAt,
|
|
1746
|
-
duration,
|
|
1747
|
-
websiteMetadata,
|
|
1748
|
-
this.options.includeMetadata
|
|
1749
|
-
) : void 0;
|
|
1750
|
-
const htmlOutput = this.options.formats.includes("html") ? formatToHTML([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1751
|
-
const json = this.options.formats.includes("json") ? formatToJson([page], url, scrapedAt, duration, websiteMetadata) : void 0;
|
|
1752
|
-
const text = this.options.formats.includes("text") ? formatToText(
|
|
1753
|
-
[page],
|
|
1754
|
-
url,
|
|
1755
|
-
scrapedAt,
|
|
1756
|
-
duration,
|
|
1757
|
-
websiteMetadata,
|
|
1758
|
-
this.options.includeMetadata
|
|
1759
|
-
) : void 0;
|
|
1760
|
-
if (this.options.onProgress) {
|
|
1761
|
-
this.options.onProgress({
|
|
1762
|
-
completed: index + 1,
|
|
1763
|
-
total: this.options.urls.length,
|
|
1764
|
-
currentUrl: url
|
|
1765
|
-
});
|
|
1766
|
-
}
|
|
1767
|
-
let proxyMetadata;
|
|
1768
|
-
if (this.options.proxy) {
|
|
1769
|
-
const proxy = this.options.proxy;
|
|
1770
|
-
if (proxy.url) {
|
|
1771
|
-
try {
|
|
1772
|
-
const proxyUrl = new URL(proxy.url);
|
|
1773
|
-
proxyMetadata = {
|
|
1774
|
-
host: proxyUrl.hostname,
|
|
1775
|
-
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1776
|
-
country: proxy.country
|
|
1777
|
-
};
|
|
1778
|
-
} catch {
|
|
1779
|
-
}
|
|
1780
|
-
} else if (proxy.host && proxy.port) {
|
|
2063
|
+
}
|
|
2064
|
+
let proxyMetadata;
|
|
2065
|
+
if (this.options.proxy) {
|
|
2066
|
+
const proxy = this.options.proxy;
|
|
2067
|
+
if (proxy.url) {
|
|
2068
|
+
try {
|
|
2069
|
+
const proxyUrl = new URL(proxy.url);
|
|
1781
2070
|
proxyMetadata = {
|
|
1782
|
-
host:
|
|
1783
|
-
port:
|
|
2071
|
+
host: proxyUrl.hostname,
|
|
2072
|
+
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1784
2073
|
country: proxy.country
|
|
1785
2074
|
};
|
|
2075
|
+
} catch {
|
|
1786
2076
|
}
|
|
2077
|
+
} else if (proxy.host && proxy.port) {
|
|
2078
|
+
proxyMetadata = {
|
|
2079
|
+
host: proxy.host,
|
|
2080
|
+
port: proxy.port,
|
|
2081
|
+
country: proxy.country
|
|
2082
|
+
};
|
|
1787
2083
|
}
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
return result;
|
|
1803
|
-
});
|
|
2084
|
+
}
|
|
2085
|
+
const result = {
|
|
2086
|
+
markdown,
|
|
2087
|
+
html: htmlOutput,
|
|
2088
|
+
metadata: {
|
|
2089
|
+
baseUrl: url,
|
|
2090
|
+
totalPages: 1,
|
|
2091
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2092
|
+
duration,
|
|
2093
|
+
website: websiteMetadata,
|
|
2094
|
+
proxy: proxyMetadata
|
|
2095
|
+
}
|
|
2096
|
+
};
|
|
2097
|
+
return result;
|
|
1804
2098
|
} catch (error) {
|
|
1805
|
-
|
|
2099
|
+
if (error instanceof AllEnginesFailedError) {
|
|
2100
|
+
const engineSummary = error.attemptedEngines.map((e) => `${e}: ${error.errors.get(e)?.message || "unknown"}`).join("; ");
|
|
2101
|
+
this.logger.error(`Failed to scrape ${url}: All engines failed - ${engineSummary}`);
|
|
2102
|
+
} else if (error instanceof Error) {
|
|
2103
|
+
this.logger.error(`Failed to scrape ${url}: ${error.message}`);
|
|
2104
|
+
} else {
|
|
2105
|
+
this.logger.error(`Failed to scrape ${url}: ${String(error)}`);
|
|
2106
|
+
}
|
|
1806
2107
|
if (this.options.onProgress) {
|
|
1807
2108
|
this.options.onProgress({
|
|
1808
2109
|
completed: index + 1,
|
|
@@ -1844,7 +2145,7 @@ async function scrape(options) {
|
|
|
1844
2145
|
}
|
|
1845
2146
|
|
|
1846
2147
|
// src/crawler.ts
|
|
1847
|
-
import { parseHTML as
|
|
2148
|
+
import { parseHTML as parseHTML3 } from "linkedom";
|
|
1848
2149
|
|
|
1849
2150
|
// src/utils/rate-limiter.ts
|
|
1850
2151
|
import pLimit2 from "p-limit";
|
|
@@ -1993,12 +2294,26 @@ var Crawler = class {
|
|
|
1993
2294
|
*/
|
|
1994
2295
|
extractLinks(html, baseUrl, depth) {
|
|
1995
2296
|
const links = [];
|
|
1996
|
-
const { document } =
|
|
2297
|
+
const { document } = parseHTML3(html);
|
|
1997
2298
|
document.querySelectorAll("a[href]").forEach((anchor) => {
|
|
1998
|
-
const
|
|
2299
|
+
const rawHref = anchor.getAttribute("href");
|
|
2300
|
+
if (!rawHref) return;
|
|
2301
|
+
const href = rawHref.trim();
|
|
1999
2302
|
if (!href) return;
|
|
2000
|
-
|
|
2303
|
+
if (href.startsWith("#")) return;
|
|
2304
|
+
const lowerHref = href.toLowerCase();
|
|
2305
|
+
if (lowerHref.startsWith("javascript:") || lowerHref.startsWith("mailto:") || lowerHref.startsWith("tel:") || lowerHref.startsWith("data:") || lowerHref.startsWith("blob:") || lowerHref.startsWith("ftp:")) {
|
|
2306
|
+
return;
|
|
2307
|
+
}
|
|
2308
|
+
let resolved = resolveUrl(href, baseUrl);
|
|
2001
2309
|
if (!resolved || !isValidUrl(resolved)) return;
|
|
2310
|
+
try {
|
|
2311
|
+
const parsed = new URL(resolved);
|
|
2312
|
+
parsed.hash = "";
|
|
2313
|
+
resolved = parsed.toString();
|
|
2314
|
+
} catch {
|
|
2315
|
+
return;
|
|
2316
|
+
}
|
|
2002
2317
|
if (!isSameDomain(resolved, this.options.url)) return;
|
|
2003
2318
|
if (!isContentUrl(resolved)) return;
|
|
2004
2319
|
if (!shouldIncludeUrl(resolved, this.options.includePatterns, this.options.excludePatterns)) return;
|
|
@@ -3046,16 +3361,251 @@ async function isDaemonRunning(port = DEFAULT_DAEMON_PORT) {
|
|
|
3046
3361
|
const client = new DaemonClient({ port, timeoutMs: 5e3 });
|
|
3047
3362
|
return client.isRunning();
|
|
3048
3363
|
}
|
|
3364
|
+
|
|
3365
|
+
// src/formatters/html.ts
|
|
3366
|
+
function formatToHTML(html) {
|
|
3367
|
+
return html;
|
|
3368
|
+
}
|
|
3369
|
+
|
|
3370
|
+
// src/errors.ts
|
|
3371
|
+
var ReaderErrorCode = /* @__PURE__ */ ((ReaderErrorCode2) => {
|
|
3372
|
+
ReaderErrorCode2["NETWORK_ERROR"] = "NETWORK_ERROR";
|
|
3373
|
+
ReaderErrorCode2["TIMEOUT"] = "TIMEOUT";
|
|
3374
|
+
ReaderErrorCode2["CONNECTION_REFUSED"] = "CONNECTION_REFUSED";
|
|
3375
|
+
ReaderErrorCode2["CLOUDFLARE_CHALLENGE"] = "CLOUDFLARE_CHALLENGE";
|
|
3376
|
+
ReaderErrorCode2["BOT_DETECTED"] = "BOT_DETECTED";
|
|
3377
|
+
ReaderErrorCode2["ACCESS_DENIED"] = "ACCESS_DENIED";
|
|
3378
|
+
ReaderErrorCode2["CONTENT_EXTRACTION_FAILED"] = "CONTENT_EXTRACTION_FAILED";
|
|
3379
|
+
ReaderErrorCode2["EMPTY_CONTENT"] = "EMPTY_CONTENT";
|
|
3380
|
+
ReaderErrorCode2["INVALID_URL"] = "INVALID_URL";
|
|
3381
|
+
ReaderErrorCode2["INVALID_OPTIONS"] = "INVALID_OPTIONS";
|
|
3382
|
+
ReaderErrorCode2["ROBOTS_BLOCKED"] = "ROBOTS_BLOCKED";
|
|
3383
|
+
ReaderErrorCode2["BROWSER_ERROR"] = "BROWSER_ERROR";
|
|
3384
|
+
ReaderErrorCode2["POOL_EXHAUSTED"] = "POOL_EXHAUSTED";
|
|
3385
|
+
ReaderErrorCode2["CLIENT_CLOSED"] = "CLIENT_CLOSED";
|
|
3386
|
+
ReaderErrorCode2["NOT_INITIALIZED"] = "NOT_INITIALIZED";
|
|
3387
|
+
ReaderErrorCode2["UNKNOWN"] = "UNKNOWN";
|
|
3388
|
+
return ReaderErrorCode2;
|
|
3389
|
+
})(ReaderErrorCode || {});
|
|
3390
|
+
var ReaderError = class extends Error {
|
|
3391
|
+
code;
|
|
3392
|
+
url;
|
|
3393
|
+
cause;
|
|
3394
|
+
timestamp;
|
|
3395
|
+
retryable;
|
|
3396
|
+
constructor(message, code, options) {
|
|
3397
|
+
super(message);
|
|
3398
|
+
this.name = "ReaderError";
|
|
3399
|
+
this.code = code;
|
|
3400
|
+
this.url = options?.url;
|
|
3401
|
+
this.cause = options?.cause;
|
|
3402
|
+
this.timestamp = (/* @__PURE__ */ new Date()).toISOString();
|
|
3403
|
+
this.retryable = options?.retryable ?? false;
|
|
3404
|
+
if (Error.captureStackTrace) {
|
|
3405
|
+
Error.captureStackTrace(this, this.constructor);
|
|
3406
|
+
}
|
|
3407
|
+
}
|
|
3408
|
+
/**
|
|
3409
|
+
* Convert to a plain object for serialization
|
|
3410
|
+
*/
|
|
3411
|
+
toJSON() {
|
|
3412
|
+
return {
|
|
3413
|
+
name: this.name,
|
|
3414
|
+
code: this.code,
|
|
3415
|
+
message: this.message,
|
|
3416
|
+
url: this.url,
|
|
3417
|
+
timestamp: this.timestamp,
|
|
3418
|
+
retryable: this.retryable,
|
|
3419
|
+
cause: this.cause?.message,
|
|
3420
|
+
stack: this.stack
|
|
3421
|
+
};
|
|
3422
|
+
}
|
|
3423
|
+
};
|
|
3424
|
+
var NetworkError = class extends ReaderError {
|
|
3425
|
+
constructor(message, options) {
|
|
3426
|
+
super(message, "NETWORK_ERROR" /* NETWORK_ERROR */, {
|
|
3427
|
+
...options,
|
|
3428
|
+
retryable: true
|
|
3429
|
+
});
|
|
3430
|
+
this.name = "NetworkError";
|
|
3431
|
+
}
|
|
3432
|
+
};
|
|
3433
|
+
var TimeoutError = class extends ReaderError {
|
|
3434
|
+
timeoutMs;
|
|
3435
|
+
constructor(message, timeoutMs, options) {
|
|
3436
|
+
super(message, "TIMEOUT" /* TIMEOUT */, {
|
|
3437
|
+
...options,
|
|
3438
|
+
retryable: true
|
|
3439
|
+
});
|
|
3440
|
+
this.name = "TimeoutError";
|
|
3441
|
+
this.timeoutMs = timeoutMs;
|
|
3442
|
+
}
|
|
3443
|
+
toJSON() {
|
|
3444
|
+
return {
|
|
3445
|
+
...super.toJSON(),
|
|
3446
|
+
timeoutMs: this.timeoutMs
|
|
3447
|
+
};
|
|
3448
|
+
}
|
|
3449
|
+
};
|
|
3450
|
+
var CloudflareError = class extends ReaderError {
|
|
3451
|
+
challengeType;
|
|
3452
|
+
constructor(challengeType, options) {
|
|
3453
|
+
super(
|
|
3454
|
+
`Cloudflare ${challengeType} challenge not resolved. Consider using a residential proxy or increasing timeout.`,
|
|
3455
|
+
"CLOUDFLARE_CHALLENGE" /* CLOUDFLARE_CHALLENGE */,
|
|
3456
|
+
{
|
|
3457
|
+
...options,
|
|
3458
|
+
retryable: true
|
|
3459
|
+
}
|
|
3460
|
+
);
|
|
3461
|
+
this.name = "CloudflareError";
|
|
3462
|
+
this.challengeType = challengeType;
|
|
3463
|
+
}
|
|
3464
|
+
toJSON() {
|
|
3465
|
+
return {
|
|
3466
|
+
...super.toJSON(),
|
|
3467
|
+
challengeType: this.challengeType
|
|
3468
|
+
};
|
|
3469
|
+
}
|
|
3470
|
+
};
|
|
3471
|
+
var AccessDeniedError = class extends ReaderError {
|
|
3472
|
+
statusCode;
|
|
3473
|
+
constructor(message, options) {
|
|
3474
|
+
super(message, "ACCESS_DENIED" /* ACCESS_DENIED */, {
|
|
3475
|
+
...options,
|
|
3476
|
+
retryable: false
|
|
3477
|
+
});
|
|
3478
|
+
this.name = "AccessDeniedError";
|
|
3479
|
+
this.statusCode = options?.statusCode;
|
|
3480
|
+
}
|
|
3481
|
+
toJSON() {
|
|
3482
|
+
return {
|
|
3483
|
+
...super.toJSON(),
|
|
3484
|
+
statusCode: this.statusCode
|
|
3485
|
+
};
|
|
3486
|
+
}
|
|
3487
|
+
};
|
|
3488
|
+
var ContentExtractionError = class extends ReaderError {
|
|
3489
|
+
constructor(message, options) {
|
|
3490
|
+
super(message, "CONTENT_EXTRACTION_FAILED" /* CONTENT_EXTRACTION_FAILED */, {
|
|
3491
|
+
...options,
|
|
3492
|
+
retryable: false
|
|
3493
|
+
});
|
|
3494
|
+
this.name = "ContentExtractionError";
|
|
3495
|
+
}
|
|
3496
|
+
};
|
|
3497
|
+
var ValidationError = class extends ReaderError {
|
|
3498
|
+
field;
|
|
3499
|
+
constructor(message, options) {
|
|
3500
|
+
super(message, "INVALID_OPTIONS" /* INVALID_OPTIONS */, {
|
|
3501
|
+
url: options?.url,
|
|
3502
|
+
retryable: false
|
|
3503
|
+
});
|
|
3504
|
+
this.name = "ValidationError";
|
|
3505
|
+
this.field = options?.field;
|
|
3506
|
+
}
|
|
3507
|
+
toJSON() {
|
|
3508
|
+
return {
|
|
3509
|
+
...super.toJSON(),
|
|
3510
|
+
field: this.field
|
|
3511
|
+
};
|
|
3512
|
+
}
|
|
3513
|
+
};
|
|
3514
|
+
var InvalidUrlError = class extends ReaderError {
|
|
3515
|
+
constructor(url, reason) {
|
|
3516
|
+
super(reason ? `Invalid URL "${url}": ${reason}` : `Invalid URL: ${url}`, "INVALID_URL" /* INVALID_URL */, {
|
|
3517
|
+
url,
|
|
3518
|
+
retryable: false
|
|
3519
|
+
});
|
|
3520
|
+
this.name = "InvalidUrlError";
|
|
3521
|
+
}
|
|
3522
|
+
};
|
|
3523
|
+
var RobotsBlockedError = class extends ReaderError {
|
|
3524
|
+
constructor(url) {
|
|
3525
|
+
super(`URL blocked by robots.txt: ${url}. Set respectRobotsTxt: false to override.`, "ROBOTS_BLOCKED" /* ROBOTS_BLOCKED */, {
|
|
3526
|
+
url,
|
|
3527
|
+
retryable: false
|
|
3528
|
+
});
|
|
3529
|
+
this.name = "RobotsBlockedError";
|
|
3530
|
+
}
|
|
3531
|
+
};
|
|
3532
|
+
var BrowserPoolError = class extends ReaderError {
|
|
3533
|
+
constructor(message, options) {
|
|
3534
|
+
super(message, "BROWSER_ERROR" /* BROWSER_ERROR */, {
|
|
3535
|
+
...options,
|
|
3536
|
+
retryable: true
|
|
3537
|
+
});
|
|
3538
|
+
this.name = "BrowserPoolError";
|
|
3539
|
+
}
|
|
3540
|
+
};
|
|
3541
|
+
var ClientClosedError = class extends ReaderError {
|
|
3542
|
+
constructor() {
|
|
3543
|
+
super("ReaderClient has been closed. Create a new instance to continue.", "CLIENT_CLOSED" /* CLIENT_CLOSED */, {
|
|
3544
|
+
retryable: false
|
|
3545
|
+
});
|
|
3546
|
+
this.name = "ClientClosedError";
|
|
3547
|
+
}
|
|
3548
|
+
};
|
|
3549
|
+
var NotInitializedError = class extends ReaderError {
|
|
3550
|
+
constructor(component) {
|
|
3551
|
+
super(`${component} not initialized. This should not happen - please report this bug.`, "NOT_INITIALIZED" /* NOT_INITIALIZED */, {
|
|
3552
|
+
retryable: false
|
|
3553
|
+
});
|
|
3554
|
+
this.name = "NotInitializedError";
|
|
3555
|
+
}
|
|
3556
|
+
};
|
|
3557
|
+
function wrapError(error, url) {
|
|
3558
|
+
if (error instanceof ReaderError) {
|
|
3559
|
+
return error;
|
|
3560
|
+
}
|
|
3561
|
+
if (error instanceof Error) {
|
|
3562
|
+
const message = error.message.toLowerCase();
|
|
3563
|
+
if (message.includes("timeout") || message.includes("timed out")) {
|
|
3564
|
+
return new TimeoutError(error.message, 3e4, { url, cause: error });
|
|
3565
|
+
}
|
|
3566
|
+
if (message.includes("econnrefused") || message.includes("connection refused")) {
|
|
3567
|
+
return new NetworkError(`Connection refused: ${error.message}`, { url, cause: error });
|
|
3568
|
+
}
|
|
3569
|
+
if (message.includes("enotfound") || message.includes("dns")) {
|
|
3570
|
+
return new NetworkError(`DNS lookup failed: ${error.message}`, { url, cause: error });
|
|
3571
|
+
}
|
|
3572
|
+
if (message.includes("cloudflare") || message.includes("challenge")) {
|
|
3573
|
+
return new CloudflareError("unknown", { url, cause: error });
|
|
3574
|
+
}
|
|
3575
|
+
return new ReaderError(error.message, "UNKNOWN" /* UNKNOWN */, {
|
|
3576
|
+
url,
|
|
3577
|
+
cause: error,
|
|
3578
|
+
retryable: false
|
|
3579
|
+
});
|
|
3580
|
+
}
|
|
3581
|
+
return new ReaderError(String(error), "UNKNOWN" /* UNKNOWN */, {
|
|
3582
|
+
url,
|
|
3583
|
+
retryable: false
|
|
3584
|
+
});
|
|
3585
|
+
}
|
|
3049
3586
|
export {
|
|
3587
|
+
AccessDeniedError,
|
|
3050
3588
|
BrowserPool,
|
|
3589
|
+
BrowserPoolError,
|
|
3590
|
+
ClientClosedError,
|
|
3591
|
+
CloudflareError,
|
|
3592
|
+
ContentExtractionError,
|
|
3051
3593
|
Crawler,
|
|
3052
3594
|
DEFAULT_DAEMON_PORT,
|
|
3053
3595
|
DEFAULT_OPTIONS,
|
|
3054
3596
|
DaemonClient,
|
|
3055
3597
|
DaemonServer,
|
|
3056
3598
|
BrowserPool as HeroBrowserPool,
|
|
3599
|
+
InvalidUrlError,
|
|
3600
|
+
NetworkError,
|
|
3601
|
+
NotInitializedError,
|
|
3057
3602
|
ReaderClient,
|
|
3603
|
+
ReaderError,
|
|
3604
|
+
ReaderErrorCode,
|
|
3605
|
+
RobotsBlockedError,
|
|
3058
3606
|
Scraper,
|
|
3607
|
+
TimeoutError,
|
|
3608
|
+
ValidationError,
|
|
3059
3609
|
cleanContent,
|
|
3060
3610
|
crawl,
|
|
3061
3611
|
createHeroConfig,
|
|
@@ -3063,14 +3613,12 @@ export {
|
|
|
3063
3613
|
detectChallenge,
|
|
3064
3614
|
extractMetadata,
|
|
3065
3615
|
formatToHTML,
|
|
3066
|
-
formatToJson,
|
|
3067
|
-
formatToJsonLite,
|
|
3068
3616
|
formatToMarkdown,
|
|
3069
|
-
formatToText,
|
|
3070
3617
|
getDaemonInfo,
|
|
3071
3618
|
getPidFilePath,
|
|
3072
3619
|
getUrlKey,
|
|
3073
3620
|
handleChallenge,
|
|
3621
|
+
htmlToMarkdown,
|
|
3074
3622
|
isChallengePage,
|
|
3075
3623
|
isDaemonRunning,
|
|
3076
3624
|
isSameDomain,
|
|
@@ -3084,6 +3632,7 @@ export {
|
|
|
3084
3632
|
shouldCrawlUrl2 as shouldCrawlUrlFn,
|
|
3085
3633
|
validateUrls,
|
|
3086
3634
|
waitForChallengeResolution,
|
|
3087
|
-
waitForSelector
|
|
3635
|
+
waitForSelector,
|
|
3636
|
+
wrapError
|
|
3088
3637
|
};
|
|
3089
3638
|
//# sourceMappingURL=index.js.map
|