@vakra-dev/reader 0.0.3 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -13,212 +13,16 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
13
13
  // src/scraper.ts
14
14
  import pLimit from "p-limit";
15
15
 
16
- // src/cloudflare/detector.ts
17
- var CLOUDFLARE_CHALLENGE_SELECTORS = [
18
- "#challenge-running",
19
- "#challenge-stage",
20
- "#challenge-form",
21
- ".cf-browser-verification",
22
- "#cf-wrapper",
23
- "#cf-hcaptcha-container",
24
- "#turnstile-wrapper"
25
- ];
26
- var CLOUDFLARE_TEXT_PATTERNS = [
27
- "checking if the site connection is secure",
28
- "this process is automatic. your browser will redirect",
29
- "ray id:",
30
- "performance & security by cloudflare"
31
- ];
32
- var CLOUDFLARE_INFRA_PATTERNS = [
33
- "/cdn-cgi/",
34
- "cloudflare",
35
- "__cf_bm",
36
- "cf-ray"
37
- ];
38
- var CLOUDFLARE_BLOCKED_PATTERNS = [
39
- "sorry, you have been blocked",
40
- "ray id:"
41
- ];
42
- async function detectChallenge(hero) {
43
- const signals = [];
44
- let type = "none";
45
- let hasCloudflareInfra = false;
46
- let hasChallengeIndicator = false;
47
- try {
48
- if (!hero.document) {
49
- return {
50
- isChallenge: false,
51
- type: "none",
52
- confidence: 0,
53
- signals: ["No document available"]
54
- };
55
- }
56
- const html = await hero.document.documentElement.outerHTML;
57
- const htmlLower = html.toLowerCase();
58
- for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
59
- if (htmlLower.includes(pattern)) {
60
- hasCloudflareInfra = true;
61
- signals.push(`Cloudflare infra: "${pattern}"`);
62
- break;
63
- }
64
- }
65
- if (!hasCloudflareInfra) {
66
- return {
67
- isChallenge: false,
68
- type: "none",
69
- confidence: 0,
70
- signals: ["No Cloudflare infrastructure detected"]
71
- };
72
- }
73
- for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
74
- try {
75
- const element = await hero.document.querySelector(selector);
76
- if (element) {
77
- hasChallengeIndicator = true;
78
- signals.push(`Challenge element: ${selector}`);
79
- type = "js_challenge";
80
- }
81
- } catch {
82
- }
83
- }
84
- for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
85
- if (htmlLower.includes(pattern)) {
86
- hasChallengeIndicator = true;
87
- signals.push(`Challenge text: "${pattern}"`);
88
- type = type === "none" ? "js_challenge" : type;
89
- }
90
- }
91
- if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
92
- hasChallengeIndicator = true;
93
- signals.push('Challenge text: "waiting for...to respond"');
94
- type = type === "none" ? "js_challenge" : type;
95
- }
96
- const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
97
- if (hasBlocked) {
98
- hasChallengeIndicator = true;
99
- signals.push("Cloudflare block page detected");
100
- type = "blocked";
101
- }
102
- const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
103
- const confidence = isChallenge ? 100 : 0;
104
- return {
105
- isChallenge,
106
- type: isChallenge ? type : "none",
107
- confidence,
108
- signals
109
- };
110
- } catch (error) {
111
- return {
112
- isChallenge: false,
113
- type: "none",
114
- confidence: 0,
115
- signals: [`Error during detection: ${error.message}`]
116
- };
117
- }
118
- }
119
- async function isChallengePage(hero) {
120
- const detection = await detectChallenge(hero);
121
- return detection.isChallenge;
122
- }
123
-
124
- // src/cloudflare/handler.ts
125
- async function waitForChallengeResolution(hero, options) {
126
- const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
127
- const startTime = Date.now();
128
- const log = (msg) => verbose && console.log(` ${msg}`);
129
- while (Date.now() - startTime < maxWaitMs) {
130
- const elapsed = Date.now() - startTime;
131
- try {
132
- const currentUrl = await hero.url;
133
- if (currentUrl !== initialUrl) {
134
- log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
135
- log(` Waiting for new page to load...`);
136
- try {
137
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
138
- log(` DOMContentLoaded`);
139
- } catch {
140
- log(` DOMContentLoaded timeout, continuing...`);
141
- }
142
- await hero.waitForPaintingStable().catch(() => {
143
- });
144
- log(` Page stabilized`);
145
- return { resolved: true, method: "url_redirect", waitedMs: elapsed };
146
- }
147
- } catch {
148
- }
149
- const detection = await detectChallenge(hero);
150
- if (!detection.isChallenge) {
151
- log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
152
- log(` Waiting for page to load...`);
153
- try {
154
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
155
- log(` DOMContentLoaded`);
156
- } catch {
157
- log(` DOMContentLoaded timeout, continuing...`);
158
- }
159
- await hero.waitForPaintingStable().catch(() => {
160
- });
161
- log(` Page stabilized`);
162
- return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
163
- }
164
- log(
165
- `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
166
- );
167
- await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
168
- }
169
- return {
170
- resolved: false,
171
- method: "timeout",
172
- waitedMs: Date.now() - startTime
173
- };
174
- }
175
- async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
176
- const startTime = Date.now();
177
- const log = (msg) => verbose && console.log(` ${msg}`);
178
- log(`Waiting for selector: "${selector}"`);
179
- while (Date.now() - startTime < maxWaitMs) {
180
- try {
181
- const element = await hero.document.querySelector(selector);
182
- if (element) {
183
- const elapsed = Date.now() - startTime;
184
- log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
185
- return { found: true, waitedMs: elapsed };
186
- }
187
- } catch {
188
- }
189
- await new Promise((resolve) => setTimeout(resolve, 300));
190
- }
191
- log(`\u2717 Selector not found within timeout`);
192
- return { found: false, waitedMs: Date.now() - startTime };
193
- }
194
- async function handleChallenge(hero, options = {}) {
195
- const initialUrl = await hero.url;
196
- const detection = await detectChallenge(hero);
197
- if (!detection.isChallenge) {
198
- return { resolved: true, method: "signals_cleared", waitedMs: 0 };
199
- }
200
- return waitForChallengeResolution(hero, {
201
- ...options,
202
- initialUrl
203
- });
204
- }
205
-
206
16
  // src/formatters/markdown.ts
207
- import TurndownService from "turndown";
208
- var turndownService = new TurndownService({
209
- headingStyle: "atx",
210
- hr: "---",
211
- bulletListMarker: "-",
212
- codeBlockStyle: "fenced",
213
- fence: "```",
214
- emDelimiter: "*",
215
- strongDelimiter: "**",
216
- linkStyle: "inlined",
217
- linkReferenceStyle: "full"
218
- });
17
+ import { convert } from "@vakra-dev/supermarkdown";
219
18
  function htmlToMarkdown(html) {
220
19
  try {
221
- return turndownService.turndown(html);
20
+ return convert(html, {
21
+ headingStyle: "atx",
22
+ bulletMarker: "-",
23
+ codeFence: "`",
24
+ linkStyle: "inline"
25
+ });
222
26
  } catch (error) {
223
27
  console.warn("Error converting HTML to Markdown:", error);
224
28
  return html.replace(/<[^>]*>/g, "").trim();
@@ -1259,96 +1063,701 @@ function shouldCrawlUrl2(url, baseDomain) {
1259
1063
  return url.hostname === baseDomain || url.hostname.endsWith(`.${baseDomain}`);
1260
1064
  }
1261
1065
 
1262
- // src/scraper.ts
1263
- var Scraper = class {
1264
- options;
1265
- pool;
1266
- logger = createLogger("scraper");
1267
- robotsCache = /* @__PURE__ */ new Map();
1268
- constructor(options) {
1269
- this.options = {
1270
- ...DEFAULT_OPTIONS,
1271
- ...options
1272
- };
1273
- if (!options.pool) {
1274
- throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
1066
+ // src/engines/types.ts
1067
+ var ENGINE_CONFIGS = {
1068
+ http: {
1069
+ name: "http",
1070
+ timeout: 3e3,
1071
+ maxTimeout: 1e4,
1072
+ quality: 100,
1073
+ features: {
1074
+ javascript: false,
1075
+ cloudflare: false,
1076
+ tlsFingerprint: false,
1077
+ waitFor: false,
1078
+ screenshots: false
1079
+ }
1080
+ },
1081
+ tlsclient: {
1082
+ name: "tlsclient",
1083
+ timeout: 5e3,
1084
+ maxTimeout: 15e3,
1085
+ quality: 80,
1086
+ features: {
1087
+ javascript: false,
1088
+ cloudflare: false,
1089
+ tlsFingerprint: true,
1090
+ waitFor: false,
1091
+ screenshots: false
1092
+ }
1093
+ },
1094
+ hero: {
1095
+ name: "hero",
1096
+ timeout: 3e4,
1097
+ maxTimeout: 6e4,
1098
+ quality: 50,
1099
+ features: {
1100
+ javascript: true,
1101
+ cloudflare: true,
1102
+ tlsFingerprint: true,
1103
+ waitFor: true,
1104
+ screenshots: true
1275
1105
  }
1276
- this.pool = options.pool;
1277
1106
  }
1278
- /**
1279
- * Get robots.txt rules for a URL, cached per domain
1280
- */
1281
- async getRobotsRules(url) {
1282
- const origin = new URL(url).origin;
1283
- if (!this.robotsCache.has(origin)) {
1284
- const rules = await fetchRobotsTxt(origin);
1285
- this.robotsCache.set(origin, rules);
1107
+ };
1108
+ var DEFAULT_ENGINE_ORDER = ["http", "tlsclient", "hero"];
1109
+
1110
+ // src/engines/errors.ts
1111
+ var EngineError = class extends Error {
1112
+ engine;
1113
+ retryable;
1114
+ constructor(engine, message, options) {
1115
+ super(`[${engine}] ${message}`);
1116
+ this.name = "EngineError";
1117
+ this.engine = engine;
1118
+ this.retryable = options?.retryable ?? true;
1119
+ this.cause = options?.cause;
1120
+ if (Error.captureStackTrace) {
1121
+ Error.captureStackTrace(this, this.constructor);
1286
1122
  }
1287
- return this.robotsCache.get(origin) ?? null;
1288
1123
  }
1289
- /**
1290
- * Scrape all URLs
1291
- *
1292
- * @returns Scrape result with pages and metadata
1293
- */
1294
- async scrape() {
1295
- const startTime = Date.now();
1296
- const results = await this.scrapeWithConcurrency();
1297
- return this.buildScrapeResult(results, startTime);
1124
+ };
1125
+ var ChallengeDetectedError = class extends EngineError {
1126
+ challengeType;
1127
+ constructor(engine, challengeType) {
1128
+ super(engine, `Challenge detected: ${challengeType || "unknown"}`, { retryable: true });
1129
+ this.name = "ChallengeDetectedError";
1130
+ this.challengeType = challengeType || "unknown";
1298
1131
  }
1299
- /**
1300
- * Scrape URLs with concurrency control
1301
- */
1302
- async scrapeWithConcurrency() {
1303
- const limit = pLimit(this.options.batchConcurrency || 1);
1304
- const tasks = this.options.urls.map(
1305
- (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1306
- );
1307
- const batchPromise = Promise.all(tasks);
1308
- if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1309
- const timeoutPromise = new Promise((_, reject) => {
1310
- setTimeout(() => {
1311
- reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1312
- }, this.options.batchTimeoutMs);
1313
- });
1314
- return Promise.race([batchPromise, timeoutPromise]);
1315
- }
1316
- return batchPromise;
1132
+ };
1133
+ var InsufficientContentError = class extends EngineError {
1134
+ contentLength;
1135
+ threshold;
1136
+ constructor(engine, contentLength, threshold = 100) {
1137
+ super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, { retryable: true });
1138
+ this.name = "InsufficientContentError";
1139
+ this.contentLength = contentLength;
1140
+ this.threshold = threshold;
1317
1141
  }
1318
- /**
1319
- * Scrape a single URL with retry logic
1320
- */
1321
- async scrapeSingleUrlWithRetry(url, index) {
1322
- const maxRetries = this.options.maxRetries || 2;
1323
- let lastError;
1324
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
1142
+ };
1143
+ var HttpError = class extends EngineError {
1144
+ statusCode;
1145
+ constructor(engine, statusCode, statusText) {
1146
+ const retryable = statusCode >= 500 || statusCode === 429;
1147
+ super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : ""}`, { retryable });
1148
+ this.name = "HttpError";
1149
+ this.statusCode = statusCode;
1150
+ }
1151
+ };
1152
+ var EngineTimeoutError = class extends EngineError {
1153
+ timeoutMs;
1154
+ constructor(engine, timeoutMs) {
1155
+ super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });
1156
+ this.name = "EngineTimeoutError";
1157
+ this.timeoutMs = timeoutMs;
1158
+ }
1159
+ };
1160
+ var EngineUnavailableError = class extends EngineError {
1161
+ constructor(engine, reason) {
1162
+ super(engine, reason || "Engine not available", { retryable: false });
1163
+ this.name = "EngineUnavailableError";
1164
+ }
1165
+ };
1166
+ var AllEnginesFailedError = class extends Error {
1167
+ attemptedEngines;
1168
+ errors;
1169
+ constructor(attemptedEngines, errors) {
1170
+ const summary = attemptedEngines.map((e) => `${e}: ${errors.get(e)?.message || "unknown"}`).join("; ");
1171
+ super(`All engines failed: ${summary}`);
1172
+ this.name = "AllEnginesFailedError";
1173
+ this.attemptedEngines = attemptedEngines;
1174
+ this.errors = errors;
1175
+ }
1176
+ };
1177
+
1178
+ // src/engines/http/index.ts
1179
+ var DEFAULT_HEADERS = {
1180
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1181
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
1182
+ "Accept-Language": "en-US,en;q=0.9",
1183
+ "Accept-Encoding": "gzip, deflate, br",
1184
+ "Cache-Control": "no-cache",
1185
+ Pragma: "no-cache",
1186
+ "Sec-Fetch-Dest": "document",
1187
+ "Sec-Fetch-Mode": "navigate",
1188
+ "Sec-Fetch-Site": "none",
1189
+ "Sec-Fetch-User": "?1",
1190
+ "Upgrade-Insecure-Requests": "1"
1191
+ };
1192
+ var CHALLENGE_PATTERNS = [
1193
+ // Cloudflare
1194
+ "cf-browser-verification",
1195
+ "cf_chl_opt",
1196
+ "challenge-platform",
1197
+ "cf-spinner",
1198
+ "Just a moment",
1199
+ "Checking your browser",
1200
+ "checking if the site connection is secure",
1201
+ "Enable JavaScript and cookies",
1202
+ "Attention Required",
1203
+ "_cf_chl_tk",
1204
+ "Verifying you are human",
1205
+ "cf-turnstile",
1206
+ "/cdn-cgi/challenge-platform/",
1207
+ // Generic bot detection
1208
+ "Please Wait...",
1209
+ "DDoS protection by",
1210
+ "Access denied",
1211
+ "bot detection",
1212
+ "are you a robot",
1213
+ "complete the security check"
1214
+ ];
1215
+ var CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm", "cf-ray"];
1216
+ var MIN_CONTENT_LENGTH = 100;
1217
+ var HttpEngine = class {
1218
+ config = ENGINE_CONFIGS.http;
1219
+ async scrape(meta) {
1220
+ const startTime = Date.now();
1221
+ const { url, options, logger: logger4, abortSignal } = meta;
1222
+ try {
1223
+ const controller = new AbortController();
1224
+ const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
1225
+ if (abortSignal) {
1226
+ abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1227
+ }
1228
+ logger4?.debug(`[http] Fetching ${url}`);
1229
+ const response = await fetch(url, {
1230
+ method: "GET",
1231
+ headers: {
1232
+ ...DEFAULT_HEADERS,
1233
+ ...options.headers || {}
1234
+ },
1235
+ redirect: "follow",
1236
+ signal: controller.signal
1237
+ });
1238
+ clearTimeout(timeoutId);
1239
+ const duration = Date.now() - startTime;
1240
+ const html = await response.text();
1241
+ logger4?.debug(`[http] Got response: ${response.status} (${html.length} chars) in ${duration}ms`);
1242
+ if (response.status >= 400) {
1243
+ throw new HttpError("http", response.status, response.statusText);
1244
+ }
1245
+ const challengeType = this.detectChallenge(html);
1246
+ if (challengeType) {
1247
+ logger4?.debug(`[http] Challenge detected: ${challengeType}`);
1248
+ throw new ChallengeDetectedError("http", challengeType);
1249
+ }
1250
+ const textContent = this.extractText(html);
1251
+ if (textContent.length < MIN_CONTENT_LENGTH) {
1252
+ logger4?.debug(`[http] Insufficient content: ${textContent.length} chars`);
1253
+ throw new InsufficientContentError("http", textContent.length, MIN_CONTENT_LENGTH);
1254
+ }
1255
+ return {
1256
+ html,
1257
+ url: response.url,
1258
+ statusCode: response.status,
1259
+ contentType: response.headers.get("content-type") || void 0,
1260
+ headers: this.headersToRecord(response.headers),
1261
+ engine: "http",
1262
+ duration
1263
+ };
1264
+ } catch (error) {
1265
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError) {
1266
+ throw error;
1267
+ }
1268
+ if (error instanceof Error) {
1269
+ if (error.name === "AbortError") {
1270
+ throw new EngineTimeoutError("http", this.config.maxTimeout);
1271
+ }
1272
+ throw new EngineError("http", error.message, { cause: error });
1273
+ }
1274
+ throw new EngineError("http", String(error));
1275
+ }
1276
+ }
1277
+ /**
1278
+ * Detect challenge patterns in HTML
1279
+ * @returns Challenge type or null if no challenge detected
1280
+ */
1281
+ detectChallenge(html) {
1282
+ const htmlLower = html.toLowerCase();
1283
+ const hasCloudflare = CLOUDFLARE_INFRA_PATTERNS.some((p) => htmlLower.includes(p.toLowerCase()));
1284
+ for (const pattern of CHALLENGE_PATTERNS) {
1285
+ if (htmlLower.includes(pattern.toLowerCase())) {
1286
+ if (hasCloudflare || pattern.includes("cf")) {
1287
+ return "cloudflare";
1288
+ }
1289
+ return "bot-detection";
1290
+ }
1291
+ }
1292
+ return null;
1293
+ }
1294
+ /**
1295
+ * Convert Headers to Record<string, string>
1296
+ */
1297
+ headersToRecord(headers) {
1298
+ const record = {};
1299
+ headers.forEach((value, key) => {
1300
+ record[key] = value;
1301
+ });
1302
+ return record;
1303
+ }
1304
+ /**
1305
+ * Extract visible text from HTML (rough extraction)
1306
+ */
1307
+ extractText(html) {
1308
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1309
+ }
1310
+ isAvailable() {
1311
+ return true;
1312
+ }
1313
+ };
1314
+ var httpEngine = new HttpEngine();
1315
+
1316
+ // src/engines/tlsclient/index.ts
1317
+ import { gotScraping } from "got-scraping";
1318
+ var JS_REQUIRED_PATTERNS = [
1319
+ // Cloudflare JS challenge
1320
+ "cf-browser-verification",
1321
+ "challenge-platform",
1322
+ "_cf_chl_tk",
1323
+ "/cdn-cgi/challenge-platform/",
1324
+ // Generic JS requirements
1325
+ "Enable JavaScript",
1326
+ "JavaScript is required",
1327
+ "Please enable JavaScript",
1328
+ "requires JavaScript",
1329
+ "noscript"
1330
+ ];
1331
+ var BLOCKED_PATTERNS = [
1332
+ "Access denied",
1333
+ "Sorry, you have been blocked",
1334
+ "bot detected",
1335
+ "suspicious activity",
1336
+ "too many requests"
1337
+ ];
1338
+ var MIN_CONTENT_LENGTH2 = 100;
1339
+ var TlsClientEngine = class {
1340
+ config = ENGINE_CONFIGS.tlsclient;
1341
+ available = true;
1342
+ constructor() {
1343
+ try {
1344
+ if (!gotScraping) {
1345
+ this.available = false;
1346
+ }
1347
+ } catch {
1348
+ this.available = false;
1349
+ }
1350
+ }
1351
+ async scrape(meta) {
1352
+ if (!this.available) {
1353
+ throw new EngineUnavailableError("tlsclient", "got-scraping not available");
1354
+ }
1355
+ const startTime = Date.now();
1356
+ const { url, options, logger: logger4, abortSignal } = meta;
1357
+ try {
1358
+ const controller = new AbortController();
1359
+ const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
1360
+ if (abortSignal) {
1361
+ abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1362
+ }
1363
+ logger4?.debug(`[tlsclient] Fetching ${url}`);
1364
+ const response = await gotScraping({
1365
+ url,
1366
+ timeout: {
1367
+ request: this.config.maxTimeout
1368
+ },
1369
+ headers: options.headers,
1370
+ followRedirect: true
1371
+ // got-scraping handles browser fingerprinting automatically
1372
+ // It uses header generators and proper TLS settings
1373
+ });
1374
+ clearTimeout(timeoutId);
1375
+ const duration = Date.now() - startTime;
1376
+ const html = response.body;
1377
+ logger4?.debug(`[tlsclient] Got response: ${response.statusCode} (${html.length} chars) in ${duration}ms`);
1378
+ if (response.statusCode >= 400) {
1379
+ throw new HttpError("tlsclient", response.statusCode, response.statusMessage);
1380
+ }
1381
+ const challengeType = this.detectJsRequired(html);
1382
+ if (challengeType) {
1383
+ logger4?.debug(`[tlsclient] JS required: ${challengeType}`);
1384
+ throw new ChallengeDetectedError("tlsclient", challengeType);
1385
+ }
1386
+ const blockedReason = this.detectBlocked(html);
1387
+ if (blockedReason) {
1388
+ logger4?.debug(`[tlsclient] Blocked: ${blockedReason}`);
1389
+ throw new ChallengeDetectedError("tlsclient", `blocked: ${blockedReason}`);
1390
+ }
1391
+ const textContent = this.extractText(html);
1392
+ if (textContent.length < MIN_CONTENT_LENGTH2) {
1393
+ logger4?.debug(`[tlsclient] Insufficient content: ${textContent.length} chars`);
1394
+ throw new InsufficientContentError("tlsclient", textContent.length, MIN_CONTENT_LENGTH2);
1395
+ }
1396
+ return {
1397
+ html,
1398
+ url: response.url,
1399
+ statusCode: response.statusCode,
1400
+ contentType: response.headers["content-type"],
1401
+ headers: response.headers,
1402
+ engine: "tlsclient",
1403
+ duration
1404
+ };
1405
+ } catch (error) {
1406
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError || error instanceof EngineUnavailableError) {
1407
+ throw error;
1408
+ }
1409
+ if (error instanceof Error) {
1410
+ if (error.name === "TimeoutError" || error.message.includes("timeout")) {
1411
+ throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
1412
+ }
1413
+ if (error.name === "AbortError") {
1414
+ throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
1415
+ }
1416
+ throw new EngineError("tlsclient", error.message, { cause: error });
1417
+ }
1418
+ throw new EngineError("tlsclient", String(error));
1419
+ }
1420
+ }
1421
+ /**
1422
+ * Detect patterns that require JS execution
1423
+ */
1424
+ detectJsRequired(html) {
1425
+ const htmlLower = html.toLowerCase();
1426
+ for (const pattern of JS_REQUIRED_PATTERNS) {
1427
+ if (htmlLower.includes(pattern.toLowerCase())) {
1428
+ if (pattern.includes("cf") || pattern.includes("cloudflare")) {
1429
+ return "cloudflare-js";
1430
+ }
1431
+ return "js-required";
1432
+ }
1433
+ }
1434
+ return null;
1435
+ }
1436
+ /**
1437
+ * Detect blocked/denied patterns
1438
+ */
1439
+ detectBlocked(html) {
1440
+ const htmlLower = html.toLowerCase();
1441
+ for (const pattern of BLOCKED_PATTERNS) {
1442
+ if (htmlLower.includes(pattern.toLowerCase())) {
1443
+ return pattern;
1444
+ }
1445
+ }
1446
+ return null;
1447
+ }
1448
+ /**
1449
+ * Extract visible text from HTML
1450
+ */
1451
+ extractText(html) {
1452
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1453
+ }
1454
+ isAvailable() {
1455
+ return this.available;
1456
+ }
1457
+ };
1458
+ var tlsClientEngine = new TlsClientEngine();
1459
+
1460
+ // src/cloudflare/detector.ts
1461
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
1462
+ "#challenge-running",
1463
+ "#challenge-stage",
1464
+ "#challenge-form",
1465
+ ".cf-browser-verification",
1466
+ "#cf-wrapper",
1467
+ "#cf-hcaptcha-container",
1468
+ "#turnstile-wrapper"
1469
+ ];
1470
+ var CLOUDFLARE_TEXT_PATTERNS = [
1471
+ "checking if the site connection is secure",
1472
+ "this process is automatic. your browser will redirect",
1473
+ "ray id:",
1474
+ "performance & security by cloudflare"
1475
+ ];
1476
+ var CLOUDFLARE_INFRA_PATTERNS2 = [
1477
+ "/cdn-cgi/",
1478
+ "cloudflare",
1479
+ "__cf_bm",
1480
+ "cf-ray"
1481
+ ];
1482
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
1483
+ "sorry, you have been blocked",
1484
+ "ray id:"
1485
+ ];
1486
+ async function detectChallenge(hero) {
1487
+ const signals = [];
1488
+ let type = "none";
1489
+ let hasCloudflareInfra = false;
1490
+ let hasChallengeIndicator = false;
1491
+ try {
1492
+ if (!hero.document) {
1493
+ return {
1494
+ isChallenge: false,
1495
+ type: "none",
1496
+ confidence: 0,
1497
+ signals: ["No document available"]
1498
+ };
1499
+ }
1500
+ const html = await hero.document.documentElement.outerHTML;
1501
+ const htmlLower = html.toLowerCase();
1502
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS2) {
1503
+ if (htmlLower.includes(pattern)) {
1504
+ hasCloudflareInfra = true;
1505
+ signals.push(`Cloudflare infra: "${pattern}"`);
1506
+ break;
1507
+ }
1508
+ }
1509
+ if (!hasCloudflareInfra) {
1510
+ return {
1511
+ isChallenge: false,
1512
+ type: "none",
1513
+ confidence: 0,
1514
+ signals: ["No Cloudflare infrastructure detected"]
1515
+ };
1516
+ }
1517
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
1325
1518
  try {
1326
- const result = await this.scrapeSingleUrl(url, index);
1327
- if (result) {
1328
- return { result };
1519
+ const element = await hero.document.querySelector(selector);
1520
+ if (element) {
1521
+ hasChallengeIndicator = true;
1522
+ signals.push(`Challenge element: ${selector}`);
1523
+ type = "js_challenge";
1524
+ }
1525
+ } catch {
1526
+ }
1527
+ }
1528
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
1529
+ if (htmlLower.includes(pattern)) {
1530
+ hasChallengeIndicator = true;
1531
+ signals.push(`Challenge text: "${pattern}"`);
1532
+ type = type === "none" ? "js_challenge" : type;
1533
+ }
1534
+ }
1535
+ if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
1536
+ hasChallengeIndicator = true;
1537
+ signals.push('Challenge text: "waiting for...to respond"');
1538
+ type = type === "none" ? "js_challenge" : type;
1539
+ }
1540
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
1541
+ if (hasBlocked) {
1542
+ hasChallengeIndicator = true;
1543
+ signals.push("Cloudflare block page detected");
1544
+ type = "blocked";
1545
+ }
1546
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
1547
+ const confidence = isChallenge ? 100 : 0;
1548
+ return {
1549
+ isChallenge,
1550
+ type: isChallenge ? type : "none",
1551
+ confidence,
1552
+ signals
1553
+ };
1554
+ } catch (error) {
1555
+ return {
1556
+ isChallenge: false,
1557
+ type: "none",
1558
+ confidence: 0,
1559
+ signals: [`Error during detection: ${error.message}`]
1560
+ };
1561
+ }
1562
+ }
1563
+ async function isChallengePage(hero) {
1564
+ const detection = await detectChallenge(hero);
1565
+ return detection.isChallenge;
1566
+ }
1567
+
1568
+ // src/cloudflare/handler.ts
1569
+ async function waitForChallengeResolution(hero, options) {
1570
+ const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
1571
+ const startTime = Date.now();
1572
+ const log = (msg) => verbose && console.log(` ${msg}`);
1573
+ while (Date.now() - startTime < maxWaitMs) {
1574
+ const elapsed = Date.now() - startTime;
1575
+ try {
1576
+ const currentUrl = await hero.url;
1577
+ if (currentUrl !== initialUrl) {
1578
+ log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
1579
+ log(` Waiting for new page to load...`);
1580
+ try {
1581
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
1582
+ log(` DOMContentLoaded`);
1583
+ } catch {
1584
+ log(` DOMContentLoaded timeout, continuing...`);
1585
+ }
1586
+ await hero.waitForPaintingStable().catch(() => {
1587
+ });
1588
+ log(` Page stabilized`);
1589
+ return { resolved: true, method: "url_redirect", waitedMs: elapsed };
1590
+ }
1591
+ } catch {
1592
+ }
1593
+ const detection = await detectChallenge(hero);
1594
+ if (!detection.isChallenge) {
1595
+ log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
1596
+ log(` Waiting for page to load...`);
1597
+ try {
1598
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
1599
+ log(` DOMContentLoaded`);
1600
+ } catch {
1601
+ log(` DOMContentLoaded timeout, continuing...`);
1602
+ }
1603
+ await hero.waitForPaintingStable().catch(() => {
1604
+ });
1605
+ log(` Page stabilized`);
1606
+ return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
1607
+ }
1608
+ log(
1609
+ `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
1610
+ );
1611
+ await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
1612
+ }
1613
+ return {
1614
+ resolved: false,
1615
+ method: "timeout",
1616
+ waitedMs: Date.now() - startTime
1617
+ };
1618
+ }
1619
+ async function waitForSelector(hero, selector, maxWaitMs, verbose = false) {
1620
+ const startTime = Date.now();
1621
+ const log = (msg) => verbose && console.log(` ${msg}`);
1622
+ log(`Waiting for selector: "${selector}"`);
1623
+ while (Date.now() - startTime < maxWaitMs) {
1624
+ try {
1625
+ const element = await hero.document.querySelector(selector);
1626
+ if (element) {
1627
+ const elapsed = Date.now() - startTime;
1628
+ log(`\u2713 Selector found after ${(elapsed / 1e3).toFixed(1)}s`);
1629
+ return { found: true, waitedMs: elapsed };
1630
+ }
1631
+ } catch {
1632
+ }
1633
+ await new Promise((resolve) => setTimeout(resolve, 300));
1634
+ }
1635
+ log(`\u2717 Selector not found within timeout`);
1636
+ return { found: false, waitedMs: Date.now() - startTime };
1637
+ }
1638
+ async function handleChallenge(hero, options = {}) {
1639
+ const initialUrl = await hero.url;
1640
+ const detection = await detectChallenge(hero);
1641
+ if (!detection.isChallenge) {
1642
+ return { resolved: true, method: "signals_cleared", waitedMs: 0 };
1643
+ }
1644
+ return waitForChallengeResolution(hero, {
1645
+ ...options,
1646
+ initialUrl
1647
+ });
1648
+ }
1649
+
1650
+ // src/engines/hero/index.ts
1651
+ var MIN_CONTENT_LENGTH3 = 100;
1652
+ var HeroEngine = class {
1653
+ config = ENGINE_CONFIGS.hero;
1654
+ async scrape(meta) {
1655
+ const startTime = Date.now();
1656
+ const { url, options, logger: logger4, abortSignal } = meta;
1657
+ const pool = options.pool;
1658
+ if (!pool) {
1659
+ throw new EngineUnavailableError("hero", "Browser pool not available");
1660
+ }
1661
+ if (abortSignal?.aborted) {
1662
+ throw new EngineTimeoutError("hero", 0);
1663
+ }
1664
+ logger4?.debug(`[hero] Starting browser scrape of ${url}`);
1665
+ try {
1666
+ const result = await pool.withBrowser(async (hero) => {
1667
+ let aborted = false;
1668
+ if (abortSignal) {
1669
+ abortSignal.addEventListener("abort", () => {
1670
+ aborted = true;
1671
+ }, { once: true });
1329
1672
  }
1330
- lastError = `Failed to scrape ${url}: No content returned`;
1331
- } catch (error) {
1332
- lastError = error.message;
1333
- if (attempt < maxRetries) {
1334
- const delay = Math.pow(2, attempt) * 1e3;
1335
- this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
1336
- await new Promise((resolve) => setTimeout(resolve, delay));
1673
+ const timeoutMs = options.timeoutMs || this.config.maxTimeout;
1674
+ await hero.goto(url, { timeoutMs });
1675
+ if (aborted) {
1676
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1677
+ }
1678
+ try {
1679
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs });
1680
+ } catch {
1681
+ }
1682
+ await hero.waitForPaintingStable();
1683
+ if (aborted) {
1684
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1685
+ }
1686
+ const initialUrl = await hero.url;
1687
+ const detection = await detectChallenge(hero);
1688
+ if (detection.isChallenge) {
1689
+ logger4?.debug(`[hero] Challenge detected: ${detection.type}`);
1690
+ if (detection.type === "blocked") {
1691
+ throw new ChallengeDetectedError("hero", "blocked");
1692
+ }
1693
+ const resolution = await waitForChallengeResolution(hero, {
1694
+ maxWaitMs: 45e3,
1695
+ pollIntervalMs: 500,
1696
+ verbose: options.verbose,
1697
+ initialUrl
1698
+ });
1699
+ if (!resolution.resolved) {
1700
+ throw new ChallengeDetectedError("hero", `unresolved: ${detection.type}`);
1701
+ }
1702
+ logger4?.debug(`[hero] Challenge resolved via ${resolution.method} in ${resolution.waitedMs}ms`);
1703
+ }
1704
+ if (aborted) {
1705
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1706
+ }
1707
+ await this.waitForFinalPage(hero, url, logger4);
1708
+ if (aborted) {
1709
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1710
+ }
1711
+ if (options.waitForSelector) {
1712
+ try {
1713
+ await hero.waitForElement(hero.document.querySelector(options.waitForSelector), {
1714
+ timeoutMs
1715
+ });
1716
+ } catch {
1717
+ logger4?.debug(`[hero] Selector not found: ${options.waitForSelector}`);
1718
+ }
1337
1719
  }
1720
+ const html = await hero.document.documentElement.outerHTML;
1721
+ const finalUrl = await hero.url;
1722
+ const textContent = this.extractText(html);
1723
+ if (textContent.length < MIN_CONTENT_LENGTH3) {
1724
+ logger4?.debug(`[hero] Insufficient content: ${textContent.length} chars`);
1725
+ throw new InsufficientContentError("hero", textContent.length, MIN_CONTENT_LENGTH3);
1726
+ }
1727
+ const duration = Date.now() - startTime;
1728
+ logger4?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);
1729
+ return {
1730
+ html,
1731
+ url: finalUrl,
1732
+ statusCode: 200,
1733
+ // Hero doesn't expose status code directly
1734
+ engine: "hero",
1735
+ duration
1736
+ };
1737
+ });
1738
+ return result;
1739
+ } catch (error) {
1740
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError || error instanceof EngineUnavailableError) {
1741
+ throw error;
1338
1742
  }
1743
+ if (error instanceof Error) {
1744
+ if (error.name === "TimeoutError" || error.message.includes("timeout")) {
1745
+ throw new EngineTimeoutError("hero", this.config.maxTimeout);
1746
+ }
1747
+ if (error.message.includes("Navigation") || error.message.includes("ERR_")) {
1748
+ throw new EngineError("hero", `Navigation failed: ${error.message}`, { cause: error });
1749
+ }
1750
+ throw new EngineError("hero", error.message, { cause: error });
1751
+ }
1752
+ throw new EngineError("hero", String(error));
1339
1753
  }
1340
- this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
1341
- return { result: null, error: lastError };
1342
1754
  }
1343
1755
  /**
1344
1756
  * Wait for the final page to load after any Cloudflare redirects
1345
- * Cloudflare often does silent redirects even when bypassed, we need to ensure
1346
- * we're on the actual content page before scraping.
1347
1757
  */
1348
- async waitForFinalPage(hero, originalUrl, verbose) {
1758
+ async waitForFinalPage(hero, originalUrl, logger4) {
1349
1759
  const maxWaitMs = 15e3;
1350
1760
  const startTime = Date.now();
1351
- const log = (msg) => verbose && this.logger.info(msg);
1352
1761
  try {
1353
1762
  await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
1354
1763
  } catch {
@@ -1357,7 +1766,7 @@ var Scraper = class {
1357
1766
  const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
1358
1767
  const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
1359
1768
  if (urlChanged || currentUrl.includes("__cf_chl")) {
1360
- log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1769
+ logger4?.debug(`[hero] Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1361
1770
  let lastUrl = currentUrl;
1362
1771
  let stableCount = 0;
1363
1772
  while (Date.now() - startTime < maxWaitMs) {
@@ -1372,7 +1781,7 @@ var Scraper = class {
1372
1781
  } else {
1373
1782
  stableCount = 0;
1374
1783
  lastUrl = currentUrl;
1375
- log(`URL changed to: ${currentUrl}`);
1784
+ logger4?.debug(`[hero] URL changed to: ${currentUrl}`);
1376
1785
  }
1377
1786
  } catch {
1378
1787
  }
@@ -1386,7 +1795,223 @@ var Scraper = class {
1386
1795
  await new Promise((resolve) => setTimeout(resolve, 2e3));
1387
1796
  }
1388
1797
  /**
1389
- * Scrape a single URL
1798
+ * Extract visible text from HTML
1799
+ */
1800
+ extractText(html) {
1801
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1802
+ }
1803
+ isAvailable() {
1804
+ return true;
1805
+ }
1806
+ };
1807
+ var heroEngine = new HeroEngine();
1808
+
1809
+ // src/engines/orchestrator.ts
1810
+ var ENGINE_REGISTRY = {
1811
+ http: httpEngine,
1812
+ tlsclient: tlsClientEngine,
1813
+ hero: heroEngine
1814
+ };
1815
+ var EngineOrchestrator = class {
1816
+ options;
1817
+ engines;
1818
+ engineOrder;
1819
+ constructor(options = {}) {
1820
+ this.options = options;
1821
+ this.engineOrder = this.resolveEngineOrder();
1822
+ this.engines = this.engineOrder.map((name) => ENGINE_REGISTRY[name]).filter((engine) => engine.isAvailable());
1823
+ }
1824
+ /**
1825
+ * Resolve the engine order based on options
1826
+ */
1827
+ resolveEngineOrder() {
1828
+ if (this.options.forceEngine) {
1829
+ return [this.options.forceEngine];
1830
+ }
1831
+ let order = this.options.engines || [...DEFAULT_ENGINE_ORDER];
1832
+ if (this.options.skipEngines) {
1833
+ order = order.filter((e) => !this.options.skipEngines.includes(e));
1834
+ }
1835
+ return order;
1836
+ }
1837
+ /**
1838
+ * Get available engines
1839
+ */
1840
+ getAvailableEngines() {
1841
+ return this.engines.map((e) => e.config.name);
1842
+ }
1843
+ /**
1844
+ * Scrape a URL using the engine cascade
1845
+ *
1846
+ * @param meta - Engine metadata (url, options, logger, abortSignal)
1847
+ * @returns Scrape result with engine metadata
1848
+ * @throws AllEnginesFailedError if all engines fail
1849
+ */
1850
+ async scrape(meta) {
1851
+ const attemptedEngines = [];
1852
+ const engineErrors = /* @__PURE__ */ new Map();
1853
+ const logger4 = meta.logger || this.options.logger;
1854
+ const verbose = this.options.verbose || meta.options.verbose;
1855
+ if (this.engines.length === 0) {
1856
+ throw new AllEnginesFailedError([], engineErrors);
1857
+ }
1858
+ const log = (msg) => {
1859
+ if (verbose) {
1860
+ logger4?.info(msg);
1861
+ } else {
1862
+ logger4?.debug(msg);
1863
+ }
1864
+ };
1865
+ log(`[orchestrator] Starting scrape of ${meta.url} with engines: ${this.engineOrder.join(" \u2192 ")}`);
1866
+ for (const engine of this.engines) {
1867
+ const engineName = engine.config.name;
1868
+ attemptedEngines.push(engineName);
1869
+ try {
1870
+ log(`[orchestrator] Trying ${engineName} engine...`);
1871
+ const controller = new AbortController();
1872
+ const timeoutId = setTimeout(() => controller.abort(), engine.config.maxTimeout);
1873
+ if (meta.abortSignal) {
1874
+ meta.abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1875
+ }
1876
+ try {
1877
+ const result = await engine.scrape({
1878
+ ...meta,
1879
+ abortSignal: controller.signal
1880
+ });
1881
+ clearTimeout(timeoutId);
1882
+ log(`[orchestrator] \u2713 ${engineName} succeeded in ${result.duration}ms`);
1883
+ return {
1884
+ ...result,
1885
+ attemptedEngines,
1886
+ engineErrors
1887
+ };
1888
+ } finally {
1889
+ clearTimeout(timeoutId);
1890
+ }
1891
+ } catch (error) {
1892
+ const err = error instanceof Error ? error : new Error(String(error));
1893
+ engineErrors.set(engineName, err);
1894
+ if (error instanceof ChallengeDetectedError) {
1895
+ log(`[orchestrator] ${engineName} detected challenge: ${error.challengeType}`);
1896
+ } else if (error instanceof InsufficientContentError) {
1897
+ log(`[orchestrator] ${engineName} insufficient content: ${error.contentLength} chars`);
1898
+ } else if (error instanceof HttpError) {
1899
+ log(`[orchestrator] ${engineName} HTTP error: ${error.statusCode}`);
1900
+ } else if (error instanceof EngineTimeoutError) {
1901
+ log(`[orchestrator] ${engineName} timed out after ${error.timeoutMs}ms`);
1902
+ } else if (error instanceof EngineUnavailableError) {
1903
+ log(`[orchestrator] ${engineName} unavailable: ${err.message}`);
1904
+ } else {
1905
+ log(`[orchestrator] ${engineName} failed: ${err.message}`);
1906
+ }
1907
+ if (!this.shouldRetry(error)) {
1908
+ log(`[orchestrator] Non-retryable error, stopping cascade`);
1909
+ break;
1910
+ }
1911
+ log(`[orchestrator] Falling back to next engine...`);
1912
+ }
1913
+ }
1914
+ log(`[orchestrator] All engines failed for ${meta.url}`);
1915
+ throw new AllEnginesFailedError(attemptedEngines, engineErrors);
1916
+ }
1917
+ /**
1918
+ * Determine if we should retry with next engine
1919
+ */
1920
+ shouldRetry(error) {
1921
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError) {
1922
+ return true;
1923
+ }
1924
+ if (error instanceof HttpError) {
1925
+ return error.statusCode === 403 || error.statusCode === 404 || error.statusCode === 429 || error.statusCode >= 500;
1926
+ }
1927
+ if (error instanceof EngineUnavailableError) {
1928
+ return true;
1929
+ }
1930
+ if (error instanceof EngineError) {
1931
+ return error.retryable;
1932
+ }
1933
+ return true;
1934
+ }
1935
+ };
1936
+
1937
+ // src/scraper.ts
1938
+ var Scraper = class {
1939
+ options;
1940
+ logger = createLogger("scraper");
1941
+ robotsCache = /* @__PURE__ */ new Map();
1942
+ constructor(options) {
1943
+ this.options = {
1944
+ ...DEFAULT_OPTIONS,
1945
+ ...options
1946
+ };
1947
+ }
1948
+ /**
1949
+ * Get robots.txt rules for a URL, cached per domain
1950
+ */
1951
+ async getRobotsRules(url) {
1952
+ const origin = new URL(url).origin;
1953
+ if (!this.robotsCache.has(origin)) {
1954
+ const rules = await fetchRobotsTxt(origin);
1955
+ this.robotsCache.set(origin, rules);
1956
+ }
1957
+ return this.robotsCache.get(origin) ?? null;
1958
+ }
1959
+ /**
1960
+ * Scrape all URLs
1961
+ *
1962
+ * @returns Scrape result with pages and metadata
1963
+ */
1964
+ async scrape() {
1965
+ const startTime = Date.now();
1966
+ const results = await this.scrapeWithConcurrency();
1967
+ return this.buildScrapeResult(results, startTime);
1968
+ }
1969
+ /**
1970
+ * Scrape URLs with concurrency control
1971
+ */
1972
+ async scrapeWithConcurrency() {
1973
+ const limit = pLimit(this.options.batchConcurrency || 1);
1974
+ const tasks = this.options.urls.map(
1975
+ (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1976
+ );
1977
+ const batchPromise = Promise.all(tasks);
1978
+ if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1979
+ const timeoutPromise = new Promise((_, reject) => {
1980
+ setTimeout(() => {
1981
+ reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1982
+ }, this.options.batchTimeoutMs);
1983
+ });
1984
+ return Promise.race([batchPromise, timeoutPromise]);
1985
+ }
1986
+ return batchPromise;
1987
+ }
1988
+ /**
1989
+ * Scrape a single URL with retry logic
1990
+ */
1991
+ async scrapeSingleUrlWithRetry(url, index) {
1992
+ const maxRetries = this.options.maxRetries || 2;
1993
+ let lastError;
1994
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
1995
+ try {
1996
+ const result = await this.scrapeSingleUrl(url, index);
1997
+ if (result) {
1998
+ return { result };
1999
+ }
2000
+ lastError = `Failed to scrape ${url}: No content returned`;
2001
+ } catch (error) {
2002
+ lastError = error.message;
2003
+ if (attempt < maxRetries) {
2004
+ const delay = Math.pow(2, attempt) * 1e3;
2005
+ this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
2006
+ await new Promise((resolve) => setTimeout(resolve, delay));
2007
+ }
2008
+ }
2009
+ }
2010
+ this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
2011
+ return { result: null, error: lastError };
2012
+ }
2013
+ /**
2014
+ * Scrape a single URL using the engine orchestrator
1390
2015
  */
1391
2016
  async scrapeSingleUrl(url, index) {
1392
2017
  const startTime = Date.now();
@@ -1395,98 +2020,84 @@ var Scraper = class {
1395
2020
  throw new Error(`URL blocked by robots.txt: ${url}`);
1396
2021
  }
1397
2022
  try {
1398
- return await this.pool.withBrowser(async (hero) => {
1399
- await hero.goto(url, { timeoutMs: this.options.timeoutMs });
1400
- try {
1401
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: this.options.timeoutMs });
1402
- } catch {
1403
- }
1404
- await hero.waitForPaintingStable();
1405
- const initialUrl = await hero.url;
1406
- const detection = await detectChallenge(hero);
1407
- if (detection.isChallenge) {
1408
- if (this.options.verbose) {
1409
- this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1410
- }
1411
- const result2 = await waitForChallengeResolution(hero, {
1412
- maxWaitMs: 45e3,
1413
- pollIntervalMs: 500,
1414
- verbose: this.options.verbose,
1415
- initialUrl
1416
- });
1417
- if (!result2.resolved) {
1418
- throw new Error(`Challenge not resolved: ${detection.type}`);
1419
- }
1420
- if (this.options.verbose) {
1421
- this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1422
- }
1423
- }
1424
- await this.waitForFinalPage(hero, url, this.options.verbose);
1425
- if (this.options.waitForSelector) {
1426
- try {
1427
- await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
1428
- timeoutMs: this.options.timeoutMs
1429
- });
1430
- } catch (error) {
1431
- this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1432
- }
1433
- }
1434
- const html = await hero.document.documentElement.outerHTML;
1435
- const cleanedHtml = cleanContent(html, url, {
1436
- removeAds: this.options.removeAds,
1437
- removeBase64Images: this.options.removeBase64Images,
1438
- onlyMainContent: this.options.onlyMainContent,
1439
- includeTags: this.options.includeTags,
1440
- excludeTags: this.options.excludeTags
2023
+ const orchestrator = new EngineOrchestrator({
2024
+ engines: this.options.engines,
2025
+ skipEngines: this.options.skipEngines,
2026
+ forceEngine: this.options.forceEngine,
2027
+ logger: this.logger,
2028
+ verbose: this.options.verbose
2029
+ });
2030
+ const engineResult = await orchestrator.scrape({
2031
+ url,
2032
+ options: this.options,
2033
+ logger: this.logger
2034
+ });
2035
+ if (this.options.verbose) {
2036
+ this.logger.info(
2037
+ `[scraper] ${url} scraped with ${engineResult.engine} engine in ${engineResult.duration}ms (attempted: ${engineResult.attemptedEngines.join(" \u2192 ")})`
2038
+ );
2039
+ }
2040
+ const cleanedHtml = cleanContent(engineResult.html, engineResult.url, {
2041
+ removeAds: this.options.removeAds,
2042
+ removeBase64Images: this.options.removeBase64Images,
2043
+ onlyMainContent: this.options.onlyMainContent,
2044
+ includeTags: this.options.includeTags,
2045
+ excludeTags: this.options.excludeTags
2046
+ });
2047
+ const websiteMetadata = extractMetadata(cleanedHtml, engineResult.url);
2048
+ const duration = Date.now() - startTime;
2049
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
2050
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
2051
+ if (this.options.onProgress) {
2052
+ this.options.onProgress({
2053
+ completed: index + 1,
2054
+ total: this.options.urls.length,
2055
+ currentUrl: url
1441
2056
  });
1442
- const websiteMetadata = extractMetadata(cleanedHtml, url);
1443
- const duration = Date.now() - startTime;
1444
- const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1445
- const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1446
- if (this.options.onProgress) {
1447
- this.options.onProgress({
1448
- completed: index + 1,
1449
- total: this.options.urls.length,
1450
- currentUrl: url
1451
- });
1452
- }
1453
- let proxyMetadata;
1454
- if (this.options.proxy) {
1455
- const proxy = this.options.proxy;
1456
- if (proxy.url) {
1457
- try {
1458
- const proxyUrl = new URL(proxy.url);
1459
- proxyMetadata = {
1460
- host: proxyUrl.hostname,
1461
- port: parseInt(proxyUrl.port, 10) || 80,
1462
- country: proxy.country
1463
- };
1464
- } catch {
1465
- }
1466
- } else if (proxy.host && proxy.port) {
2057
+ }
2058
+ let proxyMetadata;
2059
+ if (this.options.proxy) {
2060
+ const proxy = this.options.proxy;
2061
+ if (proxy.url) {
2062
+ try {
2063
+ const proxyUrl = new URL(proxy.url);
1467
2064
  proxyMetadata = {
1468
- host: proxy.host,
1469
- port: proxy.port,
2065
+ host: proxyUrl.hostname,
2066
+ port: parseInt(proxyUrl.port, 10) || 80,
1470
2067
  country: proxy.country
1471
2068
  };
2069
+ } catch {
1472
2070
  }
2071
+ } else if (proxy.host && proxy.port) {
2072
+ proxyMetadata = {
2073
+ host: proxy.host,
2074
+ port: proxy.port,
2075
+ country: proxy.country
2076
+ };
1473
2077
  }
1474
- const result = {
1475
- markdown,
1476
- html: htmlOutput,
1477
- metadata: {
1478
- baseUrl: url,
1479
- totalPages: 1,
1480
- scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1481
- duration,
1482
- website: websiteMetadata,
1483
- proxy: proxyMetadata
1484
- }
1485
- };
1486
- return result;
1487
- });
2078
+ }
2079
+ const result = {
2080
+ markdown,
2081
+ html: htmlOutput,
2082
+ metadata: {
2083
+ baseUrl: url,
2084
+ totalPages: 1,
2085
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
2086
+ duration,
2087
+ website: websiteMetadata,
2088
+ proxy: proxyMetadata
2089
+ }
2090
+ };
2091
+ return result;
1488
2092
  } catch (error) {
1489
- this.logger.error(`Failed to scrape ${url}: ${error.message}`);
2093
+ if (error instanceof AllEnginesFailedError) {
2094
+ const engineSummary = error.attemptedEngines.map((e) => `${e}: ${error.errors.get(e)?.message || "unknown"}`).join("; ");
2095
+ this.logger.error(`Failed to scrape ${url}: All engines failed - ${engineSummary}`);
2096
+ } else if (error instanceof Error) {
2097
+ this.logger.error(`Failed to scrape ${url}: ${error.message}`);
2098
+ } else {
2099
+ this.logger.error(`Failed to scrape ${url}: ${String(error)}`);
2100
+ }
1490
2101
  if (this.options.onProgress) {
1491
2102
  this.options.onProgress({
1492
2103
  completed: index + 1,