@vakra-dev/reader 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/index.js CHANGED
@@ -17,162 +17,6 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
17
17
  // src/scraper.ts
18
18
  import pLimit from "p-limit";
19
19
 
20
- // src/cloudflare/detector.ts
21
- var CLOUDFLARE_CHALLENGE_SELECTORS = [
22
- "#challenge-running",
23
- "#challenge-stage",
24
- "#challenge-form",
25
- ".cf-browser-verification",
26
- "#cf-wrapper",
27
- "#cf-hcaptcha-container",
28
- "#turnstile-wrapper"
29
- ];
30
- var CLOUDFLARE_TEXT_PATTERNS = [
31
- "checking if the site connection is secure",
32
- "this process is automatic. your browser will redirect",
33
- "ray id:",
34
- "performance & security by cloudflare"
35
- ];
36
- var CLOUDFLARE_INFRA_PATTERNS = [
37
- "/cdn-cgi/",
38
- "cloudflare",
39
- "__cf_bm",
40
- "cf-ray"
41
- ];
42
- var CLOUDFLARE_BLOCKED_PATTERNS = [
43
- "sorry, you have been blocked",
44
- "ray id:"
45
- ];
46
- async function detectChallenge(hero) {
47
- const signals = [];
48
- let type = "none";
49
- let hasCloudflareInfra = false;
50
- let hasChallengeIndicator = false;
51
- try {
52
- if (!hero.document) {
53
- return {
54
- isChallenge: false,
55
- type: "none",
56
- confidence: 0,
57
- signals: ["No document available"]
58
- };
59
- }
60
- const html = await hero.document.documentElement.outerHTML;
61
- const htmlLower = html.toLowerCase();
62
- for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
63
- if (htmlLower.includes(pattern)) {
64
- hasCloudflareInfra = true;
65
- signals.push(`Cloudflare infra: "${pattern}"`);
66
- break;
67
- }
68
- }
69
- if (!hasCloudflareInfra) {
70
- return {
71
- isChallenge: false,
72
- type: "none",
73
- confidence: 0,
74
- signals: ["No Cloudflare infrastructure detected"]
75
- };
76
- }
77
- for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
78
- try {
79
- const element = await hero.document.querySelector(selector);
80
- if (element) {
81
- hasChallengeIndicator = true;
82
- signals.push(`Challenge element: ${selector}`);
83
- type = "js_challenge";
84
- }
85
- } catch {
86
- }
87
- }
88
- for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
89
- if (htmlLower.includes(pattern)) {
90
- hasChallengeIndicator = true;
91
- signals.push(`Challenge text: "${pattern}"`);
92
- type = type === "none" ? "js_challenge" : type;
93
- }
94
- }
95
- if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
96
- hasChallengeIndicator = true;
97
- signals.push('Challenge text: "waiting for...to respond"');
98
- type = type === "none" ? "js_challenge" : type;
99
- }
100
- const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
101
- if (hasBlocked) {
102
- hasChallengeIndicator = true;
103
- signals.push("Cloudflare block page detected");
104
- type = "blocked";
105
- }
106
- const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
107
- const confidence = isChallenge ? 100 : 0;
108
- return {
109
- isChallenge,
110
- type: isChallenge ? type : "none",
111
- confidence,
112
- signals
113
- };
114
- } catch (error) {
115
- return {
116
- isChallenge: false,
117
- type: "none",
118
- confidence: 0,
119
- signals: [`Error during detection: ${error.message}`]
120
- };
121
- }
122
- }
123
-
124
- // src/cloudflare/handler.ts
125
- async function waitForChallengeResolution(hero, options) {
126
- const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
127
- const startTime = Date.now();
128
- const log = (msg) => verbose && console.log(` ${msg}`);
129
- while (Date.now() - startTime < maxWaitMs) {
130
- const elapsed = Date.now() - startTime;
131
- try {
132
- const currentUrl = await hero.url;
133
- if (currentUrl !== initialUrl) {
134
- log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
135
- log(` Waiting for new page to load...`);
136
- try {
137
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
138
- log(` DOMContentLoaded`);
139
- } catch {
140
- log(` DOMContentLoaded timeout, continuing...`);
141
- }
142
- await hero.waitForPaintingStable().catch(() => {
143
- });
144
- log(` Page stabilized`);
145
- return { resolved: true, method: "url_redirect", waitedMs: elapsed };
146
- }
147
- } catch {
148
- }
149
- const detection = await detectChallenge(hero);
150
- if (!detection.isChallenge) {
151
- log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
152
- log(` Waiting for page to load...`);
153
- try {
154
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
155
- log(` DOMContentLoaded`);
156
- } catch {
157
- log(` DOMContentLoaded timeout, continuing...`);
158
- }
159
- await hero.waitForPaintingStable().catch(() => {
160
- });
161
- log(` Page stabilized`);
162
- return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
163
- }
164
- log(
165
- `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
166
- );
167
- await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
168
- }
169
- return {
170
- resolved: false,
171
- method: "timeout",
172
- waitedMs: Date.now() - startTime
173
- };
174
- }
175
-
176
20
  // src/formatters/markdown.ts
177
21
  import TurndownService from "turndown";
178
22
  var turndownService = new TurndownService({
@@ -1054,96 +898,667 @@ var DEFAULT_OPTIONS = {
1054
898
  showChrome: false
1055
899
  };
1056
900
 
1057
- // src/scraper.ts
1058
- var Scraper = class {
1059
- options;
1060
- pool;
1061
- logger = createLogger("scraper");
1062
- robotsCache = /* @__PURE__ */ new Map();
1063
- constructor(options) {
1064
- this.options = {
1065
- ...DEFAULT_OPTIONS,
1066
- ...options
1067
- };
1068
- if (!options.pool) {
1069
- throw new Error("Browser pool must be provided. Use ReaderClient for automatic pool management.");
901
+ // src/engines/types.ts
902
+ var ENGINE_CONFIGS = {
903
+ http: {
904
+ name: "http",
905
+ timeout: 3e3,
906
+ maxTimeout: 1e4,
907
+ quality: 100,
908
+ features: {
909
+ javascript: false,
910
+ cloudflare: false,
911
+ tlsFingerprint: false,
912
+ waitFor: false,
913
+ screenshots: false
914
+ }
915
+ },
916
+ tlsclient: {
917
+ name: "tlsclient",
918
+ timeout: 5e3,
919
+ maxTimeout: 15e3,
920
+ quality: 80,
921
+ features: {
922
+ javascript: false,
923
+ cloudflare: false,
924
+ tlsFingerprint: true,
925
+ waitFor: false,
926
+ screenshots: false
927
+ }
928
+ },
929
+ hero: {
930
+ name: "hero",
931
+ timeout: 3e4,
932
+ maxTimeout: 6e4,
933
+ quality: 50,
934
+ features: {
935
+ javascript: true,
936
+ cloudflare: true,
937
+ tlsFingerprint: true,
938
+ waitFor: true,
939
+ screenshots: true
1070
940
  }
1071
- this.pool = options.pool;
1072
941
  }
1073
- /**
1074
- * Get robots.txt rules for a URL, cached per domain
1075
- */
1076
- async getRobotsRules(url) {
1077
- const origin = new URL(url).origin;
1078
- if (!this.robotsCache.has(origin)) {
1079
- const rules = await fetchRobotsTxt(origin);
1080
- this.robotsCache.set(origin, rules);
942
+ };
943
+ var DEFAULT_ENGINE_ORDER = ["http", "tlsclient", "hero"];
944
+
945
+ // src/engines/errors.ts
946
+ var EngineError = class extends Error {
947
+ engine;
948
+ retryable;
949
+ constructor(engine, message, options) {
950
+ super(`[${engine}] ${message}`);
951
+ this.name = "EngineError";
952
+ this.engine = engine;
953
+ this.retryable = options?.retryable ?? true;
954
+ this.cause = options?.cause;
955
+ if (Error.captureStackTrace) {
956
+ Error.captureStackTrace(this, this.constructor);
1081
957
  }
1082
- return this.robotsCache.get(origin) ?? null;
1083
958
  }
1084
- /**
1085
- * Scrape all URLs
1086
- *
1087
- * @returns Scrape result with pages and metadata
1088
- */
1089
- async scrape() {
1090
- const startTime = Date.now();
1091
- const results = await this.scrapeWithConcurrency();
1092
- return this.buildScrapeResult(results, startTime);
959
+ };
960
+ var ChallengeDetectedError = class extends EngineError {
961
+ challengeType;
962
+ constructor(engine, challengeType) {
963
+ super(engine, `Challenge detected: ${challengeType || "unknown"}`, { retryable: true });
964
+ this.name = "ChallengeDetectedError";
965
+ this.challengeType = challengeType || "unknown";
1093
966
  }
1094
- /**
1095
- * Scrape URLs with concurrency control
1096
- */
1097
- async scrapeWithConcurrency() {
1098
- const limit = pLimit(this.options.batchConcurrency || 1);
1099
- const tasks = this.options.urls.map(
1100
- (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1101
- );
1102
- const batchPromise = Promise.all(tasks);
1103
- if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1104
- const timeoutPromise = new Promise((_, reject) => {
1105
- setTimeout(() => {
1106
- reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1107
- }, this.options.batchTimeoutMs);
967
+ };
968
+ var InsufficientContentError = class extends EngineError {
969
+ contentLength;
970
+ threshold;
971
+ constructor(engine, contentLength, threshold = 100) {
972
+ super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, { retryable: true });
973
+ this.name = "InsufficientContentError";
974
+ this.contentLength = contentLength;
975
+ this.threshold = threshold;
976
+ }
977
+ };
978
+ var HttpError = class extends EngineError {
979
+ statusCode;
980
+ constructor(engine, statusCode, statusText) {
981
+ const retryable = statusCode >= 500 || statusCode === 429;
982
+ super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : ""}`, { retryable });
983
+ this.name = "HttpError";
984
+ this.statusCode = statusCode;
985
+ }
986
+ };
987
+ var EngineTimeoutError = class extends EngineError {
988
+ timeoutMs;
989
+ constructor(engine, timeoutMs) {
990
+ super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });
991
+ this.name = "EngineTimeoutError";
992
+ this.timeoutMs = timeoutMs;
993
+ }
994
+ };
995
+ var EngineUnavailableError = class extends EngineError {
996
+ constructor(engine, reason) {
997
+ super(engine, reason || "Engine not available", { retryable: false });
998
+ this.name = "EngineUnavailableError";
999
+ }
1000
+ };
1001
+ var AllEnginesFailedError = class extends Error {
1002
+ attemptedEngines;
1003
+ errors;
1004
+ constructor(attemptedEngines, errors) {
1005
+ const summary = attemptedEngines.map((e) => `${e}: ${errors.get(e)?.message || "unknown"}`).join("; ");
1006
+ super(`All engines failed: ${summary}`);
1007
+ this.name = "AllEnginesFailedError";
1008
+ this.attemptedEngines = attemptedEngines;
1009
+ this.errors = errors;
1010
+ }
1011
+ };
1012
+
1013
+ // src/engines/http/index.ts
1014
+ var DEFAULT_HEADERS = {
1015
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
1016
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
1017
+ "Accept-Language": "en-US,en;q=0.9",
1018
+ "Accept-Encoding": "gzip, deflate, br",
1019
+ "Cache-Control": "no-cache",
1020
+ Pragma: "no-cache",
1021
+ "Sec-Fetch-Dest": "document",
1022
+ "Sec-Fetch-Mode": "navigate",
1023
+ "Sec-Fetch-Site": "none",
1024
+ "Sec-Fetch-User": "?1",
1025
+ "Upgrade-Insecure-Requests": "1"
1026
+ };
1027
+ var CHALLENGE_PATTERNS = [
1028
+ // Cloudflare
1029
+ "cf-browser-verification",
1030
+ "cf_chl_opt",
1031
+ "challenge-platform",
1032
+ "cf-spinner",
1033
+ "Just a moment",
1034
+ "Checking your browser",
1035
+ "checking if the site connection is secure",
1036
+ "Enable JavaScript and cookies",
1037
+ "Attention Required",
1038
+ "_cf_chl_tk",
1039
+ "Verifying you are human",
1040
+ "cf-turnstile",
1041
+ "/cdn-cgi/challenge-platform/",
1042
+ // Generic bot detection
1043
+ "Please Wait...",
1044
+ "DDoS protection by",
1045
+ "Access denied",
1046
+ "bot detection",
1047
+ "are you a robot",
1048
+ "complete the security check"
1049
+ ];
1050
+ var CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm", "cf-ray"];
1051
+ var MIN_CONTENT_LENGTH = 100;
1052
+ var HttpEngine = class {
1053
+ config = ENGINE_CONFIGS.http;
1054
+ async scrape(meta) {
1055
+ const startTime = Date.now();
1056
+ const { url, options, logger: logger4, abortSignal } = meta;
1057
+ try {
1058
+ const controller = new AbortController();
1059
+ const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
1060
+ if (abortSignal) {
1061
+ abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1062
+ }
1063
+ logger4?.debug(`[http] Fetching ${url}`);
1064
+ const response = await fetch(url, {
1065
+ method: "GET",
1066
+ headers: {
1067
+ ...DEFAULT_HEADERS,
1068
+ ...options.headers || {}
1069
+ },
1070
+ redirect: "follow",
1071
+ signal: controller.signal
1108
1072
  });
1109
- return Promise.race([batchPromise, timeoutPromise]);
1073
+ clearTimeout(timeoutId);
1074
+ const duration = Date.now() - startTime;
1075
+ const html = await response.text();
1076
+ logger4?.debug(`[http] Got response: ${response.status} (${html.length} chars) in ${duration}ms`);
1077
+ if (response.status >= 400) {
1078
+ throw new HttpError("http", response.status, response.statusText);
1079
+ }
1080
+ const challengeType = this.detectChallenge(html);
1081
+ if (challengeType) {
1082
+ logger4?.debug(`[http] Challenge detected: ${challengeType}`);
1083
+ throw new ChallengeDetectedError("http", challengeType);
1084
+ }
1085
+ const textContent = this.extractText(html);
1086
+ if (textContent.length < MIN_CONTENT_LENGTH) {
1087
+ logger4?.debug(`[http] Insufficient content: ${textContent.length} chars`);
1088
+ throw new InsufficientContentError("http", textContent.length, MIN_CONTENT_LENGTH);
1089
+ }
1090
+ return {
1091
+ html,
1092
+ url: response.url,
1093
+ statusCode: response.status,
1094
+ contentType: response.headers.get("content-type") || void 0,
1095
+ headers: this.headersToRecord(response.headers),
1096
+ engine: "http",
1097
+ duration
1098
+ };
1099
+ } catch (error) {
1100
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError) {
1101
+ throw error;
1102
+ }
1103
+ if (error instanceof Error) {
1104
+ if (error.name === "AbortError") {
1105
+ throw new EngineTimeoutError("http", this.config.maxTimeout);
1106
+ }
1107
+ throw new EngineError("http", error.message, { cause: error });
1108
+ }
1109
+ throw new EngineError("http", String(error));
1110
1110
  }
1111
- return batchPromise;
1112
1111
  }
1113
1112
  /**
1114
- * Scrape a single URL with retry logic
1113
+ * Detect challenge patterns in HTML
1114
+ * @returns Challenge type or null if no challenge detected
1115
1115
  */
1116
- async scrapeSingleUrlWithRetry(url, index) {
1117
- const maxRetries = this.options.maxRetries || 2;
1118
- let lastError;
1119
- for (let attempt = 0; attempt <= maxRetries; attempt++) {
1120
- try {
1121
- const result = await this.scrapeSingleUrl(url, index);
1122
- if (result) {
1123
- return { result };
1124
- }
1125
- lastError = `Failed to scrape ${url}: No content returned`;
1126
- } catch (error) {
1127
- lastError = error.message;
1128
- if (attempt < maxRetries) {
1129
- const delay = Math.pow(2, attempt) * 1e3;
1130
- this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
1131
- await new Promise((resolve) => setTimeout(resolve, delay));
1116
+ detectChallenge(html) {
1117
+ const htmlLower = html.toLowerCase();
1118
+ const hasCloudflare = CLOUDFLARE_INFRA_PATTERNS.some((p) => htmlLower.includes(p.toLowerCase()));
1119
+ for (const pattern of CHALLENGE_PATTERNS) {
1120
+ if (htmlLower.includes(pattern.toLowerCase())) {
1121
+ if (hasCloudflare || pattern.includes("cf")) {
1122
+ return "cloudflare";
1132
1123
  }
1124
+ return "bot-detection";
1133
1125
  }
1134
1126
  }
1135
- this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
1136
- return { result: null, error: lastError };
1127
+ return null;
1128
+ }
1129
+ /**
1130
+ * Convert Headers to Record<string, string>
1131
+ */
1132
+ headersToRecord(headers) {
1133
+ const record = {};
1134
+ headers.forEach((value, key) => {
1135
+ record[key] = value;
1136
+ });
1137
+ return record;
1138
+ }
1139
+ /**
1140
+ * Extract visible text from HTML (rough extraction)
1141
+ */
1142
+ extractText(html) {
1143
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1144
+ }
1145
+ isAvailable() {
1146
+ return true;
1147
+ }
1148
+ };
1149
+ var httpEngine = new HttpEngine();
1150
+
1151
+ // src/engines/tlsclient/index.ts
1152
+ import { gotScraping } from "got-scraping";
1153
+ var JS_REQUIRED_PATTERNS = [
1154
+ // Cloudflare JS challenge
1155
+ "cf-browser-verification",
1156
+ "challenge-platform",
1157
+ "_cf_chl_tk",
1158
+ "/cdn-cgi/challenge-platform/",
1159
+ // Generic JS requirements
1160
+ "Enable JavaScript",
1161
+ "JavaScript is required",
1162
+ "Please enable JavaScript",
1163
+ "requires JavaScript",
1164
+ "noscript"
1165
+ ];
1166
+ var BLOCKED_PATTERNS = [
1167
+ "Access denied",
1168
+ "Sorry, you have been blocked",
1169
+ "bot detected",
1170
+ "suspicious activity",
1171
+ "too many requests"
1172
+ ];
1173
+ var MIN_CONTENT_LENGTH2 = 100;
1174
+ var TlsClientEngine = class {
1175
+ config = ENGINE_CONFIGS.tlsclient;
1176
+ available = true;
1177
+ constructor() {
1178
+ try {
1179
+ if (!gotScraping) {
1180
+ this.available = false;
1181
+ }
1182
+ } catch {
1183
+ this.available = false;
1184
+ }
1185
+ }
1186
+ async scrape(meta) {
1187
+ if (!this.available) {
1188
+ throw new EngineUnavailableError("tlsclient", "got-scraping not available");
1189
+ }
1190
+ const startTime = Date.now();
1191
+ const { url, options, logger: logger4, abortSignal } = meta;
1192
+ try {
1193
+ const controller = new AbortController();
1194
+ const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
1195
+ if (abortSignal) {
1196
+ abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1197
+ }
1198
+ logger4?.debug(`[tlsclient] Fetching ${url}`);
1199
+ const response = await gotScraping({
1200
+ url,
1201
+ timeout: {
1202
+ request: this.config.maxTimeout
1203
+ },
1204
+ headers: options.headers,
1205
+ followRedirect: true
1206
+ // got-scraping handles browser fingerprinting automatically
1207
+ // It uses header generators and proper TLS settings
1208
+ });
1209
+ clearTimeout(timeoutId);
1210
+ const duration = Date.now() - startTime;
1211
+ const html = response.body;
1212
+ logger4?.debug(`[tlsclient] Got response: ${response.statusCode} (${html.length} chars) in ${duration}ms`);
1213
+ if (response.statusCode >= 400) {
1214
+ throw new HttpError("tlsclient", response.statusCode, response.statusMessage);
1215
+ }
1216
+ const challengeType = this.detectJsRequired(html);
1217
+ if (challengeType) {
1218
+ logger4?.debug(`[tlsclient] JS required: ${challengeType}`);
1219
+ throw new ChallengeDetectedError("tlsclient", challengeType);
1220
+ }
1221
+ const blockedReason = this.detectBlocked(html);
1222
+ if (blockedReason) {
1223
+ logger4?.debug(`[tlsclient] Blocked: ${blockedReason}`);
1224
+ throw new ChallengeDetectedError("tlsclient", `blocked: ${blockedReason}`);
1225
+ }
1226
+ const textContent = this.extractText(html);
1227
+ if (textContent.length < MIN_CONTENT_LENGTH2) {
1228
+ logger4?.debug(`[tlsclient] Insufficient content: ${textContent.length} chars`);
1229
+ throw new InsufficientContentError("tlsclient", textContent.length, MIN_CONTENT_LENGTH2);
1230
+ }
1231
+ return {
1232
+ html,
1233
+ url: response.url,
1234
+ statusCode: response.statusCode,
1235
+ contentType: response.headers["content-type"],
1236
+ headers: response.headers,
1237
+ engine: "tlsclient",
1238
+ duration
1239
+ };
1240
+ } catch (error) {
1241
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError || error instanceof EngineUnavailableError) {
1242
+ throw error;
1243
+ }
1244
+ if (error instanceof Error) {
1245
+ if (error.name === "TimeoutError" || error.message.includes("timeout")) {
1246
+ throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
1247
+ }
1248
+ if (error.name === "AbortError") {
1249
+ throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
1250
+ }
1251
+ throw new EngineError("tlsclient", error.message, { cause: error });
1252
+ }
1253
+ throw new EngineError("tlsclient", String(error));
1254
+ }
1255
+ }
1256
+ /**
1257
+ * Detect patterns that require JS execution
1258
+ */
1259
+ detectJsRequired(html) {
1260
+ const htmlLower = html.toLowerCase();
1261
+ for (const pattern of JS_REQUIRED_PATTERNS) {
1262
+ if (htmlLower.includes(pattern.toLowerCase())) {
1263
+ if (pattern.includes("cf") || pattern.includes("cloudflare")) {
1264
+ return "cloudflare-js";
1265
+ }
1266
+ return "js-required";
1267
+ }
1268
+ }
1269
+ return null;
1270
+ }
1271
+ /**
1272
+ * Detect blocked/denied patterns
1273
+ */
1274
+ detectBlocked(html) {
1275
+ const htmlLower = html.toLowerCase();
1276
+ for (const pattern of BLOCKED_PATTERNS) {
1277
+ if (htmlLower.includes(pattern.toLowerCase())) {
1278
+ return pattern;
1279
+ }
1280
+ }
1281
+ return null;
1282
+ }
1283
+ /**
1284
+ * Extract visible text from HTML
1285
+ */
1286
+ extractText(html) {
1287
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1288
+ }
1289
+ isAvailable() {
1290
+ return this.available;
1291
+ }
1292
+ };
1293
+ var tlsClientEngine = new TlsClientEngine();
1294
+
1295
+ // src/cloudflare/detector.ts
1296
+ var CLOUDFLARE_CHALLENGE_SELECTORS = [
1297
+ "#challenge-running",
1298
+ "#challenge-stage",
1299
+ "#challenge-form",
1300
+ ".cf-browser-verification",
1301
+ "#cf-wrapper",
1302
+ "#cf-hcaptcha-container",
1303
+ "#turnstile-wrapper"
1304
+ ];
1305
+ var CLOUDFLARE_TEXT_PATTERNS = [
1306
+ "checking if the site connection is secure",
1307
+ "this process is automatic. your browser will redirect",
1308
+ "ray id:",
1309
+ "performance & security by cloudflare"
1310
+ ];
1311
+ var CLOUDFLARE_INFRA_PATTERNS2 = [
1312
+ "/cdn-cgi/",
1313
+ "cloudflare",
1314
+ "__cf_bm",
1315
+ "cf-ray"
1316
+ ];
1317
+ var CLOUDFLARE_BLOCKED_PATTERNS = [
1318
+ "sorry, you have been blocked",
1319
+ "ray id:"
1320
+ ];
1321
+ async function detectChallenge(hero) {
1322
+ const signals = [];
1323
+ let type = "none";
1324
+ let hasCloudflareInfra = false;
1325
+ let hasChallengeIndicator = false;
1326
+ try {
1327
+ if (!hero.document) {
1328
+ return {
1329
+ isChallenge: false,
1330
+ type: "none",
1331
+ confidence: 0,
1332
+ signals: ["No document available"]
1333
+ };
1334
+ }
1335
+ const html = await hero.document.documentElement.outerHTML;
1336
+ const htmlLower = html.toLowerCase();
1337
+ for (const pattern of CLOUDFLARE_INFRA_PATTERNS2) {
1338
+ if (htmlLower.includes(pattern)) {
1339
+ hasCloudflareInfra = true;
1340
+ signals.push(`Cloudflare infra: "${pattern}"`);
1341
+ break;
1342
+ }
1343
+ }
1344
+ if (!hasCloudflareInfra) {
1345
+ return {
1346
+ isChallenge: false,
1347
+ type: "none",
1348
+ confidence: 0,
1349
+ signals: ["No Cloudflare infrastructure detected"]
1350
+ };
1351
+ }
1352
+ for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
1353
+ try {
1354
+ const element = await hero.document.querySelector(selector);
1355
+ if (element) {
1356
+ hasChallengeIndicator = true;
1357
+ signals.push(`Challenge element: ${selector}`);
1358
+ type = "js_challenge";
1359
+ }
1360
+ } catch {
1361
+ }
1362
+ }
1363
+ for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
1364
+ if (htmlLower.includes(pattern)) {
1365
+ hasChallengeIndicator = true;
1366
+ signals.push(`Challenge text: "${pattern}"`);
1367
+ type = type === "none" ? "js_challenge" : type;
1368
+ }
1369
+ }
1370
+ if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
1371
+ hasChallengeIndicator = true;
1372
+ signals.push('Challenge text: "waiting for...to respond"');
1373
+ type = type === "none" ? "js_challenge" : type;
1374
+ }
1375
+ const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
1376
+ if (hasBlocked) {
1377
+ hasChallengeIndicator = true;
1378
+ signals.push("Cloudflare block page detected");
1379
+ type = "blocked";
1380
+ }
1381
+ const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
1382
+ const confidence = isChallenge ? 100 : 0;
1383
+ return {
1384
+ isChallenge,
1385
+ type: isChallenge ? type : "none",
1386
+ confidence,
1387
+ signals
1388
+ };
1389
+ } catch (error) {
1390
+ return {
1391
+ isChallenge: false,
1392
+ type: "none",
1393
+ confidence: 0,
1394
+ signals: [`Error during detection: ${error.message}`]
1395
+ };
1396
+ }
1397
+ }
1398
+
1399
+ // src/cloudflare/handler.ts
1400
+ async function waitForChallengeResolution(hero, options) {
1401
+ const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
1402
+ const startTime = Date.now();
1403
+ const log = (msg) => verbose && console.log(` ${msg}`);
1404
+ while (Date.now() - startTime < maxWaitMs) {
1405
+ const elapsed = Date.now() - startTime;
1406
+ try {
1407
+ const currentUrl = await hero.url;
1408
+ if (currentUrl !== initialUrl) {
1409
+ log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
1410
+ log(` Waiting for new page to load...`);
1411
+ try {
1412
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
1413
+ log(` DOMContentLoaded`);
1414
+ } catch {
1415
+ log(` DOMContentLoaded timeout, continuing...`);
1416
+ }
1417
+ await hero.waitForPaintingStable().catch(() => {
1418
+ });
1419
+ log(` Page stabilized`);
1420
+ return { resolved: true, method: "url_redirect", waitedMs: elapsed };
1421
+ }
1422
+ } catch {
1423
+ }
1424
+ const detection = await detectChallenge(hero);
1425
+ if (!detection.isChallenge) {
1426
+ log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
1427
+ log(` Waiting for page to load...`);
1428
+ try {
1429
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
1430
+ log(` DOMContentLoaded`);
1431
+ } catch {
1432
+ log(` DOMContentLoaded timeout, continuing...`);
1433
+ }
1434
+ await hero.waitForPaintingStable().catch(() => {
1435
+ });
1436
+ log(` Page stabilized`);
1437
+ return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
1438
+ }
1439
+ log(
1440
+ `\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
1441
+ );
1442
+ await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
1443
+ }
1444
+ return {
1445
+ resolved: false,
1446
+ method: "timeout",
1447
+ waitedMs: Date.now() - startTime
1448
+ };
1449
+ }
1450
+
1451
+ // src/engines/hero/index.ts
1452
+ var MIN_CONTENT_LENGTH3 = 100;
1453
+ var HeroEngine = class {
1454
+ config = ENGINE_CONFIGS.hero;
1455
+ async scrape(meta) {
1456
+ const startTime = Date.now();
1457
+ const { url, options, logger: logger4, abortSignal } = meta;
1458
+ const pool = options.pool;
1459
+ if (!pool) {
1460
+ throw new EngineUnavailableError("hero", "Browser pool not available");
1461
+ }
1462
+ if (abortSignal?.aborted) {
1463
+ throw new EngineTimeoutError("hero", 0);
1464
+ }
1465
+ logger4?.debug(`[hero] Starting browser scrape of ${url}`);
1466
+ try {
1467
+ const result = await pool.withBrowser(async (hero) => {
1468
+ let aborted = false;
1469
+ if (abortSignal) {
1470
+ abortSignal.addEventListener("abort", () => {
1471
+ aborted = true;
1472
+ }, { once: true });
1473
+ }
1474
+ const timeoutMs = options.timeoutMs || this.config.maxTimeout;
1475
+ await hero.goto(url, { timeoutMs });
1476
+ if (aborted) {
1477
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1478
+ }
1479
+ try {
1480
+ await hero.waitForLoad("DomContentLoaded", { timeoutMs });
1481
+ } catch {
1482
+ }
1483
+ await hero.waitForPaintingStable();
1484
+ if (aborted) {
1485
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1486
+ }
1487
+ const initialUrl = await hero.url;
1488
+ const detection = await detectChallenge(hero);
1489
+ if (detection.isChallenge) {
1490
+ logger4?.debug(`[hero] Challenge detected: ${detection.type}`);
1491
+ if (detection.type === "blocked") {
1492
+ throw new ChallengeDetectedError("hero", "blocked");
1493
+ }
1494
+ const resolution = await waitForChallengeResolution(hero, {
1495
+ maxWaitMs: 45e3,
1496
+ pollIntervalMs: 500,
1497
+ verbose: options.verbose,
1498
+ initialUrl
1499
+ });
1500
+ if (!resolution.resolved) {
1501
+ throw new ChallengeDetectedError("hero", `unresolved: ${detection.type}`);
1502
+ }
1503
+ logger4?.debug(`[hero] Challenge resolved via ${resolution.method} in ${resolution.waitedMs}ms`);
1504
+ }
1505
+ if (aborted) {
1506
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1507
+ }
1508
+ await this.waitForFinalPage(hero, url, logger4);
1509
+ if (aborted) {
1510
+ throw new EngineTimeoutError("hero", Date.now() - startTime);
1511
+ }
1512
+ if (options.waitForSelector) {
1513
+ try {
1514
+ await hero.waitForElement(hero.document.querySelector(options.waitForSelector), {
1515
+ timeoutMs
1516
+ });
1517
+ } catch {
1518
+ logger4?.debug(`[hero] Selector not found: ${options.waitForSelector}`);
1519
+ }
1520
+ }
1521
+ const html = await hero.document.documentElement.outerHTML;
1522
+ const finalUrl = await hero.url;
1523
+ const textContent = this.extractText(html);
1524
+ if (textContent.length < MIN_CONTENT_LENGTH3) {
1525
+ logger4?.debug(`[hero] Insufficient content: ${textContent.length} chars`);
1526
+ throw new InsufficientContentError("hero", textContent.length, MIN_CONTENT_LENGTH3);
1527
+ }
1528
+ const duration = Date.now() - startTime;
1529
+ logger4?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);
1530
+ return {
1531
+ html,
1532
+ url: finalUrl,
1533
+ statusCode: 200,
1534
+ // Hero doesn't expose status code directly
1535
+ engine: "hero",
1536
+ duration
1537
+ };
1538
+ });
1539
+ return result;
1540
+ } catch (error) {
1541
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError || error instanceof EngineUnavailableError) {
1542
+ throw error;
1543
+ }
1544
+ if (error instanceof Error) {
1545
+ if (error.name === "TimeoutError" || error.message.includes("timeout")) {
1546
+ throw new EngineTimeoutError("hero", this.config.maxTimeout);
1547
+ }
1548
+ if (error.message.includes("Navigation") || error.message.includes("ERR_")) {
1549
+ throw new EngineError("hero", `Navigation failed: ${error.message}`, { cause: error });
1550
+ }
1551
+ throw new EngineError("hero", error.message, { cause: error });
1552
+ }
1553
+ throw new EngineError("hero", String(error));
1554
+ }
1137
1555
  }
1138
1556
  /**
1139
1557
  * Wait for the final page to load after any Cloudflare redirects
1140
- * Cloudflare often does silent redirects even when bypassed, we need to ensure
1141
- * we're on the actual content page before scraping.
1142
1558
  */
1143
- async waitForFinalPage(hero, originalUrl, verbose) {
1559
+ async waitForFinalPage(hero, originalUrl, logger4) {
1144
1560
  const maxWaitMs = 15e3;
1145
1561
  const startTime = Date.now();
1146
- const log = (msg) => verbose && this.logger.info(msg);
1147
1562
  try {
1148
1563
  await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
1149
1564
  } catch {
@@ -1152,7 +1567,7 @@ var Scraper = class {
1152
1567
  const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
1153
1568
  const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
1154
1569
  if (urlChanged || currentUrl.includes("__cf_chl")) {
1155
- log(`Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1570
+ logger4?.debug(`[hero] Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
1156
1571
  let lastUrl = currentUrl;
1157
1572
  let stableCount = 0;
1158
1573
  while (Date.now() - startTime < maxWaitMs) {
@@ -1167,7 +1582,7 @@ var Scraper = class {
1167
1582
  } else {
1168
1583
  stableCount = 0;
1169
1584
  lastUrl = currentUrl;
1170
- log(`URL changed to: ${currentUrl}`);
1585
+ logger4?.debug(`[hero] URL changed to: ${currentUrl}`);
1171
1586
  }
1172
1587
  } catch {
1173
1588
  }
@@ -1181,7 +1596,223 @@ var Scraper = class {
1181
1596
  await new Promise((resolve) => setTimeout(resolve, 2e3));
1182
1597
  }
1183
1598
  /**
1184
- * Scrape a single URL
1599
+ * Extract visible text from HTML
1600
+ */
1601
+ extractText(html) {
1602
+ return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
1603
+ }
1604
+ isAvailable() {
1605
+ return true;
1606
+ }
1607
+ };
1608
+ var heroEngine = new HeroEngine();
1609
+
1610
+ // src/engines/orchestrator.ts
1611
+ var ENGINE_REGISTRY = {
1612
+ http: httpEngine,
1613
+ tlsclient: tlsClientEngine,
1614
+ hero: heroEngine
1615
+ };
1616
+ var EngineOrchestrator = class {
1617
+ options;
1618
+ engines;
1619
+ engineOrder;
1620
+ constructor(options = {}) {
1621
+ this.options = options;
1622
+ this.engineOrder = this.resolveEngineOrder();
1623
+ this.engines = this.engineOrder.map((name) => ENGINE_REGISTRY[name]).filter((engine) => engine.isAvailable());
1624
+ }
1625
+ /**
1626
+ * Resolve the engine order based on options
1627
+ */
1628
+ resolveEngineOrder() {
1629
+ if (this.options.forceEngine) {
1630
+ return [this.options.forceEngine];
1631
+ }
1632
+ let order = this.options.engines || [...DEFAULT_ENGINE_ORDER];
1633
+ if (this.options.skipEngines) {
1634
+ order = order.filter((e) => !this.options.skipEngines.includes(e));
1635
+ }
1636
+ return order;
1637
+ }
1638
+ /**
1639
+ * Get available engines
1640
+ */
1641
+ getAvailableEngines() {
1642
+ return this.engines.map((e) => e.config.name);
1643
+ }
1644
+ /**
1645
+ * Scrape a URL using the engine cascade
1646
+ *
1647
+ * @param meta - Engine metadata (url, options, logger, abortSignal)
1648
+ * @returns Scrape result with engine metadata
1649
+ * @throws AllEnginesFailedError if all engines fail
1650
+ */
1651
+ async scrape(meta) {
1652
+ const attemptedEngines = [];
1653
+ const engineErrors = /* @__PURE__ */ new Map();
1654
+ const logger4 = meta.logger || this.options.logger;
1655
+ const verbose = this.options.verbose || meta.options.verbose;
1656
+ if (this.engines.length === 0) {
1657
+ throw new AllEnginesFailedError([], engineErrors);
1658
+ }
1659
+ const log = (msg) => {
1660
+ if (verbose) {
1661
+ logger4?.info(msg);
1662
+ } else {
1663
+ logger4?.debug(msg);
1664
+ }
1665
+ };
1666
+ log(`[orchestrator] Starting scrape of ${meta.url} with engines: ${this.engineOrder.join(" \u2192 ")}`);
1667
+ for (const engine of this.engines) {
1668
+ const engineName = engine.config.name;
1669
+ attemptedEngines.push(engineName);
1670
+ try {
1671
+ log(`[orchestrator] Trying ${engineName} engine...`);
1672
+ const controller = new AbortController();
1673
+ const timeoutId = setTimeout(() => controller.abort(), engine.config.maxTimeout);
1674
+ if (meta.abortSignal) {
1675
+ meta.abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
1676
+ }
1677
+ try {
1678
+ const result = await engine.scrape({
1679
+ ...meta,
1680
+ abortSignal: controller.signal
1681
+ });
1682
+ clearTimeout(timeoutId);
1683
+ log(`[orchestrator] \u2713 ${engineName} succeeded in ${result.duration}ms`);
1684
+ return {
1685
+ ...result,
1686
+ attemptedEngines,
1687
+ engineErrors
1688
+ };
1689
+ } finally {
1690
+ clearTimeout(timeoutId);
1691
+ }
1692
+ } catch (error) {
1693
+ const err = error instanceof Error ? error : new Error(String(error));
1694
+ engineErrors.set(engineName, err);
1695
+ if (error instanceof ChallengeDetectedError) {
1696
+ log(`[orchestrator] ${engineName} detected challenge: ${error.challengeType}`);
1697
+ } else if (error instanceof InsufficientContentError) {
1698
+ log(`[orchestrator] ${engineName} insufficient content: ${error.contentLength} chars`);
1699
+ } else if (error instanceof HttpError) {
1700
+ log(`[orchestrator] ${engineName} HTTP error: ${error.statusCode}`);
1701
+ } else if (error instanceof EngineTimeoutError) {
1702
+ log(`[orchestrator] ${engineName} timed out after ${error.timeoutMs}ms`);
1703
+ } else if (error instanceof EngineUnavailableError) {
1704
+ log(`[orchestrator] ${engineName} unavailable: ${err.message}`);
1705
+ } else {
1706
+ log(`[orchestrator] ${engineName} failed: ${err.message}`);
1707
+ }
1708
+ if (!this.shouldRetry(error)) {
1709
+ log(`[orchestrator] Non-retryable error, stopping cascade`);
1710
+ break;
1711
+ }
1712
+ log(`[orchestrator] Falling back to next engine...`);
1713
+ }
1714
+ }
1715
+ log(`[orchestrator] All engines failed for ${meta.url}`);
1716
+ throw new AllEnginesFailedError(attemptedEngines, engineErrors);
1717
+ }
1718
+ /**
1719
+ * Determine if we should retry with next engine
1720
+ */
1721
+ shouldRetry(error) {
1722
+ if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError) {
1723
+ return true;
1724
+ }
1725
+ if (error instanceof HttpError) {
1726
+ return error.statusCode === 403 || error.statusCode === 429 || error.statusCode >= 500;
1727
+ }
1728
+ if (error instanceof EngineUnavailableError) {
1729
+ return true;
1730
+ }
1731
+ if (error instanceof EngineError) {
1732
+ return error.retryable;
1733
+ }
1734
+ return true;
1735
+ }
1736
+ };
1737
+
1738
+ // src/scraper.ts
1739
+ var Scraper = class {
1740
+ options;
1741
+ logger = createLogger("scraper");
1742
+ robotsCache = /* @__PURE__ */ new Map();
1743
+ constructor(options) {
1744
+ this.options = {
1745
+ ...DEFAULT_OPTIONS,
1746
+ ...options
1747
+ };
1748
+ }
1749
+ /**
1750
+ * Get robots.txt rules for a URL, cached per domain
1751
+ */
1752
+ async getRobotsRules(url) {
1753
+ const origin = new URL(url).origin;
1754
+ if (!this.robotsCache.has(origin)) {
1755
+ const rules = await fetchRobotsTxt(origin);
1756
+ this.robotsCache.set(origin, rules);
1757
+ }
1758
+ return this.robotsCache.get(origin) ?? null;
1759
+ }
1760
+ /**
1761
+ * Scrape all URLs
1762
+ *
1763
+ * @returns Scrape result with pages and metadata
1764
+ */
1765
+ async scrape() {
1766
+ const startTime = Date.now();
1767
+ const results = await this.scrapeWithConcurrency();
1768
+ return this.buildScrapeResult(results, startTime);
1769
+ }
1770
+ /**
1771
+ * Scrape URLs with concurrency control
1772
+ */
1773
+ async scrapeWithConcurrency() {
1774
+ const limit = pLimit(this.options.batchConcurrency || 1);
1775
+ const tasks = this.options.urls.map(
1776
+ (url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
1777
+ );
1778
+ const batchPromise = Promise.all(tasks);
1779
+ if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
1780
+ const timeoutPromise = new Promise((_, reject) => {
1781
+ setTimeout(() => {
1782
+ reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
1783
+ }, this.options.batchTimeoutMs);
1784
+ });
1785
+ return Promise.race([batchPromise, timeoutPromise]);
1786
+ }
1787
+ return batchPromise;
1788
+ }
1789
+ /**
1790
+ * Scrape a single URL with retry logic
1791
+ */
1792
+ async scrapeSingleUrlWithRetry(url, index) {
1793
+ const maxRetries = this.options.maxRetries || 2;
1794
+ let lastError;
1795
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
1796
+ try {
1797
+ const result = await this.scrapeSingleUrl(url, index);
1798
+ if (result) {
1799
+ return { result };
1800
+ }
1801
+ lastError = `Failed to scrape ${url}: No content returned`;
1802
+ } catch (error) {
1803
+ lastError = error.message;
1804
+ if (attempt < maxRetries) {
1805
+ const delay = Math.pow(2, attempt) * 1e3;
1806
+ this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
1807
+ await new Promise((resolve) => setTimeout(resolve, delay));
1808
+ }
1809
+ }
1810
+ }
1811
+ this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
1812
+ return { result: null, error: lastError };
1813
+ }
1814
+ /**
1815
+ * Scrape a single URL using the engine orchestrator
1185
1816
  */
1186
1817
  async scrapeSingleUrl(url, index) {
1187
1818
  const startTime = Date.now();
@@ -1190,98 +1821,84 @@ var Scraper = class {
1190
1821
  throw new Error(`URL blocked by robots.txt: ${url}`);
1191
1822
  }
1192
1823
  try {
1193
- return await this.pool.withBrowser(async (hero) => {
1194
- await hero.goto(url, { timeoutMs: this.options.timeoutMs });
1195
- try {
1196
- await hero.waitForLoad("DomContentLoaded", { timeoutMs: this.options.timeoutMs });
1197
- } catch {
1198
- }
1199
- await hero.waitForPaintingStable();
1200
- const initialUrl = await hero.url;
1201
- const detection = await detectChallenge(hero);
1202
- if (detection.isChallenge) {
1203
- if (this.options.verbose) {
1204
- this.logger.info(`Challenge detected on ${url}: ${detection.type}`);
1205
- }
1206
- const result2 = await waitForChallengeResolution(hero, {
1207
- maxWaitMs: 45e3,
1208
- pollIntervalMs: 500,
1209
- verbose: this.options.verbose,
1210
- initialUrl
1211
- });
1212
- if (!result2.resolved) {
1213
- throw new Error(`Challenge not resolved: ${detection.type}`);
1214
- }
1215
- if (this.options.verbose) {
1216
- this.logger.info(`Challenge resolved via ${result2.method} in ${result2.waitedMs}ms`);
1217
- }
1218
- }
1219
- await this.waitForFinalPage(hero, url, this.options.verbose);
1220
- if (this.options.waitForSelector) {
1221
- try {
1222
- await hero.waitForElement(hero.document.querySelector(this.options.waitForSelector), {
1223
- timeoutMs: this.options.timeoutMs
1224
- });
1225
- } catch (error) {
1226
- this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
1227
- }
1228
- }
1229
- const html = await hero.document.documentElement.outerHTML;
1230
- const cleanedHtml = cleanContent(html, url, {
1231
- removeAds: this.options.removeAds,
1232
- removeBase64Images: this.options.removeBase64Images,
1233
- onlyMainContent: this.options.onlyMainContent,
1234
- includeTags: this.options.includeTags,
1235
- excludeTags: this.options.excludeTags
1824
+ const orchestrator = new EngineOrchestrator({
1825
+ engines: this.options.engines,
1826
+ skipEngines: this.options.skipEngines,
1827
+ forceEngine: this.options.forceEngine,
1828
+ logger: this.logger,
1829
+ verbose: this.options.verbose
1830
+ });
1831
+ const engineResult = await orchestrator.scrape({
1832
+ url,
1833
+ options: this.options,
1834
+ logger: this.logger
1835
+ });
1836
+ if (this.options.verbose) {
1837
+ this.logger.info(
1838
+ `[scraper] ${url} scraped with ${engineResult.engine} engine in ${engineResult.duration}ms (attempted: ${engineResult.attemptedEngines.join(" \u2192 ")})`
1839
+ );
1840
+ }
1841
+ const cleanedHtml = cleanContent(engineResult.html, engineResult.url, {
1842
+ removeAds: this.options.removeAds,
1843
+ removeBase64Images: this.options.removeBase64Images,
1844
+ onlyMainContent: this.options.onlyMainContent,
1845
+ includeTags: this.options.includeTags,
1846
+ excludeTags: this.options.excludeTags
1847
+ });
1848
+ const websiteMetadata = extractMetadata(cleanedHtml, engineResult.url);
1849
+ const duration = Date.now() - startTime;
1850
+ const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1851
+ const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1852
+ if (this.options.onProgress) {
1853
+ this.options.onProgress({
1854
+ completed: index + 1,
1855
+ total: this.options.urls.length,
1856
+ currentUrl: url
1236
1857
  });
1237
- const websiteMetadata = extractMetadata(cleanedHtml, url);
1238
- const duration = Date.now() - startTime;
1239
- const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
1240
- const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
1241
- if (this.options.onProgress) {
1242
- this.options.onProgress({
1243
- completed: index + 1,
1244
- total: this.options.urls.length,
1245
- currentUrl: url
1246
- });
1247
- }
1248
- let proxyMetadata;
1249
- if (this.options.proxy) {
1250
- const proxy = this.options.proxy;
1251
- if (proxy.url) {
1252
- try {
1253
- const proxyUrl = new URL(proxy.url);
1254
- proxyMetadata = {
1255
- host: proxyUrl.hostname,
1256
- port: parseInt(proxyUrl.port, 10) || 80,
1257
- country: proxy.country
1258
- };
1259
- } catch {
1260
- }
1261
- } else if (proxy.host && proxy.port) {
1858
+ }
1859
+ let proxyMetadata;
1860
+ if (this.options.proxy) {
1861
+ const proxy = this.options.proxy;
1862
+ if (proxy.url) {
1863
+ try {
1864
+ const proxyUrl = new URL(proxy.url);
1262
1865
  proxyMetadata = {
1263
- host: proxy.host,
1264
- port: proxy.port,
1866
+ host: proxyUrl.hostname,
1867
+ port: parseInt(proxyUrl.port, 10) || 80,
1265
1868
  country: proxy.country
1266
1869
  };
1870
+ } catch {
1267
1871
  }
1872
+ } else if (proxy.host && proxy.port) {
1873
+ proxyMetadata = {
1874
+ host: proxy.host,
1875
+ port: proxy.port,
1876
+ country: proxy.country
1877
+ };
1268
1878
  }
1269
- const result = {
1270
- markdown,
1271
- html: htmlOutput,
1272
- metadata: {
1273
- baseUrl: url,
1274
- totalPages: 1,
1275
- scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1276
- duration,
1277
- website: websiteMetadata,
1278
- proxy: proxyMetadata
1279
- }
1280
- };
1281
- return result;
1282
- });
1879
+ }
1880
+ const result = {
1881
+ markdown,
1882
+ html: htmlOutput,
1883
+ metadata: {
1884
+ baseUrl: url,
1885
+ totalPages: 1,
1886
+ scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
1887
+ duration,
1888
+ website: websiteMetadata,
1889
+ proxy: proxyMetadata
1890
+ }
1891
+ };
1892
+ return result;
1283
1893
  } catch (error) {
1284
- this.logger.error(`Failed to scrape ${url}: ${error.message}`);
1894
+ if (error instanceof AllEnginesFailedError) {
1895
+ const engineSummary = error.attemptedEngines.map((e) => `${e}: ${error.errors.get(e)?.message || "unknown"}`).join("; ");
1896
+ this.logger.error(`Failed to scrape ${url}: All engines failed - ${engineSummary}`);
1897
+ } else if (error instanceof Error) {
1898
+ this.logger.error(`Failed to scrape ${url}: ${error.message}`);
1899
+ } else {
1900
+ this.logger.error(`Failed to scrape ${url}: ${String(error)}`);
1901
+ }
1285
1902
  if (this.options.onProgress) {
1286
1903
  this.options.onProgress({
1287
1904
  completed: index + 1,
@@ -2604,7 +3221,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2604
3221
  "-f, --format <formats>",
2605
3222
  "Content formats to include (comma-separated: markdown,html)",
2606
3223
  "markdown"
2607
- ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
3224
+ ).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").option("--engine <name>", "Force a specific engine (http, tlsclient, hero)").option("--skip-engine <names>", "Skip specific engines (comma-separated: http,tlsclient,hero)").action(async (urls, options) => {
2608
3225
  const port = parseInt(options.port, 10);
2609
3226
  const useStandalone = options.standalone || false;
2610
3227
  let useDaemon = false;
@@ -2636,6 +3253,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2636
3253
  }
2637
3254
  const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
2638
3255
  const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
3256
+ const skipEngines = options.skipEngine ? options.skipEngine.split(",").map((s) => s.trim()) : void 0;
2639
3257
  const scrapeOptions = {
2640
3258
  urls,
2641
3259
  formats,
@@ -2651,6 +3269,9 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
2651
3269
  // --no-main-content sets this to false
2652
3270
  includeTags,
2653
3271
  excludeTags,
3272
+ // Engine options
3273
+ forceEngine: options.engine,
3274
+ skipEngines,
2654
3275
  onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
2655
3276
  console.error(`[${completed}/${total}] ${currentUrl}`);
2656
3277
  } : void 0