@vakra-dev/reader 0.0.3 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +30 -0
- package/dist/cli/index.js +938 -323
- package/dist/cli/index.js.map +1 -1
- package/dist/index.d.ts +30 -9
- package/dist/index.js +975 -364
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli/index.js
CHANGED
|
@@ -17,178 +17,16 @@ import { ConnectionToHeroCore } from "@ulixee/hero";
|
|
|
17
17
|
// src/scraper.ts
|
|
18
18
|
import pLimit from "p-limit";
|
|
19
19
|
|
|
20
|
-
// src/cloudflare/detector.ts
|
|
21
|
-
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
22
|
-
"#challenge-running",
|
|
23
|
-
"#challenge-stage",
|
|
24
|
-
"#challenge-form",
|
|
25
|
-
".cf-browser-verification",
|
|
26
|
-
"#cf-wrapper",
|
|
27
|
-
"#cf-hcaptcha-container",
|
|
28
|
-
"#turnstile-wrapper"
|
|
29
|
-
];
|
|
30
|
-
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
31
|
-
"checking if the site connection is secure",
|
|
32
|
-
"this process is automatic. your browser will redirect",
|
|
33
|
-
"ray id:",
|
|
34
|
-
"performance & security by cloudflare"
|
|
35
|
-
];
|
|
36
|
-
var CLOUDFLARE_INFRA_PATTERNS = [
|
|
37
|
-
"/cdn-cgi/",
|
|
38
|
-
"cloudflare",
|
|
39
|
-
"__cf_bm",
|
|
40
|
-
"cf-ray"
|
|
41
|
-
];
|
|
42
|
-
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
43
|
-
"sorry, you have been blocked",
|
|
44
|
-
"ray id:"
|
|
45
|
-
];
|
|
46
|
-
async function detectChallenge(hero) {
|
|
47
|
-
const signals = [];
|
|
48
|
-
let type = "none";
|
|
49
|
-
let hasCloudflareInfra = false;
|
|
50
|
-
let hasChallengeIndicator = false;
|
|
51
|
-
try {
|
|
52
|
-
if (!hero.document) {
|
|
53
|
-
return {
|
|
54
|
-
isChallenge: false,
|
|
55
|
-
type: "none",
|
|
56
|
-
confidence: 0,
|
|
57
|
-
signals: ["No document available"]
|
|
58
|
-
};
|
|
59
|
-
}
|
|
60
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
61
|
-
const htmlLower = html.toLowerCase();
|
|
62
|
-
for (const pattern of CLOUDFLARE_INFRA_PATTERNS) {
|
|
63
|
-
if (htmlLower.includes(pattern)) {
|
|
64
|
-
hasCloudflareInfra = true;
|
|
65
|
-
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
66
|
-
break;
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
if (!hasCloudflareInfra) {
|
|
70
|
-
return {
|
|
71
|
-
isChallenge: false,
|
|
72
|
-
type: "none",
|
|
73
|
-
confidence: 0,
|
|
74
|
-
signals: ["No Cloudflare infrastructure detected"]
|
|
75
|
-
};
|
|
76
|
-
}
|
|
77
|
-
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
78
|
-
try {
|
|
79
|
-
const element = await hero.document.querySelector(selector);
|
|
80
|
-
if (element) {
|
|
81
|
-
hasChallengeIndicator = true;
|
|
82
|
-
signals.push(`Challenge element: ${selector}`);
|
|
83
|
-
type = "js_challenge";
|
|
84
|
-
}
|
|
85
|
-
} catch {
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
89
|
-
if (htmlLower.includes(pattern)) {
|
|
90
|
-
hasChallengeIndicator = true;
|
|
91
|
-
signals.push(`Challenge text: "${pattern}"`);
|
|
92
|
-
type = type === "none" ? "js_challenge" : type;
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
96
|
-
hasChallengeIndicator = true;
|
|
97
|
-
signals.push('Challenge text: "waiting for...to respond"');
|
|
98
|
-
type = type === "none" ? "js_challenge" : type;
|
|
99
|
-
}
|
|
100
|
-
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
101
|
-
if (hasBlocked) {
|
|
102
|
-
hasChallengeIndicator = true;
|
|
103
|
-
signals.push("Cloudflare block page detected");
|
|
104
|
-
type = "blocked";
|
|
105
|
-
}
|
|
106
|
-
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
107
|
-
const confidence = isChallenge ? 100 : 0;
|
|
108
|
-
return {
|
|
109
|
-
isChallenge,
|
|
110
|
-
type: isChallenge ? type : "none",
|
|
111
|
-
confidence,
|
|
112
|
-
signals
|
|
113
|
-
};
|
|
114
|
-
} catch (error) {
|
|
115
|
-
return {
|
|
116
|
-
isChallenge: false,
|
|
117
|
-
type: "none",
|
|
118
|
-
confidence: 0,
|
|
119
|
-
signals: [`Error during detection: ${error.message}`]
|
|
120
|
-
};
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
// src/cloudflare/handler.ts
|
|
125
|
-
async function waitForChallengeResolution(hero, options) {
|
|
126
|
-
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
127
|
-
const startTime = Date.now();
|
|
128
|
-
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
129
|
-
while (Date.now() - startTime < maxWaitMs) {
|
|
130
|
-
const elapsed = Date.now() - startTime;
|
|
131
|
-
try {
|
|
132
|
-
const currentUrl = await hero.url;
|
|
133
|
-
if (currentUrl !== initialUrl) {
|
|
134
|
-
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
135
|
-
log(` Waiting for new page to load...`);
|
|
136
|
-
try {
|
|
137
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
138
|
-
log(` DOMContentLoaded`);
|
|
139
|
-
} catch {
|
|
140
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
141
|
-
}
|
|
142
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
143
|
-
});
|
|
144
|
-
log(` Page stabilized`);
|
|
145
|
-
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
146
|
-
}
|
|
147
|
-
} catch {
|
|
148
|
-
}
|
|
149
|
-
const detection = await detectChallenge(hero);
|
|
150
|
-
if (!detection.isChallenge) {
|
|
151
|
-
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
152
|
-
log(` Waiting for page to load...`);
|
|
153
|
-
try {
|
|
154
|
-
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
155
|
-
log(` DOMContentLoaded`);
|
|
156
|
-
} catch {
|
|
157
|
-
log(` DOMContentLoaded timeout, continuing...`);
|
|
158
|
-
}
|
|
159
|
-
await hero.waitForPaintingStable().catch(() => {
|
|
160
|
-
});
|
|
161
|
-
log(` Page stabilized`);
|
|
162
|
-
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
163
|
-
}
|
|
164
|
-
log(
|
|
165
|
-
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
166
|
-
);
|
|
167
|
-
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
168
|
-
}
|
|
169
|
-
return {
|
|
170
|
-
resolved: false,
|
|
171
|
-
method: "timeout",
|
|
172
|
-
waitedMs: Date.now() - startTime
|
|
173
|
-
};
|
|
174
|
-
}
|
|
175
|
-
|
|
176
20
|
// src/formatters/markdown.ts
|
|
177
|
-
import
|
|
178
|
-
var turndownService = new TurndownService({
|
|
179
|
-
headingStyle: "atx",
|
|
180
|
-
hr: "---",
|
|
181
|
-
bulletListMarker: "-",
|
|
182
|
-
codeBlockStyle: "fenced",
|
|
183
|
-
fence: "```",
|
|
184
|
-
emDelimiter: "*",
|
|
185
|
-
strongDelimiter: "**",
|
|
186
|
-
linkStyle: "inlined",
|
|
187
|
-
linkReferenceStyle: "full"
|
|
188
|
-
});
|
|
21
|
+
import { convert } from "@vakra-dev/supermarkdown";
|
|
189
22
|
function htmlToMarkdown(html) {
|
|
190
23
|
try {
|
|
191
|
-
return
|
|
24
|
+
return convert(html, {
|
|
25
|
+
headingStyle: "atx",
|
|
26
|
+
bulletMarker: "-",
|
|
27
|
+
codeFence: "`",
|
|
28
|
+
linkStyle: "inline"
|
|
29
|
+
});
|
|
192
30
|
} catch (error) {
|
|
193
31
|
console.warn("Error converting HTML to Markdown:", error);
|
|
194
32
|
return html.replace(/<[^>]*>/g, "").trim();
|
|
@@ -1054,96 +892,667 @@ var DEFAULT_OPTIONS = {
|
|
|
1054
892
|
showChrome: false
|
|
1055
893
|
};
|
|
1056
894
|
|
|
1057
|
-
// src/
|
|
1058
|
-
var
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
895
|
+
// src/engines/types.ts
|
|
896
|
+
var ENGINE_CONFIGS = {
|
|
897
|
+
http: {
|
|
898
|
+
name: "http",
|
|
899
|
+
timeout: 3e3,
|
|
900
|
+
maxTimeout: 1e4,
|
|
901
|
+
quality: 100,
|
|
902
|
+
features: {
|
|
903
|
+
javascript: false,
|
|
904
|
+
cloudflare: false,
|
|
905
|
+
tlsFingerprint: false,
|
|
906
|
+
waitFor: false,
|
|
907
|
+
screenshots: false
|
|
908
|
+
}
|
|
909
|
+
},
|
|
910
|
+
tlsclient: {
|
|
911
|
+
name: "tlsclient",
|
|
912
|
+
timeout: 5e3,
|
|
913
|
+
maxTimeout: 15e3,
|
|
914
|
+
quality: 80,
|
|
915
|
+
features: {
|
|
916
|
+
javascript: false,
|
|
917
|
+
cloudflare: false,
|
|
918
|
+
tlsFingerprint: true,
|
|
919
|
+
waitFor: false,
|
|
920
|
+
screenshots: false
|
|
921
|
+
}
|
|
922
|
+
},
|
|
923
|
+
hero: {
|
|
924
|
+
name: "hero",
|
|
925
|
+
timeout: 3e4,
|
|
926
|
+
maxTimeout: 6e4,
|
|
927
|
+
quality: 50,
|
|
928
|
+
features: {
|
|
929
|
+
javascript: true,
|
|
930
|
+
cloudflare: true,
|
|
931
|
+
tlsFingerprint: true,
|
|
932
|
+
waitFor: true,
|
|
933
|
+
screenshots: true
|
|
934
|
+
}
|
|
935
|
+
}
|
|
936
|
+
};
|
|
937
|
+
var DEFAULT_ENGINE_ORDER = ["http", "tlsclient", "hero"];
|
|
938
|
+
|
|
939
|
+
// src/engines/errors.ts
|
|
940
|
+
var EngineError = class extends Error {
|
|
941
|
+
engine;
|
|
942
|
+
retryable;
|
|
943
|
+
constructor(engine, message, options) {
|
|
944
|
+
super(`[${engine}] ${message}`);
|
|
945
|
+
this.name = "EngineError";
|
|
946
|
+
this.engine = engine;
|
|
947
|
+
this.retryable = options?.retryable ?? true;
|
|
948
|
+
this.cause = options?.cause;
|
|
949
|
+
if (Error.captureStackTrace) {
|
|
950
|
+
Error.captureStackTrace(this, this.constructor);
|
|
951
|
+
}
|
|
952
|
+
}
|
|
953
|
+
};
|
|
954
|
+
var ChallengeDetectedError = class extends EngineError {
|
|
955
|
+
challengeType;
|
|
956
|
+
constructor(engine, challengeType) {
|
|
957
|
+
super(engine, `Challenge detected: ${challengeType || "unknown"}`, { retryable: true });
|
|
958
|
+
this.name = "ChallengeDetectedError";
|
|
959
|
+
this.challengeType = challengeType || "unknown";
|
|
960
|
+
}
|
|
961
|
+
};
|
|
962
|
+
var InsufficientContentError = class extends EngineError {
|
|
963
|
+
contentLength;
|
|
964
|
+
threshold;
|
|
965
|
+
constructor(engine, contentLength, threshold = 100) {
|
|
966
|
+
super(engine, `Insufficient content: ${contentLength} chars (threshold: ${threshold})`, { retryable: true });
|
|
967
|
+
this.name = "InsufficientContentError";
|
|
968
|
+
this.contentLength = contentLength;
|
|
969
|
+
this.threshold = threshold;
|
|
970
|
+
}
|
|
971
|
+
};
|
|
972
|
+
var HttpError = class extends EngineError {
|
|
973
|
+
statusCode;
|
|
974
|
+
constructor(engine, statusCode, statusText) {
|
|
975
|
+
const retryable = statusCode >= 500 || statusCode === 429;
|
|
976
|
+
super(engine, `HTTP ${statusCode}${statusText ? `: ${statusText}` : ""}`, { retryable });
|
|
977
|
+
this.name = "HttpError";
|
|
978
|
+
this.statusCode = statusCode;
|
|
979
|
+
}
|
|
980
|
+
};
|
|
981
|
+
var EngineTimeoutError = class extends EngineError {
|
|
982
|
+
timeoutMs;
|
|
983
|
+
constructor(engine, timeoutMs) {
|
|
984
|
+
super(engine, `Timeout after ${timeoutMs}ms`, { retryable: true });
|
|
985
|
+
this.name = "EngineTimeoutError";
|
|
986
|
+
this.timeoutMs = timeoutMs;
|
|
987
|
+
}
|
|
988
|
+
};
|
|
989
|
+
var EngineUnavailableError = class extends EngineError {
|
|
990
|
+
constructor(engine, reason) {
|
|
991
|
+
super(engine, reason || "Engine not available", { retryable: false });
|
|
992
|
+
this.name = "EngineUnavailableError";
|
|
993
|
+
}
|
|
994
|
+
};
|
|
995
|
+
var AllEnginesFailedError = class extends Error {
|
|
996
|
+
attemptedEngines;
|
|
997
|
+
errors;
|
|
998
|
+
constructor(attemptedEngines, errors) {
|
|
999
|
+
const summary = attemptedEngines.map((e) => `${e}: ${errors.get(e)?.message || "unknown"}`).join("; ");
|
|
1000
|
+
super(`All engines failed: ${summary}`);
|
|
1001
|
+
this.name = "AllEnginesFailedError";
|
|
1002
|
+
this.attemptedEngines = attemptedEngines;
|
|
1003
|
+
this.errors = errors;
|
|
1004
|
+
}
|
|
1005
|
+
};
|
|
1006
|
+
|
|
1007
|
+
// src/engines/http/index.ts
|
|
1008
|
+
var DEFAULT_HEADERS = {
|
|
1009
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
1010
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
1011
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
1012
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
1013
|
+
"Cache-Control": "no-cache",
|
|
1014
|
+
Pragma: "no-cache",
|
|
1015
|
+
"Sec-Fetch-Dest": "document",
|
|
1016
|
+
"Sec-Fetch-Mode": "navigate",
|
|
1017
|
+
"Sec-Fetch-Site": "none",
|
|
1018
|
+
"Sec-Fetch-User": "?1",
|
|
1019
|
+
"Upgrade-Insecure-Requests": "1"
|
|
1020
|
+
};
|
|
1021
|
+
var CHALLENGE_PATTERNS = [
|
|
1022
|
+
// Cloudflare
|
|
1023
|
+
"cf-browser-verification",
|
|
1024
|
+
"cf_chl_opt",
|
|
1025
|
+
"challenge-platform",
|
|
1026
|
+
"cf-spinner",
|
|
1027
|
+
"Just a moment",
|
|
1028
|
+
"Checking your browser",
|
|
1029
|
+
"checking if the site connection is secure",
|
|
1030
|
+
"Enable JavaScript and cookies",
|
|
1031
|
+
"Attention Required",
|
|
1032
|
+
"_cf_chl_tk",
|
|
1033
|
+
"Verifying you are human",
|
|
1034
|
+
"cf-turnstile",
|
|
1035
|
+
"/cdn-cgi/challenge-platform/",
|
|
1036
|
+
// Generic bot detection
|
|
1037
|
+
"Please Wait...",
|
|
1038
|
+
"DDoS protection by",
|
|
1039
|
+
"Access denied",
|
|
1040
|
+
"bot detection",
|
|
1041
|
+
"are you a robot",
|
|
1042
|
+
"complete the security check"
|
|
1043
|
+
];
|
|
1044
|
+
var CLOUDFLARE_INFRA_PATTERNS = ["/cdn-cgi/", "cloudflare", "__cf_bm", "cf-ray"];
|
|
1045
|
+
var MIN_CONTENT_LENGTH = 100;
|
|
1046
|
+
var HttpEngine = class {
|
|
1047
|
+
config = ENGINE_CONFIGS.http;
|
|
1048
|
+
async scrape(meta) {
|
|
1049
|
+
const startTime = Date.now();
|
|
1050
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1051
|
+
try {
|
|
1052
|
+
const controller = new AbortController();
|
|
1053
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1054
|
+
if (abortSignal) {
|
|
1055
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1056
|
+
}
|
|
1057
|
+
logger4?.debug(`[http] Fetching ${url}`);
|
|
1058
|
+
const response = await fetch(url, {
|
|
1059
|
+
method: "GET",
|
|
1060
|
+
headers: {
|
|
1061
|
+
...DEFAULT_HEADERS,
|
|
1062
|
+
...options.headers || {}
|
|
1063
|
+
},
|
|
1064
|
+
redirect: "follow",
|
|
1065
|
+
signal: controller.signal
|
|
1066
|
+
});
|
|
1067
|
+
clearTimeout(timeoutId);
|
|
1068
|
+
const duration = Date.now() - startTime;
|
|
1069
|
+
const html = await response.text();
|
|
1070
|
+
logger4?.debug(`[http] Got response: ${response.status} (${html.length} chars) in ${duration}ms`);
|
|
1071
|
+
if (response.status >= 400) {
|
|
1072
|
+
throw new HttpError("http", response.status, response.statusText);
|
|
1073
|
+
}
|
|
1074
|
+
const challengeType = this.detectChallenge(html);
|
|
1075
|
+
if (challengeType) {
|
|
1076
|
+
logger4?.debug(`[http] Challenge detected: ${challengeType}`);
|
|
1077
|
+
throw new ChallengeDetectedError("http", challengeType);
|
|
1078
|
+
}
|
|
1079
|
+
const textContent = this.extractText(html);
|
|
1080
|
+
if (textContent.length < MIN_CONTENT_LENGTH) {
|
|
1081
|
+
logger4?.debug(`[http] Insufficient content: ${textContent.length} chars`);
|
|
1082
|
+
throw new InsufficientContentError("http", textContent.length, MIN_CONTENT_LENGTH);
|
|
1083
|
+
}
|
|
1084
|
+
return {
|
|
1085
|
+
html,
|
|
1086
|
+
url: response.url,
|
|
1087
|
+
statusCode: response.status,
|
|
1088
|
+
contentType: response.headers.get("content-type") || void 0,
|
|
1089
|
+
headers: this.headersToRecord(response.headers),
|
|
1090
|
+
engine: "http",
|
|
1091
|
+
duration
|
|
1092
|
+
};
|
|
1093
|
+
} catch (error) {
|
|
1094
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError) {
|
|
1095
|
+
throw error;
|
|
1096
|
+
}
|
|
1097
|
+
if (error instanceof Error) {
|
|
1098
|
+
if (error.name === "AbortError") {
|
|
1099
|
+
throw new EngineTimeoutError("http", this.config.maxTimeout);
|
|
1100
|
+
}
|
|
1101
|
+
throw new EngineError("http", error.message, { cause: error });
|
|
1102
|
+
}
|
|
1103
|
+
throw new EngineError("http", String(error));
|
|
1070
1104
|
}
|
|
1071
|
-
this.pool = options.pool;
|
|
1072
1105
|
}
|
|
1073
1106
|
/**
|
|
1074
|
-
*
|
|
1107
|
+
* Detect challenge patterns in HTML
|
|
1108
|
+
* @returns Challenge type or null if no challenge detected
|
|
1075
1109
|
*/
|
|
1076
|
-
|
|
1077
|
-
const
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1110
|
+
detectChallenge(html) {
|
|
1111
|
+
const htmlLower = html.toLowerCase();
|
|
1112
|
+
const hasCloudflare = CLOUDFLARE_INFRA_PATTERNS.some((p) => htmlLower.includes(p.toLowerCase()));
|
|
1113
|
+
for (const pattern of CHALLENGE_PATTERNS) {
|
|
1114
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1115
|
+
if (hasCloudflare || pattern.includes("cf")) {
|
|
1116
|
+
return "cloudflare";
|
|
1117
|
+
}
|
|
1118
|
+
return "bot-detection";
|
|
1119
|
+
}
|
|
1081
1120
|
}
|
|
1082
|
-
return
|
|
1121
|
+
return null;
|
|
1083
1122
|
}
|
|
1084
1123
|
/**
|
|
1085
|
-
*
|
|
1086
|
-
*
|
|
1087
|
-
* @returns Scrape result with pages and metadata
|
|
1124
|
+
* Convert Headers to Record<string, string>
|
|
1088
1125
|
*/
|
|
1089
|
-
|
|
1090
|
-
const
|
|
1091
|
-
|
|
1092
|
-
|
|
1126
|
+
headersToRecord(headers) {
|
|
1127
|
+
const record = {};
|
|
1128
|
+
headers.forEach((value, key) => {
|
|
1129
|
+
record[key] = value;
|
|
1130
|
+
});
|
|
1131
|
+
return record;
|
|
1093
1132
|
}
|
|
1094
1133
|
/**
|
|
1095
|
-
*
|
|
1134
|
+
* Extract visible text from HTML (rough extraction)
|
|
1096
1135
|
*/
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1136
|
+
extractText(html) {
|
|
1137
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1138
|
+
}
|
|
1139
|
+
isAvailable() {
|
|
1140
|
+
return true;
|
|
1141
|
+
}
|
|
1142
|
+
};
|
|
1143
|
+
var httpEngine = new HttpEngine();
|
|
1144
|
+
|
|
1145
|
+
// src/engines/tlsclient/index.ts
|
|
1146
|
+
import { gotScraping } from "got-scraping";
|
|
1147
|
+
var JS_REQUIRED_PATTERNS = [
|
|
1148
|
+
// Cloudflare JS challenge
|
|
1149
|
+
"cf-browser-verification",
|
|
1150
|
+
"challenge-platform",
|
|
1151
|
+
"_cf_chl_tk",
|
|
1152
|
+
"/cdn-cgi/challenge-platform/",
|
|
1153
|
+
// Generic JS requirements
|
|
1154
|
+
"Enable JavaScript",
|
|
1155
|
+
"JavaScript is required",
|
|
1156
|
+
"Please enable JavaScript",
|
|
1157
|
+
"requires JavaScript",
|
|
1158
|
+
"noscript"
|
|
1159
|
+
];
|
|
1160
|
+
var BLOCKED_PATTERNS = [
|
|
1161
|
+
"Access denied",
|
|
1162
|
+
"Sorry, you have been blocked",
|
|
1163
|
+
"bot detected",
|
|
1164
|
+
"suspicious activity",
|
|
1165
|
+
"too many requests"
|
|
1166
|
+
];
|
|
1167
|
+
var MIN_CONTENT_LENGTH2 = 100;
|
|
1168
|
+
var TlsClientEngine = class {
|
|
1169
|
+
config = ENGINE_CONFIGS.tlsclient;
|
|
1170
|
+
available = true;
|
|
1171
|
+
constructor() {
|
|
1172
|
+
try {
|
|
1173
|
+
if (!gotScraping) {
|
|
1174
|
+
this.available = false;
|
|
1175
|
+
}
|
|
1176
|
+
} catch {
|
|
1177
|
+
this.available = false;
|
|
1110
1178
|
}
|
|
1111
|
-
return batchPromise;
|
|
1112
1179
|
}
|
|
1113
|
-
|
|
1114
|
-
|
|
1180
|
+
async scrape(meta) {
|
|
1181
|
+
if (!this.available) {
|
|
1182
|
+
throw new EngineUnavailableError("tlsclient", "got-scraping not available");
|
|
1183
|
+
}
|
|
1184
|
+
const startTime = Date.now();
|
|
1185
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1186
|
+
try {
|
|
1187
|
+
const controller = new AbortController();
|
|
1188
|
+
const timeoutId = setTimeout(() => controller.abort(), this.config.maxTimeout);
|
|
1189
|
+
if (abortSignal) {
|
|
1190
|
+
abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1191
|
+
}
|
|
1192
|
+
logger4?.debug(`[tlsclient] Fetching ${url}`);
|
|
1193
|
+
const response = await gotScraping({
|
|
1194
|
+
url,
|
|
1195
|
+
timeout: {
|
|
1196
|
+
request: this.config.maxTimeout
|
|
1197
|
+
},
|
|
1198
|
+
headers: options.headers,
|
|
1199
|
+
followRedirect: true
|
|
1200
|
+
// got-scraping handles browser fingerprinting automatically
|
|
1201
|
+
// It uses header generators and proper TLS settings
|
|
1202
|
+
});
|
|
1203
|
+
clearTimeout(timeoutId);
|
|
1204
|
+
const duration = Date.now() - startTime;
|
|
1205
|
+
const html = response.body;
|
|
1206
|
+
logger4?.debug(`[tlsclient] Got response: ${response.statusCode} (${html.length} chars) in ${duration}ms`);
|
|
1207
|
+
if (response.statusCode >= 400) {
|
|
1208
|
+
throw new HttpError("tlsclient", response.statusCode, response.statusMessage);
|
|
1209
|
+
}
|
|
1210
|
+
const challengeType = this.detectJsRequired(html);
|
|
1211
|
+
if (challengeType) {
|
|
1212
|
+
logger4?.debug(`[tlsclient] JS required: ${challengeType}`);
|
|
1213
|
+
throw new ChallengeDetectedError("tlsclient", challengeType);
|
|
1214
|
+
}
|
|
1215
|
+
const blockedReason = this.detectBlocked(html);
|
|
1216
|
+
if (blockedReason) {
|
|
1217
|
+
logger4?.debug(`[tlsclient] Blocked: ${blockedReason}`);
|
|
1218
|
+
throw new ChallengeDetectedError("tlsclient", `blocked: ${blockedReason}`);
|
|
1219
|
+
}
|
|
1220
|
+
const textContent = this.extractText(html);
|
|
1221
|
+
if (textContent.length < MIN_CONTENT_LENGTH2) {
|
|
1222
|
+
logger4?.debug(`[tlsclient] Insufficient content: ${textContent.length} chars`);
|
|
1223
|
+
throw new InsufficientContentError("tlsclient", textContent.length, MIN_CONTENT_LENGTH2);
|
|
1224
|
+
}
|
|
1225
|
+
return {
|
|
1226
|
+
html,
|
|
1227
|
+
url: response.url,
|
|
1228
|
+
statusCode: response.statusCode,
|
|
1229
|
+
contentType: response.headers["content-type"],
|
|
1230
|
+
headers: response.headers,
|
|
1231
|
+
engine: "tlsclient",
|
|
1232
|
+
duration
|
|
1233
|
+
};
|
|
1234
|
+
} catch (error) {
|
|
1235
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof HttpError || error instanceof EngineUnavailableError) {
|
|
1236
|
+
throw error;
|
|
1237
|
+
}
|
|
1238
|
+
if (error instanceof Error) {
|
|
1239
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1240
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1241
|
+
}
|
|
1242
|
+
if (error.name === "AbortError") {
|
|
1243
|
+
throw new EngineTimeoutError("tlsclient", this.config.maxTimeout);
|
|
1244
|
+
}
|
|
1245
|
+
throw new EngineError("tlsclient", error.message, { cause: error });
|
|
1246
|
+
}
|
|
1247
|
+
throw new EngineError("tlsclient", String(error));
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
/**
|
|
1251
|
+
* Detect patterns that require JS execution
|
|
1115
1252
|
*/
|
|
1116
|
-
|
|
1117
|
-
const
|
|
1118
|
-
|
|
1119
|
-
|
|
1253
|
+
detectJsRequired(html) {
|
|
1254
|
+
const htmlLower = html.toLowerCase();
|
|
1255
|
+
for (const pattern of JS_REQUIRED_PATTERNS) {
|
|
1256
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1257
|
+
if (pattern.includes("cf") || pattern.includes("cloudflare")) {
|
|
1258
|
+
return "cloudflare-js";
|
|
1259
|
+
}
|
|
1260
|
+
return "js-required";
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1263
|
+
return null;
|
|
1264
|
+
}
|
|
1265
|
+
/**
|
|
1266
|
+
* Detect blocked/denied patterns
|
|
1267
|
+
*/
|
|
1268
|
+
detectBlocked(html) {
|
|
1269
|
+
const htmlLower = html.toLowerCase();
|
|
1270
|
+
for (const pattern of BLOCKED_PATTERNS) {
|
|
1271
|
+
if (htmlLower.includes(pattern.toLowerCase())) {
|
|
1272
|
+
return pattern;
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
return null;
|
|
1276
|
+
}
|
|
1277
|
+
/**
|
|
1278
|
+
* Extract visible text from HTML
|
|
1279
|
+
*/
|
|
1280
|
+
extractText(html) {
|
|
1281
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1282
|
+
}
|
|
1283
|
+
isAvailable() {
|
|
1284
|
+
return this.available;
|
|
1285
|
+
}
|
|
1286
|
+
};
|
|
1287
|
+
var tlsClientEngine = new TlsClientEngine();
|
|
1288
|
+
|
|
1289
|
+
// src/cloudflare/detector.ts
|
|
1290
|
+
var CLOUDFLARE_CHALLENGE_SELECTORS = [
|
|
1291
|
+
"#challenge-running",
|
|
1292
|
+
"#challenge-stage",
|
|
1293
|
+
"#challenge-form",
|
|
1294
|
+
".cf-browser-verification",
|
|
1295
|
+
"#cf-wrapper",
|
|
1296
|
+
"#cf-hcaptcha-container",
|
|
1297
|
+
"#turnstile-wrapper"
|
|
1298
|
+
];
|
|
1299
|
+
var CLOUDFLARE_TEXT_PATTERNS = [
|
|
1300
|
+
"checking if the site connection is secure",
|
|
1301
|
+
"this process is automatic. your browser will redirect",
|
|
1302
|
+
"ray id:",
|
|
1303
|
+
"performance & security by cloudflare"
|
|
1304
|
+
];
|
|
1305
|
+
var CLOUDFLARE_INFRA_PATTERNS2 = [
|
|
1306
|
+
"/cdn-cgi/",
|
|
1307
|
+
"cloudflare",
|
|
1308
|
+
"__cf_bm",
|
|
1309
|
+
"cf-ray"
|
|
1310
|
+
];
|
|
1311
|
+
var CLOUDFLARE_BLOCKED_PATTERNS = [
|
|
1312
|
+
"sorry, you have been blocked",
|
|
1313
|
+
"ray id:"
|
|
1314
|
+
];
|
|
1315
|
+
async function detectChallenge(hero) {
|
|
1316
|
+
const signals = [];
|
|
1317
|
+
let type = "none";
|
|
1318
|
+
let hasCloudflareInfra = false;
|
|
1319
|
+
let hasChallengeIndicator = false;
|
|
1320
|
+
try {
|
|
1321
|
+
if (!hero.document) {
|
|
1322
|
+
return {
|
|
1323
|
+
isChallenge: false,
|
|
1324
|
+
type: "none",
|
|
1325
|
+
confidence: 0,
|
|
1326
|
+
signals: ["No document available"]
|
|
1327
|
+
};
|
|
1328
|
+
}
|
|
1329
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1330
|
+
const htmlLower = html.toLowerCase();
|
|
1331
|
+
for (const pattern of CLOUDFLARE_INFRA_PATTERNS2) {
|
|
1332
|
+
if (htmlLower.includes(pattern)) {
|
|
1333
|
+
hasCloudflareInfra = true;
|
|
1334
|
+
signals.push(`Cloudflare infra: "${pattern}"`);
|
|
1335
|
+
break;
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
if (!hasCloudflareInfra) {
|
|
1339
|
+
return {
|
|
1340
|
+
isChallenge: false,
|
|
1341
|
+
type: "none",
|
|
1342
|
+
confidence: 0,
|
|
1343
|
+
signals: ["No Cloudflare infrastructure detected"]
|
|
1344
|
+
};
|
|
1345
|
+
}
|
|
1346
|
+
for (const selector of CLOUDFLARE_CHALLENGE_SELECTORS) {
|
|
1120
1347
|
try {
|
|
1121
|
-
const
|
|
1122
|
-
if (
|
|
1123
|
-
|
|
1348
|
+
const element = await hero.document.querySelector(selector);
|
|
1349
|
+
if (element) {
|
|
1350
|
+
hasChallengeIndicator = true;
|
|
1351
|
+
signals.push(`Challenge element: ${selector}`);
|
|
1352
|
+
type = "js_challenge";
|
|
1124
1353
|
}
|
|
1125
|
-
|
|
1126
|
-
}
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1354
|
+
} catch {
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
for (const pattern of CLOUDFLARE_TEXT_PATTERNS) {
|
|
1358
|
+
if (htmlLower.includes(pattern)) {
|
|
1359
|
+
hasChallengeIndicator = true;
|
|
1360
|
+
signals.push(`Challenge text: "${pattern}"`);
|
|
1361
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
if (htmlLower.includes("waiting for") && htmlLower.includes("to respond")) {
|
|
1365
|
+
hasChallengeIndicator = true;
|
|
1366
|
+
signals.push('Challenge text: "waiting for...to respond"');
|
|
1367
|
+
type = type === "none" ? "js_challenge" : type;
|
|
1368
|
+
}
|
|
1369
|
+
const hasBlocked = CLOUDFLARE_BLOCKED_PATTERNS.every((p) => htmlLower.includes(p));
|
|
1370
|
+
if (hasBlocked) {
|
|
1371
|
+
hasChallengeIndicator = true;
|
|
1372
|
+
signals.push("Cloudflare block page detected");
|
|
1373
|
+
type = "blocked";
|
|
1374
|
+
}
|
|
1375
|
+
const isChallenge = hasCloudflareInfra && hasChallengeIndicator;
|
|
1376
|
+
const confidence = isChallenge ? 100 : 0;
|
|
1377
|
+
return {
|
|
1378
|
+
isChallenge,
|
|
1379
|
+
type: isChallenge ? type : "none",
|
|
1380
|
+
confidence,
|
|
1381
|
+
signals
|
|
1382
|
+
};
|
|
1383
|
+
} catch (error) {
|
|
1384
|
+
return {
|
|
1385
|
+
isChallenge: false,
|
|
1386
|
+
type: "none",
|
|
1387
|
+
confidence: 0,
|
|
1388
|
+
signals: [`Error during detection: ${error.message}`]
|
|
1389
|
+
};
|
|
1390
|
+
}
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
// src/cloudflare/handler.ts
|
|
1394
|
+
async function waitForChallengeResolution(hero, options) {
|
|
1395
|
+
const { maxWaitMs = 45e3, pollIntervalMs = 500, verbose = false, initialUrl } = options;
|
|
1396
|
+
const startTime = Date.now();
|
|
1397
|
+
const log = (msg) => verbose && console.log(` ${msg}`);
|
|
1398
|
+
while (Date.now() - startTime < maxWaitMs) {
|
|
1399
|
+
const elapsed = Date.now() - startTime;
|
|
1400
|
+
try {
|
|
1401
|
+
const currentUrl = await hero.url;
|
|
1402
|
+
if (currentUrl !== initialUrl) {
|
|
1403
|
+
log(`\u2713 URL changed: ${initialUrl} \u2192 ${currentUrl}`);
|
|
1404
|
+
log(` Waiting for new page to load...`);
|
|
1405
|
+
try {
|
|
1406
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1407
|
+
log(` DOMContentLoaded`);
|
|
1408
|
+
} catch {
|
|
1409
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1132
1410
|
}
|
|
1411
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1412
|
+
});
|
|
1413
|
+
log(` Page stabilized`);
|
|
1414
|
+
return { resolved: true, method: "url_redirect", waitedMs: elapsed };
|
|
1133
1415
|
}
|
|
1416
|
+
} catch {
|
|
1417
|
+
}
|
|
1418
|
+
const detection = await detectChallenge(hero);
|
|
1419
|
+
if (!detection.isChallenge) {
|
|
1420
|
+
log(`\u2713 Challenge signals cleared (confidence dropped to ${detection.confidence})`);
|
|
1421
|
+
log(` Waiting for page to load...`);
|
|
1422
|
+
try {
|
|
1423
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs: 3e4 });
|
|
1424
|
+
log(` DOMContentLoaded`);
|
|
1425
|
+
} catch {
|
|
1426
|
+
log(` DOMContentLoaded timeout, continuing...`);
|
|
1427
|
+
}
|
|
1428
|
+
await hero.waitForPaintingStable().catch(() => {
|
|
1429
|
+
});
|
|
1430
|
+
log(` Page stabilized`);
|
|
1431
|
+
return { resolved: true, method: "signals_cleared", waitedMs: elapsed };
|
|
1432
|
+
}
|
|
1433
|
+
log(
|
|
1434
|
+
`\u23F3 ${(elapsed / 1e3).toFixed(1)}s - Still challenge (confidence: ${detection.confidence})`
|
|
1435
|
+
);
|
|
1436
|
+
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
1437
|
+
}
|
|
1438
|
+
return {
|
|
1439
|
+
resolved: false,
|
|
1440
|
+
method: "timeout",
|
|
1441
|
+
waitedMs: Date.now() - startTime
|
|
1442
|
+
};
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
// src/engines/hero/index.ts
|
|
1446
|
+
var MIN_CONTENT_LENGTH3 = 100;
|
|
1447
|
+
var HeroEngine = class {
|
|
1448
|
+
config = ENGINE_CONFIGS.hero;
|
|
1449
|
+
async scrape(meta) {
|
|
1450
|
+
const startTime = Date.now();
|
|
1451
|
+
const { url, options, logger: logger4, abortSignal } = meta;
|
|
1452
|
+
const pool = options.pool;
|
|
1453
|
+
if (!pool) {
|
|
1454
|
+
throw new EngineUnavailableError("hero", "Browser pool not available");
|
|
1455
|
+
}
|
|
1456
|
+
if (abortSignal?.aborted) {
|
|
1457
|
+
throw new EngineTimeoutError("hero", 0);
|
|
1458
|
+
}
|
|
1459
|
+
logger4?.debug(`[hero] Starting browser scrape of ${url}`);
|
|
1460
|
+
try {
|
|
1461
|
+
const result = await pool.withBrowser(async (hero) => {
|
|
1462
|
+
let aborted = false;
|
|
1463
|
+
if (abortSignal) {
|
|
1464
|
+
abortSignal.addEventListener("abort", () => {
|
|
1465
|
+
aborted = true;
|
|
1466
|
+
}, { once: true });
|
|
1467
|
+
}
|
|
1468
|
+
const timeoutMs = options.timeoutMs || this.config.maxTimeout;
|
|
1469
|
+
await hero.goto(url, { timeoutMs });
|
|
1470
|
+
if (aborted) {
|
|
1471
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1472
|
+
}
|
|
1473
|
+
try {
|
|
1474
|
+
await hero.waitForLoad("DomContentLoaded", { timeoutMs });
|
|
1475
|
+
} catch {
|
|
1476
|
+
}
|
|
1477
|
+
await hero.waitForPaintingStable();
|
|
1478
|
+
if (aborted) {
|
|
1479
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1480
|
+
}
|
|
1481
|
+
const initialUrl = await hero.url;
|
|
1482
|
+
const detection = await detectChallenge(hero);
|
|
1483
|
+
if (detection.isChallenge) {
|
|
1484
|
+
logger4?.debug(`[hero] Challenge detected: ${detection.type}`);
|
|
1485
|
+
if (detection.type === "blocked") {
|
|
1486
|
+
throw new ChallengeDetectedError("hero", "blocked");
|
|
1487
|
+
}
|
|
1488
|
+
const resolution = await waitForChallengeResolution(hero, {
|
|
1489
|
+
maxWaitMs: 45e3,
|
|
1490
|
+
pollIntervalMs: 500,
|
|
1491
|
+
verbose: options.verbose,
|
|
1492
|
+
initialUrl
|
|
1493
|
+
});
|
|
1494
|
+
if (!resolution.resolved) {
|
|
1495
|
+
throw new ChallengeDetectedError("hero", `unresolved: ${detection.type}`);
|
|
1496
|
+
}
|
|
1497
|
+
logger4?.debug(`[hero] Challenge resolved via ${resolution.method} in ${resolution.waitedMs}ms`);
|
|
1498
|
+
}
|
|
1499
|
+
if (aborted) {
|
|
1500
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1501
|
+
}
|
|
1502
|
+
await this.waitForFinalPage(hero, url, logger4);
|
|
1503
|
+
if (aborted) {
|
|
1504
|
+
throw new EngineTimeoutError("hero", Date.now() - startTime);
|
|
1505
|
+
}
|
|
1506
|
+
if (options.waitForSelector) {
|
|
1507
|
+
try {
|
|
1508
|
+
await hero.waitForElement(hero.document.querySelector(options.waitForSelector), {
|
|
1509
|
+
timeoutMs
|
|
1510
|
+
});
|
|
1511
|
+
} catch {
|
|
1512
|
+
logger4?.debug(`[hero] Selector not found: ${options.waitForSelector}`);
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
const html = await hero.document.documentElement.outerHTML;
|
|
1516
|
+
const finalUrl = await hero.url;
|
|
1517
|
+
const textContent = this.extractText(html);
|
|
1518
|
+
if (textContent.length < MIN_CONTENT_LENGTH3) {
|
|
1519
|
+
logger4?.debug(`[hero] Insufficient content: ${textContent.length} chars`);
|
|
1520
|
+
throw new InsufficientContentError("hero", textContent.length, MIN_CONTENT_LENGTH3);
|
|
1521
|
+
}
|
|
1522
|
+
const duration = Date.now() - startTime;
|
|
1523
|
+
logger4?.debug(`[hero] Success: ${html.length} chars in ${duration}ms`);
|
|
1524
|
+
return {
|
|
1525
|
+
html,
|
|
1526
|
+
url: finalUrl,
|
|
1527
|
+
statusCode: 200,
|
|
1528
|
+
// Hero doesn't expose status code directly
|
|
1529
|
+
engine: "hero",
|
|
1530
|
+
duration
|
|
1531
|
+
};
|
|
1532
|
+
});
|
|
1533
|
+
return result;
|
|
1534
|
+
} catch (error) {
|
|
1535
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError || error instanceof EngineUnavailableError) {
|
|
1536
|
+
throw error;
|
|
1537
|
+
}
|
|
1538
|
+
if (error instanceof Error) {
|
|
1539
|
+
if (error.name === "TimeoutError" || error.message.includes("timeout")) {
|
|
1540
|
+
throw new EngineTimeoutError("hero", this.config.maxTimeout);
|
|
1541
|
+
}
|
|
1542
|
+
if (error.message.includes("Navigation") || error.message.includes("ERR_")) {
|
|
1543
|
+
throw new EngineError("hero", `Navigation failed: ${error.message}`, { cause: error });
|
|
1544
|
+
}
|
|
1545
|
+
throw new EngineError("hero", error.message, { cause: error });
|
|
1546
|
+
}
|
|
1547
|
+
throw new EngineError("hero", String(error));
|
|
1134
1548
|
}
|
|
1135
|
-
this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
|
|
1136
|
-
return { result: null, error: lastError };
|
|
1137
1549
|
}
|
|
1138
1550
|
/**
|
|
1139
1551
|
* Wait for the final page to load after any Cloudflare redirects
|
|
1140
|
-
* Cloudflare often does silent redirects even when bypassed, we need to ensure
|
|
1141
|
-
* we're on the actual content page before scraping.
|
|
1142
1552
|
*/
|
|
1143
|
-
async waitForFinalPage(hero, originalUrl,
|
|
1553
|
+
async waitForFinalPage(hero, originalUrl, logger4) {
|
|
1144
1554
|
const maxWaitMs = 15e3;
|
|
1145
1555
|
const startTime = Date.now();
|
|
1146
|
-
const log = (msg) => verbose && this.logger.info(msg);
|
|
1147
1556
|
try {
|
|
1148
1557
|
await hero.waitForLoad("AllContentLoaded", { timeoutMs: maxWaitMs });
|
|
1149
1558
|
} catch {
|
|
@@ -1152,7 +1561,7 @@ var Scraper = class {
|
|
|
1152
1561
|
const normalizeUrl2 = (url) => url.replace(/\/+$/, "");
|
|
1153
1562
|
const urlChanged = normalizeUrl2(currentUrl) !== normalizeUrl2(originalUrl);
|
|
1154
1563
|
if (urlChanged || currentUrl.includes("__cf_chl")) {
|
|
1155
|
-
|
|
1564
|
+
logger4?.debug(`[hero] Cloudflare redirect detected: ${originalUrl} \u2192 ${currentUrl}`);
|
|
1156
1565
|
let lastUrl = currentUrl;
|
|
1157
1566
|
let stableCount = 0;
|
|
1158
1567
|
while (Date.now() - startTime < maxWaitMs) {
|
|
@@ -1167,7 +1576,7 @@ var Scraper = class {
|
|
|
1167
1576
|
} else {
|
|
1168
1577
|
stableCount = 0;
|
|
1169
1578
|
lastUrl = currentUrl;
|
|
1170
|
-
|
|
1579
|
+
logger4?.debug(`[hero] URL changed to: ${currentUrl}`);
|
|
1171
1580
|
}
|
|
1172
1581
|
} catch {
|
|
1173
1582
|
}
|
|
@@ -1181,7 +1590,223 @@ var Scraper = class {
|
|
|
1181
1590
|
await new Promise((resolve) => setTimeout(resolve, 2e3));
|
|
1182
1591
|
}
|
|
1183
1592
|
/**
|
|
1184
|
-
*
|
|
1593
|
+
* Extract visible text from HTML
|
|
1594
|
+
*/
|
|
1595
|
+
extractText(html) {
|
|
1596
|
+
return html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
1597
|
+
}
|
|
1598
|
+
isAvailable() {
|
|
1599
|
+
return true;
|
|
1600
|
+
}
|
|
1601
|
+
};
|
|
1602
|
+
var heroEngine = new HeroEngine();
|
|
1603
|
+
|
|
1604
|
+
// src/engines/orchestrator.ts
|
|
1605
|
+
var ENGINE_REGISTRY = {
|
|
1606
|
+
http: httpEngine,
|
|
1607
|
+
tlsclient: tlsClientEngine,
|
|
1608
|
+
hero: heroEngine
|
|
1609
|
+
};
|
|
1610
|
+
var EngineOrchestrator = class {
|
|
1611
|
+
options;
|
|
1612
|
+
engines;
|
|
1613
|
+
engineOrder;
|
|
1614
|
+
constructor(options = {}) {
|
|
1615
|
+
this.options = options;
|
|
1616
|
+
this.engineOrder = this.resolveEngineOrder();
|
|
1617
|
+
this.engines = this.engineOrder.map((name) => ENGINE_REGISTRY[name]).filter((engine) => engine.isAvailable());
|
|
1618
|
+
}
|
|
1619
|
+
/**
|
|
1620
|
+
* Resolve the engine order based on options
|
|
1621
|
+
*/
|
|
1622
|
+
resolveEngineOrder() {
|
|
1623
|
+
if (this.options.forceEngine) {
|
|
1624
|
+
return [this.options.forceEngine];
|
|
1625
|
+
}
|
|
1626
|
+
let order = this.options.engines || [...DEFAULT_ENGINE_ORDER];
|
|
1627
|
+
if (this.options.skipEngines) {
|
|
1628
|
+
order = order.filter((e) => !this.options.skipEngines.includes(e));
|
|
1629
|
+
}
|
|
1630
|
+
return order;
|
|
1631
|
+
}
|
|
1632
|
+
/**
|
|
1633
|
+
* Get available engines
|
|
1634
|
+
*/
|
|
1635
|
+
getAvailableEngines() {
|
|
1636
|
+
return this.engines.map((e) => e.config.name);
|
|
1637
|
+
}
|
|
1638
|
+
/**
|
|
1639
|
+
* Scrape a URL using the engine cascade
|
|
1640
|
+
*
|
|
1641
|
+
* @param meta - Engine metadata (url, options, logger, abortSignal)
|
|
1642
|
+
* @returns Scrape result with engine metadata
|
|
1643
|
+
* @throws AllEnginesFailedError if all engines fail
|
|
1644
|
+
*/
|
|
1645
|
+
async scrape(meta) {
|
|
1646
|
+
const attemptedEngines = [];
|
|
1647
|
+
const engineErrors = /* @__PURE__ */ new Map();
|
|
1648
|
+
const logger4 = meta.logger || this.options.logger;
|
|
1649
|
+
const verbose = this.options.verbose || meta.options.verbose;
|
|
1650
|
+
if (this.engines.length === 0) {
|
|
1651
|
+
throw new AllEnginesFailedError([], engineErrors);
|
|
1652
|
+
}
|
|
1653
|
+
const log = (msg) => {
|
|
1654
|
+
if (verbose) {
|
|
1655
|
+
logger4?.info(msg);
|
|
1656
|
+
} else {
|
|
1657
|
+
logger4?.debug(msg);
|
|
1658
|
+
}
|
|
1659
|
+
};
|
|
1660
|
+
log(`[orchestrator] Starting scrape of ${meta.url} with engines: ${this.engineOrder.join(" \u2192 ")}`);
|
|
1661
|
+
for (const engine of this.engines) {
|
|
1662
|
+
const engineName = engine.config.name;
|
|
1663
|
+
attemptedEngines.push(engineName);
|
|
1664
|
+
try {
|
|
1665
|
+
log(`[orchestrator] Trying ${engineName} engine...`);
|
|
1666
|
+
const controller = new AbortController();
|
|
1667
|
+
const timeoutId = setTimeout(() => controller.abort(), engine.config.maxTimeout);
|
|
1668
|
+
if (meta.abortSignal) {
|
|
1669
|
+
meta.abortSignal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
1670
|
+
}
|
|
1671
|
+
try {
|
|
1672
|
+
const result = await engine.scrape({
|
|
1673
|
+
...meta,
|
|
1674
|
+
abortSignal: controller.signal
|
|
1675
|
+
});
|
|
1676
|
+
clearTimeout(timeoutId);
|
|
1677
|
+
log(`[orchestrator] \u2713 ${engineName} succeeded in ${result.duration}ms`);
|
|
1678
|
+
return {
|
|
1679
|
+
...result,
|
|
1680
|
+
attemptedEngines,
|
|
1681
|
+
engineErrors
|
|
1682
|
+
};
|
|
1683
|
+
} finally {
|
|
1684
|
+
clearTimeout(timeoutId);
|
|
1685
|
+
}
|
|
1686
|
+
} catch (error) {
|
|
1687
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
1688
|
+
engineErrors.set(engineName, err);
|
|
1689
|
+
if (error instanceof ChallengeDetectedError) {
|
|
1690
|
+
log(`[orchestrator] ${engineName} detected challenge: ${error.challengeType}`);
|
|
1691
|
+
} else if (error instanceof InsufficientContentError) {
|
|
1692
|
+
log(`[orchestrator] ${engineName} insufficient content: ${error.contentLength} chars`);
|
|
1693
|
+
} else if (error instanceof HttpError) {
|
|
1694
|
+
log(`[orchestrator] ${engineName} HTTP error: ${error.statusCode}`);
|
|
1695
|
+
} else if (error instanceof EngineTimeoutError) {
|
|
1696
|
+
log(`[orchestrator] ${engineName} timed out after ${error.timeoutMs}ms`);
|
|
1697
|
+
} else if (error instanceof EngineUnavailableError) {
|
|
1698
|
+
log(`[orchestrator] ${engineName} unavailable: ${err.message}`);
|
|
1699
|
+
} else {
|
|
1700
|
+
log(`[orchestrator] ${engineName} failed: ${err.message}`);
|
|
1701
|
+
}
|
|
1702
|
+
if (!this.shouldRetry(error)) {
|
|
1703
|
+
log(`[orchestrator] Non-retryable error, stopping cascade`);
|
|
1704
|
+
break;
|
|
1705
|
+
}
|
|
1706
|
+
log(`[orchestrator] Falling back to next engine...`);
|
|
1707
|
+
}
|
|
1708
|
+
}
|
|
1709
|
+
log(`[orchestrator] All engines failed for ${meta.url}`);
|
|
1710
|
+
throw new AllEnginesFailedError(attemptedEngines, engineErrors);
|
|
1711
|
+
}
|
|
1712
|
+
/**
|
|
1713
|
+
* Determine if we should retry with next engine
|
|
1714
|
+
*/
|
|
1715
|
+
shouldRetry(error) {
|
|
1716
|
+
if (error instanceof ChallengeDetectedError || error instanceof InsufficientContentError || error instanceof EngineTimeoutError) {
|
|
1717
|
+
return true;
|
|
1718
|
+
}
|
|
1719
|
+
if (error instanceof HttpError) {
|
|
1720
|
+
return error.statusCode === 403 || error.statusCode === 404 || error.statusCode === 429 || error.statusCode >= 500;
|
|
1721
|
+
}
|
|
1722
|
+
if (error instanceof EngineUnavailableError) {
|
|
1723
|
+
return true;
|
|
1724
|
+
}
|
|
1725
|
+
if (error instanceof EngineError) {
|
|
1726
|
+
return error.retryable;
|
|
1727
|
+
}
|
|
1728
|
+
return true;
|
|
1729
|
+
}
|
|
1730
|
+
};
|
|
1731
|
+
|
|
1732
|
+
// src/scraper.ts
|
|
1733
|
+
var Scraper = class {
|
|
1734
|
+
options;
|
|
1735
|
+
logger = createLogger("scraper");
|
|
1736
|
+
robotsCache = /* @__PURE__ */ new Map();
|
|
1737
|
+
constructor(options) {
|
|
1738
|
+
this.options = {
|
|
1739
|
+
...DEFAULT_OPTIONS,
|
|
1740
|
+
...options
|
|
1741
|
+
};
|
|
1742
|
+
}
|
|
1743
|
+
/**
|
|
1744
|
+
* Get robots.txt rules for a URL, cached per domain
|
|
1745
|
+
*/
|
|
1746
|
+
async getRobotsRules(url) {
|
|
1747
|
+
const origin = new URL(url).origin;
|
|
1748
|
+
if (!this.robotsCache.has(origin)) {
|
|
1749
|
+
const rules = await fetchRobotsTxt(origin);
|
|
1750
|
+
this.robotsCache.set(origin, rules);
|
|
1751
|
+
}
|
|
1752
|
+
return this.robotsCache.get(origin) ?? null;
|
|
1753
|
+
}
|
|
1754
|
+
/**
|
|
1755
|
+
* Scrape all URLs
|
|
1756
|
+
*
|
|
1757
|
+
* @returns Scrape result with pages and metadata
|
|
1758
|
+
*/
|
|
1759
|
+
async scrape() {
|
|
1760
|
+
const startTime = Date.now();
|
|
1761
|
+
const results = await this.scrapeWithConcurrency();
|
|
1762
|
+
return this.buildScrapeResult(results, startTime);
|
|
1763
|
+
}
|
|
1764
|
+
/**
|
|
1765
|
+
* Scrape URLs with concurrency control
|
|
1766
|
+
*/
|
|
1767
|
+
async scrapeWithConcurrency() {
|
|
1768
|
+
const limit = pLimit(this.options.batchConcurrency || 1);
|
|
1769
|
+
const tasks = this.options.urls.map(
|
|
1770
|
+
(url, index) => limit(() => this.scrapeSingleUrlWithRetry(url, index))
|
|
1771
|
+
);
|
|
1772
|
+
const batchPromise = Promise.all(tasks);
|
|
1773
|
+
if (this.options.batchTimeoutMs && this.options.batchTimeoutMs > 0) {
|
|
1774
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1775
|
+
setTimeout(() => {
|
|
1776
|
+
reject(new Error(`Batch operation timed out after ${this.options.batchTimeoutMs}ms`));
|
|
1777
|
+
}, this.options.batchTimeoutMs);
|
|
1778
|
+
});
|
|
1779
|
+
return Promise.race([batchPromise, timeoutPromise]);
|
|
1780
|
+
}
|
|
1781
|
+
return batchPromise;
|
|
1782
|
+
}
|
|
1783
|
+
/**
|
|
1784
|
+
* Scrape a single URL with retry logic
|
|
1785
|
+
*/
|
|
1786
|
+
async scrapeSingleUrlWithRetry(url, index) {
|
|
1787
|
+
const maxRetries = this.options.maxRetries || 2;
|
|
1788
|
+
let lastError;
|
|
1789
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1790
|
+
try {
|
|
1791
|
+
const result = await this.scrapeSingleUrl(url, index);
|
|
1792
|
+
if (result) {
|
|
1793
|
+
return { result };
|
|
1794
|
+
}
|
|
1795
|
+
lastError = `Failed to scrape ${url}: No content returned`;
|
|
1796
|
+
} catch (error) {
|
|
1797
|
+
lastError = error.message;
|
|
1798
|
+
if (attempt < maxRetries) {
|
|
1799
|
+
const delay = Math.pow(2, attempt) * 1e3;
|
|
1800
|
+
this.logger.warn(`Retry ${attempt + 1}/${maxRetries} for ${url} in ${delay}ms`);
|
|
1801
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
}
|
|
1805
|
+
this.logger.error(`Failed to scrape ${url} after ${maxRetries + 1} attempts: ${lastError}`);
|
|
1806
|
+
return { result: null, error: lastError };
|
|
1807
|
+
}
|
|
1808
|
+
/**
|
|
1809
|
+
* Scrape a single URL using the engine orchestrator
|
|
1185
1810
|
*/
|
|
1186
1811
|
async scrapeSingleUrl(url, index) {
|
|
1187
1812
|
const startTime = Date.now();
|
|
@@ -1190,98 +1815,84 @@ var Scraper = class {
|
|
|
1190
1815
|
throw new Error(`URL blocked by robots.txt: ${url}`);
|
|
1191
1816
|
}
|
|
1192
1817
|
try {
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
this.logger.warn(`Selector not found: ${this.options.waitForSelector}`);
|
|
1227
|
-
}
|
|
1228
|
-
}
|
|
1229
|
-
const html = await hero.document.documentElement.outerHTML;
|
|
1230
|
-
const cleanedHtml = cleanContent(html, url, {
|
|
1231
|
-
removeAds: this.options.removeAds,
|
|
1232
|
-
removeBase64Images: this.options.removeBase64Images,
|
|
1233
|
-
onlyMainContent: this.options.onlyMainContent,
|
|
1234
|
-
includeTags: this.options.includeTags,
|
|
1235
|
-
excludeTags: this.options.excludeTags
|
|
1818
|
+
const orchestrator = new EngineOrchestrator({
|
|
1819
|
+
engines: this.options.engines,
|
|
1820
|
+
skipEngines: this.options.skipEngines,
|
|
1821
|
+
forceEngine: this.options.forceEngine,
|
|
1822
|
+
logger: this.logger,
|
|
1823
|
+
verbose: this.options.verbose
|
|
1824
|
+
});
|
|
1825
|
+
const engineResult = await orchestrator.scrape({
|
|
1826
|
+
url,
|
|
1827
|
+
options: this.options,
|
|
1828
|
+
logger: this.logger
|
|
1829
|
+
});
|
|
1830
|
+
if (this.options.verbose) {
|
|
1831
|
+
this.logger.info(
|
|
1832
|
+
`[scraper] ${url} scraped with ${engineResult.engine} engine in ${engineResult.duration}ms (attempted: ${engineResult.attemptedEngines.join(" \u2192 ")})`
|
|
1833
|
+
);
|
|
1834
|
+
}
|
|
1835
|
+
const cleanedHtml = cleanContent(engineResult.html, engineResult.url, {
|
|
1836
|
+
removeAds: this.options.removeAds,
|
|
1837
|
+
removeBase64Images: this.options.removeBase64Images,
|
|
1838
|
+
onlyMainContent: this.options.onlyMainContent,
|
|
1839
|
+
includeTags: this.options.includeTags,
|
|
1840
|
+
excludeTags: this.options.excludeTags
|
|
1841
|
+
});
|
|
1842
|
+
const websiteMetadata = extractMetadata(cleanedHtml, engineResult.url);
|
|
1843
|
+
const duration = Date.now() - startTime;
|
|
1844
|
+
const markdown = this.options.formats.includes("markdown") ? htmlToMarkdown(cleanedHtml) : void 0;
|
|
1845
|
+
const htmlOutput = this.options.formats.includes("html") ? cleanedHtml : void 0;
|
|
1846
|
+
if (this.options.onProgress) {
|
|
1847
|
+
this.options.onProgress({
|
|
1848
|
+
completed: index + 1,
|
|
1849
|
+
total: this.options.urls.length,
|
|
1850
|
+
currentUrl: url
|
|
1236
1851
|
});
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
const
|
|
1241
|
-
if (
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
total: this.options.urls.length,
|
|
1245
|
-
currentUrl: url
|
|
1246
|
-
});
|
|
1247
|
-
}
|
|
1248
|
-
let proxyMetadata;
|
|
1249
|
-
if (this.options.proxy) {
|
|
1250
|
-
const proxy = this.options.proxy;
|
|
1251
|
-
if (proxy.url) {
|
|
1252
|
-
try {
|
|
1253
|
-
const proxyUrl = new URL(proxy.url);
|
|
1254
|
-
proxyMetadata = {
|
|
1255
|
-
host: proxyUrl.hostname,
|
|
1256
|
-
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1257
|
-
country: proxy.country
|
|
1258
|
-
};
|
|
1259
|
-
} catch {
|
|
1260
|
-
}
|
|
1261
|
-
} else if (proxy.host && proxy.port) {
|
|
1852
|
+
}
|
|
1853
|
+
let proxyMetadata;
|
|
1854
|
+
if (this.options.proxy) {
|
|
1855
|
+
const proxy = this.options.proxy;
|
|
1856
|
+
if (proxy.url) {
|
|
1857
|
+
try {
|
|
1858
|
+
const proxyUrl = new URL(proxy.url);
|
|
1262
1859
|
proxyMetadata = {
|
|
1263
|
-
host:
|
|
1264
|
-
port:
|
|
1860
|
+
host: proxyUrl.hostname,
|
|
1861
|
+
port: parseInt(proxyUrl.port, 10) || 80,
|
|
1265
1862
|
country: proxy.country
|
|
1266
1863
|
};
|
|
1864
|
+
} catch {
|
|
1267
1865
|
}
|
|
1866
|
+
} else if (proxy.host && proxy.port) {
|
|
1867
|
+
proxyMetadata = {
|
|
1868
|
+
host: proxy.host,
|
|
1869
|
+
port: proxy.port,
|
|
1870
|
+
country: proxy.country
|
|
1871
|
+
};
|
|
1268
1872
|
}
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
}
|
|
1281
|
-
|
|
1282
|
-
|
|
1873
|
+
}
|
|
1874
|
+
const result = {
|
|
1875
|
+
markdown,
|
|
1876
|
+
html: htmlOutput,
|
|
1877
|
+
metadata: {
|
|
1878
|
+
baseUrl: url,
|
|
1879
|
+
totalPages: 1,
|
|
1880
|
+
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1881
|
+
duration,
|
|
1882
|
+
website: websiteMetadata,
|
|
1883
|
+
proxy: proxyMetadata
|
|
1884
|
+
}
|
|
1885
|
+
};
|
|
1886
|
+
return result;
|
|
1283
1887
|
} catch (error) {
|
|
1284
|
-
|
|
1888
|
+
if (error instanceof AllEnginesFailedError) {
|
|
1889
|
+
const engineSummary = error.attemptedEngines.map((e) => `${e}: ${error.errors.get(e)?.message || "unknown"}`).join("; ");
|
|
1890
|
+
this.logger.error(`Failed to scrape ${url}: All engines failed - ${engineSummary}`);
|
|
1891
|
+
} else if (error instanceof Error) {
|
|
1892
|
+
this.logger.error(`Failed to scrape ${url}: ${error.message}`);
|
|
1893
|
+
} else {
|
|
1894
|
+
this.logger.error(`Failed to scrape ${url}: ${String(error)}`);
|
|
1895
|
+
}
|
|
1285
1896
|
if (this.options.onProgress) {
|
|
1286
1897
|
this.options.onProgress({
|
|
1287
1898
|
completed: index + 1,
|
|
@@ -2604,7 +3215,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2604
3215
|
"-f, --format <formats>",
|
|
2605
3216
|
"Content formats to include (comma-separated: markdown,html)",
|
|
2606
3217
|
"markdown"
|
|
2607
|
-
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").action(async (urls, options) => {
|
|
3218
|
+
).option("-o, --output <file>", "Output file (stdout if omitted)").option("-c, --concurrency <n>", "Parallel requests", "1").option("-t, --timeout <ms>", "Request timeout in milliseconds", "30000").option("--proxy <url>", "Proxy URL (e.g., http://user:pass@host:port)").option("--user-agent <string>", "Custom user agent string").option("--batch-timeout <ms>", "Total timeout for entire batch operation", "300000").option("--show-chrome", "Show browser window for debugging").option("--standalone", "Force standalone mode (bypass daemon)").option("-p, --port <n>", `Daemon port (default: ${DEFAULT_DAEMON_PORT})`, String(DEFAULT_DAEMON_PORT)).option("-v, --verbose", "Enable verbose logging").option("--no-main-content", "Disable main content extraction (include full page)").option("--include-tags <selectors>", "CSS selectors for elements to include (comma-separated)").option("--exclude-tags <selectors>", "CSS selectors for elements to exclude (comma-separated)").option("--engine <name>", "Force a specific engine (http, tlsclient, hero)").option("--skip-engine <names>", "Skip specific engines (comma-separated: http,tlsclient,hero)").action(async (urls, options) => {
|
|
2608
3219
|
const port = parseInt(options.port, 10);
|
|
2609
3220
|
const useStandalone = options.standalone || false;
|
|
2610
3221
|
let useDaemon = false;
|
|
@@ -2636,6 +3247,7 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2636
3247
|
}
|
|
2637
3248
|
const includeTags = options.includeTags ? options.includeTags.split(",").map((s) => s.trim()) : void 0;
|
|
2638
3249
|
const excludeTags = options.excludeTags ? options.excludeTags.split(",").map((s) => s.trim()) : void 0;
|
|
3250
|
+
const skipEngines = options.skipEngine ? options.skipEngine.split(",").map((s) => s.trim()) : void 0;
|
|
2639
3251
|
const scrapeOptions = {
|
|
2640
3252
|
urls,
|
|
2641
3253
|
formats,
|
|
@@ -2651,6 +3263,9 @@ program.command("scrape <urls...>").description("Scrape one or more URLs").optio
|
|
|
2651
3263
|
// --no-main-content sets this to false
|
|
2652
3264
|
includeTags,
|
|
2653
3265
|
excludeTags,
|
|
3266
|
+
// Engine options
|
|
3267
|
+
forceEngine: options.engine,
|
|
3268
|
+
skipEngines,
|
|
2654
3269
|
onProgress: options.verbose ? ({ completed, total, currentUrl }) => {
|
|
2655
3270
|
console.error(`[${completed}/${total}] ${currentUrl}`);
|
|
2656
3271
|
} : void 0
|