@purepageio/fetch-engines 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1657 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +323 -0
- package/dist/index.d.ts +323 -8
- package/dist/index.js +1617 -4
- package/dist/index.js.map +1 -1
- package/package.json +14 -5
- package/dist/FetchEngine.d.ts +0 -47
- package/dist/FetchEngine.d.ts.map +0 -1
- package/dist/FetchEngine.js +0 -114
- package/dist/FetchEngine.js.map +0 -1
- package/dist/FetchEngine.test.d.ts +0 -2
- package/dist/FetchEngine.test.d.ts.map +0 -1
- package/dist/FetchEngine.test.js +0 -44
- package/dist/FetchEngine.test.js.map +0 -1
- package/dist/HybridEngine.d.ts +0 -21
- package/dist/HybridEngine.d.ts.map +0 -1
- package/dist/HybridEngine.js +0 -62
- package/dist/HybridEngine.js.map +0 -1
- package/dist/IEngine.d.ts +0 -22
- package/dist/IEngine.d.ts.map +0 -1
- package/dist/IEngine.js +0 -2
- package/dist/IEngine.js.map +0 -1
- package/dist/PlaywrightEngine.d.ts +0 -90
- package/dist/PlaywrightEngine.d.ts.map +0 -1
- package/dist/PlaywrightEngine.js +0 -505
- package/dist/PlaywrightEngine.js.map +0 -1
- package/dist/PlaywrightEngine.test.d.ts +0 -2
- package/dist/PlaywrightEngine.test.d.ts.map +0 -1
- package/dist/PlaywrightEngine.test.js +0 -207
- package/dist/PlaywrightEngine.test.js.map +0 -1
- package/dist/PuppeteerEngine.d.ts +0 -21
- package/dist/PuppeteerEngine.d.ts.map +0 -1
- package/dist/PuppeteerEngine.js +0 -412
- package/dist/PuppeteerEngine.js.map +0 -1
- package/dist/browser/BrowserPool.d.ts +0 -29
- package/dist/browser/BrowserPool.d.ts.map +0 -1
- package/dist/browser/BrowserPool.js +0 -378
- package/dist/browser/BrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.d.ts +0 -78
- package/dist/browser/PlaywrightBrowserPool.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.js +0 -445
- package/dist/browser/PlaywrightBrowserPool.js.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.d.ts +0 -2
- package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +0 -1
- package/dist/browser/PlaywrightBrowserPool.test.js +0 -422
- package/dist/browser/PlaywrightBrowserPool.test.js.map +0 -1
- package/dist/errors.d.ts +0 -20
- package/dist/errors.d.ts.map +0 -1
- package/dist/errors.js +0 -30
- package/dist/errors.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/types.d.ts +0 -167
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js +0 -2
- package/dist/types.js.map +0 -1
- package/dist/utils/markdown-converter.d.ts +0 -31
- package/dist/utils/markdown-converter.d.ts.map +0 -1
- package/dist/utils/markdown-converter.js +0 -796
- package/dist/utils/markdown-converter.js.map +0 -1
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,1657 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
FetchEngine: () => FetchEngine,
|
|
34
|
+
HybridEngine: () => HybridEngine,
|
|
35
|
+
PlaywrightEngine: () => PlaywrightEngine
|
|
36
|
+
});
|
|
37
|
+
module.exports = __toCommonJS(index_exports);
|
|
38
|
+
|
|
39
|
+
// src/utils/markdown-converter.ts
|
|
40
|
+
var import_turndown = __toESM(require("turndown"), 1);
|
|
41
|
+
var import_turndown_plugin_gfm = require("turndown-plugin-gfm");
|
|
42
|
+
var import_node_html_parser = require("node-html-parser");
|
|
43
|
+
var PREPROCESSING_REMOVE_SELECTORS = [
|
|
44
|
+
"script:not([type='application/ld+json'])",
|
|
45
|
+
// Keep JSON-LD
|
|
46
|
+
"style",
|
|
47
|
+
"noscript",
|
|
48
|
+
"iframe:not([title])"
|
|
49
|
+
// Keep iframes with titles (potential embeds)
|
|
50
|
+
];
|
|
51
|
+
var MAIN_CONTENT_SELECTORS = [
|
|
52
|
+
// By semantics
|
|
53
|
+
"article",
|
|
54
|
+
"main",
|
|
55
|
+
"[role='main']",
|
|
56
|
+
"[role='article']",
|
|
57
|
+
// By common class/id names (more robust patterns)
|
|
58
|
+
"[class*='article-body']",
|
|
59
|
+
"[class*='post-content']",
|
|
60
|
+
"[class*='main-content']",
|
|
61
|
+
"[class*='entry-content']",
|
|
62
|
+
"[id*='article-body']",
|
|
63
|
+
"[id*='main-content']",
|
|
64
|
+
// Common CMS patterns
|
|
65
|
+
".article",
|
|
66
|
+
".post",
|
|
67
|
+
".content",
|
|
68
|
+
".entry",
|
|
69
|
+
".blog-post",
|
|
70
|
+
// Fallback
|
|
71
|
+
"body"
|
|
72
|
+
];
|
|
73
|
+
var FORUM_COMMENT_SELECTORS = [
|
|
74
|
+
".comment",
|
|
75
|
+
".comments",
|
|
76
|
+
".comtr",
|
|
77
|
+
'[id^="comment-"]',
|
|
78
|
+
'div[id^="c_"]'
|
|
79
|
+
];
|
|
80
|
+
var FORUM_THREAD_SELECTORS = [".thread", ".post", '[id^="thread-"]'];
|
|
81
|
+
var FORUM_VOTE_SELECTORS = [".vote", ".score", ".upvote", ".downvote", ".votelinks"];
|
|
82
|
+
var FORUM_MAIN_POST_SELECTORS = [".fatitem", ".submission", ".op", ".original-post"];
|
|
83
|
+
var FORUM_COMMENTS_CONTAINER_SELECTORS = [".comment-tree", ".comments", "#comments"];
|
|
84
|
+
var FORUM_OBVIOUS_NON_CONTENT_SELECTORS = ["header", "footer", ".nav", ".sidebar"];
|
|
85
|
+
var MIN_LINK_DENSITY_TEXT_LENGTH = 50;
|
|
86
|
+
var DEFAULT_LINK_DENSITY_THRESHOLD = 0.4;
|
|
87
|
+
var MIN_FORUM_INDICATOR_COUNT = 3;
|
|
88
|
+
var CODE_BLOCK_LANG_PREFIXES = ["language-", "lang-"];
|
|
89
|
+
var POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES = 2;
|
|
90
|
+
var MarkdownConverter = class {
|
|
91
|
+
turndownService;
|
|
92
|
+
constructor() {
|
|
93
|
+
this.turndownService = new import_turndown.default({
|
|
94
|
+
headingStyle: "atx",
|
|
95
|
+
codeBlockStyle: "fenced",
|
|
96
|
+
bulletListMarker: "-",
|
|
97
|
+
strongDelimiter: "**",
|
|
98
|
+
emDelimiter: "*",
|
|
99
|
+
hr: "---",
|
|
100
|
+
// Use nodeType check instead of window.HTMLElement
|
|
101
|
+
keepReplacement: (_content, node) => {
|
|
102
|
+
if (node.nodeType === 1) {
|
|
103
|
+
const htmlElement = node;
|
|
104
|
+
if (htmlElement.getAttribute("role") === "presentation" || htmlElement.classList?.contains("preserve")) {
|
|
105
|
+
return htmlElement.outerHTML;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return "";
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
this.turndownService.use(import_turndown_plugin_gfm.gfm);
|
|
112
|
+
this.setupPrioritizedRules();
|
|
113
|
+
}
|
|
114
|
+
// --- Public Method ---
|
|
115
|
+
/**
|
|
116
|
+
* Converts HTML string to Markdown.
|
|
117
|
+
* @param html The HTML string to convert.
|
|
118
|
+
* @param options Conversion options.
|
|
119
|
+
* @returns The converted Markdown string.
|
|
120
|
+
*/
|
|
121
|
+
convert(html, options = {}) {
|
|
122
|
+
const preprocessedHtml = this.preprocessHTML(html);
|
|
123
|
+
let markdown = this.turndownService.turndown(preprocessedHtml);
|
|
124
|
+
markdown = this.postprocessMarkdown(markdown, options);
|
|
125
|
+
return markdown;
|
|
126
|
+
}
|
|
127
|
+
// --- Turndown Rule Setup ---
|
|
128
|
+
setupPrioritizedRules() {
|
|
129
|
+
this.addContentExtractionRules();
|
|
130
|
+
this.addStructureRules();
|
|
131
|
+
this.addBlockRules();
|
|
132
|
+
this.addInlineRules();
|
|
133
|
+
}
|
|
134
|
+
// We rely on preprocessing to remove nav/menus/high-link-density areas.
|
|
135
|
+
// These rules primarily help Turndown understand the *structure* of the *intended* content.
|
|
136
|
+
addContentExtractionRules() {
|
|
137
|
+
this.turndownService.addRule("main-content-marker", {
|
|
138
|
+
filter: (node) => {
|
|
139
|
+
if (node.nodeType !== 1) return false;
|
|
140
|
+
const el = node;
|
|
141
|
+
const element = node;
|
|
142
|
+
return el.tagName.toLowerCase() === "main" || ["main", "article"].includes(el.getAttribute("role") || "") || MAIN_CONTENT_SELECTORS.some((selector) => {
|
|
143
|
+
try {
|
|
144
|
+
return element.matches(selector) && selector !== "body";
|
|
145
|
+
} catch {
|
|
146
|
+
return false;
|
|
147
|
+
}
|
|
148
|
+
});
|
|
149
|
+
},
|
|
150
|
+
// Just pass content through, this rule is mainly for filter priority/debugging
|
|
151
|
+
replacement: (content) => content
|
|
152
|
+
});
|
|
153
|
+
const unwantedTags = [
|
|
154
|
+
"script",
|
|
155
|
+
"style",
|
|
156
|
+
"noscript",
|
|
157
|
+
"iframe",
|
|
158
|
+
"button",
|
|
159
|
+
"input",
|
|
160
|
+
"select",
|
|
161
|
+
"textarea",
|
|
162
|
+
"form",
|
|
163
|
+
"canvas",
|
|
164
|
+
/*'svg' removed */
|
|
165
|
+
"audio",
|
|
166
|
+
"video"
|
|
167
|
+
];
|
|
168
|
+
this.turndownService.addRule("remove-unwanted", {
|
|
169
|
+
filter: unwantedTags,
|
|
170
|
+
replacement: () => ""
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
addStructureRules() {
|
|
174
|
+
this.turndownService.addRule("article", {
|
|
175
|
+
filter: "article",
|
|
176
|
+
replacement: (content) => `
|
|
177
|
+
|
|
178
|
+
${content}
|
|
179
|
+
|
|
180
|
+
`
|
|
181
|
+
// Add separation
|
|
182
|
+
});
|
|
183
|
+
this.turndownService.addRule("section", {
|
|
184
|
+
filter: "section",
|
|
185
|
+
replacement: (content) => `
|
|
186
|
+
|
|
187
|
+
${content}
|
|
188
|
+
|
|
189
|
+
`
|
|
190
|
+
// Add separation
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
addBlockRules() {
|
|
194
|
+
this.turndownService.addRule("list", {
|
|
195
|
+
filter: ["ul", "ol"],
|
|
196
|
+
replacement: (content, node) => {
|
|
197
|
+
if (node.nodeType !== 1) return content;
|
|
198
|
+
const parent = node.parentNode;
|
|
199
|
+
const indent = parent && parent.nodeName.toLowerCase() === "li" ? " " : "";
|
|
200
|
+
return "\n" + content.split("\n").map((line) => indent + line.trimEnd()).join("\n").trim() + "\n";
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
this.turndownService.addRule("listItem", {
|
|
204
|
+
filter: "li",
|
|
205
|
+
// Use standard function for `this` context if needed, or ensure types match
|
|
206
|
+
replacement: function(content, node, options) {
|
|
207
|
+
content = content.replace(/^\s+/gm, "").replace(/\n(?!\s*$)/gm, "\n ");
|
|
208
|
+
let prefix = options.bulletListMarker + " ";
|
|
209
|
+
const parentNode = node.parentNode;
|
|
210
|
+
if (parentNode && parentNode.nodeName === "OL") {
|
|
211
|
+
try {
|
|
212
|
+
const start = parentNode.getAttribute("start");
|
|
213
|
+
const elementNode = node;
|
|
214
|
+
const parentElement = parentNode;
|
|
215
|
+
const index = Array.prototype.indexOf.call(parentElement.children, elementNode);
|
|
216
|
+
prefix = (start ? Number(start) + index : index + 1) + ". ";
|
|
217
|
+
} catch (e) {
|
|
218
|
+
console.warn("Could not determine ordered list index:", e);
|
|
219
|
+
prefix = "1. ";
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
const trimmedContent = content.trim();
|
|
223
|
+
return prefix + trimmedContent + (node.nextSibling && !/\n$/.test(trimmedContent) ? "\n" : "");
|
|
224
|
+
}
|
|
225
|
+
});
|
|
226
|
+
this.turndownService.addRule("blockquote", {
|
|
227
|
+
filter: "blockquote",
|
|
228
|
+
replacement: (content) => {
|
|
229
|
+
const trimmedContent = content.trim();
|
|
230
|
+
return "\n\n> " + trimmedContent.replace(/\n/g, "\n> ") + "\n\n";
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
addInlineRules() {
|
|
235
|
+
this.turndownService.addRule("link", {
|
|
236
|
+
filter: (node, _options) => {
|
|
237
|
+
return node.nodeType === 1 && node.nodeName === "A" && !!node.getAttribute("href");
|
|
238
|
+
},
|
|
239
|
+
replacement: (content, node) => {
|
|
240
|
+
const element = node;
|
|
241
|
+
const href = element.getAttribute("href") || "";
|
|
242
|
+
const title = element.getAttribute("title");
|
|
243
|
+
const text = content.trim() ? content.trim() : href;
|
|
244
|
+
let decodedHref = href;
|
|
245
|
+
try {
|
|
246
|
+
if (href.includes("%")) {
|
|
247
|
+
decodedHref = decodeURI(href);
|
|
248
|
+
}
|
|
249
|
+
} catch (e) {
|
|
250
|
+
console.warn(`Failed to decode URI, keeping original: ${href}`, e);
|
|
251
|
+
}
|
|
252
|
+
return title ? `[${text}](${decodedHref} "${title}")` : `[${text}](${decodedHref})`;
|
|
253
|
+
}
|
|
254
|
+
});
|
|
255
|
+
this.turndownService.addRule("figure", {
|
|
256
|
+
filter: "figure",
|
|
257
|
+
replacement: (content, node) => {
|
|
258
|
+
if (node.nodeType !== 1) return content;
|
|
259
|
+
const element = node;
|
|
260
|
+
const img = element.querySelector("img");
|
|
261
|
+
const figcaption = element.querySelector("figcaption");
|
|
262
|
+
let markdown = "";
|
|
263
|
+
let mainImgMd = "";
|
|
264
|
+
if (img) {
|
|
265
|
+
const src = img.getAttribute("src") || "";
|
|
266
|
+
const alt = img.getAttribute("alt") || "";
|
|
267
|
+
const title = img.getAttribute("title");
|
|
268
|
+
mainImgMd = title ? `` : ``;
|
|
269
|
+
}
|
|
270
|
+
let processedContent = content.trim();
|
|
271
|
+
if (mainImgMd) {
|
|
272
|
+
markdown = mainImgMd;
|
|
273
|
+
const imgPlaceholder = ` || ""})`;
|
|
274
|
+
processedContent = processedContent.replace(imgPlaceholder, "").trim();
|
|
275
|
+
}
|
|
276
|
+
if (figcaption) {
|
|
277
|
+
const captionText = figcaption.textContent?.trim();
|
|
278
|
+
if (captionText) {
|
|
279
|
+
markdown += `
|
|
280
|
+
|
|
281
|
+
_${captionText}_`;
|
|
282
|
+
processedContent = processedContent.replace(captionText, "").trim();
|
|
283
|
+
processedContent = processedContent.replace(/^_+|_+$/g, "").trim();
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
if (processedContent) {
|
|
287
|
+
if (processedContent.length > 10 || /[a-zA-Z0-9]/.test(processedContent)) {
|
|
288
|
+
markdown += `
|
|
289
|
+
|
|
290
|
+
${processedContent}`;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return "\n\n" + markdown.trim() + "\n\n";
|
|
294
|
+
}
|
|
295
|
+
});
|
|
296
|
+
this.turndownService.addRule("image", {
|
|
297
|
+
filter: (node) => {
|
|
298
|
+
return node.nodeType === 1 && node.nodeName === "IMG" && !!node.getAttribute("src");
|
|
299
|
+
},
|
|
300
|
+
replacement: (_content, node) => {
|
|
301
|
+
const element = node;
|
|
302
|
+
const src = element.getAttribute("src") || "";
|
|
303
|
+
const alt = element.getAttribute("alt") || "";
|
|
304
|
+
const title = element.getAttribute("title");
|
|
305
|
+
return title ? `
|
|
306
|
+
|
|
307
|
+

|
|
308
|
+
|
|
309
|
+
` : `
|
|
310
|
+
|
|
311
|
+

|
|
312
|
+
|
|
313
|
+
`;
|
|
314
|
+
}
|
|
315
|
+
});
|
|
316
|
+
this.turndownService.addRule("code-block", {
|
|
317
|
+
filter: (node) => {
|
|
318
|
+
if (node.nodeType !== 1) return false;
|
|
319
|
+
const element = node;
|
|
320
|
+
const isPre = element.tagName.toLowerCase() === "pre";
|
|
321
|
+
if (!isPre) return false;
|
|
322
|
+
const hasCodeChild = element.querySelector("code") !== null;
|
|
323
|
+
const hasCodeClass = /highlight|syntax|code|listing|source/i.test(element.className);
|
|
324
|
+
const hasLangAttribute = !!element.getAttribute("lang") || !!element.getAttribute("language");
|
|
325
|
+
return hasCodeChild || hasCodeClass || hasLangAttribute;
|
|
326
|
+
},
|
|
327
|
+
replacement: (content, node) => {
|
|
328
|
+
if (node.nodeType !== 1) return content.trim();
|
|
329
|
+
const element = node;
|
|
330
|
+
let language = "";
|
|
331
|
+
const codeElement = element.querySelector("code");
|
|
332
|
+
language = element.getAttribute("lang") || element.getAttribute("language") || (codeElement ? codeElement.getAttribute("lang") || codeElement.getAttribute("language") : "") || "";
|
|
333
|
+
if (!language) {
|
|
334
|
+
const classes = (element.className + " " + (codeElement?.className || "")).split(" ").filter(Boolean);
|
|
335
|
+
for (const cls of classes) {
|
|
336
|
+
for (const prefix of CODE_BLOCK_LANG_PREFIXES) {
|
|
337
|
+
if (cls.startsWith(prefix)) {
|
|
338
|
+
language = cls.substring(prefix.length);
|
|
339
|
+
break;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
if (language) break;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
const cleanedContent = content.trim();
|
|
346
|
+
return `
|
|
347
|
+
|
|
348
|
+
\`\`\`${language}
|
|
349
|
+
${cleanedContent}
|
|
350
|
+
\`\`\`
|
|
351
|
+
|
|
352
|
+
`;
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
this.turndownService.addRule("inlineCode", {
|
|
356
|
+
filter: (node) => node.nodeName === "CODE" && node.parentNode?.nodeName !== "PRE",
|
|
357
|
+
replacement: (content) => {
|
|
358
|
+
const trimmed = content.trim();
|
|
359
|
+
if (!trimmed) return "";
|
|
360
|
+
let delimiter = "`";
|
|
361
|
+
if (trimmed.includes("`")) {
|
|
362
|
+
delimiter = "``";
|
|
363
|
+
if (trimmed.startsWith("`") || trimmed.endsWith("`")) {
|
|
364
|
+
return `${delimiter} ${trimmed} ${delimiter}`;
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
return delimiter + trimmed + delimiter;
|
|
368
|
+
}
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
// --- HTML Preprocessing ---
|
|
372
|
+
preprocessHTML(html) {
|
|
373
|
+
try {
|
|
374
|
+
html = this.cleanupHtml(html);
|
|
375
|
+
const root = (0, import_node_html_parser.parse)(html, {
|
|
376
|
+
comment: false,
|
|
377
|
+
blockTextElements: { script: true, style: true, noscript: true }
|
|
378
|
+
});
|
|
379
|
+
if (root.nodeType === 3) {
|
|
380
|
+
return root.textContent ?? "";
|
|
381
|
+
} else if (root.nodeType !== 1) {
|
|
382
|
+
console.warn("Unexpected root node type after parsing:", root.nodeType);
|
|
383
|
+
return root.toString();
|
|
384
|
+
}
|
|
385
|
+
const rootElement = root;
|
|
386
|
+
PREPROCESSING_REMOVE_SELECTORS.forEach((selector) => {
|
|
387
|
+
try {
|
|
388
|
+
rootElement.querySelectorAll(selector).forEach((el) => el.remove());
|
|
389
|
+
} catch (e) {
|
|
390
|
+
console.warn(`Skipping invalid selector during preprocessing: ${selector}`, e);
|
|
391
|
+
}
|
|
392
|
+
});
|
|
393
|
+
this.removeHighLinkDensityElements(rootElement, DEFAULT_LINK_DENSITY_THRESHOLD);
|
|
394
|
+
const metadata = this.extractDocumentMetadata(rootElement);
|
|
395
|
+
const isForum = this.detectForumPage(rootElement);
|
|
396
|
+
let contentElement = rootElement;
|
|
397
|
+
if (isForum) {
|
|
398
|
+
contentElement = this.extractForumContentElement(rootElement);
|
|
399
|
+
} else {
|
|
400
|
+
contentElement = this.extractArticleContentElement(rootElement);
|
|
401
|
+
}
|
|
402
|
+
let contentHtml = contentElement instanceof import_node_html_parser.HTMLElement ? contentElement.outerHTML : contentElement.textContent;
|
|
403
|
+
contentHtml = this.cleanupContentHtml(contentHtml || "");
|
|
404
|
+
const metadataString = metadata.length > 0 ? metadata.join("\n\n") + "\n\n---\n\n" : "";
|
|
405
|
+
return metadataString + contentHtml;
|
|
406
|
+
} catch (error) {
|
|
407
|
+
console.error("HTML preprocessing failed:", error);
|
|
408
|
+
return this.cleanupHtml(html);
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
cleanupHtml(html) {
|
|
412
|
+
return html.replace(/AMIL:\[=-,amilft[^\s]*/g, "").replace(/\{\{\s*[^}\s]+\s*}}/g, "").replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
|
|
413
|
+
}
|
|
414
|
+
cleanupContentHtml(content) {
|
|
415
|
+
return content.replace(/\s*data-(?:reactid|reactroot|react-|testid|v-|js-|qa-|cy-)[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "").replace(/\s*ng-[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "").replace(/\s*_ngcontent-[^\s]*\s*=""/g, "").replace(/\s*class\s*=\s*"(ng-|mat-)[^"]*"/g, "").replace(/<!--[\s\S]*?-->/g, "").replace(/([ \t])+/g, " ").replace(/\s*\n\s*/g, "\n").trim();
|
|
416
|
+
}
|
|
417
|
+
removeHighLinkDensityElements(element, threshold) {
|
|
418
|
+
const potentialBoilerplate = element.querySelectorAll(
|
|
419
|
+
"div, nav, ul, aside, section, .sidebar, .widget, .menu, [role='navigation'], [role='menubar']"
|
|
420
|
+
);
|
|
421
|
+
for (const el of Array.from(potentialBoilerplate)) {
|
|
422
|
+
if (!(el instanceof import_node_html_parser.HTMLElement)) continue;
|
|
423
|
+
const textContent = el.textContent || "";
|
|
424
|
+
if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH) continue;
|
|
425
|
+
const links = el.querySelectorAll("a");
|
|
426
|
+
if (links.length < 3) continue;
|
|
427
|
+
const textLength = textContent.length;
|
|
428
|
+
let linkTextLength = 0;
|
|
429
|
+
el.querySelectorAll("a").forEach((link) => {
|
|
430
|
+
if (link.closest("a") === link) {
|
|
431
|
+
linkTextLength += link.textContent?.length || 0;
|
|
432
|
+
}
|
|
433
|
+
});
|
|
434
|
+
if (textLength === 0) continue;
|
|
435
|
+
const density = linkTextLength / textLength;
|
|
436
|
+
if (density > threshold) {
|
|
437
|
+
const containsMainContent = el.querySelector('main, article, [role="main"], [role="article"]') !== null;
|
|
438
|
+
const isMainContent = MAIN_CONTENT_SELECTORS.some((selector) => {
|
|
439
|
+
try {
|
|
440
|
+
return el.matches(selector);
|
|
441
|
+
} catch {
|
|
442
|
+
return false;
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
if (!containsMainContent && !isMainContent) {
|
|
446
|
+
el.remove();
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
extractDocumentMetadata(root) {
|
|
452
|
+
const metadata = [];
|
|
453
|
+
const addedMeta = /* @__PURE__ */ new Set();
|
|
454
|
+
const addMeta = (key, value, isTitle = false) => {
|
|
455
|
+
const cleanedValue = value?.trim();
|
|
456
|
+
if (cleanedValue && !addedMeta.has(key.toLowerCase())) {
|
|
457
|
+
if (isTitle) {
|
|
458
|
+
metadata.unshift(`# ${cleanedValue}`);
|
|
459
|
+
} else {
|
|
460
|
+
metadata.push(`**${key}:** ${cleanedValue}`);
|
|
461
|
+
}
|
|
462
|
+
addedMeta.add(key.toLowerCase());
|
|
463
|
+
}
|
|
464
|
+
};
|
|
465
|
+
addMeta("Title", root.querySelector("meta[property='og:title']")?.getAttribute("content"), true);
|
|
466
|
+
addMeta("Title", root.querySelector("meta[name='twitter:title']")?.getAttribute("content"), true);
|
|
467
|
+
addMeta("Title", root.querySelector("meta[name='DC.title']")?.getAttribute("content"), true);
|
|
468
|
+
addMeta("Title", root.querySelector("title")?.textContent, true);
|
|
469
|
+
addMeta("Description", root.querySelector("meta[property='og:description']")?.getAttribute("content"));
|
|
470
|
+
addMeta("Description", root.querySelector("meta[name='twitter:description']")?.getAttribute("content"));
|
|
471
|
+
addMeta("Description", root.querySelector("meta[name='description']")?.getAttribute("content"));
|
|
472
|
+
addMeta("Description", root.querySelector("meta[name='DC.description']")?.getAttribute("content"));
|
|
473
|
+
addMeta("Author", root.querySelector("meta[name='author']")?.getAttribute("content"));
|
|
474
|
+
addMeta("Author", root.querySelector("meta[property='article:author']")?.getAttribute("content"));
|
|
475
|
+
addMeta("Author", root.querySelector("[rel='author']")?.textContent);
|
|
476
|
+
addMeta("Published", root.querySelector("meta[property='article:published_time']")?.getAttribute("content"));
|
|
477
|
+
addMeta("Published", root.querySelector("meta[name='publish-date']")?.getAttribute("content"));
|
|
478
|
+
addMeta("Published", root.querySelector("time[itemprop='datePublished']")?.getAttribute("datetime"));
|
|
479
|
+
addMeta("Published", root.querySelector("time")?.getAttribute("datetime"));
|
|
480
|
+
addMeta("URL", root.querySelector("link[rel='canonical']")?.getAttribute("href"));
|
|
481
|
+
addMeta("URL", root.querySelector("meta[property='og:url']")?.getAttribute("content"));
|
|
482
|
+
const jsonLdScripts = root.querySelectorAll("script[type='application/ld+json']");
|
|
483
|
+
if (jsonLdScripts.length > 0) {
|
|
484
|
+
const jsonLdData = Array.from(jsonLdScripts).map((script) => {
|
|
485
|
+
try {
|
|
486
|
+
const textContent = script.textContent;
|
|
487
|
+
return textContent ? JSON.parse(textContent) : null;
|
|
488
|
+
} catch (e) {
|
|
489
|
+
return null;
|
|
490
|
+
}
|
|
491
|
+
}).filter((item) => item !== null);
|
|
492
|
+
if (jsonLdData.length > 0 && !addedMeta.has("json-ld")) {
|
|
493
|
+
metadata.push("<details><summary>JSON-LD Metadata</summary>\n");
|
|
494
|
+
metadata.push("```json", JSON.stringify(jsonLdData, null, 2), "```");
|
|
495
|
+
metadata.push("</details>");
|
|
496
|
+
addedMeta.add("json-ld");
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
return metadata;
|
|
500
|
+
}
|
|
501
|
+
detectForumPage(root) {
|
|
502
|
+
const countMatches = (selectors) => {
|
|
503
|
+
return selectors.reduce((count, selector) => {
|
|
504
|
+
try {
|
|
505
|
+
if (root) {
|
|
506
|
+
return count + root.querySelectorAll(selector).length;
|
|
507
|
+
}
|
|
508
|
+
return count;
|
|
509
|
+
} catch {
|
|
510
|
+
return count;
|
|
511
|
+
}
|
|
512
|
+
}, 0);
|
|
513
|
+
};
|
|
514
|
+
const commentCount = countMatches(FORUM_COMMENT_SELECTORS);
|
|
515
|
+
const threadCount = countMatches(FORUM_THREAD_SELECTORS);
|
|
516
|
+
const voteCount = countMatches(FORUM_VOTE_SELECTORS);
|
|
517
|
+
let isKnownForumHost = false;
|
|
518
|
+
try {
|
|
519
|
+
const canonicalUrl = root.querySelector('link[rel="canonical"]')?.getAttribute("href") || root.querySelector('meta[property="og:url"]')?.getAttribute("content");
|
|
520
|
+
if (canonicalUrl) {
|
|
521
|
+
const absoluteUrl = new URL(canonicalUrl, "http://example.com").toString();
|
|
522
|
+
const hostname = new URL(absoluteUrl).hostname.toLowerCase();
|
|
523
|
+
isKnownForumHost = hostname.includes("reddit.com") || hostname.includes("news.ycombinator.com") || hostname.includes("forum") || hostname.includes("discuss") || hostname.includes("community");
|
|
524
|
+
}
|
|
525
|
+
} catch (e) {
|
|
526
|
+
console.warn("Could not parse URL for forum detection:", e);
|
|
527
|
+
}
|
|
528
|
+
return commentCount >= MIN_FORUM_INDICATOR_COUNT || threadCount > 1 || // More than one thread item is stronger indicator
|
|
529
|
+
voteCount >= MIN_FORUM_INDICATOR_COUNT || isKnownForumHost;
|
|
530
|
+
}
|
|
531
|
+
// Tries to find the main content element for an article-like page
|
|
532
|
+
extractArticleContentElement(root) {
|
|
533
|
+
let bestCandidate = null;
|
|
534
|
+
let maxScore = -1;
|
|
535
|
+
for (const selector of MAIN_CONTENT_SELECTORS) {
|
|
536
|
+
try {
|
|
537
|
+
const elements = root.querySelectorAll(selector);
|
|
538
|
+
for (const element of Array.from(elements)) {
|
|
539
|
+
if (!(element instanceof import_node_html_parser.HTMLElement)) continue;
|
|
540
|
+
const textLength = (element.textContent || "").trim().length;
|
|
541
|
+
if (textLength < 100 && !element.querySelector("img, video, iframe, figure")) continue;
|
|
542
|
+
let score = textLength;
|
|
543
|
+
if (["ARTICLE", "MAIN"].includes(element.tagName)) score *= 1.5;
|
|
544
|
+
if (["main", "article"].includes(element.getAttribute("role") || "")) score *= 1.5;
|
|
545
|
+
if (["HEADER", "FOOTER", "NAV", "ASIDE"].includes(element.tagName)) score *= 0.3;
|
|
546
|
+
try {
|
|
547
|
+
if (
|
|
548
|
+
/* @ts-expect-error TODO: fix this */
|
|
549
|
+
element.matches(
|
|
550
|
+
'.sidebar, .widget, .menu, .nav, .header, .footer, [role="navigation"], [role="complementary"], [role="banner"]'
|
|
551
|
+
)
|
|
552
|
+
)
|
|
553
|
+
score *= 0.2;
|
|
554
|
+
} catch {
|
|
555
|
+
}
|
|
556
|
+
if (this.hasHighLinkDensity(element, 0.6)) {
|
|
557
|
+
score *= 0.5;
|
|
558
|
+
}
|
|
559
|
+
if (element.querySelectorAll("p").length > 2) score *= 1.2;
|
|
560
|
+
if (element.tagName === "BODY" && maxScore > 200) continue;
|
|
561
|
+
if (score > maxScore) {
|
|
562
|
+
maxScore = score;
|
|
563
|
+
bestCandidate = element;
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
} catch (e) {
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
return bestCandidate || root;
|
|
570
|
+
}
|
|
571
|
+
// Tries to find the main content element(s) for a forum-like page
|
|
572
|
+
extractForumContentElement(root) {
|
|
573
|
+
const tempContainer = (0, import_node_html_parser.parse)("<div></div>").firstChild;
|
|
574
|
+
try {
|
|
575
|
+
const mainPost = FORUM_MAIN_POST_SELECTORS.map((s) => root.querySelector(s)).find(
|
|
576
|
+
(el) => el instanceof import_node_html_parser.HTMLElement
|
|
577
|
+
);
|
|
578
|
+
if (mainPost) {
|
|
579
|
+
tempContainer.appendChild(mainPost.clone());
|
|
580
|
+
}
|
|
581
|
+
} catch (e) {
|
|
582
|
+
console.warn("Error finding forum main post:", e);
|
|
583
|
+
}
|
|
584
|
+
try {
|
|
585
|
+
const commentsContainer = FORUM_COMMENTS_CONTAINER_SELECTORS.map((s) => root.querySelector(s)).find(
|
|
586
|
+
(el) => el instanceof import_node_html_parser.HTMLElement
|
|
587
|
+
);
|
|
588
|
+
if (commentsContainer) {
|
|
589
|
+
const clonedComments = commentsContainer.clone();
|
|
590
|
+
if (clonedComments instanceof import_node_html_parser.HTMLElement) {
|
|
591
|
+
FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
|
|
592
|
+
try {
|
|
593
|
+
clonedComments.querySelectorAll(selector).forEach((el) => el.remove());
|
|
594
|
+
} catch {
|
|
595
|
+
}
|
|
596
|
+
});
|
|
597
|
+
tempContainer.appendChild(clonedComments);
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
} catch (e) {
|
|
601
|
+
console.warn("Error finding forum comments container:", e);
|
|
602
|
+
}
|
|
603
|
+
if (tempContainer.childNodes.length > 0) {
|
|
604
|
+
return tempContainer;
|
|
605
|
+
}
|
|
606
|
+
const body = root.querySelector("body");
|
|
607
|
+
if (body) {
|
|
608
|
+
const clonedBody = body.clone();
|
|
609
|
+
if (clonedBody instanceof import_node_html_parser.HTMLElement) {
|
|
610
|
+
FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
|
|
611
|
+
try {
|
|
612
|
+
clonedBody.querySelectorAll(selector).forEach((el) => el.remove());
|
|
613
|
+
} catch {
|
|
614
|
+
}
|
|
615
|
+
});
|
|
616
|
+
this.removeHighLinkDensityElements(clonedBody, DEFAULT_LINK_DENSITY_THRESHOLD);
|
|
617
|
+
return clonedBody;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
return root;
|
|
621
|
+
}
|
|
622
|
+
// Helper function to check link density within an element
|
|
623
|
+
hasHighLinkDensity(element, threshold) {
|
|
624
|
+
const textContent = element.textContent || "";
|
|
625
|
+
if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH) return false;
|
|
626
|
+
const links = element.querySelectorAll("a");
|
|
627
|
+
if (links.length < 3) return false;
|
|
628
|
+
const textLength = textContent.length;
|
|
629
|
+
let linkTextLength = 0;
|
|
630
|
+
element.querySelectorAll("a").forEach((link) => {
|
|
631
|
+
if (link.closest("a") === link) {
|
|
632
|
+
linkTextLength += link.textContent?.length || 0;
|
|
633
|
+
}
|
|
634
|
+
});
|
|
635
|
+
if (textLength === 0) return false;
|
|
636
|
+
return linkTextLength / textLength > threshold;
|
|
637
|
+
}
|
|
638
|
+
// --- Markdown Postprocessing ---
|
|
639
|
+
postprocessMarkdown(markdown, options) {
|
|
640
|
+
let processed = markdown;
|
|
641
|
+
processed = processed.replace(/^(\s*\n)?(#{1,6}\s.*)$/gm, "\n\n$2\n\n");
|
|
642
|
+
processed = processed.replace(/^(\s*\n)?(([\*\-+>]|\d+\.)\s)/gm, (_match, _p1, p2) => `
|
|
643
|
+
|
|
644
|
+
${p2}`);
|
|
645
|
+
processed = processed.replace(
|
|
646
|
+
/(\n([\*\-+]|\d+\.)\s(?:(?!\n\n|\n {2,}|\n\t)[\s\S])*?)\n(?=([\*\-+]|\d+\.)\s)/g,
|
|
647
|
+
"$1"
|
|
648
|
+
);
|
|
649
|
+
processed = processed.replace(/\[\]\([^)]*\)/g, "");
|
|
650
|
+
processed = processed.replace(/!\[\]\([^)]*\)/g, "");
|
|
651
|
+
processed = processed.replace(/(!?\[[^\]]*\]\()(\/\/)/g, "$1https://");
|
|
652
|
+
const maxNewlines = "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES + 1);
|
|
653
|
+
const newlineRegex = new RegExp(`${maxNewlines}+`, "g");
|
|
654
|
+
processed = processed.replace(newlineRegex, "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES));
|
|
655
|
+
processed = processed.replace(/^[ \t]+|[ \t]+$/gm, "");
|
|
656
|
+
processed = processed.replace(/^(\s*\n)?(```(.*)\n[\s\S]*?\n```)(\s*\n)?/gm, "\n\n$2\n\n");
|
|
657
|
+
processed = processed.replace(/^(.{30,})$(\n\1)+/gm, "$1");
|
|
658
|
+
processed = processed.replace(/(\n---\n)(\S)/g, "$1\n$2");
|
|
659
|
+
if (options.maxContentLength && processed.length > options.maxContentLength) {
|
|
660
|
+
const truncatedPoint = processed.lastIndexOf(".", options.maxContentLength - 15);
|
|
661
|
+
const sliceEnd = truncatedPoint > options.maxContentLength / 2 ? truncatedPoint + 1 : options.maxContentLength;
|
|
662
|
+
processed = processed.slice(0, sliceEnd) + "... (truncated)";
|
|
663
|
+
}
|
|
664
|
+
return processed.trim();
|
|
665
|
+
}
|
|
666
|
+
};
|
|
667
|
+
|
|
668
|
+
// src/errors.ts
|
|
669
|
+
var FetchError = class _FetchError extends Error {
|
|
670
|
+
/** A specific error code (e.g., ERR_NAVIGATION_TIMEOUT, ERR_HTTP_ERROR). */
|
|
671
|
+
code;
|
|
672
|
+
/** The original error object, if available. */
|
|
673
|
+
originalError;
|
|
674
|
+
/** HTTP status code, if relevant. */
|
|
675
|
+
statusCode;
|
|
676
|
+
/**
|
|
677
|
+
* Creates an instance of FetchError.
|
|
678
|
+
* @param message The error message.
|
|
679
|
+
* @param code Optional error code string.
|
|
680
|
+
* @param originalError Optional original error.
|
|
681
|
+
* @param statusCode Optional HTTP status code.
|
|
682
|
+
*/
|
|
683
|
+
constructor(message, code, originalError, statusCode) {
|
|
684
|
+
super(message);
|
|
685
|
+
this.name = "FetchError";
|
|
686
|
+
this.code = code;
|
|
687
|
+
this.originalError = originalError;
|
|
688
|
+
this.statusCode = statusCode;
|
|
689
|
+
if (Error.captureStackTrace) {
|
|
690
|
+
Error.captureStackTrace(this, _FetchError);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
};
|
|
694
|
+
|
|
695
|
+
// src/FetchEngine.ts
|
|
696
|
+
var FetchEngineHttpError = class extends FetchError {
|
|
697
|
+
constructor(message, statusCode) {
|
|
698
|
+
super(message, "ERR_HTTP_ERROR", void 0, statusCode);
|
|
699
|
+
this.statusCode = statusCode;
|
|
700
|
+
this.name = "FetchEngineHttpError";
|
|
701
|
+
}
|
|
702
|
+
};
|
|
703
|
+
var FetchEngine = class _FetchEngine {
|
|
704
|
+
options;
|
|
705
|
+
static DEFAULT_OPTIONS = {
|
|
706
|
+
markdown: false
|
|
707
|
+
};
|
|
708
|
+
/**
|
|
709
|
+
* Creates an instance of FetchEngine.
|
|
710
|
+
* @param options Configuration options for the FetchEngine.
|
|
711
|
+
*/
|
|
712
|
+
constructor(options = {}) {
|
|
713
|
+
this.options = { ..._FetchEngine.DEFAULT_OPTIONS, ...options };
|
|
714
|
+
}
|
|
715
|
+
/**
|
|
716
|
+
* Fetches HTML or converts to Markdown from the specified URL.
|
|
717
|
+
*
|
|
718
|
+
* @param url The URL to fetch.
|
|
719
|
+
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
720
|
+
* @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
|
|
721
|
+
* @throws {Error} If the content type is not HTML or for other network errors.
|
|
722
|
+
*/
|
|
723
|
+
async fetchHTML(url, options) {
|
|
724
|
+
const effectiveOptions = { ...this.options, ...options };
|
|
725
|
+
let response;
|
|
726
|
+
try {
|
|
727
|
+
response = await fetch(url, {
|
|
728
|
+
redirect: "follow",
|
|
729
|
+
headers: {
|
|
730
|
+
// Standard browser-like headers
|
|
731
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
732
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
733
|
+
"Accept-Language": "en-US,en;q=0.9"
|
|
734
|
+
}
|
|
735
|
+
});
|
|
736
|
+
if (!response.ok) {
|
|
737
|
+
throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
|
|
738
|
+
}
|
|
739
|
+
const contentTypeHeader = response.headers.get("content-type");
|
|
740
|
+
if (!contentTypeHeader || !contentTypeHeader.includes("text/html")) {
|
|
741
|
+
throw new FetchError("Content-Type is not text/html", "ERR_NON_HTML_CONTENT");
|
|
742
|
+
}
|
|
743
|
+
const html = await response.text();
|
|
744
|
+
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
745
|
+
const title = titleMatch ? titleMatch[1].trim() : null;
|
|
746
|
+
let finalContent = html;
|
|
747
|
+
let finalContentType = "html";
|
|
748
|
+
if (effectiveOptions.markdown) {
|
|
749
|
+
try {
|
|
750
|
+
const converter = new MarkdownConverter();
|
|
751
|
+
finalContent = converter.convert(html);
|
|
752
|
+
finalContentType = "markdown";
|
|
753
|
+
} catch (conversionError) {
|
|
754
|
+
console.error(`Markdown conversion failed for ${url} (FetchEngine):`, conversionError);
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
return {
|
|
758
|
+
content: finalContent,
|
|
759
|
+
contentType: finalContentType,
|
|
760
|
+
title,
|
|
761
|
+
url: response.url,
|
|
762
|
+
// Use the final URL after redirects
|
|
763
|
+
isFromCache: false,
|
|
764
|
+
statusCode: response.status,
|
|
765
|
+
error: void 0
|
|
766
|
+
};
|
|
767
|
+
} catch (error) {
|
|
768
|
+
if (error instanceof FetchEngineHttpError || error instanceof FetchError && error.code === "ERR_NON_HTML_CONTENT") {
|
|
769
|
+
throw error;
|
|
770
|
+
}
|
|
771
|
+
const message = error instanceof Error ? error.message : "Unknown fetch error";
|
|
772
|
+
throw new FetchError(`Fetch failed: ${message}`, "ERR_FETCH_FAILED", error instanceof Error ? error : void 0);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
/**
|
|
776
|
+
* Cleans up resources used by the engine.
|
|
777
|
+
* For FetchEngine, this is a no-op as it doesn't manage persistent resources.
|
|
778
|
+
* @returns A Promise that resolves when cleanup is complete.
|
|
779
|
+
*/
|
|
780
|
+
async cleanup() {
|
|
781
|
+
return Promise.resolve();
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Retrieves metrics for the engine.
|
|
785
|
+
* FetchEngine does not manage browsers, so it returns an empty array.
|
|
786
|
+
* @returns An empty array.
|
|
787
|
+
*/
|
|
788
|
+
getMetrics() {
|
|
789
|
+
return [];
|
|
790
|
+
}
|
|
791
|
+
};
|
|
792
|
+
|
|
793
|
+
// src/browser/PlaywrightBrowserPool.ts
|
|
794
|
+
var import_playwright = require("playwright");
|
|
795
|
+
var import_user_agents = __toESM(require("user-agents"), 1);
|
|
796
|
+
var import_uuid = require("uuid");
|
|
797
|
+
var import_p_queue = __toESM(require("p-queue"), 1);
|
|
798
|
+
var import_playwright_extra = require("playwright-extra");
|
|
799
|
+
var chromiumWithExtras;
|
|
800
|
+
var StealthPluginInstance;
|
|
801
|
+
async function loadDependencies() {
|
|
802
|
+
if (!chromiumWithExtras) {
|
|
803
|
+
chromiumWithExtras = (0, import_playwright_extra.addExtra)(import_playwright.chromium);
|
|
804
|
+
const StealthPluginModule = await import("puppeteer-extra-plugin-stealth");
|
|
805
|
+
const stealthPluginFactory = typeof StealthPluginModule.default === "function" ? StealthPluginModule.default : StealthPluginModule;
|
|
806
|
+
if (typeof stealthPluginFactory !== "function") {
|
|
807
|
+
throw new Error("puppeteer-extra-plugin-stealth export is not a function or module structure is unexpected.");
|
|
808
|
+
}
|
|
809
|
+
StealthPluginInstance = stealthPluginFactory();
|
|
810
|
+
chromiumWithExtras.use(StealthPluginInstance);
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
var PlaywrightBrowserPool = class _PlaywrightBrowserPool {
|
|
814
|
+
pool = /* @__PURE__ */ new Set();
|
|
815
|
+
maxBrowsers;
|
|
816
|
+
maxPagesPerContext;
|
|
817
|
+
maxBrowserAge;
|
|
818
|
+
healthCheckInterval;
|
|
819
|
+
healthCheckTimer = null;
|
|
820
|
+
maxIdleTime;
|
|
821
|
+
isCleaningUp = false;
|
|
822
|
+
useHeadedMode;
|
|
823
|
+
blockedDomains;
|
|
824
|
+
blockedResourceTypes;
|
|
825
|
+
proxyConfig;
|
|
826
|
+
static DEFAULT_BLOCKED_DOMAINS = [
|
|
827
|
+
"doubleclick.net",
|
|
828
|
+
"google-analytics.com",
|
|
829
|
+
"googletagmanager.com",
|
|
830
|
+
"googlesyndication.com",
|
|
831
|
+
"googleadservices.com",
|
|
832
|
+
"adservice.google.com",
|
|
833
|
+
"facebook.net",
|
|
834
|
+
"fbcdn.net",
|
|
835
|
+
"connect.facebook.net",
|
|
836
|
+
"ads-twitter.com",
|
|
837
|
+
"platform.twitter.com",
|
|
838
|
+
"analytics.tiktok.com",
|
|
839
|
+
"ads.tiktok.com",
|
|
840
|
+
"amazon-adsystem.com",
|
|
841
|
+
"adnxs.com",
|
|
842
|
+
"criteo.com",
|
|
843
|
+
"scorecardresearch.com",
|
|
844
|
+
"quantserve.com",
|
|
845
|
+
"rubiconproject.com",
|
|
846
|
+
"pubmatic.com",
|
|
847
|
+
"taboola.com",
|
|
848
|
+
"outbrain.com"
|
|
849
|
+
];
|
|
850
|
+
static DEFAULT_BLOCKED_RESOURCE_TYPES = ["image", "font", "media", "websocket"];
|
|
851
|
+
acquireQueue = new import_p_queue.default({ concurrency: 1 });
|
|
852
|
+
constructor(config = {}) {
|
|
853
|
+
this.maxBrowsers = config.maxBrowsers ?? 2;
|
|
854
|
+
this.maxPagesPerContext = config.maxPagesPerContext ?? 6;
|
|
855
|
+
this.maxBrowserAge = config.maxBrowserAge ?? 20 * 60 * 1e3;
|
|
856
|
+
this.healthCheckInterval = config.healthCheckInterval ?? 60 * 1e3;
|
|
857
|
+
this.useHeadedMode = config.useHeadedMode ?? false;
|
|
858
|
+
this.maxIdleTime = config.maxIdleTime ?? 5 * 60 * 1e3;
|
|
859
|
+
this.blockedDomains = config.blockedDomains && config.blockedDomains.length > 0 ? config.blockedDomains : _PlaywrightBrowserPool.DEFAULT_BLOCKED_DOMAINS;
|
|
860
|
+
this.blockedResourceTypes = config.blockedResourceTypes && config.blockedResourceTypes.length > 0 ? config.blockedResourceTypes : _PlaywrightBrowserPool.DEFAULT_BLOCKED_RESOURCE_TYPES;
|
|
861
|
+
this.proxyConfig = config.proxy;
|
|
862
|
+
}
|
|
863
|
+
async initialize() {
|
|
864
|
+
await loadDependencies();
|
|
865
|
+
if (this.isCleaningUp) return;
|
|
866
|
+
await this.ensureMinimumInstances();
|
|
867
|
+
this.scheduleHealthCheck();
|
|
868
|
+
}
|
|
869
|
+
scheduleHealthCheck() {
|
|
870
|
+
if (this.isCleaningUp) return;
|
|
871
|
+
if (this.healthCheckTimer) {
|
|
872
|
+
clearTimeout(this.healthCheckTimer);
|
|
873
|
+
}
|
|
874
|
+
if (this.healthCheckInterval > 0) {
|
|
875
|
+
this.healthCheckTimer = setTimeout(() => {
|
|
876
|
+
this.healthCheck().catch((_err) => {
|
|
877
|
+
});
|
|
878
|
+
}, this.healthCheckInterval);
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
async ensureMinimumInstances() {
|
|
882
|
+
if (this.isCleaningUp) return;
|
|
883
|
+
while (this.pool.size < this.maxBrowsers) {
|
|
884
|
+
try {
|
|
885
|
+
await this.createBrowserInstance();
|
|
886
|
+
} catch (error) {
|
|
887
|
+
break;
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
async createBrowserInstance() {
|
|
892
|
+
await loadDependencies();
|
|
893
|
+
const id = (0, import_uuid.v4)();
|
|
894
|
+
const launchOptions = {
|
|
895
|
+
headless: !this.useHeadedMode,
|
|
896
|
+
args: [
|
|
897
|
+
"--no-sandbox",
|
|
898
|
+
"--disable-setuid-sandbox",
|
|
899
|
+
"--disable-dev-shm-usage",
|
|
900
|
+
"--disable-accelerated-2d-canvas",
|
|
901
|
+
"--no-first-run",
|
|
902
|
+
"--no-zygote",
|
|
903
|
+
"--disable-gpu",
|
|
904
|
+
"--mute-audio",
|
|
905
|
+
"--disable-background-networking"
|
|
906
|
+
],
|
|
907
|
+
proxy: this.proxyConfig
|
|
908
|
+
};
|
|
909
|
+
const browser = await chromiumWithExtras.launch(launchOptions);
|
|
910
|
+
const context = await browser.newContext({
|
|
911
|
+
userAgent: new import_user_agents.default().toString(),
|
|
912
|
+
viewport: {
|
|
913
|
+
width: 1280 + Math.floor(Math.random() * 120),
|
|
914
|
+
height: 720 + Math.floor(Math.random() * 80)
|
|
915
|
+
},
|
|
916
|
+
javaScriptEnabled: true,
|
|
917
|
+
ignoreHTTPSErrors: true
|
|
918
|
+
});
|
|
919
|
+
await context.route("**/*", async (route) => {
|
|
920
|
+
const request = route.request();
|
|
921
|
+
const url = request.url();
|
|
922
|
+
const resourceType = request.resourceType();
|
|
923
|
+
try {
|
|
924
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
925
|
+
if (this.blockedDomains.some((domain) => hostname.includes(domain)) || this.blockedResourceTypes.includes(resourceType)) {
|
|
926
|
+
await route.abort("aborted");
|
|
927
|
+
} else {
|
|
928
|
+
await route.continue();
|
|
929
|
+
}
|
|
930
|
+
} catch (_e) {
|
|
931
|
+
await route.continue();
|
|
932
|
+
}
|
|
933
|
+
});
|
|
934
|
+
const now = /* @__PURE__ */ new Date();
|
|
935
|
+
const metrics = {
|
|
936
|
+
id,
|
|
937
|
+
pagesCreated: 0,
|
|
938
|
+
activePages: 0,
|
|
939
|
+
lastUsed: now,
|
|
940
|
+
errors: 0,
|
|
941
|
+
createdAt: now,
|
|
942
|
+
isHealthy: true
|
|
943
|
+
};
|
|
944
|
+
const instance = {
|
|
945
|
+
id,
|
|
946
|
+
browser,
|
|
947
|
+
context,
|
|
948
|
+
pages: /* @__PURE__ */ new Set(),
|
|
949
|
+
metrics,
|
|
950
|
+
isHealthy: true,
|
|
951
|
+
disconnectedHandler: () => {
|
|
952
|
+
}
|
|
953
|
+
};
|
|
954
|
+
instance.disconnectedHandler = () => {
|
|
955
|
+
if (instance.isHealthy) {
|
|
956
|
+
instance.isHealthy = false;
|
|
957
|
+
instance.metrics.isHealthy = false;
|
|
958
|
+
this.healthCheck().catch((_err) => {
|
|
959
|
+
});
|
|
960
|
+
}
|
|
961
|
+
};
|
|
962
|
+
browser.on("disconnected", instance.disconnectedHandler);
|
|
963
|
+
this.pool.add(instance);
|
|
964
|
+
return instance;
|
|
965
|
+
}
|
|
966
|
+
acquirePage() {
|
|
967
|
+
return this.acquireQueue.add(async () => {
|
|
968
|
+
if (this.isCleaningUp) {
|
|
969
|
+
throw new Error("Pool is shutting down.");
|
|
970
|
+
}
|
|
971
|
+
let bestInstance = null;
|
|
972
|
+
for (const instance of this.pool) {
|
|
973
|
+
if (instance.isHealthy && instance.pages.size < this.maxPagesPerContext) {
|
|
974
|
+
if (!bestInstance || instance.pages.size < bestInstance.pages.size) {
|
|
975
|
+
bestInstance = instance;
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
if (!bestInstance && this.pool.size < this.maxBrowsers) {
|
|
980
|
+
try {
|
|
981
|
+
bestInstance = await this.createBrowserInstance();
|
|
982
|
+
} catch (error) {
|
|
983
|
+
throw new Error(`Failed to create new browser instance for acquisition: ${error.message}`);
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
if (!bestInstance) {
|
|
987
|
+
await this.ensureMinimumInstances();
|
|
988
|
+
for (const instance of this.pool) {
|
|
989
|
+
if (instance.isHealthy && instance.pages.size < this.maxPagesPerContext) {
|
|
990
|
+
if (!bestInstance || instance.pages.size < bestInstance.pages.size) {
|
|
991
|
+
bestInstance = instance;
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
}
|
|
995
|
+
if (!bestInstance) {
|
|
996
|
+
throw new Error("Failed to acquire Playwright page: No available or creatable browser instance.");
|
|
997
|
+
}
|
|
998
|
+
}
|
|
999
|
+
try {
|
|
1000
|
+
const page = await bestInstance.context.newPage();
|
|
1001
|
+
bestInstance.pages.add(page);
|
|
1002
|
+
bestInstance.metrics.pagesCreated++;
|
|
1003
|
+
bestInstance.metrics.activePages = bestInstance.pages.size;
|
|
1004
|
+
bestInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
|
|
1005
|
+
page.on("close", () => {
|
|
1006
|
+
bestInstance.pages.delete(page);
|
|
1007
|
+
bestInstance.metrics.activePages = bestInstance.pages.size;
|
|
1008
|
+
bestInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
|
|
1009
|
+
});
|
|
1010
|
+
page.on("crash", () => {
|
|
1011
|
+
bestInstance.metrics.errors++;
|
|
1012
|
+
bestInstance.pages.delete(page);
|
|
1013
|
+
bestInstance.isHealthy = false;
|
|
1014
|
+
bestInstance.metrics.isHealthy = false;
|
|
1015
|
+
this.healthCheck().catch((_err) => {
|
|
1016
|
+
});
|
|
1017
|
+
});
|
|
1018
|
+
return page;
|
|
1019
|
+
} catch (error) {
|
|
1020
|
+
bestInstance.metrics.errors++;
|
|
1021
|
+
bestInstance.isHealthy = false;
|
|
1022
|
+
bestInstance.metrics.isHealthy = false;
|
|
1023
|
+
this.healthCheck().catch((_err) => {
|
|
1024
|
+
});
|
|
1025
|
+
throw new Error(`Failed to create new page: ${error.message}`);
|
|
1026
|
+
}
|
|
1027
|
+
});
|
|
1028
|
+
}
|
|
1029
|
+
async healthCheck() {
|
|
1030
|
+
if (this.isCleaningUp) return;
|
|
1031
|
+
const now = /* @__PURE__ */ new Date();
|
|
1032
|
+
const checks = [];
|
|
1033
|
+
for (const instance of this.pool) {
|
|
1034
|
+
checks.push(
|
|
1035
|
+
(async () => {
|
|
1036
|
+
if (!instance.isHealthy) {
|
|
1037
|
+
return;
|
|
1038
|
+
}
|
|
1039
|
+
let shouldRemove = false;
|
|
1040
|
+
let reason = "unknown";
|
|
1041
|
+
if (!instance.browser.isConnected()) {
|
|
1042
|
+
shouldRemove = true;
|
|
1043
|
+
reason = "browser disconnected";
|
|
1044
|
+
}
|
|
1045
|
+
if (!shouldRemove && this.maxBrowserAge > 0 && now.getTime() - instance.metrics.createdAt.getTime() > this.maxBrowserAge) {
|
|
1046
|
+
shouldRemove = true;
|
|
1047
|
+
reason = "max age reached";
|
|
1048
|
+
}
|
|
1049
|
+
if (!shouldRemove && this.pool.size > 1 && // Only remove idle if pool has more than 1
|
|
1050
|
+
instance.pages.size === 0 && this.maxIdleTime > 0 && now.getTime() - instance.metrics.lastUsed.getTime() > this.maxIdleTime) {
|
|
1051
|
+
shouldRemove = true;
|
|
1052
|
+
reason = "idle timeout";
|
|
1053
|
+
}
|
|
1054
|
+
if (shouldRemove) {
|
|
1055
|
+
instance.isHealthy = false;
|
|
1056
|
+
instance.metrics.isHealthy = false;
|
|
1057
|
+
await this.closeAndRemoveInstance(instance, reason);
|
|
1058
|
+
} else {
|
|
1059
|
+
instance.isHealthy = true;
|
|
1060
|
+
instance.metrics.isHealthy = true;
|
|
1061
|
+
}
|
|
1062
|
+
})().catch((_err) => {
|
|
1063
|
+
})
|
|
1064
|
+
);
|
|
1065
|
+
}
|
|
1066
|
+
try {
|
|
1067
|
+
await Promise.allSettled(checks);
|
|
1068
|
+
} finally {
|
|
1069
|
+
await this.ensureMinimumInstances();
|
|
1070
|
+
this.scheduleHealthCheck();
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
async closeAndRemoveInstance(instance, _reason) {
|
|
1074
|
+
const removed = this.pool.delete(instance);
|
|
1075
|
+
if (!removed) return;
|
|
1076
|
+
instance.browser.off("disconnected", instance.disconnectedHandler);
|
|
1077
|
+
try {
|
|
1078
|
+
await instance.context.close();
|
|
1079
|
+
} catch (_error) {
|
|
1080
|
+
}
|
|
1081
|
+
try {
|
|
1082
|
+
await instance.browser.close();
|
|
1083
|
+
} catch (_error) {
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
async releasePage(page) {
|
|
1087
|
+
if (!page || page.isClosed()) return;
|
|
1088
|
+
let ownerInstance;
|
|
1089
|
+
for (const instance of this.pool) {
|
|
1090
|
+
if (instance.pages.has(page)) {
|
|
1091
|
+
ownerInstance = instance;
|
|
1092
|
+
break;
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
try {
|
|
1096
|
+
await page.close();
|
|
1097
|
+
if (ownerInstance) {
|
|
1098
|
+
ownerInstance.pages.delete(page);
|
|
1099
|
+
ownerInstance.metrics.activePages = ownerInstance.pages.size;
|
|
1100
|
+
ownerInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
|
|
1101
|
+
}
|
|
1102
|
+
} catch (error) {
|
|
1103
|
+
if (ownerInstance) {
|
|
1104
|
+
ownerInstance.isHealthy = false;
|
|
1105
|
+
ownerInstance.metrics.isHealthy = false;
|
|
1106
|
+
ownerInstance.metrics.errors++;
|
|
1107
|
+
ownerInstance.pages.delete(page);
|
|
1108
|
+
ownerInstance.metrics.activePages = ownerInstance.pages.size;
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
async cleanup() {
|
|
1113
|
+
if (this.isCleaningUp) return;
|
|
1114
|
+
this.isCleaningUp = true;
|
|
1115
|
+
if (this.healthCheckTimer) {
|
|
1116
|
+
clearTimeout(this.healthCheckTimer);
|
|
1117
|
+
this.healthCheckTimer = null;
|
|
1118
|
+
}
|
|
1119
|
+
this.acquireQueue.clear();
|
|
1120
|
+
await this.acquireQueue.onIdle();
|
|
1121
|
+
const closePromises = [...this.pool].map((instance) => this.closeAndRemoveInstance(instance, "cleanup"));
|
|
1122
|
+
this.pool.clear();
|
|
1123
|
+
await Promise.allSettled(closePromises);
|
|
1124
|
+
this.isCleaningUp = false;
|
|
1125
|
+
}
|
|
1126
|
+
getMetrics() {
|
|
1127
|
+
return [...this.pool].map((instance) => ({
|
|
1128
|
+
...instance.metrics,
|
|
1129
|
+
activePages: instance.pages.size,
|
|
1130
|
+
isHealthy: instance.isHealthy
|
|
1131
|
+
}));
|
|
1132
|
+
}
|
|
1133
|
+
};
|
|
1134
|
+
|
|
1135
|
+
// src/PlaywrightEngine.ts
|
|
1136
|
+
var import_p_queue2 = __toESM(require("p-queue"), 1);
|
|
1137
|
+
var import_axios = __toESM(require("axios"), 1);
|
|
1138
|
+
function delay(time) {
|
|
1139
|
+
return new Promise((resolve) => setTimeout(resolve, time));
|
|
1140
|
+
}
|
|
1141
|
+
var PlaywrightEngine = class _PlaywrightEngine {
|
|
1142
|
+
browserPool = null;
|
|
1143
|
+
queue;
|
|
1144
|
+
cache = /* @__PURE__ */ new Map();
|
|
1145
|
+
config;
|
|
1146
|
+
// Browser pooling safety flags
|
|
1147
|
+
initializingBrowserPool = false;
|
|
1148
|
+
isUsingHeadedMode = false;
|
|
1149
|
+
// Tracks current pool mode
|
|
1150
|
+
headedFallbackSites = /* @__PURE__ */ new Set();
|
|
1151
|
+
// Stores domains marked for headed mode
|
|
1152
|
+
// Default configuration - Ensure all required fields are present
|
|
1153
|
+
static DEFAULT_CONFIG = {
|
|
1154
|
+
concurrentPages: 3,
|
|
1155
|
+
maxRetries: 3,
|
|
1156
|
+
retryDelay: 5e3,
|
|
1157
|
+
cacheTTL: 15 * 60 * 1e3,
|
|
1158
|
+
useHttpFallback: true,
|
|
1159
|
+
useHeadedModeFallback: false,
|
|
1160
|
+
defaultFastMode: true,
|
|
1161
|
+
simulateHumanBehavior: true,
|
|
1162
|
+
maxBrowsers: 2,
|
|
1163
|
+
maxPagesPerContext: 6,
|
|
1164
|
+
maxBrowserAge: 20 * 60 * 1e3,
|
|
1165
|
+
healthCheckInterval: 60 * 1e3,
|
|
1166
|
+
poolBlockedDomains: [],
|
|
1167
|
+
poolBlockedResourceTypes: [],
|
|
1168
|
+
proxy: void 0,
|
|
1169
|
+
useHeadedMode: false,
|
|
1170
|
+
// ADDED default
|
|
1171
|
+
markdown: true
|
|
1172
|
+
};
|
|
1173
|
+
/**
|
|
1174
|
+
* Creates an instance of PlaywrightEngine.
|
|
1175
|
+
*
|
|
1176
|
+
* @param config Configuration options for the engine and its browser pool.
|
|
1177
|
+
* See `PlaywrightEngineConfig` for details.
|
|
1178
|
+
*/
|
|
1179
|
+
constructor(config = {}) {
|
|
1180
|
+
this.config = { ..._PlaywrightEngine.DEFAULT_CONFIG, ...config };
|
|
1181
|
+
this.queue = new import_p_queue2.default({ concurrency: this.config.concurrentPages });
|
|
1182
|
+
}
|
|
1183
|
+
/**
|
|
1184
|
+
* Initialize the browser pool with improved error handling and mode switching.
|
|
1185
|
+
*/
|
|
1186
|
+
async initializeBrowserPool(useHeadedMode = false) {
|
|
1187
|
+
if (this.browserPool && this.isUsingHeadedMode === useHeadedMode) {
|
|
1188
|
+
return;
|
|
1189
|
+
}
|
|
1190
|
+
if (this.initializingBrowserPool) {
|
|
1191
|
+
while (this.initializingBrowserPool) {
|
|
1192
|
+
await delay(100);
|
|
1193
|
+
}
|
|
1194
|
+
if (this.browserPool && this.isUsingHeadedMode === useHeadedMode) {
|
|
1195
|
+
return;
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
this.initializingBrowserPool = true;
|
|
1199
|
+
try {
|
|
1200
|
+
if (this.browserPool && this.isUsingHeadedMode !== useHeadedMode) {
|
|
1201
|
+
await this.browserPool.cleanup();
|
|
1202
|
+
this.browserPool = null;
|
|
1203
|
+
}
|
|
1204
|
+
this.isUsingHeadedMode = useHeadedMode;
|
|
1205
|
+
this.browserPool = new PlaywrightBrowserPool({
|
|
1206
|
+
maxBrowsers: this.config.maxBrowsers,
|
|
1207
|
+
maxPagesPerContext: this.config.maxPagesPerContext,
|
|
1208
|
+
maxBrowserAge: this.config.maxBrowserAge,
|
|
1209
|
+
healthCheckInterval: this.config.healthCheckInterval,
|
|
1210
|
+
useHeadedMode,
|
|
1211
|
+
blockedDomains: this.config.poolBlockedDomains,
|
|
1212
|
+
blockedResourceTypes: this.config.poolBlockedResourceTypes,
|
|
1213
|
+
proxy: this.config.proxy
|
|
1214
|
+
});
|
|
1215
|
+
await this.browserPool.initialize();
|
|
1216
|
+
} catch (error) {
|
|
1217
|
+
this.browserPool = null;
|
|
1218
|
+
this.isUsingHeadedMode = false;
|
|
1219
|
+
throw error;
|
|
1220
|
+
} finally {
|
|
1221
|
+
this.initializingBrowserPool = false;
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
/**
|
|
1225
|
+
* Fallback method using simple HTTP requests via Axios.
|
|
1226
|
+
* Ensures return type matches HTMLFetchResult.
|
|
1227
|
+
*/
|
|
1228
|
+
async fetchHTMLWithHttpFallback(url) {
|
|
1229
|
+
try {
|
|
1230
|
+
const response = await import_axios.default.get(url, {
|
|
1231
|
+
headers: {
|
|
1232
|
+
// Use more standard browser-like headers
|
|
1233
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
1234
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
1235
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
1236
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
1237
|
+
// Allow compression
|
|
1238
|
+
Referer: "https://www.google.com/",
|
|
1239
|
+
// Common referer
|
|
1240
|
+
"Upgrade-Insecure-Requests": "1",
|
|
1241
|
+
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
|
|
1242
|
+
"Sec-Ch-Ua-Mobile": "?0",
|
|
1243
|
+
"Sec-Ch-Ua-Platform": '"Windows"',
|
|
1244
|
+
"Sec-Fetch-Dest": "document",
|
|
1245
|
+
"Sec-Fetch-Mode": "navigate",
|
|
1246
|
+
"Sec-Fetch-Site": "cross-site",
|
|
1247
|
+
"Sec-Fetch-User": "?1",
|
|
1248
|
+
Connection: "keep-alive"
|
|
1249
|
+
// Keep connection open
|
|
1250
|
+
// Avoid Cache-Control/Pragma unless specifically needed
|
|
1251
|
+
},
|
|
1252
|
+
maxRedirects: 5,
|
|
1253
|
+
timeout: 3e4,
|
|
1254
|
+
responseType: "text",
|
|
1255
|
+
// Decompress response automatically
|
|
1256
|
+
decompress: true
|
|
1257
|
+
});
|
|
1258
|
+
const titleMatch = response.data.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
1259
|
+
let title = titleMatch ? titleMatch[1].trim() : "";
|
|
1260
|
+
if (!title && /<html>([^<]+)<\/html>/.test(response.data)) {
|
|
1261
|
+
title = response.data.replace(/<\/?html>/g, "").trim();
|
|
1262
|
+
}
|
|
1263
|
+
const lowerHtml = response.data.toLowerCase();
|
|
1264
|
+
const isChallengeOrBot = /cloudflare|checking your browser|please wait|verification|captcha|attention required/i.test(lowerHtml);
|
|
1265
|
+
if (isChallengeOrBot) {
|
|
1266
|
+
throw new FetchError("Received challenge page via HTTP fallback", "ERR_CHALLENGE_PAGE");
|
|
1267
|
+
}
|
|
1268
|
+
const originalHtml = response.data;
|
|
1269
|
+
let finalContent = originalHtml;
|
|
1270
|
+
let finalContentType = "html";
|
|
1271
|
+
if (this.config.markdown) {
|
|
1272
|
+
try {
|
|
1273
|
+
const converter = new MarkdownConverter();
|
|
1274
|
+
finalContent = converter.convert(originalHtml);
|
|
1275
|
+
finalContentType = "markdown";
|
|
1276
|
+
} catch (conversionError) {
|
|
1277
|
+
console.error(`Markdown conversion failed for ${url} (HTTP fallback):`, conversionError);
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
return {
|
|
1281
|
+
content: finalContent,
|
|
1282
|
+
contentType: finalContentType,
|
|
1283
|
+
title,
|
|
1284
|
+
// title is extracted from original HTML
|
|
1285
|
+
url: response.request?.res?.responseUrl || response.config.url || url,
|
|
1286
|
+
isFromCache: false,
|
|
1287
|
+
statusCode: response.status,
|
|
1288
|
+
error: void 0
|
|
1289
|
+
};
|
|
1290
|
+
} catch (error) {
|
|
1291
|
+
if (!(error instanceof FetchError)) {
|
|
1292
|
+
throw new FetchError(`HTTP fallback failed: ${error.message}`, "ERR_HTTP_FALLBACK_FAILED", error);
|
|
1293
|
+
}
|
|
1294
|
+
throw error;
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
checkCache(url) {
|
|
1298
|
+
const cached = this.cache.get(url);
|
|
1299
|
+
if (cached && Date.now() - cached.timestamp < this.config.cacheTTL) {
|
|
1300
|
+
return cached.result;
|
|
1301
|
+
}
|
|
1302
|
+
if (cached) {
|
|
1303
|
+
this.cache.delete(url);
|
|
1304
|
+
}
|
|
1305
|
+
return null;
|
|
1306
|
+
}
|
|
1307
|
+
/**
|
|
1308
|
+
* Safely check if a page is still usable and connected.
|
|
1309
|
+
*/
|
|
1310
|
+
async isPageValid(page) {
|
|
1311
|
+
if (!page || page.isClosed()) return false;
|
|
1312
|
+
try {
|
|
1313
|
+
if (!page.context().browser()?.isConnected()) return false;
|
|
1314
|
+
await page.evaluate("1 + 1", { timeout: 1e3 });
|
|
1315
|
+
return true;
|
|
1316
|
+
} catch (error) {
|
|
1317
|
+
return false;
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
/**
|
|
1321
|
+
* Simulate human-like interactions on the page.
|
|
1322
|
+
*/
|
|
1323
|
+
async simulateHumanBehavior(page) {
|
|
1324
|
+
if (!await this.isPageValid(page)) return;
|
|
1325
|
+
try {
|
|
1326
|
+
const viewport = page.viewportSize();
|
|
1327
|
+
if (!viewport) return;
|
|
1328
|
+
await page.mouse.move(Math.random() * viewport.width, Math.random() * viewport.height / 3, { steps: 5 });
|
|
1329
|
+
await delay(150 + Math.random() * 200);
|
|
1330
|
+
await page.mouse.move(
|
|
1331
|
+
Math.random() * viewport.width,
|
|
1332
|
+
viewport.height / 2 + Math.random() * viewport.height / 2,
|
|
1333
|
+
{ steps: 10 }
|
|
1334
|
+
);
|
|
1335
|
+
await delay(200 + Math.random() * 300);
|
|
1336
|
+
await page.evaluate(() => {
|
|
1337
|
+
window.scrollBy({
|
|
1338
|
+
top: window.innerHeight * (0.3 + Math.random() * 0.4),
|
|
1339
|
+
behavior: "smooth"
|
|
1340
|
+
});
|
|
1341
|
+
});
|
|
1342
|
+
await delay(400 + Math.random() * 600);
|
|
1343
|
+
await page.evaluate(() => {
|
|
1344
|
+
window.scrollBy({
|
|
1345
|
+
top: window.innerHeight * (0.2 + Math.random() * 0.3),
|
|
1346
|
+
behavior: "smooth"
|
|
1347
|
+
});
|
|
1348
|
+
});
|
|
1349
|
+
await delay(300 + Math.random() * 400);
|
|
1350
|
+
} catch (_error) {
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
/**
|
|
1354
|
+
* Adds a result to the in-memory cache.
|
|
1355
|
+
*/
|
|
1356
|
+
addToCache(url, result) {
|
|
1357
|
+
if (this.config.cacheTTL <= 0) return;
|
|
1358
|
+
const entry = {
|
|
1359
|
+
result: { ...result, isFromCache: true },
|
|
1360
|
+
// Mark as cached
|
|
1361
|
+
timestamp: Date.now()
|
|
1362
|
+
};
|
|
1363
|
+
this.cache.set(url, entry);
|
|
1364
|
+
}
|
|
1365
|
+
/**
|
|
1366
|
+
* Public method to fetch HTML. Delegates to the internal recursive fetch method.
|
|
1367
|
+
*
|
|
1368
|
+
* @param url The URL to fetch.
|
|
1369
|
+
* @param options Optional settings for this specific fetch operation.
|
|
1370
|
+
* @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
|
|
1371
|
+
* @returns A Promise resolving to an HTMLFetchResult object.
|
|
1372
|
+
* @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
|
|
1373
|
+
*/
|
|
1374
|
+
async fetchHTML(url, options = {}) {
|
|
1375
|
+
const fetchConfig = {
|
|
1376
|
+
...this.config,
|
|
1377
|
+
markdown: options.markdown === void 0 ? this.config.markdown : options.markdown,
|
|
1378
|
+
fastMode: options.fastMode === void 0 ? this.config.defaultFastMode : options.fastMode
|
|
1379
|
+
};
|
|
1380
|
+
return this._fetchRecursive(url, fetchConfig, 0, 0);
|
|
1381
|
+
}
|
|
1382
|
+
/**
|
|
1383
|
+
* Internal recursive method to handle fetching with retries.
|
|
1384
|
+
*
|
|
1385
|
+
* @param url URL to fetch
|
|
1386
|
+
* @param currentConfig The merged configuration including markdown option
|
|
1387
|
+
* @param retryAttempt Current retry attempt number (starts at 0)
|
|
1388
|
+
* @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
|
|
1389
|
+
* @returns Promise resolving to HTMLFetchResult
|
|
1390
|
+
*/
|
|
1391
|
+
async _fetchRecursive(url, currentConfig, retryAttempt, parentRetryCount) {
|
|
1392
|
+
const useFastMode = currentConfig.fastMode;
|
|
1393
|
+
if (retryAttempt === 0 && parentRetryCount === 0) {
|
|
1394
|
+
const cachedResult = this.checkCache(url);
|
|
1395
|
+
if (cachedResult) {
|
|
1396
|
+
if (currentConfig.markdown && !cachedResult.content.startsWith("#") && !cachedResult.content.includes("\n\n---\n\n")) {
|
|
1397
|
+
try {
|
|
1398
|
+
const converter = new MarkdownConverter();
|
|
1399
|
+
cachedResult.content = converter.convert(cachedResult.content);
|
|
1400
|
+
} catch (e) {
|
|
1401
|
+
console.error("Failed to convert cached result to markdown", e);
|
|
1402
|
+
}
|
|
1403
|
+
} else if (!currentConfig.markdown && (cachedResult.content.startsWith("#") || cachedResult.content.includes("\n\n---\n\n"))) {
|
|
1404
|
+
console.warn("Cached result is Markdown, but HTML was requested. Re-fetching.");
|
|
1405
|
+
this.cache.delete(url);
|
|
1406
|
+
return this._fetchRecursive(url, currentConfig, 0, 0);
|
|
1407
|
+
}
|
|
1408
|
+
return cachedResult;
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
try {
|
|
1412
|
+
if (currentConfig.useHttpFallback && retryAttempt === 0 && parentRetryCount === 0) {
|
|
1413
|
+
try {
|
|
1414
|
+
const httpResult = await this.fetchHTMLWithHttpFallback(url);
|
|
1415
|
+
if (this.config.cacheTTL > 0) {
|
|
1416
|
+
this.addToCache(url, httpResult);
|
|
1417
|
+
}
|
|
1418
|
+
return httpResult;
|
|
1419
|
+
} catch (httpError) {
|
|
1420
|
+
if (httpError instanceof FetchError && httpError.code === "ERR_CHALLENGE_PAGE") {
|
|
1421
|
+
} else {
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
const useHeadedMode = currentConfig.useHeadedModeFallback && (retryAttempt >= 2 || this.shouldUseHeadedMode(url)) || currentConfig.useHeadedMode;
|
|
1426
|
+
try {
|
|
1427
|
+
if (!this.browserPool || this.isUsingHeadedMode !== useHeadedMode) {
|
|
1428
|
+
await this.initializeBrowserPool(useHeadedMode);
|
|
1429
|
+
}
|
|
1430
|
+
} catch (initError) {
|
|
1431
|
+
if (parentRetryCount < 1) {
|
|
1432
|
+
await delay(currentConfig.retryDelay);
|
|
1433
|
+
return this._fetchRecursive(url, currentConfig, retryAttempt, parentRetryCount + 1);
|
|
1434
|
+
}
|
|
1435
|
+
throw new FetchError(
|
|
1436
|
+
`Pool init failed: ${initError.message}`,
|
|
1437
|
+
"ERR_POOL_INIT_FAILED",
|
|
1438
|
+
initError
|
|
1439
|
+
);
|
|
1440
|
+
}
|
|
1441
|
+
if (!this.browserPool) {
|
|
1442
|
+
throw new FetchError("Browser pool unavailable.", "ERR_POOL_UNAVAILABLE");
|
|
1443
|
+
}
|
|
1444
|
+
const result = await this.queue.add(
|
|
1445
|
+
() => this.fetchWithPlaywright(url, this.browserPool, useFastMode, currentConfig.markdown)
|
|
1446
|
+
);
|
|
1447
|
+
if (result && this.config.cacheTTL > 0) {
|
|
1448
|
+
this.addToCache(url, result);
|
|
1449
|
+
}
|
|
1450
|
+
if (!result) {
|
|
1451
|
+
throw new FetchError("Playwright fetch queued but no result.", "ERR_QUEUE_NO_RESULT");
|
|
1452
|
+
}
|
|
1453
|
+
return result;
|
|
1454
|
+
} catch (error) {
|
|
1455
|
+
if (useFastMode && retryAttempt === 0 && parentRetryCount === 0) {
|
|
1456
|
+
return this._fetchRecursive(url, { ...currentConfig, fastMode: false }, 0, parentRetryCount);
|
|
1457
|
+
}
|
|
1458
|
+
if (retryAttempt < currentConfig.maxRetries) {
|
|
1459
|
+
await delay(currentConfig.retryDelay);
|
|
1460
|
+
return this._fetchRecursive(url, currentConfig, retryAttempt + 1, parentRetryCount);
|
|
1461
|
+
}
|
|
1462
|
+
const finalError = error instanceof FetchError ? error : new FetchError(`Fetch failed: ${error.message}`, "ERR_FETCH_FAILED", error);
|
|
1463
|
+
throw new FetchError(
|
|
1464
|
+
`Fetch failed after ${currentConfig.maxRetries} retries: ${finalError.message}`,
|
|
1465
|
+
finalError.code,
|
|
1466
|
+
finalError.originalError || error
|
|
1467
|
+
);
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
/**
|
|
1471
|
+
* Performs the actual page fetch using a Playwright page from the pool.
|
|
1472
|
+
* Ensures return type matches HTMLFetchResult.
|
|
1473
|
+
*/
|
|
1474
|
+
async fetchWithPlaywright(url, pool, fastMode, convertToMarkdown) {
|
|
1475
|
+
let page = null;
|
|
1476
|
+
try {
|
|
1477
|
+
page = await pool.acquirePage();
|
|
1478
|
+
await this.applyBlockingRules(page, fastMode);
|
|
1479
|
+
let response = null;
|
|
1480
|
+
try {
|
|
1481
|
+
response = await page.goto(url, {
|
|
1482
|
+
waitUntil: "domcontentloaded",
|
|
1483
|
+
timeout: 6e4
|
|
1484
|
+
});
|
|
1485
|
+
} catch (navigationError) {
|
|
1486
|
+
throw new FetchError(
|
|
1487
|
+
`Playwright navigation failed: ${navigationError.message}`,
|
|
1488
|
+
"ERR_NAVIGATION",
|
|
1489
|
+
navigationError
|
|
1490
|
+
);
|
|
1491
|
+
}
|
|
1492
|
+
if (!response) {
|
|
1493
|
+
throw new FetchError("Playwright navigation did not return a response.", "ERR_NO_RESPONSE");
|
|
1494
|
+
}
|
|
1495
|
+
if (!response.ok()) {
|
|
1496
|
+
throw new FetchError(
|
|
1497
|
+
`HTTP error status received: ${response.status()}`,
|
|
1498
|
+
"ERR_HTTP_ERROR",
|
|
1499
|
+
void 0,
|
|
1500
|
+
response.status()
|
|
1501
|
+
);
|
|
1502
|
+
}
|
|
1503
|
+
const contentType = response.headers()["content-type"] || "";
|
|
1504
|
+
if (!contentType.includes("html")) {
|
|
1505
|
+
throw new FetchError(`Invalid content type received: ${contentType}`, "ERR_NON_HTML_CONTENT");
|
|
1506
|
+
}
|
|
1507
|
+
if (!fastMode && this.config.simulateHumanBehavior) {
|
|
1508
|
+
await this.simulateHumanBehavior(page);
|
|
1509
|
+
}
|
|
1510
|
+
const html = await page.content();
|
|
1511
|
+
const title = await page.title();
|
|
1512
|
+
const finalUrl = page.url();
|
|
1513
|
+
const status = response?.status();
|
|
1514
|
+
let finalContent = html;
|
|
1515
|
+
let finalContentType = "html";
|
|
1516
|
+
if (convertToMarkdown) {
|
|
1517
|
+
try {
|
|
1518
|
+
const converter = new MarkdownConverter();
|
|
1519
|
+
finalContent = converter.convert(html);
|
|
1520
|
+
finalContentType = "markdown";
|
|
1521
|
+
} catch (conversionError) {
|
|
1522
|
+
console.error(`Markdown conversion failed for ${url} (Playwright):`, conversionError);
|
|
1523
|
+
}
|
|
1524
|
+
}
|
|
1525
|
+
return {
|
|
1526
|
+
content: finalContent,
|
|
1527
|
+
contentType: finalContentType,
|
|
1528
|
+
title: title || null,
|
|
1529
|
+
url: finalUrl,
|
|
1530
|
+
isFromCache: false,
|
|
1531
|
+
statusCode: status,
|
|
1532
|
+
error: void 0
|
|
1533
|
+
};
|
|
1534
|
+
} finally {
|
|
1535
|
+
if (page) {
|
|
1536
|
+
await pool.releasePage(page);
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
async applyBlockingRules(page, fastMode) {
|
|
1541
|
+
const blockedResources = fastMode ? this.config.poolBlockedResourceTypes.concat(["image", "font", "stylesheet", "media"]) : this.config.poolBlockedResourceTypes;
|
|
1542
|
+
const blockedDomains = this.config.poolBlockedDomains;
|
|
1543
|
+
if (blockedResources.length > 0 || blockedDomains.length > 0) {
|
|
1544
|
+
try {
|
|
1545
|
+
await page.route("**/*", (route) => {
|
|
1546
|
+
const resourceType = route.request().resourceType();
|
|
1547
|
+
const requestUrl = route.request().url();
|
|
1548
|
+
if (blockedResources.includes(resourceType)) {
|
|
1549
|
+
return route.abort();
|
|
1550
|
+
}
|
|
1551
|
+
if (blockedDomains.some(
|
|
1552
|
+
(pattern) => new RegExp(pattern.replace(/\./g, "\\.").replace(/\*/g, ".*")).test(requestUrl)
|
|
1553
|
+
)) {
|
|
1554
|
+
return route.abort();
|
|
1555
|
+
}
|
|
1556
|
+
return route.continue();
|
|
1557
|
+
});
|
|
1558
|
+
} catch (_error) {
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
/**
|
|
1563
|
+
* Cleans up resources used by the engine, primarily closing browser instances in the pool.
|
|
1564
|
+
*
|
|
1565
|
+
* It is crucial to call this method when finished with the engine instance to release resources.
|
|
1566
|
+
* @returns A Promise that resolves when cleanup is complete.
|
|
1567
|
+
*/
|
|
1568
|
+
async cleanup() {
|
|
1569
|
+
try {
|
|
1570
|
+
await this.queue.onIdle();
|
|
1571
|
+
this.queue.clear();
|
|
1572
|
+
if (this.browserPool) {
|
|
1573
|
+
await this.browserPool.cleanup();
|
|
1574
|
+
this.browserPool = null;
|
|
1575
|
+
}
|
|
1576
|
+
this.isUsingHeadedMode = false;
|
|
1577
|
+
} catch (_error) {
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
/**
|
|
1581
|
+
* Retrieves metrics from the underlying browser pool.
|
|
1582
|
+
* @returns An array of BrowserMetrics objects, one for each active browser instance, or an empty array if the pool is not initialized.
|
|
1583
|
+
*/
|
|
1584
|
+
getMetrics() {
|
|
1585
|
+
if (this.browserPool) {
|
|
1586
|
+
return this.browserPool.getMetrics();
|
|
1587
|
+
}
|
|
1588
|
+
return [];
|
|
1589
|
+
}
|
|
1590
|
+
// Helper to check if a specific domain is marked for headed mode
|
|
1591
|
+
shouldUseHeadedMode(url) {
|
|
1592
|
+
if (!this.config.useHeadedModeFallback) return false;
|
|
1593
|
+
try {
|
|
1594
|
+
const domain = new URL(url).hostname;
|
|
1595
|
+
return this.headedFallbackSites.has(domain);
|
|
1596
|
+
} catch {
|
|
1597
|
+
return false;
|
|
1598
|
+
}
|
|
1599
|
+
}
|
|
1600
|
+
};
|
|
1601
|
+
|
|
1602
|
+
// src/HybridEngine.ts
|
|
1603
|
+
var HybridEngine = class {
|
|
1604
|
+
fetchEngine;
|
|
1605
|
+
playwrightEngine;
|
|
1606
|
+
config;
|
|
1607
|
+
// Store config for potential per-request PW overrides
|
|
1608
|
+
constructor(config = {}) {
|
|
1609
|
+
this.fetchEngine = new FetchEngine({ markdown: config.markdown });
|
|
1610
|
+
this.playwrightEngine = new PlaywrightEngine(config);
|
|
1611
|
+
this.config = config;
|
|
1612
|
+
}
|
|
1613
|
+
async fetchHTML(url, options = {}) {
|
|
1614
|
+
try {
|
|
1615
|
+
const fetchResult = await this.fetchEngine.fetchHTML(url);
|
|
1616
|
+
return fetchResult;
|
|
1617
|
+
} catch (fetchError) {
|
|
1618
|
+
console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
|
|
1619
|
+
const playwrightOptions = {
|
|
1620
|
+
...this.config,
|
|
1621
|
+
// Start with base config given to HybridEngine
|
|
1622
|
+
...options
|
|
1623
|
+
// Override with per-request options
|
|
1624
|
+
};
|
|
1625
|
+
try {
|
|
1626
|
+
const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
|
|
1627
|
+
return playwrightResult;
|
|
1628
|
+
} catch (playwrightError) {
|
|
1629
|
+
console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
|
|
1630
|
+
throw playwrightError;
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
}
|
|
1634
|
+
/**
|
|
1635
|
+
* Delegates getMetrics to the PlaywrightEngine.
|
|
1636
|
+
*/
|
|
1637
|
+
getMetrics() {
|
|
1638
|
+
return this.playwrightEngine.getMetrics();
|
|
1639
|
+
}
|
|
1640
|
+
/**
|
|
1641
|
+
* Calls cleanup on both underlying engines.
|
|
1642
|
+
*/
|
|
1643
|
+
async cleanup() {
|
|
1644
|
+
await Promise.allSettled([
|
|
1645
|
+
this.fetchEngine.cleanup(),
|
|
1646
|
+
// Although a no-op, call for consistency
|
|
1647
|
+
this.playwrightEngine.cleanup()
|
|
1648
|
+
]);
|
|
1649
|
+
}
|
|
1650
|
+
};
|
|
1651
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
1652
|
+
0 && (module.exports = {
|
|
1653
|
+
FetchEngine,
|
|
1654
|
+
HybridEngine,
|
|
1655
|
+
PlaywrightEngine
|
|
1656
|
+
});
|
|
1657
|
+
//# sourceMappingURL=index.cjs.map
|