@purepageio/fetch-engines 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/index.cjs +1657 -0
  2. package/dist/index.cjs.map +1 -0
  3. package/dist/index.d.cts +323 -0
  4. package/dist/index.d.ts +323 -8
  5. package/dist/index.js +1617 -4
  6. package/dist/index.js.map +1 -1
  7. package/package.json +14 -5
  8. package/dist/FetchEngine.d.ts +0 -47
  9. package/dist/FetchEngine.d.ts.map +0 -1
  10. package/dist/FetchEngine.js +0 -114
  11. package/dist/FetchEngine.js.map +0 -1
  12. package/dist/FetchEngine.test.d.ts +0 -2
  13. package/dist/FetchEngine.test.d.ts.map +0 -1
  14. package/dist/FetchEngine.test.js +0 -44
  15. package/dist/FetchEngine.test.js.map +0 -1
  16. package/dist/HybridEngine.d.ts +0 -21
  17. package/dist/HybridEngine.d.ts.map +0 -1
  18. package/dist/HybridEngine.js +0 -62
  19. package/dist/HybridEngine.js.map +0 -1
  20. package/dist/IEngine.d.ts +0 -22
  21. package/dist/IEngine.d.ts.map +0 -1
  22. package/dist/IEngine.js +0 -2
  23. package/dist/IEngine.js.map +0 -1
  24. package/dist/PlaywrightEngine.d.ts +0 -90
  25. package/dist/PlaywrightEngine.d.ts.map +0 -1
  26. package/dist/PlaywrightEngine.js +0 -505
  27. package/dist/PlaywrightEngine.js.map +0 -1
  28. package/dist/PlaywrightEngine.test.d.ts +0 -2
  29. package/dist/PlaywrightEngine.test.d.ts.map +0 -1
  30. package/dist/PlaywrightEngine.test.js +0 -207
  31. package/dist/PlaywrightEngine.test.js.map +0 -1
  32. package/dist/PuppeteerEngine.d.ts +0 -21
  33. package/dist/PuppeteerEngine.d.ts.map +0 -1
  34. package/dist/PuppeteerEngine.js +0 -412
  35. package/dist/PuppeteerEngine.js.map +0 -1
  36. package/dist/browser/BrowserPool.d.ts +0 -29
  37. package/dist/browser/BrowserPool.d.ts.map +0 -1
  38. package/dist/browser/BrowserPool.js +0 -378
  39. package/dist/browser/BrowserPool.js.map +0 -1
  40. package/dist/browser/PlaywrightBrowserPool.d.ts +0 -48
  41. package/dist/browser/PlaywrightBrowserPool.d.ts.map +0 -1
  42. package/dist/browser/PlaywrightBrowserPool.js +0 -378
  43. package/dist/browser/PlaywrightBrowserPool.js.map +0 -1
  44. package/dist/browser/PlaywrightBrowserPool.test.d.ts +0 -2
  45. package/dist/browser/PlaywrightBrowserPool.test.d.ts.map +0 -1
  46. package/dist/browser/PlaywrightBrowserPool.test.js +0 -422
  47. package/dist/browser/PlaywrightBrowserPool.test.js.map +0 -1
  48. package/dist/errors.d.ts +0 -20
  49. package/dist/errors.d.ts.map +0 -1
  50. package/dist/errors.js +0 -30
  51. package/dist/errors.js.map +0 -1
  52. package/dist/index.d.ts.map +0 -1
  53. package/dist/types.d.ts +0 -167
  54. package/dist/types.d.ts.map +0 -1
  55. package/dist/types.js +0 -2
  56. package/dist/types.js.map +0 -1
  57. package/dist/utils/markdown-converter.d.ts +0 -31
  58. package/dist/utils/markdown-converter.d.ts.map +0 -1
  59. package/dist/utils/markdown-converter.js +0 -796
  60. package/dist/utils/markdown-converter.js.map +0 -1
package/dist/index.cjs ADDED
@@ -0,0 +1,1657 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ FetchEngine: () => FetchEngine,
34
+ HybridEngine: () => HybridEngine,
35
+ PlaywrightEngine: () => PlaywrightEngine
36
+ });
37
+ module.exports = __toCommonJS(index_exports);
38
+
39
+ // src/utils/markdown-converter.ts
40
+ var import_turndown = __toESM(require("turndown"), 1);
41
+ var import_turndown_plugin_gfm = require("turndown-plugin-gfm");
42
+ var import_node_html_parser = require("node-html-parser");
43
+ var PREPROCESSING_REMOVE_SELECTORS = [
44
+ "script:not([type='application/ld+json'])",
45
+ // Keep JSON-LD
46
+ "style",
47
+ "noscript",
48
+ "iframe:not([title])"
49
+ // Keep iframes with titles (potential embeds)
50
+ ];
51
+ var MAIN_CONTENT_SELECTORS = [
52
+ // By semantics
53
+ "article",
54
+ "main",
55
+ "[role='main']",
56
+ "[role='article']",
57
+ // By common class/id names (more robust patterns)
58
+ "[class*='article-body']",
59
+ "[class*='post-content']",
60
+ "[class*='main-content']",
61
+ "[class*='entry-content']",
62
+ "[id*='article-body']",
63
+ "[id*='main-content']",
64
+ // Common CMS patterns
65
+ ".article",
66
+ ".post",
67
+ ".content",
68
+ ".entry",
69
+ ".blog-post",
70
+ // Fallback
71
+ "body"
72
+ ];
73
+ var FORUM_COMMENT_SELECTORS = [
74
+ ".comment",
75
+ ".comments",
76
+ ".comtr",
77
+ '[id^="comment-"]',
78
+ 'div[id^="c_"]'
79
+ ];
80
+ var FORUM_THREAD_SELECTORS = [".thread", ".post", '[id^="thread-"]'];
81
+ var FORUM_VOTE_SELECTORS = [".vote", ".score", ".upvote", ".downvote", ".votelinks"];
82
+ var FORUM_MAIN_POST_SELECTORS = [".fatitem", ".submission", ".op", ".original-post"];
83
+ var FORUM_COMMENTS_CONTAINER_SELECTORS = [".comment-tree", ".comments", "#comments"];
84
+ var FORUM_OBVIOUS_NON_CONTENT_SELECTORS = ["header", "footer", ".nav", ".sidebar"];
85
+ var MIN_LINK_DENSITY_TEXT_LENGTH = 50;
86
+ var DEFAULT_LINK_DENSITY_THRESHOLD = 0.4;
87
+ var MIN_FORUM_INDICATOR_COUNT = 3;
88
+ var CODE_BLOCK_LANG_PREFIXES = ["language-", "lang-"];
89
+ var POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES = 2;
90
+ var MarkdownConverter = class {
91
+ turndownService;
92
+ constructor() {
93
+ this.turndownService = new import_turndown.default({
94
+ headingStyle: "atx",
95
+ codeBlockStyle: "fenced",
96
+ bulletListMarker: "-",
97
+ strongDelimiter: "**",
98
+ emDelimiter: "*",
99
+ hr: "---",
100
+ // Use nodeType check instead of window.HTMLElement
101
+ keepReplacement: (_content, node) => {
102
+ if (node.nodeType === 1) {
103
+ const htmlElement = node;
104
+ if (htmlElement.getAttribute("role") === "presentation" || htmlElement.classList?.contains("preserve")) {
105
+ return htmlElement.outerHTML;
106
+ }
107
+ }
108
+ return "";
109
+ }
110
+ });
111
+ this.turndownService.use(import_turndown_plugin_gfm.gfm);
112
+ this.setupPrioritizedRules();
113
+ }
114
+ // --- Public Method ---
115
+ /**
116
+ * Converts HTML string to Markdown.
117
+ * @param html The HTML string to convert.
118
+ * @param options Conversion options.
119
+ * @returns The converted Markdown string.
120
+ */
121
+ convert(html, options = {}) {
122
+ const preprocessedHtml = this.preprocessHTML(html);
123
+ let markdown = this.turndownService.turndown(preprocessedHtml);
124
+ markdown = this.postprocessMarkdown(markdown, options);
125
+ return markdown;
126
+ }
127
+ // --- Turndown Rule Setup ---
128
+ setupPrioritizedRules() {
129
+ this.addContentExtractionRules();
130
+ this.addStructureRules();
131
+ this.addBlockRules();
132
+ this.addInlineRules();
133
+ }
134
+ // We rely on preprocessing to remove nav/menus/high-link-density areas.
135
+ // These rules primarily help Turndown understand the *structure* of the *intended* content.
136
+ addContentExtractionRules() {
137
+ this.turndownService.addRule("main-content-marker", {
138
+ filter: (node) => {
139
+ if (node.nodeType !== 1) return false;
140
+ const el = node;
141
+ const element = node;
142
+ return el.tagName.toLowerCase() === "main" || ["main", "article"].includes(el.getAttribute("role") || "") || MAIN_CONTENT_SELECTORS.some((selector) => {
143
+ try {
144
+ return element.matches(selector) && selector !== "body";
145
+ } catch {
146
+ return false;
147
+ }
148
+ });
149
+ },
150
+ // Just pass content through, this rule is mainly for filter priority/debugging
151
+ replacement: (content) => content
152
+ });
153
+ const unwantedTags = [
154
+ "script",
155
+ "style",
156
+ "noscript",
157
+ "iframe",
158
+ "button",
159
+ "input",
160
+ "select",
161
+ "textarea",
162
+ "form",
163
+ "canvas",
164
+ /*'svg' removed */
165
+ "audio",
166
+ "video"
167
+ ];
168
+ this.turndownService.addRule("remove-unwanted", {
169
+ filter: unwantedTags,
170
+ replacement: () => ""
171
+ });
172
+ }
173
+ addStructureRules() {
174
+ this.turndownService.addRule("article", {
175
+ filter: "article",
176
+ replacement: (content) => `
177
+
178
+ ${content}
179
+
180
+ `
181
+ // Add separation
182
+ });
183
+ this.turndownService.addRule("section", {
184
+ filter: "section",
185
+ replacement: (content) => `
186
+
187
+ ${content}
188
+
189
+ `
190
+ // Add separation
191
+ });
192
+ }
193
+ addBlockRules() {
194
+ this.turndownService.addRule("list", {
195
+ filter: ["ul", "ol"],
196
+ replacement: (content, node) => {
197
+ if (node.nodeType !== 1) return content;
198
+ const parent = node.parentNode;
199
+ const indent = parent && parent.nodeName.toLowerCase() === "li" ? " " : "";
200
+ return "\n" + content.split("\n").map((line) => indent + line.trimEnd()).join("\n").trim() + "\n";
201
+ }
202
+ });
203
+ this.turndownService.addRule("listItem", {
204
+ filter: "li",
205
+ // Use standard function for `this` context if needed, or ensure types match
206
+ replacement: function(content, node, options) {
207
+ content = content.replace(/^\s+/gm, "").replace(/\n(?!\s*$)/gm, "\n ");
208
+ let prefix = options.bulletListMarker + " ";
209
+ const parentNode = node.parentNode;
210
+ if (parentNode && parentNode.nodeName === "OL") {
211
+ try {
212
+ const start = parentNode.getAttribute("start");
213
+ const elementNode = node;
214
+ const parentElement = parentNode;
215
+ const index = Array.prototype.indexOf.call(parentElement.children, elementNode);
216
+ prefix = (start ? Number(start) + index : index + 1) + ". ";
217
+ } catch (e) {
218
+ console.warn("Could not determine ordered list index:", e);
219
+ prefix = "1. ";
220
+ }
221
+ }
222
+ const trimmedContent = content.trim();
223
+ return prefix + trimmedContent + (node.nextSibling && !/\n$/.test(trimmedContent) ? "\n" : "");
224
+ }
225
+ });
226
+ this.turndownService.addRule("blockquote", {
227
+ filter: "blockquote",
228
+ replacement: (content) => {
229
+ const trimmedContent = content.trim();
230
+ return "\n\n> " + trimmedContent.replace(/\n/g, "\n> ") + "\n\n";
231
+ }
232
+ });
233
+ }
234
+ addInlineRules() {
235
+ this.turndownService.addRule("link", {
236
+ filter: (node, _options) => {
237
+ return node.nodeType === 1 && node.nodeName === "A" && !!node.getAttribute("href");
238
+ },
239
+ replacement: (content, node) => {
240
+ const element = node;
241
+ const href = element.getAttribute("href") || "";
242
+ const title = element.getAttribute("title");
243
+ const text = content.trim() ? content.trim() : href;
244
+ let decodedHref = href;
245
+ try {
246
+ if (href.includes("%")) {
247
+ decodedHref = decodeURI(href);
248
+ }
249
+ } catch (e) {
250
+ console.warn(`Failed to decode URI, keeping original: ${href}`, e);
251
+ }
252
+ return title ? `[${text}](${decodedHref} "${title}")` : `[${text}](${decodedHref})`;
253
+ }
254
+ });
255
+ this.turndownService.addRule("figure", {
256
+ filter: "figure",
257
+ replacement: (content, node) => {
258
+ if (node.nodeType !== 1) return content;
259
+ const element = node;
260
+ const img = element.querySelector("img");
261
+ const figcaption = element.querySelector("figcaption");
262
+ let markdown = "";
263
+ let mainImgMd = "";
264
+ if (img) {
265
+ const src = img.getAttribute("src") || "";
266
+ const alt = img.getAttribute("alt") || "";
267
+ const title = img.getAttribute("title");
268
+ mainImgMd = title ? `![${alt}](${src} "${title}")` : `![${alt}](${src})`;
269
+ }
270
+ let processedContent = content.trim();
271
+ if (mainImgMd) {
272
+ markdown = mainImgMd;
273
+ const imgPlaceholder = `![${img?.getAttribute("alt") || ""}](${img?.getAttribute("src") || ""})`;
274
+ processedContent = processedContent.replace(imgPlaceholder, "").trim();
275
+ }
276
+ if (figcaption) {
277
+ const captionText = figcaption.textContent?.trim();
278
+ if (captionText) {
279
+ markdown += `
280
+
281
+ _${captionText}_`;
282
+ processedContent = processedContent.replace(captionText, "").trim();
283
+ processedContent = processedContent.replace(/^_+|_+$/g, "").trim();
284
+ }
285
+ }
286
+ if (processedContent) {
287
+ if (processedContent.length > 10 || /[a-zA-Z0-9]/.test(processedContent)) {
288
+ markdown += `
289
+
290
+ ${processedContent}`;
291
+ }
292
+ }
293
+ return "\n\n" + markdown.trim() + "\n\n";
294
+ }
295
+ });
296
+ this.turndownService.addRule("image", {
297
+ filter: (node) => {
298
+ return node.nodeType === 1 && node.nodeName === "IMG" && !!node.getAttribute("src");
299
+ },
300
+ replacement: (_content, node) => {
301
+ const element = node;
302
+ const src = element.getAttribute("src") || "";
303
+ const alt = element.getAttribute("alt") || "";
304
+ const title = element.getAttribute("title");
305
+ return title ? `
306
+
307
+ ![${alt}](${src} "${title}")
308
+
309
+ ` : `
310
+
311
+ ![${alt}](${src})
312
+
313
+ `;
314
+ }
315
+ });
316
+ this.turndownService.addRule("code-block", {
317
+ filter: (node) => {
318
+ if (node.nodeType !== 1) return false;
319
+ const element = node;
320
+ const isPre = element.tagName.toLowerCase() === "pre";
321
+ if (!isPre) return false;
322
+ const hasCodeChild = element.querySelector("code") !== null;
323
+ const hasCodeClass = /highlight|syntax|code|listing|source/i.test(element.className);
324
+ const hasLangAttribute = !!element.getAttribute("lang") || !!element.getAttribute("language");
325
+ return hasCodeChild || hasCodeClass || hasLangAttribute;
326
+ },
327
+ replacement: (content, node) => {
328
+ if (node.nodeType !== 1) return content.trim();
329
+ const element = node;
330
+ let language = "";
331
+ const codeElement = element.querySelector("code");
332
+ language = element.getAttribute("lang") || element.getAttribute("language") || (codeElement ? codeElement.getAttribute("lang") || codeElement.getAttribute("language") : "") || "";
333
+ if (!language) {
334
+ const classes = (element.className + " " + (codeElement?.className || "")).split(" ").filter(Boolean);
335
+ for (const cls of classes) {
336
+ for (const prefix of CODE_BLOCK_LANG_PREFIXES) {
337
+ if (cls.startsWith(prefix)) {
338
+ language = cls.substring(prefix.length);
339
+ break;
340
+ }
341
+ }
342
+ if (language) break;
343
+ }
344
+ }
345
+ const cleanedContent = content.trim();
346
+ return `
347
+
348
+ \`\`\`${language}
349
+ ${cleanedContent}
350
+ \`\`\`
351
+
352
+ `;
353
+ }
354
+ });
355
+ this.turndownService.addRule("inlineCode", {
356
+ filter: (node) => node.nodeName === "CODE" && node.parentNode?.nodeName !== "PRE",
357
+ replacement: (content) => {
358
+ const trimmed = content.trim();
359
+ if (!trimmed) return "";
360
+ let delimiter = "`";
361
+ if (trimmed.includes("`")) {
362
+ delimiter = "``";
363
+ if (trimmed.startsWith("`") || trimmed.endsWith("`")) {
364
+ return `${delimiter} ${trimmed} ${delimiter}`;
365
+ }
366
+ }
367
+ return delimiter + trimmed + delimiter;
368
+ }
369
+ });
370
+ }
371
+ // --- HTML Preprocessing ---
372
+ preprocessHTML(html) {
373
+ try {
374
+ html = this.cleanupHtml(html);
375
+ const root = (0, import_node_html_parser.parse)(html, {
376
+ comment: false,
377
+ blockTextElements: { script: true, style: true, noscript: true }
378
+ });
379
+ if (root.nodeType === 3) {
380
+ return root.textContent ?? "";
381
+ } else if (root.nodeType !== 1) {
382
+ console.warn("Unexpected root node type after parsing:", root.nodeType);
383
+ return root.toString();
384
+ }
385
+ const rootElement = root;
386
+ PREPROCESSING_REMOVE_SELECTORS.forEach((selector) => {
387
+ try {
388
+ rootElement.querySelectorAll(selector).forEach((el) => el.remove());
389
+ } catch (e) {
390
+ console.warn(`Skipping invalid selector during preprocessing: ${selector}`, e);
391
+ }
392
+ });
393
+ this.removeHighLinkDensityElements(rootElement, DEFAULT_LINK_DENSITY_THRESHOLD);
394
+ const metadata = this.extractDocumentMetadata(rootElement);
395
+ const isForum = this.detectForumPage(rootElement);
396
+ let contentElement = rootElement;
397
+ if (isForum) {
398
+ contentElement = this.extractForumContentElement(rootElement);
399
+ } else {
400
+ contentElement = this.extractArticleContentElement(rootElement);
401
+ }
402
+ let contentHtml = contentElement instanceof import_node_html_parser.HTMLElement ? contentElement.outerHTML : contentElement.textContent;
403
+ contentHtml = this.cleanupContentHtml(contentHtml || "");
404
+ const metadataString = metadata.length > 0 ? metadata.join("\n\n") + "\n\n---\n\n" : "";
405
+ return metadataString + contentHtml;
406
+ } catch (error) {
407
+ console.error("HTML preprocessing failed:", error);
408
+ return this.cleanupHtml(html);
409
+ }
410
+ }
411
+ cleanupHtml(html) {
412
+ return html.replace(/AMIL:\[=-,amilft[^\s]*/g, "").replace(/\{\{\s*[^}\s]+\s*}}/g, "").replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
413
+ }
414
+ cleanupContentHtml(content) {
415
+ return content.replace(/\s*data-(?:reactid|reactroot|react-|testid|v-|js-|qa-|cy-)[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "").replace(/\s*ng-[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "").replace(/\s*_ngcontent-[^\s]*\s*=""/g, "").replace(/\s*class\s*=\s*"(ng-|mat-)[^"]*"/g, "").replace(/<!--[\s\S]*?-->/g, "").replace(/([ \t])+/g, " ").replace(/\s*\n\s*/g, "\n").trim();
416
+ }
417
+ removeHighLinkDensityElements(element, threshold) {
418
+ const potentialBoilerplate = element.querySelectorAll(
419
+ "div, nav, ul, aside, section, .sidebar, .widget, .menu, [role='navigation'], [role='menubar']"
420
+ );
421
+ for (const el of Array.from(potentialBoilerplate)) {
422
+ if (!(el instanceof import_node_html_parser.HTMLElement)) continue;
423
+ const textContent = el.textContent || "";
424
+ if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH) continue;
425
+ const links = el.querySelectorAll("a");
426
+ if (links.length < 3) continue;
427
+ const textLength = textContent.length;
428
+ let linkTextLength = 0;
429
+ el.querySelectorAll("a").forEach((link) => {
430
+ if (link.closest("a") === link) {
431
+ linkTextLength += link.textContent?.length || 0;
432
+ }
433
+ });
434
+ if (textLength === 0) continue;
435
+ const density = linkTextLength / textLength;
436
+ if (density > threshold) {
437
+ const containsMainContent = el.querySelector('main, article, [role="main"], [role="article"]') !== null;
438
+ const isMainContent = MAIN_CONTENT_SELECTORS.some((selector) => {
439
+ try {
440
+ return el.matches(selector);
441
+ } catch {
442
+ return false;
443
+ }
444
+ });
445
+ if (!containsMainContent && !isMainContent) {
446
+ el.remove();
447
+ }
448
+ }
449
+ }
450
+ }
451
+ extractDocumentMetadata(root) {
452
+ const metadata = [];
453
+ const addedMeta = /* @__PURE__ */ new Set();
454
+ const addMeta = (key, value, isTitle = false) => {
455
+ const cleanedValue = value?.trim();
456
+ if (cleanedValue && !addedMeta.has(key.toLowerCase())) {
457
+ if (isTitle) {
458
+ metadata.unshift(`# ${cleanedValue}`);
459
+ } else {
460
+ metadata.push(`**${key}:** ${cleanedValue}`);
461
+ }
462
+ addedMeta.add(key.toLowerCase());
463
+ }
464
+ };
465
+ addMeta("Title", root.querySelector("meta[property='og:title']")?.getAttribute("content"), true);
466
+ addMeta("Title", root.querySelector("meta[name='twitter:title']")?.getAttribute("content"), true);
467
+ addMeta("Title", root.querySelector("meta[name='DC.title']")?.getAttribute("content"), true);
468
+ addMeta("Title", root.querySelector("title")?.textContent, true);
469
+ addMeta("Description", root.querySelector("meta[property='og:description']")?.getAttribute("content"));
470
+ addMeta("Description", root.querySelector("meta[name='twitter:description']")?.getAttribute("content"));
471
+ addMeta("Description", root.querySelector("meta[name='description']")?.getAttribute("content"));
472
+ addMeta("Description", root.querySelector("meta[name='DC.description']")?.getAttribute("content"));
473
+ addMeta("Author", root.querySelector("meta[name='author']")?.getAttribute("content"));
474
+ addMeta("Author", root.querySelector("meta[property='article:author']")?.getAttribute("content"));
475
+ addMeta("Author", root.querySelector("[rel='author']")?.textContent);
476
+ addMeta("Published", root.querySelector("meta[property='article:published_time']")?.getAttribute("content"));
477
+ addMeta("Published", root.querySelector("meta[name='publish-date']")?.getAttribute("content"));
478
+ addMeta("Published", root.querySelector("time[itemprop='datePublished']")?.getAttribute("datetime"));
479
+ addMeta("Published", root.querySelector("time")?.getAttribute("datetime"));
480
+ addMeta("URL", root.querySelector("link[rel='canonical']")?.getAttribute("href"));
481
+ addMeta("URL", root.querySelector("meta[property='og:url']")?.getAttribute("content"));
482
+ const jsonLdScripts = root.querySelectorAll("script[type='application/ld+json']");
483
+ if (jsonLdScripts.length > 0) {
484
+ const jsonLdData = Array.from(jsonLdScripts).map((script) => {
485
+ try {
486
+ const textContent = script.textContent;
487
+ return textContent ? JSON.parse(textContent) : null;
488
+ } catch (e) {
489
+ return null;
490
+ }
491
+ }).filter((item) => item !== null);
492
+ if (jsonLdData.length > 0 && !addedMeta.has("json-ld")) {
493
+ metadata.push("<details><summary>JSON-LD Metadata</summary>\n");
494
+ metadata.push("```json", JSON.stringify(jsonLdData, null, 2), "```");
495
+ metadata.push("</details>");
496
+ addedMeta.add("json-ld");
497
+ }
498
+ }
499
+ return metadata;
500
+ }
501
+ detectForumPage(root) {
502
+ const countMatches = (selectors) => {
503
+ return selectors.reduce((count, selector) => {
504
+ try {
505
+ if (root) {
506
+ return count + root.querySelectorAll(selector).length;
507
+ }
508
+ return count;
509
+ } catch {
510
+ return count;
511
+ }
512
+ }, 0);
513
+ };
514
+ const commentCount = countMatches(FORUM_COMMENT_SELECTORS);
515
+ const threadCount = countMatches(FORUM_THREAD_SELECTORS);
516
+ const voteCount = countMatches(FORUM_VOTE_SELECTORS);
517
+ let isKnownForumHost = false;
518
+ try {
519
+ const canonicalUrl = root.querySelector('link[rel="canonical"]')?.getAttribute("href") || root.querySelector('meta[property="og:url"]')?.getAttribute("content");
520
+ if (canonicalUrl) {
521
+ const absoluteUrl = new URL(canonicalUrl, "http://example.com").toString();
522
+ const hostname = new URL(absoluteUrl).hostname.toLowerCase();
523
+ isKnownForumHost = hostname.includes("reddit.com") || hostname.includes("news.ycombinator.com") || hostname.includes("forum") || hostname.includes("discuss") || hostname.includes("community");
524
+ }
525
+ } catch (e) {
526
+ console.warn("Could not parse URL for forum detection:", e);
527
+ }
528
+ return commentCount >= MIN_FORUM_INDICATOR_COUNT || threadCount > 1 || // More than one thread item is stronger indicator
529
+ voteCount >= MIN_FORUM_INDICATOR_COUNT || isKnownForumHost;
530
+ }
531
+ // Tries to find the main content element for an article-like page
532
+ extractArticleContentElement(root) {
533
+ let bestCandidate = null;
534
+ let maxScore = -1;
535
+ for (const selector of MAIN_CONTENT_SELECTORS) {
536
+ try {
537
+ const elements = root.querySelectorAll(selector);
538
+ for (const element of Array.from(elements)) {
539
+ if (!(element instanceof import_node_html_parser.HTMLElement)) continue;
540
+ const textLength = (element.textContent || "").trim().length;
541
+ if (textLength < 100 && !element.querySelector("img, video, iframe, figure")) continue;
542
+ let score = textLength;
543
+ if (["ARTICLE", "MAIN"].includes(element.tagName)) score *= 1.5;
544
+ if (["main", "article"].includes(element.getAttribute("role") || "")) score *= 1.5;
545
+ if (["HEADER", "FOOTER", "NAV", "ASIDE"].includes(element.tagName)) score *= 0.3;
546
+ try {
547
+ if (
548
+ /* @ts-expect-error TODO: fix this */
549
+ element.matches(
550
+ '.sidebar, .widget, .menu, .nav, .header, .footer, [role="navigation"], [role="complementary"], [role="banner"]'
551
+ )
552
+ )
553
+ score *= 0.2;
554
+ } catch {
555
+ }
556
+ if (this.hasHighLinkDensity(element, 0.6)) {
557
+ score *= 0.5;
558
+ }
559
+ if (element.querySelectorAll("p").length > 2) score *= 1.2;
560
+ if (element.tagName === "BODY" && maxScore > 200) continue;
561
+ if (score > maxScore) {
562
+ maxScore = score;
563
+ bestCandidate = element;
564
+ }
565
+ }
566
+ } catch (e) {
567
+ }
568
+ }
569
+ return bestCandidate || root;
570
+ }
571
+ // Tries to find the main content element(s) for a forum-like page
572
+ extractForumContentElement(root) {
573
+ const tempContainer = (0, import_node_html_parser.parse)("<div></div>").firstChild;
574
+ try {
575
+ const mainPost = FORUM_MAIN_POST_SELECTORS.map((s) => root.querySelector(s)).find(
576
+ (el) => el instanceof import_node_html_parser.HTMLElement
577
+ );
578
+ if (mainPost) {
579
+ tempContainer.appendChild(mainPost.clone());
580
+ }
581
+ } catch (e) {
582
+ console.warn("Error finding forum main post:", e);
583
+ }
584
+ try {
585
+ const commentsContainer = FORUM_COMMENTS_CONTAINER_SELECTORS.map((s) => root.querySelector(s)).find(
586
+ (el) => el instanceof import_node_html_parser.HTMLElement
587
+ );
588
+ if (commentsContainer) {
589
+ const clonedComments = commentsContainer.clone();
590
+ if (clonedComments instanceof import_node_html_parser.HTMLElement) {
591
+ FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
592
+ try {
593
+ clonedComments.querySelectorAll(selector).forEach((el) => el.remove());
594
+ } catch {
595
+ }
596
+ });
597
+ tempContainer.appendChild(clonedComments);
598
+ }
599
+ }
600
+ } catch (e) {
601
+ console.warn("Error finding forum comments container:", e);
602
+ }
603
+ if (tempContainer.childNodes.length > 0) {
604
+ return tempContainer;
605
+ }
606
+ const body = root.querySelector("body");
607
+ if (body) {
608
+ const clonedBody = body.clone();
609
+ if (clonedBody instanceof import_node_html_parser.HTMLElement) {
610
+ FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
611
+ try {
612
+ clonedBody.querySelectorAll(selector).forEach((el) => el.remove());
613
+ } catch {
614
+ }
615
+ });
616
+ this.removeHighLinkDensityElements(clonedBody, DEFAULT_LINK_DENSITY_THRESHOLD);
617
+ return clonedBody;
618
+ }
619
+ }
620
+ return root;
621
+ }
622
+ // Helper function to check link density within an element
623
+ hasHighLinkDensity(element, threshold) {
624
+ const textContent = element.textContent || "";
625
+ if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH) return false;
626
+ const links = element.querySelectorAll("a");
627
+ if (links.length < 3) return false;
628
+ const textLength = textContent.length;
629
+ let linkTextLength = 0;
630
+ element.querySelectorAll("a").forEach((link) => {
631
+ if (link.closest("a") === link) {
632
+ linkTextLength += link.textContent?.length || 0;
633
+ }
634
+ });
635
+ if (textLength === 0) return false;
636
+ return linkTextLength / textLength > threshold;
637
+ }
638
+ // --- Markdown Postprocessing ---
639
+ postprocessMarkdown(markdown, options) {
640
+ let processed = markdown;
641
+ processed = processed.replace(/^(\s*\n)?(#{1,6}\s.*)$/gm, "\n\n$2\n\n");
642
+ processed = processed.replace(/^(\s*\n)?(([\*\-+>]|\d+\.)\s)/gm, (_match, _p1, p2) => `
643
+
644
+ ${p2}`);
645
+ processed = processed.replace(
646
+ /(\n([\*\-+]|\d+\.)\s(?:(?!\n\n|\n {2,}|\n\t)[\s\S])*?)\n(?=([\*\-+]|\d+\.)\s)/g,
647
+ "$1"
648
+ );
649
+ processed = processed.replace(/\[\]\([^)]*\)/g, "");
650
+ processed = processed.replace(/!\[\]\([^)]*\)/g, "");
651
+ processed = processed.replace(/(!?\[[^\]]*\]\()(\/\/)/g, "$1https://");
652
+ const maxNewlines = "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES + 1);
653
+ const newlineRegex = new RegExp(`${maxNewlines}+`, "g");
654
+ processed = processed.replace(newlineRegex, "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES));
655
+ processed = processed.replace(/^[ \t]+|[ \t]+$/gm, "");
656
+ processed = processed.replace(/^(\s*\n)?(```(.*)\n[\s\S]*?\n```)(\s*\n)?/gm, "\n\n$2\n\n");
657
+ processed = processed.replace(/^(.{30,})$(\n\1)+/gm, "$1");
658
+ processed = processed.replace(/(\n---\n)(\S)/g, "$1\n$2");
659
+ if (options.maxContentLength && processed.length > options.maxContentLength) {
660
+ const truncatedPoint = processed.lastIndexOf(".", options.maxContentLength - 15);
661
+ const sliceEnd = truncatedPoint > options.maxContentLength / 2 ? truncatedPoint + 1 : options.maxContentLength;
662
+ processed = processed.slice(0, sliceEnd) + "... (truncated)";
663
+ }
664
+ return processed.trim();
665
+ }
666
+ };
667
+
668
+ // src/errors.ts
669
+ var FetchError = class _FetchError extends Error {
670
+ /** A specific error code (e.g., ERR_NAVIGATION_TIMEOUT, ERR_HTTP_ERROR). */
671
+ code;
672
+ /** The original error object, if available. */
673
+ originalError;
674
+ /** HTTP status code, if relevant. */
675
+ statusCode;
676
+ /**
677
+ * Creates an instance of FetchError.
678
+ * @param message The error message.
679
+ * @param code Optional error code string.
680
+ * @param originalError Optional original error.
681
+ * @param statusCode Optional HTTP status code.
682
+ */
683
+ constructor(message, code, originalError, statusCode) {
684
+ super(message);
685
+ this.name = "FetchError";
686
+ this.code = code;
687
+ this.originalError = originalError;
688
+ this.statusCode = statusCode;
689
+ if (Error.captureStackTrace) {
690
+ Error.captureStackTrace(this, _FetchError);
691
+ }
692
+ }
693
+ };
694
+
695
+ // src/FetchEngine.ts
696
+ var FetchEngineHttpError = class extends FetchError {
697
+ constructor(message, statusCode) {
698
+ super(message, "ERR_HTTP_ERROR", void 0, statusCode);
699
+ this.statusCode = statusCode;
700
+ this.name = "FetchEngineHttpError";
701
+ }
702
+ };
703
+ var FetchEngine = class _FetchEngine {
704
+ options;
705
+ static DEFAULT_OPTIONS = {
706
+ markdown: false
707
+ };
708
+ /**
709
+ * Creates an instance of FetchEngine.
710
+ * @param options Configuration options for the FetchEngine.
711
+ */
712
+ constructor(options = {}) {
713
+ this.options = { ..._FetchEngine.DEFAULT_OPTIONS, ...options };
714
+ }
715
+ /**
716
+ * Fetches HTML or converts to Markdown from the specified URL.
717
+ *
718
+ * @param url The URL to fetch.
719
+ * @returns A Promise resolving to an HTMLFetchResult object.
720
+ * @throws {FetchEngineHttpError} If the HTTP response status is not ok (e.g., 404, 500).
721
+ * @throws {Error} If the content type is not HTML or for other network errors.
722
+ */
723
+ async fetchHTML(url, options) {
724
+ const effectiveOptions = { ...this.options, ...options };
725
+ let response;
726
+ try {
727
+ response = await fetch(url, {
728
+ redirect: "follow",
729
+ headers: {
730
+ // Standard browser-like headers
731
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
732
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
733
+ "Accept-Language": "en-US,en;q=0.9"
734
+ }
735
+ });
736
+ if (!response.ok) {
737
+ throw new FetchEngineHttpError(`HTTP error! status: ${response.status}`, response.status);
738
+ }
739
+ const contentTypeHeader = response.headers.get("content-type");
740
+ if (!contentTypeHeader || !contentTypeHeader.includes("text/html")) {
741
+ throw new FetchError("Content-Type is not text/html", "ERR_NON_HTML_CONTENT");
742
+ }
743
+ const html = await response.text();
744
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
745
+ const title = titleMatch ? titleMatch[1].trim() : null;
746
+ let finalContent = html;
747
+ let finalContentType = "html";
748
+ if (effectiveOptions.markdown) {
749
+ try {
750
+ const converter = new MarkdownConverter();
751
+ finalContent = converter.convert(html);
752
+ finalContentType = "markdown";
753
+ } catch (conversionError) {
754
+ console.error(`Markdown conversion failed for ${url} (FetchEngine):`, conversionError);
755
+ }
756
+ }
757
+ return {
758
+ content: finalContent,
759
+ contentType: finalContentType,
760
+ title,
761
+ url: response.url,
762
+ // Use the final URL after redirects
763
+ isFromCache: false,
764
+ statusCode: response.status,
765
+ error: void 0
766
+ };
767
+ } catch (error) {
768
+ if (error instanceof FetchEngineHttpError || error instanceof FetchError && error.code === "ERR_NON_HTML_CONTENT") {
769
+ throw error;
770
+ }
771
+ const message = error instanceof Error ? error.message : "Unknown fetch error";
772
+ throw new FetchError(`Fetch failed: ${message}`, "ERR_FETCH_FAILED", error instanceof Error ? error : void 0);
773
+ }
774
+ }
775
+ /**
776
+ * Cleans up resources used by the engine.
777
+ * For FetchEngine, this is a no-op as it doesn't manage persistent resources.
778
+ * @returns A Promise that resolves when cleanup is complete.
779
+ */
780
+ async cleanup() {
781
+ return Promise.resolve();
782
+ }
783
+ /**
784
+ * Retrieves metrics for the engine.
785
+ * FetchEngine does not manage browsers, so it returns an empty array.
786
+ * @returns An empty array.
787
+ */
788
+ getMetrics() {
789
+ return [];
790
+ }
791
+ };
792
+
793
+ // src/browser/PlaywrightBrowserPool.ts
794
+ var import_playwright = require("playwright");
795
+ var import_user_agents = __toESM(require("user-agents"), 1);
796
+ var import_uuid = require("uuid");
797
+ var import_p_queue = __toESM(require("p-queue"), 1);
798
+ var import_playwright_extra = require("playwright-extra");
799
+ var chromiumWithExtras;
800
+ var StealthPluginInstance;
801
+ async function loadDependencies() {
802
+ if (!chromiumWithExtras) {
803
+ chromiumWithExtras = (0, import_playwright_extra.addExtra)(import_playwright.chromium);
804
+ const StealthPluginModule = await import("puppeteer-extra-plugin-stealth");
805
+ const stealthPluginFactory = typeof StealthPluginModule.default === "function" ? StealthPluginModule.default : StealthPluginModule;
806
+ if (typeof stealthPluginFactory !== "function") {
807
+ throw new Error("puppeteer-extra-plugin-stealth export is not a function or module structure is unexpected.");
808
+ }
809
+ StealthPluginInstance = stealthPluginFactory();
810
+ chromiumWithExtras.use(StealthPluginInstance);
811
+ }
812
+ }
813
+ var PlaywrightBrowserPool = class _PlaywrightBrowserPool {
814
+ pool = /* @__PURE__ */ new Set();
815
+ maxBrowsers;
816
+ maxPagesPerContext;
817
+ maxBrowserAge;
818
+ healthCheckInterval;
819
+ healthCheckTimer = null;
820
+ maxIdleTime;
821
+ isCleaningUp = false;
822
+ useHeadedMode;
823
+ blockedDomains;
824
+ blockedResourceTypes;
825
+ proxyConfig;
826
+ static DEFAULT_BLOCKED_DOMAINS = [
827
+ "doubleclick.net",
828
+ "google-analytics.com",
829
+ "googletagmanager.com",
830
+ "googlesyndication.com",
831
+ "googleadservices.com",
832
+ "adservice.google.com",
833
+ "facebook.net",
834
+ "fbcdn.net",
835
+ "connect.facebook.net",
836
+ "ads-twitter.com",
837
+ "platform.twitter.com",
838
+ "analytics.tiktok.com",
839
+ "ads.tiktok.com",
840
+ "amazon-adsystem.com",
841
+ "adnxs.com",
842
+ "criteo.com",
843
+ "scorecardresearch.com",
844
+ "quantserve.com",
845
+ "rubiconproject.com",
846
+ "pubmatic.com",
847
+ "taboola.com",
848
+ "outbrain.com"
849
+ ];
850
+ static DEFAULT_BLOCKED_RESOURCE_TYPES = ["image", "font", "media", "websocket"];
851
+ acquireQueue = new import_p_queue.default({ concurrency: 1 });
852
+ constructor(config = {}) {
853
+ this.maxBrowsers = config.maxBrowsers ?? 2;
854
+ this.maxPagesPerContext = config.maxPagesPerContext ?? 6;
855
+ this.maxBrowserAge = config.maxBrowserAge ?? 20 * 60 * 1e3;
856
+ this.healthCheckInterval = config.healthCheckInterval ?? 60 * 1e3;
857
+ this.useHeadedMode = config.useHeadedMode ?? false;
858
+ this.maxIdleTime = config.maxIdleTime ?? 5 * 60 * 1e3;
859
+ this.blockedDomains = config.blockedDomains && config.blockedDomains.length > 0 ? config.blockedDomains : _PlaywrightBrowserPool.DEFAULT_BLOCKED_DOMAINS;
860
+ this.blockedResourceTypes = config.blockedResourceTypes && config.blockedResourceTypes.length > 0 ? config.blockedResourceTypes : _PlaywrightBrowserPool.DEFAULT_BLOCKED_RESOURCE_TYPES;
861
+ this.proxyConfig = config.proxy;
862
+ }
863
+ async initialize() {
864
+ await loadDependencies();
865
+ if (this.isCleaningUp) return;
866
+ await this.ensureMinimumInstances();
867
+ this.scheduleHealthCheck();
868
+ }
869
+ scheduleHealthCheck() {
870
+ if (this.isCleaningUp) return;
871
+ if (this.healthCheckTimer) {
872
+ clearTimeout(this.healthCheckTimer);
873
+ }
874
+ if (this.healthCheckInterval > 0) {
875
+ this.healthCheckTimer = setTimeout(() => {
876
+ this.healthCheck().catch((_err) => {
877
+ });
878
+ }, this.healthCheckInterval);
879
+ }
880
+ }
881
+ async ensureMinimumInstances() {
882
+ if (this.isCleaningUp) return;
883
+ while (this.pool.size < this.maxBrowsers) {
884
+ try {
885
+ await this.createBrowserInstance();
886
+ } catch (error) {
887
+ break;
888
+ }
889
+ }
890
+ }
891
+ async createBrowserInstance() {
892
+ await loadDependencies();
893
+ const id = (0, import_uuid.v4)();
894
+ const launchOptions = {
895
+ headless: !this.useHeadedMode,
896
+ args: [
897
+ "--no-sandbox",
898
+ "--disable-setuid-sandbox",
899
+ "--disable-dev-shm-usage",
900
+ "--disable-accelerated-2d-canvas",
901
+ "--no-first-run",
902
+ "--no-zygote",
903
+ "--disable-gpu",
904
+ "--mute-audio",
905
+ "--disable-background-networking"
906
+ ],
907
+ proxy: this.proxyConfig
908
+ };
909
+ const browser = await chromiumWithExtras.launch(launchOptions);
910
+ const context = await browser.newContext({
911
+ userAgent: new import_user_agents.default().toString(),
912
+ viewport: {
913
+ width: 1280 + Math.floor(Math.random() * 120),
914
+ height: 720 + Math.floor(Math.random() * 80)
915
+ },
916
+ javaScriptEnabled: true,
917
+ ignoreHTTPSErrors: true
918
+ });
919
+ await context.route("**/*", async (route) => {
920
+ const request = route.request();
921
+ const url = request.url();
922
+ const resourceType = request.resourceType();
923
+ try {
924
+ const hostname = new URL(url).hostname.toLowerCase();
925
+ if (this.blockedDomains.some((domain) => hostname.includes(domain)) || this.blockedResourceTypes.includes(resourceType)) {
926
+ await route.abort("aborted");
927
+ } else {
928
+ await route.continue();
929
+ }
930
+ } catch (_e) {
931
+ await route.continue();
932
+ }
933
+ });
934
+ const now = /* @__PURE__ */ new Date();
935
+ const metrics = {
936
+ id,
937
+ pagesCreated: 0,
938
+ activePages: 0,
939
+ lastUsed: now,
940
+ errors: 0,
941
+ createdAt: now,
942
+ isHealthy: true
943
+ };
944
+ const instance = {
945
+ id,
946
+ browser,
947
+ context,
948
+ pages: /* @__PURE__ */ new Set(),
949
+ metrics,
950
+ isHealthy: true,
951
+ disconnectedHandler: () => {
952
+ }
953
+ };
954
+ instance.disconnectedHandler = () => {
955
+ if (instance.isHealthy) {
956
+ instance.isHealthy = false;
957
+ instance.metrics.isHealthy = false;
958
+ this.healthCheck().catch((_err) => {
959
+ });
960
+ }
961
+ };
962
+ browser.on("disconnected", instance.disconnectedHandler);
963
+ this.pool.add(instance);
964
+ return instance;
965
+ }
966
+ acquirePage() {
967
+ return this.acquireQueue.add(async () => {
968
+ if (this.isCleaningUp) {
969
+ throw new Error("Pool is shutting down.");
970
+ }
971
+ let bestInstance = null;
972
+ for (const instance of this.pool) {
973
+ if (instance.isHealthy && instance.pages.size < this.maxPagesPerContext) {
974
+ if (!bestInstance || instance.pages.size < bestInstance.pages.size) {
975
+ bestInstance = instance;
976
+ }
977
+ }
978
+ }
979
+ if (!bestInstance && this.pool.size < this.maxBrowsers) {
980
+ try {
981
+ bestInstance = await this.createBrowserInstance();
982
+ } catch (error) {
983
+ throw new Error(`Failed to create new browser instance for acquisition: ${error.message}`);
984
+ }
985
+ }
986
+ if (!bestInstance) {
987
+ await this.ensureMinimumInstances();
988
+ for (const instance of this.pool) {
989
+ if (instance.isHealthy && instance.pages.size < this.maxPagesPerContext) {
990
+ if (!bestInstance || instance.pages.size < bestInstance.pages.size) {
991
+ bestInstance = instance;
992
+ }
993
+ }
994
+ }
995
+ if (!bestInstance) {
996
+ throw new Error("Failed to acquire Playwright page: No available or creatable browser instance.");
997
+ }
998
+ }
999
+ try {
1000
+ const page = await bestInstance.context.newPage();
1001
+ bestInstance.pages.add(page);
1002
+ bestInstance.metrics.pagesCreated++;
1003
+ bestInstance.metrics.activePages = bestInstance.pages.size;
1004
+ bestInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
1005
+ page.on("close", () => {
1006
+ bestInstance.pages.delete(page);
1007
+ bestInstance.metrics.activePages = bestInstance.pages.size;
1008
+ bestInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
1009
+ });
1010
+ page.on("crash", () => {
1011
+ bestInstance.metrics.errors++;
1012
+ bestInstance.pages.delete(page);
1013
+ bestInstance.isHealthy = false;
1014
+ bestInstance.metrics.isHealthy = false;
1015
+ this.healthCheck().catch((_err) => {
1016
+ });
1017
+ });
1018
+ return page;
1019
+ } catch (error) {
1020
+ bestInstance.metrics.errors++;
1021
+ bestInstance.isHealthy = false;
1022
+ bestInstance.metrics.isHealthy = false;
1023
+ this.healthCheck().catch((_err) => {
1024
+ });
1025
+ throw new Error(`Failed to create new page: ${error.message}`);
1026
+ }
1027
+ });
1028
+ }
1029
+ async healthCheck() {
1030
+ if (this.isCleaningUp) return;
1031
+ const now = /* @__PURE__ */ new Date();
1032
+ const checks = [];
1033
+ for (const instance of this.pool) {
1034
+ checks.push(
1035
+ (async () => {
1036
+ if (!instance.isHealthy) {
1037
+ return;
1038
+ }
1039
+ let shouldRemove = false;
1040
+ let reason = "unknown";
1041
+ if (!instance.browser.isConnected()) {
1042
+ shouldRemove = true;
1043
+ reason = "browser disconnected";
1044
+ }
1045
+ if (!shouldRemove && this.maxBrowserAge > 0 && now.getTime() - instance.metrics.createdAt.getTime() > this.maxBrowserAge) {
1046
+ shouldRemove = true;
1047
+ reason = "max age reached";
1048
+ }
1049
+ if (!shouldRemove && this.pool.size > 1 && // Only remove idle if pool has more than 1
1050
+ instance.pages.size === 0 && this.maxIdleTime > 0 && now.getTime() - instance.metrics.lastUsed.getTime() > this.maxIdleTime) {
1051
+ shouldRemove = true;
1052
+ reason = "idle timeout";
1053
+ }
1054
+ if (shouldRemove) {
1055
+ instance.isHealthy = false;
1056
+ instance.metrics.isHealthy = false;
1057
+ await this.closeAndRemoveInstance(instance, reason);
1058
+ } else {
1059
+ instance.isHealthy = true;
1060
+ instance.metrics.isHealthy = true;
1061
+ }
1062
+ })().catch((_err) => {
1063
+ })
1064
+ );
1065
+ }
1066
+ try {
1067
+ await Promise.allSettled(checks);
1068
+ } finally {
1069
+ await this.ensureMinimumInstances();
1070
+ this.scheduleHealthCheck();
1071
+ }
1072
+ }
1073
+ async closeAndRemoveInstance(instance, _reason) {
1074
+ const removed = this.pool.delete(instance);
1075
+ if (!removed) return;
1076
+ instance.browser.off("disconnected", instance.disconnectedHandler);
1077
+ try {
1078
+ await instance.context.close();
1079
+ } catch (_error) {
1080
+ }
1081
+ try {
1082
+ await instance.browser.close();
1083
+ } catch (_error) {
1084
+ }
1085
+ }
1086
+ async releasePage(page) {
1087
+ if (!page || page.isClosed()) return;
1088
+ let ownerInstance;
1089
+ for (const instance of this.pool) {
1090
+ if (instance.pages.has(page)) {
1091
+ ownerInstance = instance;
1092
+ break;
1093
+ }
1094
+ }
1095
+ try {
1096
+ await page.close();
1097
+ if (ownerInstance) {
1098
+ ownerInstance.pages.delete(page);
1099
+ ownerInstance.metrics.activePages = ownerInstance.pages.size;
1100
+ ownerInstance.metrics.lastUsed = /* @__PURE__ */ new Date();
1101
+ }
1102
+ } catch (error) {
1103
+ if (ownerInstance) {
1104
+ ownerInstance.isHealthy = false;
1105
+ ownerInstance.metrics.isHealthy = false;
1106
+ ownerInstance.metrics.errors++;
1107
+ ownerInstance.pages.delete(page);
1108
+ ownerInstance.metrics.activePages = ownerInstance.pages.size;
1109
+ }
1110
+ }
1111
+ }
1112
+ async cleanup() {
1113
+ if (this.isCleaningUp) return;
1114
+ this.isCleaningUp = true;
1115
+ if (this.healthCheckTimer) {
1116
+ clearTimeout(this.healthCheckTimer);
1117
+ this.healthCheckTimer = null;
1118
+ }
1119
+ this.acquireQueue.clear();
1120
+ await this.acquireQueue.onIdle();
1121
+ const closePromises = [...this.pool].map((instance) => this.closeAndRemoveInstance(instance, "cleanup"));
1122
+ this.pool.clear();
1123
+ await Promise.allSettled(closePromises);
1124
+ this.isCleaningUp = false;
1125
+ }
1126
+ getMetrics() {
1127
+ return [...this.pool].map((instance) => ({
1128
+ ...instance.metrics,
1129
+ activePages: instance.pages.size,
1130
+ isHealthy: instance.isHealthy
1131
+ }));
1132
+ }
1133
+ };
1134
+
1135
+ // src/PlaywrightEngine.ts
1136
+ var import_p_queue2 = __toESM(require("p-queue"), 1);
1137
+ var import_axios = __toESM(require("axios"), 1);
1138
+ function delay(time) {
1139
+ return new Promise((resolve) => setTimeout(resolve, time));
1140
+ }
1141
+ var PlaywrightEngine = class _PlaywrightEngine {
1142
+ browserPool = null;
1143
+ queue;
1144
+ cache = /* @__PURE__ */ new Map();
1145
+ config;
1146
+ // Browser pooling safety flags
1147
+ initializingBrowserPool = false;
1148
+ isUsingHeadedMode = false;
1149
+ // Tracks current pool mode
1150
+ headedFallbackSites = /* @__PURE__ */ new Set();
1151
+ // Stores domains marked for headed mode
1152
+ // Default configuration - Ensure all required fields are present
1153
+ static DEFAULT_CONFIG = {
1154
+ concurrentPages: 3,
1155
+ maxRetries: 3,
1156
+ retryDelay: 5e3,
1157
+ cacheTTL: 15 * 60 * 1e3,
1158
+ useHttpFallback: true,
1159
+ useHeadedModeFallback: false,
1160
+ defaultFastMode: true,
1161
+ simulateHumanBehavior: true,
1162
+ maxBrowsers: 2,
1163
+ maxPagesPerContext: 6,
1164
+ maxBrowserAge: 20 * 60 * 1e3,
1165
+ healthCheckInterval: 60 * 1e3,
1166
+ poolBlockedDomains: [],
1167
+ poolBlockedResourceTypes: [],
1168
+ proxy: void 0,
1169
+ useHeadedMode: false,
1170
+ // ADDED default
1171
+ markdown: true
1172
+ };
1173
+ /**
1174
+ * Creates an instance of PlaywrightEngine.
1175
+ *
1176
+ * @param config Configuration options for the engine and its browser pool.
1177
+ * See `PlaywrightEngineConfig` for details.
1178
+ */
1179
+ constructor(config = {}) {
1180
+ this.config = { ..._PlaywrightEngine.DEFAULT_CONFIG, ...config };
1181
+ this.queue = new import_p_queue2.default({ concurrency: this.config.concurrentPages });
1182
+ }
1183
+ /**
1184
+ * Initialize the browser pool with improved error handling and mode switching.
1185
+ */
1186
+ async initializeBrowserPool(useHeadedMode = false) {
1187
+ if (this.browserPool && this.isUsingHeadedMode === useHeadedMode) {
1188
+ return;
1189
+ }
1190
+ if (this.initializingBrowserPool) {
1191
+ while (this.initializingBrowserPool) {
1192
+ await delay(100);
1193
+ }
1194
+ if (this.browserPool && this.isUsingHeadedMode === useHeadedMode) {
1195
+ return;
1196
+ }
1197
+ }
1198
+ this.initializingBrowserPool = true;
1199
+ try {
1200
+ if (this.browserPool && this.isUsingHeadedMode !== useHeadedMode) {
1201
+ await this.browserPool.cleanup();
1202
+ this.browserPool = null;
1203
+ }
1204
+ this.isUsingHeadedMode = useHeadedMode;
1205
+ this.browserPool = new PlaywrightBrowserPool({
1206
+ maxBrowsers: this.config.maxBrowsers,
1207
+ maxPagesPerContext: this.config.maxPagesPerContext,
1208
+ maxBrowserAge: this.config.maxBrowserAge,
1209
+ healthCheckInterval: this.config.healthCheckInterval,
1210
+ useHeadedMode,
1211
+ blockedDomains: this.config.poolBlockedDomains,
1212
+ blockedResourceTypes: this.config.poolBlockedResourceTypes,
1213
+ proxy: this.config.proxy
1214
+ });
1215
+ await this.browserPool.initialize();
1216
+ } catch (error) {
1217
+ this.browserPool = null;
1218
+ this.isUsingHeadedMode = false;
1219
+ throw error;
1220
+ } finally {
1221
+ this.initializingBrowserPool = false;
1222
+ }
1223
+ }
1224
+ /**
1225
+ * Fallback method using simple HTTP requests via Axios.
1226
+ * Ensures return type matches HTMLFetchResult.
1227
+ */
1228
+ async fetchHTMLWithHttpFallback(url) {
1229
+ try {
1230
+ const response = await import_axios.default.get(url, {
1231
+ headers: {
1232
+ // Use more standard browser-like headers
1233
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
1234
+ Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
1235
+ "Accept-Language": "en-US,en;q=0.9",
1236
+ "Accept-Encoding": "gzip, deflate, br",
1237
+ // Allow compression
1238
+ Referer: "https://www.google.com/",
1239
+ // Common referer
1240
+ "Upgrade-Insecure-Requests": "1",
1241
+ "Sec-Ch-Ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
1242
+ "Sec-Ch-Ua-Mobile": "?0",
1243
+ "Sec-Ch-Ua-Platform": '"Windows"',
1244
+ "Sec-Fetch-Dest": "document",
1245
+ "Sec-Fetch-Mode": "navigate",
1246
+ "Sec-Fetch-Site": "cross-site",
1247
+ "Sec-Fetch-User": "?1",
1248
+ Connection: "keep-alive"
1249
+ // Keep connection open
1250
+ // Avoid Cache-Control/Pragma unless specifically needed
1251
+ },
1252
+ maxRedirects: 5,
1253
+ timeout: 3e4,
1254
+ responseType: "text",
1255
+ // Decompress response automatically
1256
+ decompress: true
1257
+ });
1258
+ const titleMatch = response.data.match(/<title[^>]*>([^<]+)<\/title>/i);
1259
+ let title = titleMatch ? titleMatch[1].trim() : "";
1260
+ if (!title && /<html>([^<]+)<\/html>/.test(response.data)) {
1261
+ title = response.data.replace(/<\/?html>/g, "").trim();
1262
+ }
1263
+ const lowerHtml = response.data.toLowerCase();
1264
+ const isChallengeOrBot = /cloudflare|checking your browser|please wait|verification|captcha|attention required/i.test(lowerHtml);
1265
+ if (isChallengeOrBot) {
1266
+ throw new FetchError("Received challenge page via HTTP fallback", "ERR_CHALLENGE_PAGE");
1267
+ }
1268
+ const originalHtml = response.data;
1269
+ let finalContent = originalHtml;
1270
+ let finalContentType = "html";
1271
+ if (this.config.markdown) {
1272
+ try {
1273
+ const converter = new MarkdownConverter();
1274
+ finalContent = converter.convert(originalHtml);
1275
+ finalContentType = "markdown";
1276
+ } catch (conversionError) {
1277
+ console.error(`Markdown conversion failed for ${url} (HTTP fallback):`, conversionError);
1278
+ }
1279
+ }
1280
+ return {
1281
+ content: finalContent,
1282
+ contentType: finalContentType,
1283
+ title,
1284
+ // title is extracted from original HTML
1285
+ url: response.request?.res?.responseUrl || response.config.url || url,
1286
+ isFromCache: false,
1287
+ statusCode: response.status,
1288
+ error: void 0
1289
+ };
1290
+ } catch (error) {
1291
+ if (!(error instanceof FetchError)) {
1292
+ throw new FetchError(`HTTP fallback failed: ${error.message}`, "ERR_HTTP_FALLBACK_FAILED", error);
1293
+ }
1294
+ throw error;
1295
+ }
1296
+ }
1297
+ checkCache(url) {
1298
+ const cached = this.cache.get(url);
1299
+ if (cached && Date.now() - cached.timestamp < this.config.cacheTTL) {
1300
+ return cached.result;
1301
+ }
1302
+ if (cached) {
1303
+ this.cache.delete(url);
1304
+ }
1305
+ return null;
1306
+ }
1307
+ /**
1308
+ * Safely check if a page is still usable and connected.
1309
+ */
1310
+ async isPageValid(page) {
1311
+ if (!page || page.isClosed()) return false;
1312
+ try {
1313
+ if (!page.context().browser()?.isConnected()) return false;
1314
+ await page.evaluate("1 + 1", { timeout: 1e3 });
1315
+ return true;
1316
+ } catch (error) {
1317
+ return false;
1318
+ }
1319
+ }
1320
+ /**
1321
+ * Simulate human-like interactions on the page.
1322
+ */
1323
+ async simulateHumanBehavior(page) {
1324
+ if (!await this.isPageValid(page)) return;
1325
+ try {
1326
+ const viewport = page.viewportSize();
1327
+ if (!viewport) return;
1328
+ await page.mouse.move(Math.random() * viewport.width, Math.random() * viewport.height / 3, { steps: 5 });
1329
+ await delay(150 + Math.random() * 200);
1330
+ await page.mouse.move(
1331
+ Math.random() * viewport.width,
1332
+ viewport.height / 2 + Math.random() * viewport.height / 2,
1333
+ { steps: 10 }
1334
+ );
1335
+ await delay(200 + Math.random() * 300);
1336
+ await page.evaluate(() => {
1337
+ window.scrollBy({
1338
+ top: window.innerHeight * (0.3 + Math.random() * 0.4),
1339
+ behavior: "smooth"
1340
+ });
1341
+ });
1342
+ await delay(400 + Math.random() * 600);
1343
+ await page.evaluate(() => {
1344
+ window.scrollBy({
1345
+ top: window.innerHeight * (0.2 + Math.random() * 0.3),
1346
+ behavior: "smooth"
1347
+ });
1348
+ });
1349
+ await delay(300 + Math.random() * 400);
1350
+ } catch (_error) {
1351
+ }
1352
+ }
1353
+ /**
1354
+ * Adds a result to the in-memory cache.
1355
+ */
1356
+ addToCache(url, result) {
1357
+ if (this.config.cacheTTL <= 0) return;
1358
+ const entry = {
1359
+ result: { ...result, isFromCache: true },
1360
+ // Mark as cached
1361
+ timestamp: Date.now()
1362
+ };
1363
+ this.cache.set(url, entry);
1364
+ }
1365
+ /**
1366
+ * Public method to fetch HTML. Delegates to the internal recursive fetch method.
1367
+ *
1368
+ * @param url The URL to fetch.
1369
+ * @param options Optional settings for this specific fetch operation.
1370
+ * @param options.fastMode Overrides the engine's `defaultFastMode` configuration for this request.
1371
+ * @returns A Promise resolving to an HTMLFetchResult object.
1372
+ * @throws {FetchError} If the fetch fails after all retries or encounters critical errors.
1373
+ */
1374
+ async fetchHTML(url, options = {}) {
1375
+ const fetchConfig = {
1376
+ ...this.config,
1377
+ markdown: options.markdown === void 0 ? this.config.markdown : options.markdown,
1378
+ fastMode: options.fastMode === void 0 ? this.config.defaultFastMode : options.fastMode
1379
+ };
1380
+ return this._fetchRecursive(url, fetchConfig, 0, 0);
1381
+ }
1382
+ /**
1383
+ * Internal recursive method to handle fetching with retries.
1384
+ *
1385
+ * @param url URL to fetch
1386
+ * @param currentConfig The merged configuration including markdown option
1387
+ * @param retryAttempt Current retry attempt number (starts at 0)
1388
+ * @param parentRetryCount Tracks retries related to pool initialization errors (starts at 0)
1389
+ * @returns Promise resolving to HTMLFetchResult
1390
+ */
1391
+ async _fetchRecursive(url, currentConfig, retryAttempt, parentRetryCount) {
1392
+ const useFastMode = currentConfig.fastMode;
1393
+ if (retryAttempt === 0 && parentRetryCount === 0) {
1394
+ const cachedResult = this.checkCache(url);
1395
+ if (cachedResult) {
1396
+ if (currentConfig.markdown && !cachedResult.content.startsWith("#") && !cachedResult.content.includes("\n\n---\n\n")) {
1397
+ try {
1398
+ const converter = new MarkdownConverter();
1399
+ cachedResult.content = converter.convert(cachedResult.content);
1400
+ } catch (e) {
1401
+ console.error("Failed to convert cached result to markdown", e);
1402
+ }
1403
+ } else if (!currentConfig.markdown && (cachedResult.content.startsWith("#") || cachedResult.content.includes("\n\n---\n\n"))) {
1404
+ console.warn("Cached result is Markdown, but HTML was requested. Re-fetching.");
1405
+ this.cache.delete(url);
1406
+ return this._fetchRecursive(url, currentConfig, 0, 0);
1407
+ }
1408
+ return cachedResult;
1409
+ }
1410
+ }
1411
+ try {
1412
+ if (currentConfig.useHttpFallback && retryAttempt === 0 && parentRetryCount === 0) {
1413
+ try {
1414
+ const httpResult = await this.fetchHTMLWithHttpFallback(url);
1415
+ if (this.config.cacheTTL > 0) {
1416
+ this.addToCache(url, httpResult);
1417
+ }
1418
+ return httpResult;
1419
+ } catch (httpError) {
1420
+ if (httpError instanceof FetchError && httpError.code === "ERR_CHALLENGE_PAGE") {
1421
+ } else {
1422
+ }
1423
+ }
1424
+ }
1425
+ const useHeadedMode = currentConfig.useHeadedModeFallback && (retryAttempt >= 2 || this.shouldUseHeadedMode(url)) || currentConfig.useHeadedMode;
1426
+ try {
1427
+ if (!this.browserPool || this.isUsingHeadedMode !== useHeadedMode) {
1428
+ await this.initializeBrowserPool(useHeadedMode);
1429
+ }
1430
+ } catch (initError) {
1431
+ if (parentRetryCount < 1) {
1432
+ await delay(currentConfig.retryDelay);
1433
+ return this._fetchRecursive(url, currentConfig, retryAttempt, parentRetryCount + 1);
1434
+ }
1435
+ throw new FetchError(
1436
+ `Pool init failed: ${initError.message}`,
1437
+ "ERR_POOL_INIT_FAILED",
1438
+ initError
1439
+ );
1440
+ }
1441
+ if (!this.browserPool) {
1442
+ throw new FetchError("Browser pool unavailable.", "ERR_POOL_UNAVAILABLE");
1443
+ }
1444
+ const result = await this.queue.add(
1445
+ () => this.fetchWithPlaywright(url, this.browserPool, useFastMode, currentConfig.markdown)
1446
+ );
1447
+ if (result && this.config.cacheTTL > 0) {
1448
+ this.addToCache(url, result);
1449
+ }
1450
+ if (!result) {
1451
+ throw new FetchError("Playwright fetch queued but no result.", "ERR_QUEUE_NO_RESULT");
1452
+ }
1453
+ return result;
1454
+ } catch (error) {
1455
+ if (useFastMode && retryAttempt === 0 && parentRetryCount === 0) {
1456
+ return this._fetchRecursive(url, { ...currentConfig, fastMode: false }, 0, parentRetryCount);
1457
+ }
1458
+ if (retryAttempt < currentConfig.maxRetries) {
1459
+ await delay(currentConfig.retryDelay);
1460
+ return this._fetchRecursive(url, currentConfig, retryAttempt + 1, parentRetryCount);
1461
+ }
1462
+ const finalError = error instanceof FetchError ? error : new FetchError(`Fetch failed: ${error.message}`, "ERR_FETCH_FAILED", error);
1463
+ throw new FetchError(
1464
+ `Fetch failed after ${currentConfig.maxRetries} retries: ${finalError.message}`,
1465
+ finalError.code,
1466
+ finalError.originalError || error
1467
+ );
1468
+ }
1469
+ }
1470
+ /**
1471
+ * Performs the actual page fetch using a Playwright page from the pool.
1472
+ * Ensures return type matches HTMLFetchResult.
1473
+ */
1474
+ async fetchWithPlaywright(url, pool, fastMode, convertToMarkdown) {
1475
+ let page = null;
1476
+ try {
1477
+ page = await pool.acquirePage();
1478
+ await this.applyBlockingRules(page, fastMode);
1479
+ let response = null;
1480
+ try {
1481
+ response = await page.goto(url, {
1482
+ waitUntil: "domcontentloaded",
1483
+ timeout: 6e4
1484
+ });
1485
+ } catch (navigationError) {
1486
+ throw new FetchError(
1487
+ `Playwright navigation failed: ${navigationError.message}`,
1488
+ "ERR_NAVIGATION",
1489
+ navigationError
1490
+ );
1491
+ }
1492
+ if (!response) {
1493
+ throw new FetchError("Playwright navigation did not return a response.", "ERR_NO_RESPONSE");
1494
+ }
1495
+ if (!response.ok()) {
1496
+ throw new FetchError(
1497
+ `HTTP error status received: ${response.status()}`,
1498
+ "ERR_HTTP_ERROR",
1499
+ void 0,
1500
+ response.status()
1501
+ );
1502
+ }
1503
+ const contentType = response.headers()["content-type"] || "";
1504
+ if (!contentType.includes("html")) {
1505
+ throw new FetchError(`Invalid content type received: ${contentType}`, "ERR_NON_HTML_CONTENT");
1506
+ }
1507
+ if (!fastMode && this.config.simulateHumanBehavior) {
1508
+ await this.simulateHumanBehavior(page);
1509
+ }
1510
+ const html = await page.content();
1511
+ const title = await page.title();
1512
+ const finalUrl = page.url();
1513
+ const status = response?.status();
1514
+ let finalContent = html;
1515
+ let finalContentType = "html";
1516
+ if (convertToMarkdown) {
1517
+ try {
1518
+ const converter = new MarkdownConverter();
1519
+ finalContent = converter.convert(html);
1520
+ finalContentType = "markdown";
1521
+ } catch (conversionError) {
1522
+ console.error(`Markdown conversion failed for ${url} (Playwright):`, conversionError);
1523
+ }
1524
+ }
1525
+ return {
1526
+ content: finalContent,
1527
+ contentType: finalContentType,
1528
+ title: title || null,
1529
+ url: finalUrl,
1530
+ isFromCache: false,
1531
+ statusCode: status,
1532
+ error: void 0
1533
+ };
1534
+ } finally {
1535
+ if (page) {
1536
+ await pool.releasePage(page);
1537
+ }
1538
+ }
1539
+ }
1540
+ async applyBlockingRules(page, fastMode) {
1541
+ const blockedResources = fastMode ? this.config.poolBlockedResourceTypes.concat(["image", "font", "stylesheet", "media"]) : this.config.poolBlockedResourceTypes;
1542
+ const blockedDomains = this.config.poolBlockedDomains;
1543
+ if (blockedResources.length > 0 || blockedDomains.length > 0) {
1544
+ try {
1545
+ await page.route("**/*", (route) => {
1546
+ const resourceType = route.request().resourceType();
1547
+ const requestUrl = route.request().url();
1548
+ if (blockedResources.includes(resourceType)) {
1549
+ return route.abort();
1550
+ }
1551
+ if (blockedDomains.some(
1552
+ (pattern) => new RegExp(pattern.replace(/\./g, "\\.").replace(/\*/g, ".*")).test(requestUrl)
1553
+ )) {
1554
+ return route.abort();
1555
+ }
1556
+ return route.continue();
1557
+ });
1558
+ } catch (_error) {
1559
+ }
1560
+ }
1561
+ }
1562
+ /**
1563
+ * Cleans up resources used by the engine, primarily closing browser instances in the pool.
1564
+ *
1565
+ * It is crucial to call this method when finished with the engine instance to release resources.
1566
+ * @returns A Promise that resolves when cleanup is complete.
1567
+ */
1568
+ async cleanup() {
1569
+ try {
1570
+ await this.queue.onIdle();
1571
+ this.queue.clear();
1572
+ if (this.browserPool) {
1573
+ await this.browserPool.cleanup();
1574
+ this.browserPool = null;
1575
+ }
1576
+ this.isUsingHeadedMode = false;
1577
+ } catch (_error) {
1578
+ }
1579
+ }
1580
+ /**
1581
+ * Retrieves metrics from the underlying browser pool.
1582
+ * @returns An array of BrowserMetrics objects, one for each active browser instance, or an empty array if the pool is not initialized.
1583
+ */
1584
+ getMetrics() {
1585
+ if (this.browserPool) {
1586
+ return this.browserPool.getMetrics();
1587
+ }
1588
+ return [];
1589
+ }
1590
+ // Helper to check if a specific domain is marked for headed mode
1591
+ shouldUseHeadedMode(url) {
1592
+ if (!this.config.useHeadedModeFallback) return false;
1593
+ try {
1594
+ const domain = new URL(url).hostname;
1595
+ return this.headedFallbackSites.has(domain);
1596
+ } catch {
1597
+ return false;
1598
+ }
1599
+ }
1600
+ };
1601
+
1602
+ // src/HybridEngine.ts
1603
+ var HybridEngine = class {
1604
+ fetchEngine;
1605
+ playwrightEngine;
1606
+ config;
1607
+ // Store config for potential per-request PW overrides
1608
+ constructor(config = {}) {
1609
+ this.fetchEngine = new FetchEngine({ markdown: config.markdown });
1610
+ this.playwrightEngine = new PlaywrightEngine(config);
1611
+ this.config = config;
1612
+ }
1613
+ async fetchHTML(url, options = {}) {
1614
+ try {
1615
+ const fetchResult = await this.fetchEngine.fetchHTML(url);
1616
+ return fetchResult;
1617
+ } catch (fetchError) {
1618
+ console.warn(`FetchEngine failed for ${url}: ${fetchError.message}. Falling back to PlaywrightEngine.`);
1619
+ const playwrightOptions = {
1620
+ ...this.config,
1621
+ // Start with base config given to HybridEngine
1622
+ ...options
1623
+ // Override with per-request options
1624
+ };
1625
+ try {
1626
+ const playwrightResult = await this.playwrightEngine.fetchHTML(url, playwrightOptions);
1627
+ return playwrightResult;
1628
+ } catch (playwrightError) {
1629
+ console.error(`PlaywrightEngine fallback failed for ${url}: ${playwrightError.message}`);
1630
+ throw playwrightError;
1631
+ }
1632
+ }
1633
+ }
1634
+ /**
1635
+ * Delegates getMetrics to the PlaywrightEngine.
1636
+ */
1637
+ getMetrics() {
1638
+ return this.playwrightEngine.getMetrics();
1639
+ }
1640
+ /**
1641
+ * Calls cleanup on both underlying engines.
1642
+ */
1643
+ async cleanup() {
1644
+ await Promise.allSettled([
1645
+ this.fetchEngine.cleanup(),
1646
+ // Although a no-op, call for consistency
1647
+ this.playwrightEngine.cleanup()
1648
+ ]);
1649
+ }
1650
+ };
1651
+ // Annotate the CommonJS export names for ESM import in node:
1652
+ 0 && (module.exports = {
1653
+ FetchEngine,
1654
+ HybridEngine,
1655
+ PlaywrightEngine
1656
+ });
1657
+ //# sourceMappingURL=index.cjs.map