@purepageio/fetch-engines 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,794 @@
1
+ import TurndownService from "turndown";
2
+ import { gfm } from "turndown-plugin-gfm";
3
+ import { parse, HTMLElement as NHPHTMLElement } from "node-html-parser";
4
+ // --- Constants ---
5
+ // Preprocessing - Selectors for removal (balanced approach)
6
+ const PREPROCESSING_REMOVE_SELECTORS = [
7
+ "script:not([type='application/ld+json'])", // Keep JSON-LD
8
+ "style",
9
+ "noscript",
10
+ "iframe:not([title])", // Keep iframes with titles (potential embeds)
11
+ ];
12
+ // Preprocessing - Selectors for identifying potential main content
13
+ const MAIN_CONTENT_SELECTORS = [
14
+ // By semantics
15
+ "article",
16
+ "main",
17
+ "[role='main']",
18
+ "[role='article']",
19
+ // By common class/id names (more robust patterns)
20
+ "[class*='article-body']",
21
+ "[class*='post-content']",
22
+ "[class*='main-content']",
23
+ "[class*='entry-content']",
24
+ "[id*='article-body']",
25
+ "[id*='main-content']",
26
+ // Common CMS patterns
27
+ ".article",
28
+ ".post",
29
+ ".content",
30
+ ".entry",
31
+ ".blog-post",
32
+ // Fallback
33
+ "body",
34
+ ];
35
+ // Preprocessing - Selectors for forum detection
36
+ const FORUM_COMMENT_SELECTORS = [
37
+ ".comment",
38
+ ".comments",
39
+ ".comtr",
40
+ '[id^="comment-"]',
41
+ 'div[id^="c_"]',
42
+ ];
43
+ const FORUM_THREAD_SELECTORS = [".thread", ".post", '[id^="thread-"]'];
44
+ const FORUM_VOTE_SELECTORS = [".vote", ".score", ".upvote", ".downvote", ".votelinks"];
45
+ const FORUM_MAIN_POST_SELECTORS = [".fatitem", ".submission", ".op", ".original-post"];
46
+ const FORUM_COMMENTS_CONTAINER_SELECTORS = [".comment-tree", ".comments", "#comments"];
47
+ const FORUM_OBVIOUS_NON_CONTENT_SELECTORS = ["header", "footer", ".nav", ".sidebar"];
48
+ // Preprocessing - Link Density
49
+ const MIN_LINK_DENSITY_TEXT_LENGTH = 50; // Lowered slightly from original
50
+ const DEFAULT_LINK_DENSITY_THRESHOLD = 0.4; // Slightly lower threshold
51
+ // Preprocessing - Forum Detection
52
+ const MIN_FORUM_INDICATOR_COUNT = 3;
53
+ // Turndown - Code block detection
54
+ const CODE_BLOCK_LANG_PREFIXES = ["language-", "lang-"];
55
+ // Postprocessing
56
+ const POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES = 2; // Keep paragraphs separate
57
+ // --- Class Definition ---
58
+ export class MarkdownConverter {
59
+ turndownService;
60
+ constructor() {
61
+ this.turndownService = new TurndownService({
62
+ headingStyle: "atx",
63
+ codeBlockStyle: "fenced",
64
+ bulletListMarker: "-",
65
+ strongDelimiter: "**",
66
+ emDelimiter: "*",
67
+ hr: "---",
68
+ // Use nodeType check instead of window.HTMLElement
69
+ keepReplacement: ((_content, node) => {
70
+ // Node.ELEMENT_NODE is 1
71
+ if (node.nodeType === 1) {
72
+ const htmlElement = node;
73
+ if (htmlElement.getAttribute("role") === "presentation" || htmlElement.classList?.contains("preserve")) {
74
+ return htmlElement.outerHTML;
75
+ }
76
+ }
77
+ return "";
78
+ }),
79
+ });
80
+ this.turndownService.use(gfm);
81
+ // Setup conversion rules
82
+ this.setupPrioritizedRules();
83
+ }
84
+ // --- Public Method ---
85
+ /**
86
+ * Converts HTML string to Markdown.
87
+ * @param html The HTML string to convert.
88
+ * @param options Conversion options.
89
+ * @returns The converted Markdown string.
90
+ */
91
+ convert(html, options = {}) {
92
+ // Preprocess HTML to clean and extract main content
93
+ const preprocessedHtml = this.preprocessHTML(html);
94
+ // Convert preprocessed HTML to Markdown
95
+ let markdown = this.turndownService.turndown(preprocessedHtml);
96
+ // Post-process Markdown for cleanup
97
+ markdown = this.postprocessMarkdown(markdown, options);
98
+ return markdown;
99
+ }
100
+ // --- Turndown Rule Setup ---
101
+ setupPrioritizedRules() {
102
+ this.addContentExtractionRules();
103
+ this.addStructureRules();
104
+ this.addBlockRules();
105
+ this.addInlineRules();
106
+ }
107
+ // We rely on preprocessing to remove nav/menus/high-link-density areas.
108
+ // These rules primarily help Turndown understand the *structure* of the *intended* content.
109
+ addContentExtractionRules() {
110
+ this.turndownService.addRule("main-content-marker", {
111
+ filter: (node) => {
112
+ // Node.ELEMENT_NODE is 1
113
+ if (node.nodeType !== 1)
114
+ return false;
115
+ const el = node;
116
+ const element = node;
117
+ return (el.tagName.toLowerCase() === "main" ||
118
+ ["main", "article"].includes(el.getAttribute("role") || "") ||
119
+ MAIN_CONTENT_SELECTORS.some((selector) => {
120
+ try {
121
+ return element.matches(selector) && selector !== "body";
122
+ }
123
+ catch {
124
+ return false;
125
+ }
126
+ }));
127
+ },
128
+ // Just pass content through, this rule is mainly for filter priority/debugging
129
+ replacement: (content) => content,
130
+ });
131
+ // Explicitly remove elements that should definitely not be in Markdown
132
+ const unwantedTags = [
133
+ "script",
134
+ "style",
135
+ "noscript",
136
+ "iframe",
137
+ "button",
138
+ "input",
139
+ "select",
140
+ "textarea",
141
+ "form",
142
+ "canvas",
143
+ /*'svg' removed */ "audio",
144
+ "video",
145
+ ];
146
+ this.turndownService.addRule("remove-unwanted", {
147
+ filter: unwantedTags,
148
+ replacement: () => "",
149
+ });
150
+ }
151
+ addStructureRules() {
152
+ // Article structure (less critical now preprocessing extracts content)
153
+ this.turndownService.addRule("article", {
154
+ filter: "article",
155
+ replacement: (content) => `\n\n${content}\n\n`, // Add separation
156
+ });
157
+ // Section structure (less critical now preprocessing extracts content)
158
+ this.turndownService.addRule("section", {
159
+ filter: "section",
160
+ replacement: (content) => `\n\n${content}\n\n`, // Add separation
161
+ });
162
+ // Preserve heading levels correctly
163
+ // this.turndownService.keep(["h1", "h2", "h3", "h4", "h5", "h6"]); // REMOVED - Use default ATX headings
164
+ }
165
+ addBlockRules() {
166
+ // Lists (ensure proper nesting indentation)
167
+ this.turndownService.addRule("list", {
168
+ filter: ["ul", "ol"],
169
+ replacement: (content, node) => {
170
+ // Node.ELEMENT_NODE is 1
171
+ if (node.nodeType !== 1)
172
+ return content;
173
+ // Check if the parent is a list item (nested list)
174
+ const parent = node.parentNode;
175
+ const indent = parent && parent.nodeName.toLowerCase() === "li" ? " " : "";
176
+ // Ensure content is handled line by line for indentation
177
+ // Trim trailing spaces from each line before joining
178
+ return ("\n" +
179
+ content
180
+ .split("\n")
181
+ .map((line) => indent + line.trimEnd())
182
+ .join("\n")
183
+ .trim() +
184
+ "\n");
185
+ },
186
+ });
187
+ // List items
188
+ this.turndownService.addRule("listItem", {
189
+ filter: "li",
190
+ // Use standard function for `this` context if needed, or ensure types match
191
+ replacement: function (content, node, options) {
192
+ content = content
193
+ .replace(/^\s+/gm, "") // Remove leading whitespace from each line
194
+ .replace(/\n(?!\s*$)/gm, "\n "); // Indent subsequent lines correctly
195
+ let prefix = options.bulletListMarker + " ";
196
+ const parentNode = node.parentNode;
197
+ if (parentNode && parentNode.nodeName === "OL") {
198
+ try {
199
+ const start = parentNode.getAttribute("start");
200
+ // Ensure node is an Element before accessing children/indexOf
201
+ const elementNode = node;
202
+ const parentElement = parentNode;
203
+ const index = Array.prototype.indexOf.call(parentElement.children, elementNode);
204
+ prefix = (start ? Number(start) + index : index + 1) + ". ";
205
+ }
206
+ catch (e) {
207
+ console.warn("Could not determine ordered list index:", e);
208
+ prefix = "1. "; // Fallback
209
+ }
210
+ }
211
+ // Add newline only if needed (next sibling exists and current content doesn't end with newline)
212
+ const trimmedContent = content.trim();
213
+ return prefix + trimmedContent + (node.nextSibling && !/\n$/.test(trimmedContent) ? "\n" : "");
214
+ },
215
+ });
216
+ // Tables - Relying on GFM plugin
217
+ // Blockquotes
218
+ this.turndownService.addRule("blockquote", {
219
+ filter: "blockquote",
220
+ replacement: (content) => {
221
+ // Trim leading/trailing newlines from content and add > prefix correctly
222
+ const trimmedContent = content.trim();
223
+ return "\n\n> " + trimmedContent.replace(/\n/g, "\n> ") + "\n\n";
224
+ },
225
+ });
226
+ }
227
+ addInlineRules() {
228
+ // Links - Ensure proper formatting and title preservation
229
+ this.turndownService.addRule("link", {
230
+ filter: (node, _options) => {
231
+ // Check nodeType and nodeName first, then cast for getAttribute
232
+ return node.nodeType === 1 && node.nodeName === "A" && !!node.getAttribute("href");
233
+ },
234
+ replacement: (content, node) => {
235
+ const element = node;
236
+ const href = element.getAttribute("href") || "";
237
+ const title = element.getAttribute("title");
238
+ // Use content if available and not just whitespace, otherwise use href as text
239
+ const text = content.trim() ? content.trim() : href;
240
+ // Decode URI components, handling potential errors
241
+ let decodedHref = href;
242
+ try {
243
+ // Decode only if it looks like it might be encoded
244
+ if (href.includes("%")) {
245
+ decodedHref = decodeURI(href);
246
+ }
247
+ }
248
+ catch (e) {
249
+ console.warn(`Failed to decode URI, keeping original: ${href}`, e);
250
+ // Keep original href if decoding fails
251
+ }
252
+ return title ? `[${text}](${decodedHref} \"${title}\")` : `[${text}](${decodedHref})`;
253
+ },
254
+ });
255
+ // Images - Handle figures and captions
256
+ this.turndownService.addRule("figure", {
257
+ filter: "figure",
258
+ replacement: (content, node) => {
259
+ if (!(node instanceof window.HTMLElement))
260
+ return content;
261
+ const element = node;
262
+ // Use DOM methods on the casted element
263
+ const img = element.querySelector("img");
264
+ const figcaption = element.querySelector("figcaption");
265
+ let markdown = "";
266
+ let mainImgMd = "";
267
+ if (img) {
268
+ const src = img.getAttribute("src") || "";
269
+ const alt = img.getAttribute("alt") || "";
270
+ const title = img.getAttribute("title");
271
+ mainImgMd = title ? `![${alt}](${src} "${title}")` : `![${alt}](${src})`;
272
+ }
273
+ // Process the original content provided by Turndown (handles nested elements)
274
+ let processedContent = content.trim();
275
+ // If the figure primarily contains the image and caption, structure around the image
276
+ if (mainImgMd) {
277
+ markdown = mainImgMd;
278
+ // Remove the image representation from the processed content if Turndown included it
279
+ // Use a simple placeholder to avoid issues with special chars in alt/src
280
+ const imgPlaceholder = `![${img?.getAttribute("alt") || ""}](${img?.getAttribute("src") || ""})`;
281
+ processedContent = processedContent.replace(imgPlaceholder, "").trim();
282
+ }
283
+ if (figcaption) {
284
+ const captionText = figcaption.textContent?.trim();
285
+ if (captionText) {
286
+ markdown += `\n\n_${captionText}_`; // Use italics for caption below the image
287
+ // Remove the caption representation from the processed content
288
+ processedContent = processedContent.replace(captionText, "").trim();
289
+ processedContent = processedContent.replace(/^_+|_+$/g, "").trim(); // Remove surrounding underscores if any
290
+ }
291
+ }
292
+ // Append any remaining content from the figure
293
+ if (processedContent) {
294
+ // Avoid adding just empty placeholders or insignificant content
295
+ if (processedContent.length > 10 || /[a-zA-Z0-9]/.test(processedContent)) {
296
+ markdown += `\n\n${processedContent}`;
297
+ }
298
+ }
299
+ return "\n\n" + markdown.trim() + "\n\n";
300
+ },
301
+ });
302
+ // Standalone Images (not in figures)
303
+ this.turndownService.addRule("image", {
304
+ filter: (node) => {
305
+ // Node.ELEMENT_NODE is 1, it's an IMG, and has src
306
+ return node.nodeType === 1 && node.nodeName === "IMG" && !!node.getAttribute("src");
307
+ },
308
+ replacement: (_content, node) => {
309
+ const element = node;
310
+ const src = element.getAttribute("src") || "";
311
+ const alt = element.getAttribute("alt") || "";
312
+ const title = element.getAttribute("title");
313
+ // Add surrounding newlines for block display
314
+ return title ? `\n\n![${alt}](${src} "${title}")\n\n` : `\n\n![${alt}](${src})\n\n`;
315
+ },
316
+ });
317
+ // Code Blocks - Enhanced detection
318
+ this.turndownService.addRule("code-block", {
319
+ filter: (node) => {
320
+ // Node.ELEMENT_NODE is 1
321
+ if (node.nodeType !== 1)
322
+ return false;
323
+ const element = node;
324
+ // Must be a <pre> tag
325
+ const isPre = element.tagName.toLowerCase() === "pre";
326
+ if (!isPre)
327
+ return false;
328
+ // Consider it code if it has a <code> child or specific classes/attributes
329
+ const hasCodeChild = element.querySelector("code") !== null;
330
+ const hasCodeClass = /highlight|syntax|code|listing|source/i.test(element.className);
331
+ const hasLangAttribute = !!element.getAttribute("lang") || !!element.getAttribute("language");
332
+ return hasCodeChild || hasCodeClass || hasLangAttribute;
333
+ },
334
+ replacement: (content, node) => {
335
+ if (!(node instanceof window.HTMLElement))
336
+ return content.trim(); // Should be HTMLElement based on filter
337
+ const element = node;
338
+ // Detect language
339
+ let language = "";
340
+ const codeElement = element.querySelector("code");
341
+ // 1. Check attributes on <pre> or <code>
342
+ language =
343
+ element.getAttribute("lang") ||
344
+ element.getAttribute("language") ||
345
+ (codeElement ? codeElement.getAttribute("lang") || codeElement.getAttribute("language") : "") ||
346
+ "";
347
+ // 2. Check for "language-*" or "lang-*" class
348
+ if (!language) {
349
+ const classes = (element.className + " " + (codeElement?.className || "")).split(" ").filter(Boolean);
350
+ for (const cls of classes) {
351
+ for (const prefix of CODE_BLOCK_LANG_PREFIXES) {
352
+ if (cls.startsWith(prefix)) {
353
+ language = cls.substring(prefix.length);
354
+ break;
355
+ }
356
+ }
357
+ if (language)
358
+ break;
359
+ }
360
+ }
361
+ // Clean up content - remove leading/trailing newlines often added
362
+ const cleanedContent = content.trim();
363
+ // Format code block
364
+ return `\n\n\`\`\`${language}\n${cleanedContent}\n\`\`\`\n\n`;
365
+ },
366
+ });
367
+ // Inline Code
368
+ this.turndownService.addRule("inlineCode", {
369
+ filter: (node) => node.nodeName === "CODE" && node.parentNode?.nodeName !== "PRE",
370
+ replacement: (content) => {
371
+ // Ensure content is trimmed and handle potential backticks inside
372
+ const trimmed = content.trim();
373
+ if (!trimmed)
374
+ return ""; // Don't render empty code tags
375
+ // Determine delimiter based on content
376
+ let delimiter = "`";
377
+ if (trimmed.includes("`")) {
378
+ delimiter = "``";
379
+ // If content starts or ends with backtick, add space when using ``
380
+ if (trimmed.startsWith("`") || trimmed.endsWith("`")) {
381
+ return `${delimiter} ${trimmed} ${delimiter}`;
382
+ }
383
+ }
384
+ return delimiter + trimmed + delimiter;
385
+ },
386
+ });
387
+ }
388
+ // --- HTML Preprocessing ---
389
+ preprocessHTML(html) {
390
+ try {
391
+ html = this.cleanupHtml(html);
392
+ const root = parse(html, {
393
+ comment: false,
394
+ blockTextElements: { script: true, style: true, noscript: true },
395
+ });
396
+ // Use nodeType check and cast via unknown
397
+ if (root.nodeType === 3) {
398
+ // Node.TEXT_NODE
399
+ return root.textContent ?? "";
400
+ }
401
+ else if (root.nodeType !== 1) {
402
+ // Node.ELEMENT_NODE
403
+ console.warn("Unexpected root node type after parsing:", root.nodeType);
404
+ return root.toString();
405
+ }
406
+ const rootElement = root;
407
+ PREPROCESSING_REMOVE_SELECTORS.forEach((selector) => {
408
+ try {
409
+ rootElement.querySelectorAll(selector).forEach((el) => el.remove());
410
+ }
411
+ catch (e) {
412
+ console.warn(`Skipping invalid selector during preprocessing: ${selector}`, e);
413
+ }
414
+ });
415
+ this.removeHighLinkDensityElements(rootElement, DEFAULT_LINK_DENSITY_THRESHOLD);
416
+ const metadata = this.extractDocumentMetadata(rootElement);
417
+ const isForum = this.detectForumPage(rootElement);
418
+ let contentElement = rootElement;
419
+ if (isForum) {
420
+ contentElement = this.extractForumContentElement(rootElement);
421
+ }
422
+ else {
423
+ contentElement = this.extractArticleContentElement(rootElement);
424
+ }
425
+ let contentHtml = contentElement instanceof NHPHTMLElement ? contentElement.outerHTML : contentElement.textContent;
426
+ contentHtml = this.cleanupContentHtml(contentHtml || "");
427
+ const metadataString = metadata.length > 0 ? metadata.join("\n\n") + "\n\n---\n\n" : "";
428
+ return metadataString + contentHtml;
429
+ }
430
+ catch (error) {
431
+ console.error("HTML preprocessing failed:", error);
432
+ return this.cleanupHtml(html);
433
+ }
434
+ }
435
+ cleanupHtml(html) {
436
+ // Remove specific non-standard characters/patterns observed in the wild
437
+ return (html
438
+ // Example pattern from original code
439
+ .replace(/AMIL:\[=-,amilft[^\s]*/g, "")
440
+ // Remove simple template variables like {{variable}} but not complex ones
441
+ .replace(/\{\{\s*[^}\s]+\s*}}/g, "")
442
+ // Remove control characters except for common whitespace (tab, newline, carriage return)
443
+ .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, ""));
444
+ }
445
+ cleanupContentHtml(content) {
446
+ // Remove common SPA framework attributes after content extraction
447
+ // Also remove comments that might have survived initial parse
448
+ return (content
449
+ // Remove specific data-* attributes that are often framework-specific noise
450
+ .replace(/\s*data-(?:reactid|reactroot|react-|testid|v-|js-|qa-|cy-)[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "")
451
+ // Remove Angular-specific attributes
452
+ .replace(/\s*ng-[^=\s]*\s*=\s*(?:"[^"]*"|'[^']*'|\S+)/g, "")
453
+ .replace(/\s*_ngcontent-[^\s]*\s*=""/g, "")
454
+ .replace(/\s*class\s*=\s*"(ng-|mat-)[^"]*"/g, "") // Remove common Angular classes
455
+ // Remove comment nodes explicitly
456
+ .replace(/<!--[\s\S]*?-->/g, "")
457
+ // Collapse multiple spaces/tabs within lines, but preserve newlines
458
+ .replace(/([ \t])+/g, " ")
459
+ // Trim whitespace around newlines
460
+ .replace(/\s*\n\s*/g, "\n")
461
+ .trim());
462
+ }
463
+ removeHighLinkDensityElements(element, threshold) {
464
+ const potentialBoilerplate = element.querySelectorAll("div, nav, ul, aside, section, .sidebar, .widget, .menu, [role='navigation'], [role='menubar']");
465
+ for (const el of Array.from(potentialBoilerplate)) {
466
+ if (!(el instanceof NHPHTMLElement))
467
+ continue;
468
+ const textContent = el.textContent || "";
469
+ if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH)
470
+ continue;
471
+ const links = el.querySelectorAll("a");
472
+ if (links.length < 3)
473
+ continue; // Require a minimum number of links
474
+ const textLength = textContent.length;
475
+ // Calculate link text length more carefully - avoid double counting nested links
476
+ let linkTextLength = 0;
477
+ el.querySelectorAll("a").forEach((link) => {
478
+ // Ensure link is a direct child or descendant not within another link
479
+ if (link.closest("a") === link) {
480
+ linkTextLength += link.textContent?.length || 0;
481
+ }
482
+ });
483
+ // Avoid division by zero
484
+ if (textLength === 0)
485
+ continue;
486
+ const density = linkTextLength / textLength;
487
+ if (density > threshold) {
488
+ // Avoid removing the element if it contains a primary content marker
489
+ const containsMainContent = el.querySelector('main, article, [role="main"], [role="article"]') !== null;
490
+ // Also avoid removing if it IS the main content candidate itself
491
+ const isMainContent = MAIN_CONTENT_SELECTORS.some((selector) => {
492
+ try {
493
+ // Explicitly assert type before calling matches
494
+ /* @ts-expect-error TODO: fix this */
495
+ return el.matches(selector);
496
+ }
497
+ catch {
498
+ return false;
499
+ }
500
+ });
501
+ if (!containsMainContent && !isMainContent) {
502
+ el.remove();
503
+ }
504
+ }
505
+ }
506
+ }
507
+ extractDocumentMetadata(root) {
508
+ const metadata = [];
509
+ const addedMeta = new Set(); // Track added keys to avoid duplicates
510
+ // Helper to add metadata if value exists and key hasn't been added
511
+ const addMeta = (key, value, isTitle = false) => {
512
+ const cleanedValue = value?.trim();
513
+ if (cleanedValue && !addedMeta.has(key.toLowerCase())) {
514
+ if (isTitle) {
515
+ metadata.unshift(`# ${cleanedValue}`); // Main title as H1 at the beginning
516
+ }
517
+ else {
518
+ metadata.push(`**${key}:** ${cleanedValue}`);
519
+ }
520
+ addedMeta.add(key.toLowerCase());
521
+ }
522
+ };
523
+ // 1. Title (Prioritize specific ones, fallback to <title>)
524
+ addMeta("Title", root.querySelector("meta[property='og:title']")?.getAttribute("content"), true);
525
+ addMeta("Title", root.querySelector("meta[name='twitter:title']")?.getAttribute("content"), true);
526
+ addMeta("Title", root.querySelector("meta[name='DC.title']")?.getAttribute("content"), true);
527
+ addMeta("Title", root.querySelector("title")?.textContent, true);
528
+ // 2. Description
529
+ addMeta("Description", root.querySelector("meta[property='og:description']")?.getAttribute("content"));
530
+ addMeta("Description", root.querySelector("meta[name='twitter:description']")?.getAttribute("content"));
531
+ addMeta("Description", root.querySelector("meta[name='description']")?.getAttribute("content"));
532
+ addMeta("Description", root.querySelector("meta[name='DC.description']")?.getAttribute("content"));
533
+ // 3. Author
534
+ addMeta("Author", root.querySelector("meta[name='author']")?.getAttribute("content"));
535
+ addMeta("Author", root.querySelector("meta[property='article:author']")?.getAttribute("content"));
536
+ addMeta("Author", root.querySelector("[rel='author']")?.textContent);
537
+ // 4. Publication Date
538
+ addMeta("Published", root.querySelector("meta[property='article:published_time']")?.getAttribute("content"));
539
+ addMeta("Published", root.querySelector("meta[name='publish-date']")?.getAttribute("content"));
540
+ addMeta("Published", root.querySelector("time[itemprop='datePublished']")?.getAttribute("datetime"));
541
+ addMeta("Published", root.querySelector("time")?.getAttribute("datetime")); // Generic time tag
542
+ // 5. Canonical URL
543
+ addMeta("URL", root.querySelector("link[rel='canonical']")?.getAttribute("href"));
544
+ addMeta("URL", root.querySelector("meta[property='og:url']")?.getAttribute("content"));
545
+ // 6. Extract JSON-LD
546
+ const jsonLdScripts = root.querySelectorAll("script[type='application/ld+json']");
547
+ if (jsonLdScripts.length > 0) {
548
+ const jsonLdData = Array.from(jsonLdScripts)
549
+ .map((script) => {
550
+ try {
551
+ // Ensure script content exists before parsing
552
+ const textContent = script.textContent;
553
+ return textContent ? JSON.parse(textContent) : null;
554
+ }
555
+ catch (e) {
556
+ // Ignore invalid JSON
557
+ return null;
558
+ }
559
+ })
560
+ .filter((item) => item !== null); // Type guard for filter
561
+ if (jsonLdData.length > 0 && !addedMeta.has("json-ld")) {
562
+ // Use details/summary for collapsibility
563
+ metadata.push("<details><summary>JSON-LD Metadata</summary>\n");
564
+ metadata.push("```json", JSON.stringify(jsonLdData, null, 2), "```");
565
+ metadata.push("</details>");
566
+ addedMeta.add("json-ld");
567
+ }
568
+ }
569
+ return metadata;
570
+ }
571
+ detectForumPage(root) {
572
+ // Count indicators across different selector groups
573
+ const countMatches = (selectors) => {
574
+ return selectors.reduce((count, selector) => {
575
+ try {
576
+ // Check if element exists before querying within it
577
+ if (root) {
578
+ return count + root.querySelectorAll(selector).length;
579
+ }
580
+ return count;
581
+ }
582
+ catch {
583
+ return count;
584
+ } // Ignore selector errors
585
+ }, 0);
586
+ };
587
+ const commentCount = countMatches(FORUM_COMMENT_SELECTORS);
588
+ const threadCount = countMatches(FORUM_THREAD_SELECTORS);
589
+ const voteCount = countMatches(FORUM_VOTE_SELECTORS);
590
+ // Check hostname for known forum patterns
591
+ let isKnownForumHost = false;
592
+ try {
593
+ const canonicalUrl = root.querySelector('link[rel="canonical"]')?.getAttribute("href") ||
594
+ root.querySelector('meta[property="og:url"]')?.getAttribute("content");
595
+ if (canonicalUrl) {
596
+ // Ensure the URL is absolute before parsing
597
+ // Provide a dummy base URL in case the canonical URL is relative
598
+ const absoluteUrl = new URL(canonicalUrl, "http://example.com").toString();
599
+ const hostname = new URL(absoluteUrl).hostname.toLowerCase();
600
+ isKnownForumHost =
601
+ hostname.includes("reddit.com") ||
602
+ hostname.includes("news.ycombinator.com") ||
603
+ hostname.includes("forum") ||
604
+ hostname.includes("discuss") ||
605
+ hostname.includes("community");
606
+ }
607
+ }
608
+ catch (e) {
609
+ console.warn("Could not parse URL for forum detection:", e);
610
+ }
611
+ // Decision logic: requires significant indicators or known host
612
+ return (commentCount >= MIN_FORUM_INDICATOR_COUNT ||
613
+ threadCount > 1 || // More than one thread item is stronger indicator
614
+ voteCount >= MIN_FORUM_INDICATOR_COUNT ||
615
+ isKnownForumHost);
616
+ }
617
+ // Tries to find the main content element for an article-like page
618
+ extractArticleContentElement(root) {
619
+ let bestCandidate = null;
620
+ let maxScore = -1;
621
+ // Evaluate candidates based on selectors, text length, and tag boosting/penalties
622
+ for (const selector of MAIN_CONTENT_SELECTORS) {
623
+ try {
624
+ const elements = root.querySelectorAll(selector);
625
+ for (const element of Array.from(elements)) {
626
+ if (!(element instanceof NHPHTMLElement))
627
+ continue;
628
+ // Basic scoring: text length
629
+ const textLength = (element.textContent || "").trim().length;
630
+ // Require some minimum length or presence of media to be considered
631
+ if (textLength < 100 && !element.querySelector("img, video, iframe, figure"))
632
+ continue;
633
+ let score = textLength;
634
+ // Boost common content tags/roles
635
+ if (["ARTICLE", "MAIN"].includes(element.tagName))
636
+ score *= 1.5;
637
+ if (["main", "article"].includes(element.getAttribute("role") || ""))
638
+ score *= 1.5;
639
+ // Penalize common boilerplate containers/roles
640
+ if (["HEADER", "FOOTER", "NAV", "ASIDE"].includes(element.tagName))
641
+ score *= 0.3;
642
+ try {
643
+ // Explicitly assert type before calling matches
644
+ if (
645
+ /* @ts-expect-error TODO: fix this */
646
+ element.matches('.sidebar, .widget, .menu, .nav, .header, .footer, [role="navigation"], [role="complementary"], [role="banner"]'))
647
+ score *= 0.2;
648
+ }
649
+ catch { }
650
+ // Penalize if it contains high-link density elements that weren't removed
651
+ if (this.hasHighLinkDensity(element, 0.6)) {
652
+ // Use a slightly higher threshold here
653
+ score *= 0.5;
654
+ }
655
+ // Boost if it contains multiple paragraph tags
656
+ if (element.querySelectorAll("p").length > 2)
657
+ score *= 1.2;
658
+ // Avoid selecting the entire body unless other scores are very low
659
+ if (element.tagName === "BODY" && maxScore > 200)
660
+ continue;
661
+ if (score > maxScore) {
662
+ maxScore = score;
663
+ bestCandidate = element;
664
+ }
665
+ }
666
+ }
667
+ catch (e) {
668
+ // Ignore invalid selectors
669
+ }
670
+ }
671
+ // Return the best candidate, or the root if nothing substantial found
672
+ return bestCandidate || root;
673
+ }
674
+ // Tries to find the main content element(s) for a forum-like page
675
+ extractForumContentElement(root) {
676
+ // For forums, combine the main post + comments container
677
+ const tempContainer = parse("<div></div>").firstChild;
678
+ // 1. Find and clone the main post/submission
679
+ try {
680
+ const mainPost = FORUM_MAIN_POST_SELECTORS.map((s) => root.querySelector(s)).find((el) => el instanceof NHPHTMLElement);
681
+ if (mainPost) {
682
+ tempContainer.appendChild(mainPost.clone());
683
+ }
684
+ }
685
+ catch (e) {
686
+ console.warn("Error finding forum main post:", e);
687
+ }
688
+ // 2. Find, clean, and clone the comments container
689
+ try {
690
+ const commentsContainer = FORUM_COMMENTS_CONTAINER_SELECTORS.map((s) => root.querySelector(s)).find((el) => el instanceof NHPHTMLElement);
691
+ if (commentsContainer) {
692
+ const clonedComments = commentsContainer.clone();
693
+ if (clonedComments instanceof NHPHTMLElement) {
694
+ // Clean obvious non-content from the cloned comments section
695
+ FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
696
+ try {
697
+ clonedComments.querySelectorAll(selector).forEach((el) => el.remove());
698
+ }
699
+ catch {
700
+ /* ignore */
701
+ }
702
+ });
703
+ tempContainer.appendChild(clonedComments);
704
+ }
705
+ }
706
+ }
707
+ catch (e) {
708
+ console.warn("Error finding forum comments container:", e);
709
+ }
710
+ // If we found specific parts, return the combined container
711
+ if (tempContainer.childNodes.length > 0) {
712
+ return tempContainer;
713
+ }
714
+ // Fallback: If no specific parts found, use the body after cleaning
715
+ const body = root.querySelector("body");
716
+ if (body) {
717
+ const clonedBody = body.clone();
718
+ if (clonedBody instanceof NHPHTMLElement) {
719
+ FORUM_OBVIOUS_NON_CONTENT_SELECTORS.forEach((selector) => {
720
+ try {
721
+ clonedBody.querySelectorAll(selector).forEach((el) => el.remove());
722
+ }
723
+ catch {
724
+ /* ignore */
725
+ }
726
+ });
727
+ // Also remove high link density from body fallback
728
+ this.removeHighLinkDensityElements(clonedBody, DEFAULT_LINK_DENSITY_THRESHOLD);
729
+ return clonedBody;
730
+ }
731
+ }
732
+ // Ultimate fallback: return the original root
733
+ return root;
734
+ }
735
+ // Helper function to check link density within an element
736
+ hasHighLinkDensity(element, threshold) {
737
+ const textContent = element.textContent || "";
738
+ if (textContent.length < MIN_LINK_DENSITY_TEXT_LENGTH)
739
+ return false;
740
+ const links = element.querySelectorAll("a");
741
+ if (links.length < 3)
742
+ return false;
743
+ const textLength = textContent.length;
744
+ let linkTextLength = 0;
745
+ element.querySelectorAll("a").forEach((link) => {
746
+ // Ensure link is a direct child or descendant not within another link
747
+ if (link.closest("a") === link) {
748
+ linkTextLength += link.textContent?.length || 0;
749
+ }
750
+ });
751
+ // Avoid division by zero
752
+ if (textLength === 0)
753
+ return false;
754
+ return linkTextLength / textLength > threshold;
755
+ }
756
+ // --- Markdown Postprocessing ---
757
+ postprocessMarkdown(markdown, options) {
758
+ let processed = markdown;
759
+ // 1. Fix heading spacing (ensure blank lines around headings)
760
+ processed = processed.replace(/^(\s*\n)?(#{1,6}\s.*)$/gm, "\n\n$2\n\n");
761
+ // 2. Fix list spacing (ensure blank line before list, compact items)
762
+ processed = processed.replace(/^(\s*\n)?(([\*\-+>]|\d+\.)\s)/gm, (_match, _p1, p2) => `\n\n${p2}`); // Ensure blank line before first item
763
+ // Remove single newlines *between* simple list items of the same type unless followed by indented block
764
+ processed = processed.replace(/(\n([\*\-+]|\d+\.)\s(?:(?!\n\n|\n {2,}|\n\t)[\s\S])*?)\n(?=([\*\-+]|\d+\.)\s)/g, "$1");
765
+ // 3. Remove empty Markdown elements (links, images)
766
+ processed = processed.replace(/\[\]\([^)]*\)/g, ""); // Empty links: [](...)
767
+ processed = processed.replace(/!\[\]\([^)]*\)/g, ""); // Empty images: ![](...)
768
+ // 4. Normalize image/link URLs (ensure protocol) - Basic handling
769
+ processed = processed.replace(/(!?\[[^\]]*\]\()(\/\/)/g, "$1https://"); // Fix protocol-relative URLs //
770
+ // Root-relative URLs (/path/...) need base URL context which we don't have here.
771
+ // 5. Normalize newlines (max 2 consecutive newlines)
772
+ const maxNewlines = "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES + 1);
773
+ const newlineRegex = new RegExp(`${maxNewlines}+`, "g");
774
+ processed = processed.replace(newlineRegex, "\n".repeat(POSTPROCESSING_MAX_CONSECUTIVE_NEWLINES));
775
+ // 6. Clean extraneous whitespace
776
+ processed = processed.replace(/^[ \t]+|[ \t]+$/gm, ""); // Trim leading/trailing space on lines
777
+ // 7. Fix code block spacing (ensure blank lines around them)
778
+ processed = processed.replace(/^(\s*\n)?(```(.*)\n[\s\S]*?\n```)(\s*\n)?/gm, "\n\n$2\n\n");
779
+ // 8. Remove excessively repeated *lines* (simple check for duplication)
780
+ processed = processed.replace(/^(.{30,})$(\n\1)+/gm, "$1");
781
+ // 9. Tidy up metadata section (ensure spacing)
782
+ processed = processed.replace(/(\n---\n)(\S)/g, "$1\n$2"); // Ensure blank line after separator
783
+ // 10. Truncate to max length if specified
784
+ if (options.maxContentLength && processed.length > options.maxContentLength) {
785
+ // Try to truncate at a sentence boundary
786
+ const truncatedPoint = processed.lastIndexOf(".", options.maxContentLength - 15); // Look back a bit
787
+ const sliceEnd = truncatedPoint > options.maxContentLength / 2 ? truncatedPoint + 1 : options.maxContentLength;
788
+ processed = processed.slice(0, sliceEnd) + "... (truncated)";
789
+ }
790
+ // 11. Final trim
791
+ return processed.trim();
792
+ }
793
+ }
794
+ //# sourceMappingURL=markdown-converter.js.map