defuddle 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/defuddle.js CHANGED
@@ -4,113 +4,9 @@ exports.Defuddle = void 0;
4
4
  const metadata_1 = require("./metadata");
5
5
  const extractor_registry_1 = require("./extractor-registry");
6
6
  const constants_1 = require("./constants");
7
- const math_full_1 = require("./elements/math.full");
8
- const code_1 = require("./elements/code");
9
- const footnotes_1 = require("./elements/footnotes");
10
- const headings_1 = require("./elements/headings");
7
+ const standardize_1 = require("./standardize");
11
8
  const scoring_1 = require("./scoring");
12
- const ELEMENT_STANDARDIZATION_RULES = [
13
- ...math_full_1.mathRules,
14
- ...code_1.codeBlockRules,
15
- ...headings_1.headingRules,
16
- // Convert divs with paragraph role to actual paragraphs
17
- {
18
- selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
19
- element: 'p',
20
- transform: (el, doc) => {
21
- const p = doc.createElement('p');
22
- // Copy innerHTML
23
- p.innerHTML = el.innerHTML;
24
- // Copy allowed attributes
25
- Array.from(el.attributes).forEach(attr => {
26
- if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
27
- p.setAttribute(attr.name, attr.value);
28
- }
29
- });
30
- return p;
31
- }
32
- },
33
- // Convert divs with list roles to actual lists
34
- {
35
- selector: 'div[role="list"]',
36
- element: 'ul',
37
- // Custom handler for list type detection and transformation
38
- transform: (el, doc) => {
39
- // First determine if this is an ordered list
40
- const firstItem = el.querySelector('div[role="listitem"] .label');
41
- const label = firstItem?.textContent?.trim() || '';
42
- const isOrdered = label.match(/^\d+\)/);
43
- // Create the appropriate list type
44
- const list = doc.createElement(isOrdered ? 'ol' : 'ul');
45
- // Process each list item
46
- const items = el.querySelectorAll('div[role="listitem"]');
47
- items.forEach(item => {
48
- const li = doc.createElement('li');
49
- const content = item.querySelector('.content');
50
- if (content) {
51
- // Convert any paragraph divs inside content
52
- const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
53
- paragraphDivs.forEach(div => {
54
- const p = doc.createElement('p');
55
- p.innerHTML = div.innerHTML;
56
- div.replaceWith(p);
57
- });
58
- // Convert any nested lists recursively
59
- const nestedLists = content.querySelectorAll('div[role="list"]');
60
- nestedLists.forEach(nestedList => {
61
- const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
62
- const nestedLabel = firstNestedItem?.textContent?.trim() || '';
63
- const isNestedOrdered = nestedLabel.match(/^\d+\)/);
64
- const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
65
- // Process nested items
66
- const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
67
- nestedItems.forEach(nestedItem => {
68
- const nestedLi = doc.createElement('li');
69
- const nestedContent = nestedItem.querySelector('.content');
70
- if (nestedContent) {
71
- // Convert paragraph divs in nested items
72
- const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
73
- nestedParagraphs.forEach(div => {
74
- const p = doc.createElement('p');
75
- p.innerHTML = div.innerHTML;
76
- div.replaceWith(p);
77
- });
78
- nestedLi.innerHTML = nestedContent.innerHTML;
79
- }
80
- newNestedList.appendChild(nestedLi);
81
- });
82
- nestedList.replaceWith(newNestedList);
83
- });
84
- li.innerHTML = content.innerHTML;
85
- }
86
- list.appendChild(li);
87
- });
88
- return list;
89
- }
90
- },
91
- {
92
- selector: 'div[role="listitem"]',
93
- element: 'li',
94
- // Custom handler for list item content
95
- transform: (el, doc) => {
96
- const content = el.querySelector('.content');
97
- if (!content)
98
- return el;
99
- // Convert any paragraph divs inside content
100
- const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
101
- paragraphDivs.forEach(div => {
102
- const p = doc.createElement('p');
103
- p.innerHTML = div.innerHTML;
104
- div.replaceWith(p);
105
- });
106
- return content;
107
- }
108
- }
109
- ];
110
- // Type guard
111
- function isElement(node) {
112
- return node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE;
113
- }
9
+ const utils_1 = require("./utils");
114
10
  class Defuddle {
115
11
  /**
116
12
  * Create a new Defuddle instance
@@ -131,7 +27,7 @@ class Defuddle {
131
27
  const schemaOrgData = metadata_1.MetadataExtractor.extractSchemaOrgData(this.doc);
132
28
  const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData);
133
29
  try {
134
- // Try to use a specific extractor first
30
+ // Use site-specific extractor first, if there is one
135
31
  const url = this.options.url || this.doc.URL;
136
32
  const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
137
33
  if (extractor && extractor.canExtract()) {
@@ -153,13 +49,14 @@ class Defuddle {
153
49
  extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase()
154
50
  };
155
51
  }
156
- // Evaluate styles and sizes on original document
52
+ // Continue if there is no extractor...
53
+ // Evaluate mobile styles and sizes on original document
157
54
  const mobileStyles = this._evaluateMediaQueries(this.doc);
158
- // Check for small images in original document, excluding lazy-loaded ones
55
+ // Find small images in original document, excluding lazy-loaded ones
159
56
  const smallImages = this.findSmallImages(this.doc);
160
57
  // Clone document
161
58
  const clone = this.doc.cloneNode(true);
162
- // Apply mobile style to clone
59
+ // Apply mobile styles to clone
163
60
  this.applyMobileStyles(clone, mobileStyles);
164
61
  // Find main content
165
62
  const mainContent = this.findMainContent(clone);
@@ -172,14 +69,18 @@ class Defuddle {
172
69
  parseTime: Math.round(endTime - startTime)
173
70
  };
174
71
  }
175
- // Remove small images identified from original document
72
+ // Remove small images
176
73
  this.removeSmallImages(clone, smallImages);
177
- // Perform other destructive operations on the clone
74
+ // Remove hidden elements using computed styles
178
75
  this.removeHiddenElements(clone);
76
+ // Remove non-content blocks by scoring
77
+ // Tries to find lists, navigation based on text content and link density
78
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug);
79
+ // Remove clutter using selectors
179
80
  this.removeClutter(clone);
180
- // Clean up the main content
181
- this.cleanContent(mainContent, metadata);
182
- const content = mainContent ? mainContent.outerHTML : this.doc.body.innerHTML;
81
+ // Normalize the main content
82
+ (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
83
+ const content = mainContent.outerHTML;
183
84
  const endTime = Date.now();
184
85
  return {
185
86
  content,
@@ -301,35 +202,10 @@ class Defuddle {
301
202
  }
302
203
  });
303
204
  }
304
- getWindow(doc) {
305
- // First try defaultView
306
- if (doc.defaultView) {
307
- return doc.defaultView;
308
- }
309
- // Then try ownerWindow
310
- if (doc.ownerWindow) {
311
- return doc.ownerWindow;
312
- }
313
- // Finally try to get window from document
314
- if (doc.window) {
315
- return doc.window;
316
- }
317
- return null;
318
- }
319
- getComputedStyle(element) {
320
- const win = this.getWindow(element.ownerDocument);
321
- if (!win)
322
- return null;
323
- return win.getComputedStyle(element);
324
- }
325
205
  removeHiddenElements(doc) {
326
206
  let count = 0;
327
207
  const elementsToRemove = new Set();
328
- // First pass: Get all elements matching hidden selectors
329
- const hiddenElements = doc.querySelectorAll(constants_1.HIDDEN_ELEMENT_SELECTORS);
330
- hiddenElements.forEach(el => elementsToRemove.add(el));
331
- count += hiddenElements.length;
332
- // Second pass: Get all elements and check their styles
208
+ // Get all elements and check their styles
333
209
  const allElements = Array.from(doc.getElementsByTagName('*'));
334
210
  // Process styles in batches to minimize layout thrashing
335
211
  const BATCH_SIZE = 100;
@@ -365,8 +241,7 @@ class Defuddle {
365
241
  }
366
242
  });
367
243
  }
368
- // Final pass: Batch remove all hidden elements
369
- elementsToRemove.forEach(el => el.remove());
244
+ // Batch remove all hidden elements
370
245
  this._log('Removed hidden elements:', count);
371
246
  }
372
247
  removeClutter(doc) {
@@ -425,754 +300,6 @@ class Defuddle {
425
300
  processingTime: `${(endTime - startTime).toFixed(2)}ms`
426
301
  });
427
302
  }
428
- flattenDivs(element) {
429
- let processedCount = 0;
430
- const startTime = Date.now();
431
- // Process in batches to maintain performance
432
- let keepProcessing = true;
433
- // Helper function to check if an element directly contains inline content
434
- // This helps prevent unwrapping divs that visually act as paragraphs.
435
- function hasDirectInlineContent(el) {
436
- for (const child of el.childNodes) {
437
- // Check for non-empty text nodes
438
- if (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE && child.textContent?.trim()) {
439
- return true;
440
- }
441
- // Check for element nodes that are considered inline
442
- if (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
443
- return true;
444
- }
445
- }
446
- return false;
447
- }
448
- const shouldPreserveElement = (el) => {
449
- const tagName = el.tagName.toLowerCase();
450
- // Check if element should be preserved
451
- if (constants_1.PRESERVE_ELEMENTS.has(tagName))
452
- return true;
453
- // Check for semantic roles
454
- const role = el.getAttribute('role');
455
- if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
456
- return true;
457
- }
458
- // Check for semantic classes
459
- const className = el.className;
460
- if (typeof className === 'string' && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
461
- return true;
462
- }
463
- // Check if div contains mixed content types that should be preserved
464
- if (tagName === 'div') {
465
- const children = Array.from(el.children);
466
- const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
467
- child.getAttribute('role') === 'article' ||
468
- (child.className && typeof child.className === 'string' &&
469
- child.className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)));
470
- if (hasPreservedElements)
471
- return true;
472
- }
473
- return false;
474
- };
475
- const isWrapperDiv = (div) => {
476
- // If it directly contains inline content, it's NOT a wrapper
477
- if (hasDirectInlineContent(div)) {
478
- return false;
479
- }
480
- // Check if it's just empty space
481
- if (!div.textContent?.trim())
482
- return true;
483
- // Check if it only contains other divs or block elements
484
- const children = Array.from(div.children);
485
- if (children.length === 0)
486
- return true;
487
- // Check if all children are block elements
488
- const allBlockElements = children.every(child => {
489
- const tag = child.tagName.toLowerCase();
490
- return tag === 'div' || tag === 'p' || tag === 'h1' || tag === 'h2' ||
491
- tag === 'h3' || tag === 'h4' || tag === 'h5' || tag === 'h6' ||
492
- tag === 'ul' || tag === 'ol' || tag === 'pre' || tag === 'blockquote' ||
493
- tag === 'figure';
494
- });
495
- if (allBlockElements)
496
- return true;
497
- // Check for common wrapper patterns
498
- const className = div.className.toLowerCase();
499
- const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
500
- if (isWrapper)
501
- return true;
502
- // Check if it has excessive whitespace or empty text nodes
503
- const textNodes = Array.from(div.childNodes).filter(node => node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && node.textContent?.trim() // TEXT_NODE
504
- );
505
- if (textNodes.length === 0)
506
- return true;
507
- // Check if it's a div that only contains block elements
508
- const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
509
- const tag = child.tagName.toLowerCase();
510
- return constants_1.INLINE_ELEMENTS.has(tag);
511
- });
512
- if (hasOnlyBlockElements)
513
- return true;
514
- return false;
515
- };
516
- // Function to process a single div
517
- const processDiv = (div) => {
518
- // Skip processing if div has been removed or should be preserved
519
- if (!div.isConnected || shouldPreserveElement(div))
520
- return false;
521
- // Case 1: Empty div or div with only whitespace
522
- if (!div.hasChildNodes() || !div.textContent?.trim()) {
523
- div.remove();
524
- processedCount++;
525
- return true;
526
- }
527
- // Case 2: Top-level div - be more aggressive
528
- if (div.parentElement === element) {
529
- const children = Array.from(div.children);
530
- const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
531
- const tag = child.tagName.toLowerCase();
532
- return constants_1.INLINE_ELEMENTS.has(tag);
533
- });
534
- if (hasOnlyBlockElements) {
535
- const fragment = this.doc.createDocumentFragment();
536
- while (div.firstChild) {
537
- fragment.appendChild(div.firstChild);
538
- }
539
- div.replaceWith(fragment);
540
- processedCount++;
541
- return true;
542
- }
543
- }
544
- // Case 3: Wrapper div - merge up aggressively
545
- if (isWrapperDiv(div)) {
546
- // Special case: if div only contains block elements, merge them up
547
- const children = Array.from(div.children);
548
- const onlyBlockElements = !children.some(child => {
549
- const tag = child.tagName.toLowerCase();
550
- return constants_1.INLINE_ELEMENTS.has(tag);
551
- });
552
- if (onlyBlockElements) {
553
- const fragment = this.doc.createDocumentFragment();
554
- while (div.firstChild) {
555
- fragment.appendChild(div.firstChild);
556
- }
557
- div.replaceWith(fragment);
558
- processedCount++;
559
- return true;
560
- }
561
- // Otherwise handle as normal wrapper
562
- const fragment = this.doc.createDocumentFragment();
563
- while (div.firstChild) {
564
- fragment.appendChild(div.firstChild);
565
- }
566
- div.replaceWith(fragment);
567
- processedCount++;
568
- return true;
569
- }
570
- // Case 4: Div only contains text and/or inline elements - convert to paragraph
571
- const childNodes = Array.from(div.childNodes);
572
- const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE) ||
573
- (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
574
- if (hasOnlyInlineOrText && div.textContent?.trim()) { // Ensure there's actual content
575
- const p = this.doc.createElement('p');
576
- // Move all children (including inline tags like <font>) to the new <p>
577
- while (div.firstChild) {
578
- p.appendChild(div.firstChild);
579
- }
580
- div.replaceWith(p);
581
- processedCount++;
582
- return true;
583
- }
584
- // Case 5: Div has single child - unwrap only if child is block-level
585
- if (div.children.length === 1) {
586
- const child = div.firstElementChild;
587
- const childTag = child.tagName.toLowerCase();
588
- // Only unwrap if the single child is a block element and not preserved
589
- if (constants_1.BLOCK_ELEMENTS.includes(childTag) && !shouldPreserveElement(child)) {
590
- div.replaceWith(child);
591
- processedCount++;
592
- return true;
593
- }
594
- }
595
- // Case 6: Deeply nested div - merge up
596
- let nestingDepth = 0;
597
- let parent = div.parentElement;
598
- while (parent) {
599
- if (parent.tagName.toLowerCase() === 'div') {
600
- nestingDepth++;
601
- }
602
- parent = parent.parentElement;
603
- }
604
- // Only unwrap if nested AND does not contain direct inline content
605
- if (nestingDepth > 0 && !hasDirectInlineContent(div)) {
606
- const fragment = this.doc.createDocumentFragment();
607
- while (div.firstChild) {
608
- fragment.appendChild(div.firstChild);
609
- }
610
- div.replaceWith(fragment);
611
- processedCount++;
612
- return true;
613
- }
614
- return false;
615
- };
616
- // First pass: Process top-level divs
617
- const processTopLevelDivs = () => {
618
- const topDivs = Array.from(element.children).filter(el => el.tagName.toLowerCase() === 'div');
619
- let modified = false;
620
- topDivs.forEach(div => {
621
- if (processDiv(div)) {
622
- modified = true;
623
- }
624
- });
625
- return modified;
626
- };
627
- // Second pass: Process remaining divs from deepest to shallowest
628
- const processRemainingDivs = () => {
629
- const allDivs = Array.from(element.getElementsByTagName('div'))
630
- .sort((a, b) => {
631
- // Count nesting depth
632
- const getDepth = (el) => {
633
- let depth = 0;
634
- let parent = el.parentElement;
635
- while (parent) {
636
- if (parent.tagName.toLowerCase() === 'div')
637
- depth++;
638
- parent = parent.parentElement;
639
- }
640
- return depth;
641
- };
642
- return getDepth(b) - getDepth(a); // Process deepest first
643
- });
644
- let modified = false;
645
- allDivs.forEach(div => {
646
- if (processDiv(div)) {
647
- modified = true;
648
- }
649
- });
650
- return modified;
651
- };
652
- // Final cleanup pass - aggressively flatten remaining divs
653
- const finalCleanup = () => {
654
- const remainingDivs = Array.from(element.getElementsByTagName('div'));
655
- let modified = false;
656
- remainingDivs.forEach(div => {
657
- // Only perform final cleanup/unwrap if the div is still connected,
658
- // not preserved, and does not contain direct inline content.
659
- if (div.isConnected && !shouldPreserveElement(div) && !hasDirectInlineContent(div)) {
660
- const children = Array.from(div.children);
661
- const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
662
- // Unwrap if it only contains paragraphs OR is identified as a wrapper
663
- if (onlyParagraphs || isWrapperDiv(div)) {
664
- const fragment = this.doc.createDocumentFragment();
665
- while (div.firstChild) {
666
- fragment.appendChild(div.firstChild);
667
- }
668
- div.replaceWith(fragment);
669
- processedCount++;
670
- modified = true;
671
- }
672
- }
673
- });
674
- return modified;
675
- };
676
- // Execute all passes until no more changes
677
- do {
678
- keepProcessing = false;
679
- if (processTopLevelDivs())
680
- keepProcessing = true;
681
- if (processRemainingDivs())
682
- keepProcessing = true;
683
- if (finalCleanup())
684
- keepProcessing = true;
685
- } while (keepProcessing);
686
- const endTime = Date.now();
687
- this._log('Flattened divs:', {
688
- count: processedCount,
689
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
690
- });
691
- }
692
- cleanContent(element, metadata) {
693
- this.standardizeSpaces(element);
694
- // Remove HTML comments
695
- this.removeHtmlComments(element);
696
- // Handle H1 elements - remove first one and convert others to H2
697
- this.standardizeHeadings(element, metadata.title);
698
- // Standardize footnotes and citations
699
- (0, footnotes_1.standardizeFootnotes)(element);
700
- // Handle lazy-loaded images
701
- this.handleLazyImages(element);
702
- // Convert embedded content to standard formats
703
- this.standardizeElements(element);
704
- // If not debug mode, do the full cleanup
705
- if (!this.debug) {
706
- // First pass of div flattening
707
- this.flattenDivs(element);
708
- // Strip unwanted attributes
709
- this.stripUnwantedAttributes(element);
710
- // Remove empty elements
711
- this.removeEmptyElements(element);
712
- // Remove trailing headings
713
- this.removeTrailingHeadings(element);
714
- // Final pass of div flattening after cleanup operations
715
- this.flattenDivs(element);
716
- // Standardize consecutive br elements
717
- this.stripExtraBrElements(element);
718
- // Clean up empty lines
719
- this.removeEmptyLines(element);
720
- }
721
- else {
722
- // In debug mode, still do basic cleanup but preserve structure
723
- this.stripUnwantedAttributes(element);
724
- this.removeEmptyElements(element);
725
- this.removeTrailingHeadings(element);
726
- this.stripExtraBrElements(element);
727
- this._log('Debug mode: Skipping div flattening to preserve structure');
728
- }
729
- }
730
- standardizeSpaces(element) {
731
- const processNode = (node) => {
732
- // Skip pre and code elements
733
- if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
734
- const tag = node.tagName.toLowerCase();
735
- if (tag === 'pre' || tag === 'code') {
736
- return;
737
- }
738
- }
739
- // Process text nodes
740
- if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
741
- const text = node.textContent || '';
742
- // Replace &nbsp; with regular spaces, except when it's a single &nbsp; between words
743
- const newText = text.replace(/\xA0+/g, (match) => {
744
- // If it's a single &nbsp; between word characters, preserve it
745
- if (match.length === 1) {
746
- const prev = node.previousSibling?.textContent?.slice(-1);
747
- const next = node.nextSibling?.textContent?.charAt(0);
748
- if (prev?.match(/\w/) && next?.match(/\w/)) {
749
- return '\xA0';
750
- }
751
- }
752
- return ' '.repeat(match.length);
753
- });
754
- if (newText !== text) {
755
- node.textContent = newText;
756
- }
757
- }
758
- // Process children recursively
759
- if (node.hasChildNodes()) {
760
- Array.from(node.childNodes).forEach(processNode);
761
- }
762
- };
763
- processNode(element);
764
- }
765
- removeTrailingHeadings(element) {
766
- let removedCount = 0;
767
- const hasContentAfter = (el) => {
768
- // Check if there's any meaningful content after this element
769
- let nextContent = '';
770
- let sibling = el.nextSibling;
771
- // First check direct siblings
772
- while (sibling) {
773
- if (sibling.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
774
- nextContent += sibling.textContent || '';
775
- }
776
- else if (sibling.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) { // ELEMENT_NODE
777
- // If we find an element sibling, check its content
778
- nextContent += sibling.textContent || '';
779
- }
780
- sibling = sibling.nextSibling;
781
- }
782
- // If we found meaningful content at this level, return true
783
- if (nextContent.trim()) {
784
- return true;
785
- }
786
- // If no content found at this level and we have a parent,
787
- // check for content after the parent
788
- const parent = el.parentElement;
789
- if (parent && parent !== element) {
790
- return hasContentAfter(parent);
791
- }
792
- return false;
793
- };
794
- // Process all headings from bottom to top
795
- const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
796
- .reverse();
797
- headings.forEach(heading => {
798
- if (!hasContentAfter(heading)) {
799
- heading.remove();
800
- removedCount++;
801
- }
802
- else {
803
- // Stop processing once we find a heading with content after it
804
- return;
805
- }
806
- });
807
- if (removedCount > 0) {
808
- this._log('Removed trailing headings:', removedCount);
809
- }
810
- }
811
- standardizeHeadings(element, title) {
812
- const normalizeText = (text) => {
813
- return text
814
- .replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
815
- .replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
816
- .trim()
817
- .toLowerCase();
818
- };
819
- const h1s = element.getElementsByTagName('h1');
820
- Array.from(h1s).forEach(h1 => {
821
- const h2 = this.doc.createElement('h2');
822
- h2.innerHTML = h1.innerHTML;
823
- // Copy allowed attributes
824
- Array.from(h1.attributes).forEach(attr => {
825
- if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
826
- h2.setAttribute(attr.name, attr.value);
827
- }
828
- });
829
- h1.parentNode?.replaceChild(h2, h1);
830
- });
831
- // Remove first H2 if it matches title
832
- const h2s = element.getElementsByTagName('h2');
833
- if (h2s.length > 0) {
834
- const firstH2 = h2s[0];
835
- const firstH2Text = normalizeText(firstH2.textContent || '');
836
- const normalizedTitle = normalizeText(title);
837
- if (normalizedTitle && normalizedTitle === firstH2Text) {
838
- firstH2.remove();
839
- }
840
- }
841
- }
842
- removeHtmlComments(element) {
843
- let removedCount = 0;
844
- // Get all elements and check their child nodes
845
- const allElements = Array.from(element.getElementsByTagName('*'));
846
- // Process each element's child nodes
847
- allElements.forEach(el => {
848
- const childNodes = Array.from(el.childNodes);
849
- childNodes.forEach(node => {
850
- if (node.nodeType === 8) { // 8 is the node type for comments
851
- node.remove();
852
- removedCount++;
853
- }
854
- });
855
- });
856
- this._log('Removed HTML comments:', removedCount);
857
- }
858
- stripUnwantedAttributes(element) {
859
- let attributeCount = 0;
860
- const processElement = (el) => {
861
- // Skip SVG elements - preserve all their attributes
862
- if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
863
- return;
864
- }
865
- const attributes = Array.from(el.attributes);
866
- const tag = el.tagName.toLowerCase();
867
- attributes.forEach(attr => {
868
- const attrName = attr.name.toLowerCase();
869
- const attrValue = attr.value;
870
- // Special cases for preserving specific attributes
871
- if (
872
- // Preserve footnote IDs
873
- (attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
874
- attrValue.startsWith('fn:') || // Footnote content
875
- attrValue === 'footnotes' // Footnotes container
876
- )) ||
877
- // Preserve code block language classes and footnote backref class
878
- (attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
879
- attrValue === 'footnote-backref'))) {
880
- return;
881
- }
882
- // In debug mode, allow debug attributes and data- attributes
883
- if (this.debug) {
884
- if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
885
- !constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
886
- !attrName.startsWith('data-')) {
887
- el.removeAttribute(attr.name);
888
- attributeCount++;
889
- }
890
- }
891
- else {
892
- // In normal mode, only allow standard attributes
893
- if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
894
- el.removeAttribute(attr.name);
895
- attributeCount++;
896
- }
897
- }
898
- });
899
- };
900
- processElement(element);
901
- element.querySelectorAll('*').forEach(processElement);
902
- this._log('Stripped attributes:', attributeCount);
903
- }
904
- removeEmptyElements(element) {
905
- let removedCount = 0;
906
- let iterations = 0;
907
- let keepRemoving = true;
908
- while (keepRemoving) {
909
- iterations++;
910
- keepRemoving = false;
911
- // Get all elements without children, working from deepest first
912
- const emptyElements = Array.from(element.getElementsByTagName('*')).filter(el => {
913
- if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase())) {
914
- return false;
915
- }
916
- // Check if element has only whitespace or &nbsp;
917
- const textContent = el.textContent || '';
918
- const hasOnlyWhitespace = textContent.trim().length === 0;
919
- const hasNbsp = textContent.includes('\u00A0'); // Unicode non-breaking space
920
- // Check if element has no meaningful children
921
- const hasNoChildren = !el.hasChildNodes() ||
922
- (Array.from(el.childNodes).every(node => {
923
- if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
924
- const nodeText = node.textContent || '';
925
- return nodeText.trim().length === 0 && !nodeText.includes('\u00A0');
926
- }
927
- return false;
928
- }));
929
- // Special case: Check for divs that only contain spans with commas
930
- if (el.tagName.toLowerCase() === 'div') {
931
- const children = Array.from(el.children);
932
- const hasOnlyCommaSpans = children.length > 0 && children.every(child => {
933
- if (child.tagName.toLowerCase() !== 'span')
934
- return false;
935
- const content = child.textContent?.trim() || '';
936
- return content === ',' || content === '' || content === ' ';
937
- });
938
- if (hasOnlyCommaSpans)
939
- return true;
940
- }
941
- return hasOnlyWhitespace && !hasNbsp && hasNoChildren;
942
- });
943
- if (emptyElements.length > 0) {
944
- emptyElements.forEach(el => {
945
- el.remove();
946
- removedCount++;
947
- });
948
- keepRemoving = true;
949
- }
950
- }
951
- this._log('Removed empty elements:', removedCount, 'iterations:', iterations);
952
- }
953
- stripExtraBrElements(element) {
954
- let processedCount = 0;
955
- const startTime = Date.now();
956
- // Get all br elements directly
957
- const brElements = Array.from(element.getElementsByTagName('br'));
958
- // Keep track of consecutive br elements
959
- let consecutiveBrs = [];
960
- // Helper to process collected br elements
961
- const processBrs = () => {
962
- if (consecutiveBrs.length > 2) {
963
- // Keep only two br elements
964
- for (let i = 2; i < consecutiveBrs.length; i++) {
965
- consecutiveBrs[i].remove();
966
- processedCount++;
967
- }
968
- }
969
- consecutiveBrs = [];
970
- };
971
- // Process all br elements
972
- brElements.forEach(currentNode => {
973
- // Check if this br is consecutive with previous ones
974
- let isConsecutive = false;
975
- if (consecutiveBrs.length > 0) {
976
- const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
977
- let node = currentNode.previousSibling;
978
- // Skip whitespace text nodes
979
- while (node && node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && !node.textContent?.trim()) {
980
- node = node.previousSibling;
981
- }
982
- if (node === lastBr) {
983
- isConsecutive = true;
984
- }
985
- }
986
- if (isConsecutive) {
987
- consecutiveBrs.push(currentNode);
988
- }
989
- else {
990
- // Process any previously collected brs before starting new group
991
- processBrs();
992
- consecutiveBrs = [currentNode];
993
- }
994
- });
995
- // Process any remaining br elements
996
- processBrs();
997
- const endTime = Date.now();
998
- this._log('Standardized br elements:', {
999
- removed: processedCount,
1000
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
1001
- });
1002
- }
1003
- removeEmptyLines(element) {
1004
- let removedCount = 0;
1005
- const startTime = Date.now();
1006
- // First pass: remove empty text nodes
1007
- const removeEmptyTextNodes = (node) => {
1008
- // Skip if inside pre or code
1009
- if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
1010
- const tag = node.tagName.toLowerCase();
1011
- if (tag === 'pre' || tag === 'code') {
1012
- return;
1013
- }
1014
- }
1015
- // Process children first (depth-first)
1016
- const children = Array.from(node.childNodes);
1017
- children.forEach(removeEmptyTextNodes);
1018
- // Then handle this node
1019
- if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
1020
- const text = node.textContent || '';
1021
- // If it's completely empty or just special characters/whitespace, remove it
1022
- if (!text || text.match(/^[\u200C\u200B\u200D\u200E\u200F\uFEFF\xA0\s]*$/)) {
1023
- node.parentNode?.removeChild(node);
1024
- removedCount++;
1025
- }
1026
- else {
1027
- // Clean up the text content while preserving important spaces
1028
- const newText = text
1029
- .replace(/\n{3,}/g, '\n\n') // More than 2 newlines -> 2 newlines
1030
- .replace(/^[\n\r\t]+/, '') // Remove leading newlines/tabs (preserve spaces)
1031
- .replace(/[\n\r\t]+$/, '') // Remove trailing newlines/tabs (preserve spaces)
1032
- .replace(/[ \t]*\n[ \t]*/g, '\n') // Remove spaces around newlines
1033
- .replace(/[ \t]{3,}/g, ' ') // 3+ spaces -> 1 space
1034
- .replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
1035
- .replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
1036
- // Clean up zero-width characters and multiple non-breaking spaces
1037
- .replace(/[\u200C\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
1038
- .replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple &nbsp; -> single &nbsp;
1039
- if (newText !== text) {
1040
- node.textContent = newText;
1041
- removedCount += text.length - newText.length;
1042
- }
1043
- }
1044
- }
1045
- };
1046
- // Second pass: clean up empty elements and normalize spacing
1047
- const cleanupEmptyElements = (node) => {
1048
- if (!isElement(node))
1049
- return;
1050
- // Skip pre and code elements
1051
- const tag = node.tagName.toLowerCase();
1052
- if (tag === 'pre' || tag === 'code') {
1053
- return;
1054
- }
1055
- // Process children first (depth-first)
1056
- Array.from(node.childNodes)
1057
- .filter(isElement)
1058
- .forEach(cleanupEmptyElements);
1059
- // Then normalize this element's whitespace
1060
- node.normalize(); // Combine adjacent text nodes
1061
- // Special handling for block elements
1062
- const isBlockElement = this.getComputedStyle(node)?.display === 'block';
1063
- // Only remove empty text nodes at the start and end if they contain just newlines/tabs
1064
- // For block elements, also remove spaces
1065
- const startPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
1066
- const endPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
1067
- while (node.firstChild &&
1068
- node.firstChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
1069
- (node.firstChild.textContent || '').match(startPattern)) {
1070
- node.removeChild(node.firstChild);
1071
- removedCount++;
1072
- }
1073
- while (node.lastChild &&
1074
- node.lastChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
1075
- (node.lastChild.textContent || '').match(endPattern)) {
1076
- node.removeChild(node.lastChild);
1077
- removedCount++;
1078
- }
1079
- // Ensure there's a space between inline elements if needed
1080
- if (!isBlockElement) {
1081
- const children = Array.from(node.childNodes);
1082
- for (let i = 0; i < children.length - 1; i++) {
1083
- const current = children[i];
1084
- const next = children[i + 1];
1085
- // Only add space between elements or between element and text
1086
- if (isElement(current) || isElement(next)) {
1087
- // Don't add space if next content starts with punctuation
1088
- const nextContent = next.textContent || '';
1089
- const currentContent = current.textContent || '';
1090
- if (!nextContent.match(/^[,.!?:;]/) &&
1091
- !currentContent.match(/[,.!?:;]$/)) {
1092
- // Check if there's already a space
1093
- const hasSpace = (current.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
1094
- (current.textContent || '').endsWith(' ')) ||
1095
- (next.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
1096
- (next.textContent || '').startsWith(' '));
1097
- if (!hasSpace) {
1098
- const space = this.doc.createTextNode(' ');
1099
- node.insertBefore(space, next);
1100
- }
1101
- }
1102
- }
1103
- }
1104
- }
1105
- };
1106
- // Run both passes
1107
- removeEmptyTextNodes(element);
1108
- cleanupEmptyElements(element);
1109
- const endTime = Date.now();
1110
- this._log('Removed empty lines:', {
1111
- charactersRemoved: removedCount,
1112
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
1113
- });
1114
- }
1115
- handleLazyImages(element) {
1116
- let processedCount = 0;
1117
- const lazyImages = element.querySelectorAll('img[data-src], img[data-srcset]');
1118
- lazyImages.forEach(img => {
1119
- // Check if element is an image by checking tag name and required properties
1120
- if (img.tagName.toLowerCase() !== 'img' || !('src' in img) || !('srcset' in img)) {
1121
- return;
1122
- }
1123
- // Handle data-src
1124
- const dataSrc = img.getAttribute('data-src');
1125
- if (dataSrc && !img.getAttribute('src')) {
1126
- img.setAttribute('src', dataSrc);
1127
- processedCount++;
1128
- }
1129
- // Handle data-srcset
1130
- const dataSrcset = img.getAttribute('data-srcset');
1131
- if (dataSrcset && !img.getAttribute('srcset')) {
1132
- img.setAttribute('srcset', dataSrcset);
1133
- processedCount++;
1134
- }
1135
- // Remove lazy loading related classes and attributes
1136
- img.classList.remove('lazy', 'lazyload');
1137
- img.removeAttribute('data-ll-status');
1138
- img.removeAttribute('data-src');
1139
- img.removeAttribute('data-srcset');
1140
- });
1141
- this._log('Processed lazy images:', processedCount);
1142
- }
1143
- standardizeElements(element) {
1144
- let processedCount = 0;
1145
- // Convert elements based on standardization rules
1146
- ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
1147
- const elements = element.querySelectorAll(rule.selector);
1148
- elements.forEach(el => {
1149
- if (rule.transform) {
1150
- // If there's a transform function, use it to create the new element
1151
- const transformed = rule.transform(el, this.doc);
1152
- el.replaceWith(transformed);
1153
- processedCount++;
1154
- }
1155
- });
1156
- });
1157
- // Convert lite-youtube elements
1158
- const liteYoutubeElements = element.querySelectorAll('lite-youtube');
1159
- liteYoutubeElements.forEach(el => {
1160
- const videoId = el.getAttribute('videoid');
1161
- if (!videoId)
1162
- return;
1163
- const iframe = this.doc.createElement('iframe');
1164
- iframe.width = '560';
1165
- iframe.height = '315';
1166
- iframe.src = `https://www.youtube.com/embed/${videoId}`;
1167
- iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
1168
- iframe.frameBorder = '0';
1169
- iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
1170
- iframe.setAttribute('allowfullscreen', '');
1171
- el.replaceWith(iframe);
1172
- processedCount++;
1173
- });
1174
- this._log('Converted embedded elements:', processedCount);
1175
- }
1176
303
  // Find small IMG and SVG elements
1177
304
  findSmallImages(doc) {
1178
305
  const MIN_DIMENSION = 33;
@@ -1184,21 +311,7 @@ class Defuddle {
1184
311
  const elements = [
1185
312
  ...Array.from(doc.getElementsByTagName('img')),
1186
313
  ...Array.from(doc.getElementsByTagName('svg'))
1187
- ].filter(element => {
1188
- // Skip lazy-loaded images that haven't been processed yet
1189
- // and math images which may be small
1190
- if (element.tagName.toLowerCase() === 'img') {
1191
- const ignoredImage = element.classList.contains('lazy') ||
1192
- element.classList.contains('lazyload') ||
1193
- element.classList.contains('latex') ||
1194
- element.hasAttribute('decoding') ||
1195
- element.hasAttribute('data-src') ||
1196
- element.hasAttribute('data-srcset') ||
1197
- element.hasAttribute('loading');
1198
- return !ignoredImage;
1199
- }
1200
- return true;
1201
- });
314
+ ];
1202
315
  if (elements.length === 0) {
1203
316
  return smallImages;
1204
317
  }
@@ -1342,7 +455,7 @@ class Defuddle {
1342
455
  const elements = doc.querySelectorAll(selector);
1343
456
  elements.forEach(element => {
1344
457
  // Base score from selector priority (earlier = higher)
1345
- let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 10;
458
+ let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40;
1346
459
  // Add score based on content analysis
1347
460
  score += scoring_1.ContentScorer.scoreElement(element);
1348
461
  candidates.push({ element, score });
@@ -1416,6 +529,9 @@ class Defuddle {
1416
529
  }
1417
530
  return parts.join(' > ');
1418
531
  }
532
+ getComputedStyle(element) {
533
+ return (0, utils_1.getComputedStyle)(element);
534
+ }
1419
535
  }
1420
536
  exports.Defuddle = Defuddle;
1421
537
  //# sourceMappingURL=defuddle.js.map