npm - defuddle - Versions diffs - 0.5.4 → 0.6.1 - Mend

defuddle 0.5.4 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/constants.d.ts +0 -1
package/dist/constants.js +88 -28
package/dist/constants.js.map +1 -1
package/dist/defuddle.d.ts +1 -14
package/dist/defuddle.js +23 -907
package/dist/defuddle.js.map +1 -1
package/dist/elements/images.d.ts +8 -0
package/dist/elements/images.js +779 -0
package/dist/elements/images.js.map +1 -0
package/dist/elements/math.core.js +59 -0
package/dist/elements/math.core.js.map +1 -0
package/dist/elements/math.js +9 -0
package/dist/elements/math.js.map +1 -0
package/dist/index.full.js +1 -1
package/dist/index.js +1 -1
package/dist/scoring.d.ts +17 -0
package/dist/scoring.js +216 -0
package/dist/scoring.js.map +1 -1
package/dist/standardize.d.ts +2 -0
package/dist/standardize.js +830 -0
package/dist/standardize.js.map +1 -0
package/dist/utils.d.ts +4 -0
package/dist/utils.js +38 -0
package/dist/utils.js.map +1 -0
package/package.json +1 -1
package/dist/elements/math.full.js +0 -121
package/dist/elements/math.full.js.map +0 -1

package/dist/defuddle.js CHANGED Viewed

@@ -4,113 +4,9 @@ exports.Defuddle = void 0;
 const metadata_1 = require("./metadata");
 const extractor_registry_1 = require("./extractor-registry");
 const constants_1 = require("./constants");
-const math_full_1 = require("./elements/math.full");
-const code_1 = require("./elements/code");
-const footnotes_1 = require("./elements/footnotes");
-const headings_1 = require("./elements/headings");
+const standardize_1 = require("./standardize");
 const scoring_1 = require("./scoring");
-const ELEMENT_STANDARDIZATION_RULES = [
-    ...math_full_1.mathRules,
-    ...code_1.codeBlockRules,
-    ...headings_1.headingRules,
-    // Convert divs with paragraph role to actual paragraphs
-    {
-        selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
-        element: 'p',
-        transform: (el, doc) => {
-            const p = doc.createElement('p');
-            // Copy innerHTML
-            p.innerHTML = el.innerHTML;
-            // Copy allowed attributes
-            Array.from(el.attributes).forEach(attr => {
-                if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
-                    p.setAttribute(attr.name, attr.value);
-                }
-            });
-            return p;
-        }
-    },
-    // Convert divs with list roles to actual lists
-    {
-        selector: 'div[role="list"]',
-        element: 'ul',
-        // Custom handler for list type detection and transformation
-        transform: (el, doc) => {
-            // First determine if this is an ordered list
-            const firstItem = el.querySelector('div[role="listitem"] .label');
-            const label = firstItem?.textContent?.trim() || '';
-            const isOrdered = label.match(/^\d+\)/);
-            // Create the appropriate list type
-            const list = doc.createElement(isOrdered ? 'ol' : 'ul');
-            // Process each list item
-            const items = el.querySelectorAll('div[role="listitem"]');
-            items.forEach(item => {
-                const li = doc.createElement('li');
-                const content = item.querySelector('.content');
-                if (content) {
-                    // Convert any paragraph divs inside content
-                    const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
-                    paragraphDivs.forEach(div => {
-                        const p = doc.createElement('p');
-                        p.innerHTML = div.innerHTML;
-                        div.replaceWith(p);
-                    });
-                    // Convert any nested lists recursively
-                    const nestedLists = content.querySelectorAll('div[role="list"]');
-                    nestedLists.forEach(nestedList => {
-                        const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
-                        const nestedLabel = firstNestedItem?.textContent?.trim() || '';
-                        const isNestedOrdered = nestedLabel.match(/^\d+\)/);
-                        const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
-                        // Process nested items
-                        const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
-                        nestedItems.forEach(nestedItem => {
-                            const nestedLi = doc.createElement('li');
-                            const nestedContent = nestedItem.querySelector('.content');
-                            if (nestedContent) {
-                                // Convert paragraph divs in nested items
-                                const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
-                                nestedParagraphs.forEach(div => {
-                                    const p = doc.createElement('p');
-                                    p.innerHTML = div.innerHTML;
-                                    div.replaceWith(p);
-                                });
-                                nestedLi.innerHTML = nestedContent.innerHTML;
-                            }
-                            newNestedList.appendChild(nestedLi);
-                        });
-                        nestedList.replaceWith(newNestedList);
-                    });
-                    li.innerHTML = content.innerHTML;
-                }
-                list.appendChild(li);
-            });
-            return list;
-        }
-    },
-    {
-        selector: 'div[role="listitem"]',
-        element: 'li',
-        // Custom handler for list item content
-        transform: (el, doc) => {
-            const content = el.querySelector('.content');
-            if (!content)
-                return el;
-            // Convert any paragraph divs inside content
-            const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
-            paragraphDivs.forEach(div => {
-                const p = doc.createElement('p');
-                p.innerHTML = div.innerHTML;
-                div.replaceWith(p);
-            });
-            return content;
-        }
-    }
-];
-// Type guard
-function isElement(node) {
-    return node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE;
-}
+const utils_1 = require("./utils");
 class Defuddle {
     /**
      * Create a new Defuddle instance
@@ -131,7 +27,7 @@ class Defuddle {
         const schemaOrgData = metadata_1.MetadataExtractor.extractSchemaOrgData(this.doc);
         const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData);
         try {
-            // Try to use a specific extractor first
+            // Use site-specific extractor first, if there is one
             const url = this.options.url || this.doc.URL;
             const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
             if (extractor && extractor.canExtract()) {
@@ -153,13 +49,14 @@ class Defuddle {
                     extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase()
                 };
             }
-            // Evaluate styles and sizes on original document
+            // Continue if there is no extractor...
+            // Evaluate mobile styles and sizes on original document
             const mobileStyles = this._evaluateMediaQueries(this.doc);
-            // Check for small images in original document, excluding lazy-loaded ones
+            // Find small images in original document, excluding lazy-loaded ones
             const smallImages = this.findSmallImages(this.doc);
             // Clone document
             const clone = this.doc.cloneNode(true);
-            // Apply mobile style to clone
+            // Apply mobile styles to clone
             this.applyMobileStyles(clone, mobileStyles);
             // Find main content
             const mainContent = this.findMainContent(clone);
@@ -172,14 +69,18 @@ class Defuddle {
                     parseTime: Math.round(endTime - startTime)
                 };
             }
-            // Remove small images identified from original document
+            // Remove small images
             this.removeSmallImages(clone, smallImages);
-            // Perform other destructive operations on the clone
+            // Remove hidden elements using computed styles
             this.removeHiddenElements(clone);
+            // Remove non-content blocks by scoring
+            // Tries to find lists, navigation based on text content and link density
+            scoring_1.ContentScorer.scoreAndRemove(clone, this.debug);
+            // Remove clutter using selectors
             this.removeClutter(clone);
-            // Clean up the main content
-            this.cleanContent(mainContent, metadata);
-            const content = mainContent ? mainContent.outerHTML : this.doc.body.innerHTML;
+            // Normalize the main content
+            (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
+            const content = mainContent.outerHTML;
             const endTime = Date.now();
             return {
                 content,
@@ -301,35 +202,10 @@ class Defuddle {
             }
         });
     }
-    getWindow(doc) {
-        // First try defaultView
-        if (doc.defaultView) {
-            return doc.defaultView;
-        }
-        // Then try ownerWindow
-        if (doc.ownerWindow) {
-            return doc.ownerWindow;
-        }
-        // Finally try to get window from document
-        if (doc.window) {
-            return doc.window;
-        }
-        return null;
-    }
-    getComputedStyle(element) {
-        const win = this.getWindow(element.ownerDocument);
-        if (!win)
-            return null;
-        return win.getComputedStyle(element);
-    }
     removeHiddenElements(doc) {
         let count = 0;
         const elementsToRemove = new Set();
-        // First pass: Get all elements matching hidden selectors
-        const hiddenElements = doc.querySelectorAll(constants_1.HIDDEN_ELEMENT_SELECTORS);
-        hiddenElements.forEach(el => elementsToRemove.add(el));
-        count += hiddenElements.length;
-        // Second pass: Get all elements and check their styles
+        // Get all elements and check their styles
         const allElements = Array.from(doc.getElementsByTagName('*'));
         // Process styles in batches to minimize layout thrashing
         const BATCH_SIZE = 100;
@@ -365,8 +241,7 @@ class Defuddle {
                 }
             });
         }
-        // Final pass: Batch remove all hidden elements
-        elementsToRemove.forEach(el => el.remove());
+        // Batch remove all hidden elements
         this._log('Removed hidden elements:', count);
     }
     removeClutter(doc) {
@@ -425,754 +300,6 @@ class Defuddle {
             processingTime: `${(endTime - startTime).toFixed(2)}ms`
         });
     }
-    flattenDivs(element) {
-        let processedCount = 0;
-        const startTime = Date.now();
-        // Process in batches to maintain performance
-        let keepProcessing = true;
-        // Helper function to check if an element directly contains inline content
-        // This helps prevent unwrapping divs that visually act as paragraphs.
-        function hasDirectInlineContent(el) {
-            for (const child of el.childNodes) {
-                // Check for non-empty text nodes
-                if (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE && child.textContent?.trim()) {
-                    return true;
-                }
-                // Check for element nodes that are considered inline
-                if (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
-                    return true;
-                }
-            }
-            return false;
-        }
-        const shouldPreserveElement = (el) => {
-            const tagName = el.tagName.toLowerCase();
-            // Check if element should be preserved
-            if (constants_1.PRESERVE_ELEMENTS.has(tagName))
-                return true;
-            // Check for semantic roles
-            const role = el.getAttribute('role');
-            if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
-                return true;
-            }
-            // Check for semantic classes
-            const className = el.className;
-            if (typeof className === 'string' && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
-                return true;
-            }
-            // Check if div contains mixed content types that should be preserved
-            if (tagName === 'div') {
-                const children = Array.from(el.children);
-                const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
-                    child.getAttribute('role') === 'article' ||
-                    (child.className && typeof child.className === 'string' &&
-                        child.className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)));
-                if (hasPreservedElements)
-                    return true;
-            }
-            return false;
-        };
-        const isWrapperDiv = (div) => {
-            // If it directly contains inline content, it's NOT a wrapper
-            if (hasDirectInlineContent(div)) {
-                return false;
-            }
-            // Check if it's just empty space
-            if (!div.textContent?.trim())
-                return true;
-            // Check if it only contains other divs or block elements
-            const children = Array.from(div.children);
-            if (children.length === 0)
-                return true;
-            // Check if all children are block elements
-            const allBlockElements = children.every(child => {
-                const tag = child.tagName.toLowerCase();
-                return tag === 'div' || tag === 'p' || tag === 'h1' || tag === 'h2' ||
-                    tag === 'h3' || tag === 'h4' || tag === 'h5' || tag === 'h6' ||
-                    tag === 'ul' || tag === 'ol' || tag === 'pre' || tag === 'blockquote' ||
-                    tag === 'figure';
-            });
-            if (allBlockElements)
-                return true;
-            // Check for common wrapper patterns
-            const className = div.className.toLowerCase();
-            const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
-            if (isWrapper)
-                return true;
-            // Check if it has excessive whitespace or empty text nodes
-            const textNodes = Array.from(div.childNodes).filter(node => node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && node.textContent?.trim() // TEXT_NODE
-            );
-            if (textNodes.length === 0)
-                return true;
-            // Check if it's a div that only contains block elements
-            const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
-                const tag = child.tagName.toLowerCase();
-                return constants_1.INLINE_ELEMENTS.has(tag);
-            });
-            if (hasOnlyBlockElements)
-                return true;
-            return false;
-        };
-        // Function to process a single div
-        const processDiv = (div) => {
-            // Skip processing if div has been removed or should be preserved
-            if (!div.isConnected || shouldPreserveElement(div))
-                return false;
-            // Case 1: Empty div or div with only whitespace
-            if (!div.hasChildNodes() || !div.textContent?.trim()) {
-                div.remove();
-                processedCount++;
-                return true;
-            }
-            // Case 2: Top-level div - be more aggressive
-            if (div.parentElement === element) {
-                const children = Array.from(div.children);
-                const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
-                    const tag = child.tagName.toLowerCase();
-                    return constants_1.INLINE_ELEMENTS.has(tag);
-                });
-                if (hasOnlyBlockElements) {
-                    const fragment = this.doc.createDocumentFragment();
-                    while (div.firstChild) {
-                        fragment.appendChild(div.firstChild);
-                    }
-                    div.replaceWith(fragment);
-                    processedCount++;
-                    return true;
-                }
-            }
-            // Case 3: Wrapper div - merge up aggressively
-            if (isWrapperDiv(div)) {
-                // Special case: if div only contains block elements, merge them up
-                const children = Array.from(div.children);
-                const onlyBlockElements = !children.some(child => {
-                    const tag = child.tagName.toLowerCase();
-                    return constants_1.INLINE_ELEMENTS.has(tag);
-                });
-                if (onlyBlockElements) {
-                    const fragment = this.doc.createDocumentFragment();
-                    while (div.firstChild) {
-                        fragment.appendChild(div.firstChild);
-                    }
-                    div.replaceWith(fragment);
-                    processedCount++;
-                    return true;
-                }
-                // Otherwise handle as normal wrapper
-                const fragment = this.doc.createDocumentFragment();
-                while (div.firstChild) {
-                    fragment.appendChild(div.firstChild);
-                }
-                div.replaceWith(fragment);
-                processedCount++;
-                return true;
-            }
-            // Case 4: Div only contains text and/or inline elements - convert to paragraph
-            const childNodes = Array.from(div.childNodes);
-            const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE) ||
-                (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
-            if (hasOnlyInlineOrText && div.textContent?.trim()) { // Ensure there's actual content
-                const p = this.doc.createElement('p');
-                // Move all children (including inline tags like <font>) to the new <p>
-                while (div.firstChild) {
-                    p.appendChild(div.firstChild);
-                }
-                div.replaceWith(p);
-                processedCount++;
-                return true;
-            }
-            // Case 5: Div has single child - unwrap only if child is block-level
-            if (div.children.length === 1) {
-                const child = div.firstElementChild;
-                const childTag = child.tagName.toLowerCase();
-                // Only unwrap if the single child is a block element and not preserved
-                if (constants_1.BLOCK_ELEMENTS.includes(childTag) && !shouldPreserveElement(child)) {
-                    div.replaceWith(child);
-                    processedCount++;
-                    return true;
-                }
-            }
-            // Case 6: Deeply nested div - merge up
-            let nestingDepth = 0;
-            let parent = div.parentElement;
-            while (parent) {
-                if (parent.tagName.toLowerCase() === 'div') {
-                    nestingDepth++;
-                }
-                parent = parent.parentElement;
-            }
-            // Only unwrap if nested AND does not contain direct inline content
-            if (nestingDepth > 0 && !hasDirectInlineContent(div)) {
-                const fragment = this.doc.createDocumentFragment();
-                while (div.firstChild) {
-                    fragment.appendChild(div.firstChild);
-                }
-                div.replaceWith(fragment);
-                processedCount++;
-                return true;
-            }
-            return false;
-        };
-        // First pass: Process top-level divs
-        const processTopLevelDivs = () => {
-            const topDivs = Array.from(element.children).filter(el => el.tagName.toLowerCase() === 'div');
-            let modified = false;
-            topDivs.forEach(div => {
-                if (processDiv(div)) {
-                    modified = true;
-                }
-            });
-            return modified;
-        };
-        // Second pass: Process remaining divs from deepest to shallowest
-        const processRemainingDivs = () => {
-            const allDivs = Array.from(element.getElementsByTagName('div'))
-                .sort((a, b) => {
-                // Count nesting depth
-                const getDepth = (el) => {
-                    let depth = 0;
-                    let parent = el.parentElement;
-                    while (parent) {
-                        if (parent.tagName.toLowerCase() === 'div')
-                            depth++;
-                        parent = parent.parentElement;
-                    }
-                    return depth;
-                };
-                return getDepth(b) - getDepth(a); // Process deepest first
-            });
-            let modified = false;
-            allDivs.forEach(div => {
-                if (processDiv(div)) {
-                    modified = true;
-                }
-            });
-            return modified;
-        };
-        // Final cleanup pass - aggressively flatten remaining divs
-        const finalCleanup = () => {
-            const remainingDivs = Array.from(element.getElementsByTagName('div'));
-            let modified = false;
-            remainingDivs.forEach(div => {
-                // Only perform final cleanup/unwrap if the div is still connected,
-                // not preserved, and does not contain direct inline content.
-                if (div.isConnected && !shouldPreserveElement(div) && !hasDirectInlineContent(div)) {
-                    const children = Array.from(div.children);
-                    const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
-                    // Unwrap if it only contains paragraphs OR is identified as a wrapper
-                    if (onlyParagraphs || isWrapperDiv(div)) {
-                        const fragment = this.doc.createDocumentFragment();
-                        while (div.firstChild) {
-                            fragment.appendChild(div.firstChild);
-                        }
-                        div.replaceWith(fragment);
-                        processedCount++;
-                        modified = true;
-                    }
-                }
-            });
-            return modified;
-        };
-        // Execute all passes until no more changes
-        do {
-            keepProcessing = false;
-            if (processTopLevelDivs())
-                keepProcessing = true;
-            if (processRemainingDivs())
-                keepProcessing = true;
-            if (finalCleanup())
-                keepProcessing = true;
-        } while (keepProcessing);
-        const endTime = Date.now();
-        this._log('Flattened divs:', {
-            count: processedCount,
-            processingTime: `${(endTime - startTime).toFixed(2)}ms`
-        });
-    }
-    cleanContent(element, metadata) {
-        this.standardizeSpaces(element);
-        // Remove HTML comments
-        this.removeHtmlComments(element);
-        // Handle H1 elements - remove first one and convert others to H2
-        this.standardizeHeadings(element, metadata.title);
-        // Standardize footnotes and citations
-        (0, footnotes_1.standardizeFootnotes)(element);
-        // Handle lazy-loaded images
-        this.handleLazyImages(element);
-        // Convert embedded content to standard formats
-        this.standardizeElements(element);
-        // If not debug mode, do the full cleanup
-        if (!this.debug) {
-            // First pass of div flattening
-            this.flattenDivs(element);
-            // Strip unwanted attributes
-            this.stripUnwantedAttributes(element);
-            // Remove empty elements
-            this.removeEmptyElements(element);
-            // Remove trailing headings
-            this.removeTrailingHeadings(element);
-            // Final pass of div flattening after cleanup operations
-            this.flattenDivs(element);
-            // Standardize consecutive br elements
-            this.stripExtraBrElements(element);
-            // Clean up empty lines
-            this.removeEmptyLines(element);
-        }
-        else {
-            // In debug mode, still do basic cleanup but preserve structure
-            this.stripUnwantedAttributes(element);
-            this.removeEmptyElements(element);
-            this.removeTrailingHeadings(element);
-            this.stripExtraBrElements(element);
-            this._log('Debug mode: Skipping div flattening to preserve structure');
-        }
-    }
-    standardizeSpaces(element) {
-        const processNode = (node) => {
-            // Skip pre and code elements
-            if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
-                const tag = node.tagName.toLowerCase();
-                if (tag === 'pre' || tag === 'code') {
-                    return;
-                }
-            }
-            // Process text nodes
-            if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
-                const text = node.textContent || '';
-                // Replace &nbsp; with regular spaces, except when it's a single &nbsp; between words
-                const newText = text.replace(/\xA0+/g, (match) => {
-                    // If it's a single &nbsp; between word characters, preserve it
-                    if (match.length === 1) {
-                        const prev = node.previousSibling?.textContent?.slice(-1);
-                        const next = node.nextSibling?.textContent?.charAt(0);
-                        if (prev?.match(/\w/) && next?.match(/\w/)) {
-                            return '\xA0';
-                        }
-                    }
-                    return ' '.repeat(match.length);
-                });
-                if (newText !== text) {
-                    node.textContent = newText;
-                }
-            }
-            // Process children recursively
-            if (node.hasChildNodes()) {
-                Array.from(node.childNodes).forEach(processNode);
-            }
-        };
-        processNode(element);
-    }
-    removeTrailingHeadings(element) {
-        let removedCount = 0;
-        const hasContentAfter = (el) => {
-            // Check if there's any meaningful content after this element
-            let nextContent = '';
-            let sibling = el.nextSibling;
-            // First check direct siblings
-            while (sibling) {
-                if (sibling.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
-                    nextContent += sibling.textContent || '';
-                }
-                else if (sibling.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) { // ELEMENT_NODE
-                    // If we find an element sibling, check its content
-                    nextContent += sibling.textContent || '';
-                }
-                sibling = sibling.nextSibling;
-            }
-            // If we found meaningful content at this level, return true
-            if (nextContent.trim()) {
-                return true;
-            }
-            // If no content found at this level and we have a parent,
-            // check for content after the parent
-            const parent = el.parentElement;
-            if (parent && parent !== element) {
-                return hasContentAfter(parent);
-            }
-            return false;
-        };
-        // Process all headings from bottom to top
-        const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
-            .reverse();
-        headings.forEach(heading => {
-            if (!hasContentAfter(heading)) {
-                heading.remove();
-                removedCount++;
-            }
-            else {
-                // Stop processing once we find a heading with content after it
-                return;
-            }
-        });
-        if (removedCount > 0) {
-            this._log('Removed trailing headings:', removedCount);
-        }
-    }
-    standardizeHeadings(element, title) {
-        const normalizeText = (text) => {
-            return text
-                .replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
-                .replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
-                .trim()
-                .toLowerCase();
-        };
-        const h1s = element.getElementsByTagName('h1');
-        Array.from(h1s).forEach(h1 => {
-            const h2 = this.doc.createElement('h2');
-            h2.innerHTML = h1.innerHTML;
-            // Copy allowed attributes
-            Array.from(h1.attributes).forEach(attr => {
-                if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
-                    h2.setAttribute(attr.name, attr.value);
-                }
-            });
-            h1.parentNode?.replaceChild(h2, h1);
-        });
-        // Remove first H2 if it matches title
-        const h2s = element.getElementsByTagName('h2');
-        if (h2s.length > 0) {
-            const firstH2 = h2s[0];
-            const firstH2Text = normalizeText(firstH2.textContent || '');
-            const normalizedTitle = normalizeText(title);
-            if (normalizedTitle && normalizedTitle === firstH2Text) {
-                firstH2.remove();
-            }
-        }
-    }
-    removeHtmlComments(element) {
-        let removedCount = 0;
-        // Get all elements and check their child nodes
-        const allElements = Array.from(element.getElementsByTagName('*'));
-        // Process each element's child nodes
-        allElements.forEach(el => {
-            const childNodes = Array.from(el.childNodes);
-            childNodes.forEach(node => {
-                if (node.nodeType === 8) { // 8 is the node type for comments
-                    node.remove();
-                    removedCount++;
-                }
-            });
-        });
-        this._log('Removed HTML comments:', removedCount);
-    }
-    stripUnwantedAttributes(element) {
-        let attributeCount = 0;
-        const processElement = (el) => {
-            // Skip SVG elements - preserve all their attributes
-            if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
-                return;
-            }
-            const attributes = Array.from(el.attributes);
-            const tag = el.tagName.toLowerCase();
-            attributes.forEach(attr => {
-                const attrName = attr.name.toLowerCase();
-                const attrValue = attr.value;
-                // Special cases for preserving specific attributes
-                if (
-                // Preserve footnote IDs
-                (attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
-                    attrValue.startsWith('fn:') || // Footnote content
-                    attrValue === 'footnotes' // Footnotes container
-                )) ||
-                    // Preserve code block language classes and footnote backref class
-                    (attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
-                        attrValue === 'footnote-backref'))) {
-                    return;
-                }
-                // In debug mode, allow debug attributes and data- attributes
-                if (this.debug) {
-                    if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
-                        !constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
-                        !attrName.startsWith('data-')) {
-                        el.removeAttribute(attr.name);
-                        attributeCount++;
-                    }
-                }
-                else {
-                    // In normal mode, only allow standard attributes
-                    if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
-                        el.removeAttribute(attr.name);
-                        attributeCount++;
-                    }
-                }
-            });
-        };
-        processElement(element);
-        element.querySelectorAll('*').forEach(processElement);
-        this._log('Stripped attributes:', attributeCount);
-    }
-    removeEmptyElements(element) {
-        let removedCount = 0;
-        let iterations = 0;
-        let keepRemoving = true;
-        while (keepRemoving) {
-            iterations++;
-            keepRemoving = false;
-            // Get all elements without children, working from deepest first
-            const emptyElements = Array.from(element.getElementsByTagName('*')).filter(el => {
-                if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase())) {
-                    return false;
-                }
-                // Check if element has only whitespace or &nbsp;
-                const textContent = el.textContent || '';
-                const hasOnlyWhitespace = textContent.trim().length === 0;
-                const hasNbsp = textContent.includes('\u00A0'); // Unicode non-breaking space
-                // Check if element has no meaningful children
-                const hasNoChildren = !el.hasChildNodes() ||
-                    (Array.from(el.childNodes).every(node => {
-                        if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
-                            const nodeText = node.textContent || '';
-                            return nodeText.trim().length === 0 && !nodeText.includes('\u00A0');
-                        }
-                        return false;
-                    }));
-                // Special case: Check for divs that only contain spans with commas
-                if (el.tagName.toLowerCase() === 'div') {
-                    const children = Array.from(el.children);
-                    const hasOnlyCommaSpans = children.length > 0 && children.every(child => {
-                        if (child.tagName.toLowerCase() !== 'span')
-                            return false;
-                        const content = child.textContent?.trim() || '';
-                        return content === ',' || content === '' || content === ' ';
-                    });
-                    if (hasOnlyCommaSpans)
-                        return true;
-                }
-                return hasOnlyWhitespace && !hasNbsp && hasNoChildren;
-            });
-            if (emptyElements.length > 0) {
-                emptyElements.forEach(el => {
-                    el.remove();
-                    removedCount++;
-                });
-                keepRemoving = true;
-            }
-        }
-        this._log('Removed empty elements:', removedCount, 'iterations:', iterations);
-    }
-    stripExtraBrElements(element) {
-        let processedCount = 0;
-        const startTime = Date.now();
-        // Get all br elements directly
-        const brElements = Array.from(element.getElementsByTagName('br'));
-        // Keep track of consecutive br elements
-        let consecutiveBrs = [];
-        // Helper to process collected br elements
-        const processBrs = () => {
-            if (consecutiveBrs.length > 2) {
-                // Keep only two br elements
-                for (let i = 2; i < consecutiveBrs.length; i++) {
-                    consecutiveBrs[i].remove();
-                    processedCount++;
-                }
-            }
-            consecutiveBrs = [];
-        };
-        // Process all br elements
-        brElements.forEach(currentNode => {
-            // Check if this br is consecutive with previous ones
-            let isConsecutive = false;
-            if (consecutiveBrs.length > 0) {
-                const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
-                let node = currentNode.previousSibling;
-                // Skip whitespace text nodes
-                while (node && node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && !node.textContent?.trim()) {
-                    node = node.previousSibling;
-                }
-                if (node === lastBr) {
-                    isConsecutive = true;
-                }
-            }
-            if (isConsecutive) {
-                consecutiveBrs.push(currentNode);
-            }
-            else {
-                // Process any previously collected brs before starting new group
-                processBrs();
-                consecutiveBrs = [currentNode];
-            }
-        });
-        // Process any remaining br elements
-        processBrs();
-        const endTime = Date.now();
-        this._log('Standardized br elements:', {
-            removed: processedCount,
-            processingTime: `${(endTime - startTime).toFixed(2)}ms`
-        });
-    }
-    removeEmptyLines(element) {
-        let removedCount = 0;
-        const startTime = Date.now();
-        // First pass: remove empty text nodes
-        const removeEmptyTextNodes = (node) => {
-            // Skip if inside pre or code
-            if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
-                const tag = node.tagName.toLowerCase();
-                if (tag === 'pre' || tag === 'code') {
-                    return;
-                }
-            }
-            // Process children first (depth-first)
-            const children = Array.from(node.childNodes);
-            children.forEach(removeEmptyTextNodes);
-            // Then handle this node
-            if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
-                const text = node.textContent || '';
-                // If it's completely empty or just special characters/whitespace, remove it
-                if (!text || text.match(/^[\u200C\u200B\u200D\u200E\u200F\uFEFF\xA0\s]*$/)) {
-                    node.parentNode?.removeChild(node);
-                    removedCount++;
-                }
-                else {
-                    // Clean up the text content while preserving important spaces
-                    const newText = text
-                        .replace(/\n{3,}/g, '\n\n') // More than 2 newlines -> 2 newlines
-                        .replace(/^[\n\r\t]+/, '') // Remove leading newlines/tabs (preserve spaces)
-                        .replace(/[\n\r\t]+$/, '') // Remove trailing newlines/tabs (preserve spaces)
-                        .replace(/[ \t]*\n[ \t]*/g, '\n') // Remove spaces around newlines
-                        .replace(/[ \t]{3,}/g, ' ') // 3+ spaces -> 1 space
-                        .replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
-                        .replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
-                        // Clean up zero-width characters and multiple non-breaking spaces
-                        .replace(/[\u200C\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
-                        .replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple &nbsp; -> single &nbsp;
-                    if (newText !== text) {
-                        node.textContent = newText;
-                        removedCount += text.length - newText.length;
-                    }
-                }
-            }
-        };
-        // Second pass: clean up empty elements and normalize spacing
-        const cleanupEmptyElements = (node) => {
-            if (!isElement(node))
-                return;
-            // Skip pre and code elements
-            const tag = node.tagName.toLowerCase();
-            if (tag === 'pre' || tag === 'code') {
-                return;
-            }
-            // Process children first (depth-first)
-            Array.from(node.childNodes)
-                .filter(isElement)
-                .forEach(cleanupEmptyElements);
-            // Then normalize this element's whitespace
-            node.normalize(); // Combine adjacent text nodes
-            // Special handling for block elements
-            const isBlockElement = this.getComputedStyle(node)?.display === 'block';
-            // Only remove empty text nodes at the start and end if they contain just newlines/tabs
-            // For block elements, also remove spaces
-            const startPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
-            const endPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
-            while (node.firstChild &&
-                node.firstChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
-                (node.firstChild.textContent || '').match(startPattern)) {
-                node.removeChild(node.firstChild);
-                removedCount++;
-            }
-            while (node.lastChild &&
-                node.lastChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
-                (node.lastChild.textContent || '').match(endPattern)) {
-                node.removeChild(node.lastChild);
-                removedCount++;
-            }
-            // Ensure there's a space between inline elements if needed
-            if (!isBlockElement) {
-                const children = Array.from(node.childNodes);
-                for (let i = 0; i < children.length - 1; i++) {
-                    const current = children[i];
-                    const next = children[i + 1];
-                    // Only add space between elements or between element and text
-                    if (isElement(current) || isElement(next)) {
-                        // Don't add space if next content starts with punctuation
-                        const nextContent = next.textContent || '';
-                        const currentContent = current.textContent || '';
-                        if (!nextContent.match(/^[,.!?:;]/) &&
-                            !currentContent.match(/[,.!?:;]$/)) {
-                            // Check if there's already a space
-                            const hasSpace = (current.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
-                                (current.textContent || '').endsWith(' ')) ||
-                                (next.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
-                                    (next.textContent || '').startsWith(' '));
-                            if (!hasSpace) {
-                                const space = this.doc.createTextNode(' ');
-                                node.insertBefore(space, next);
-                            }
-                        }
-                    }
-                }
-            }
-        };
-        // Run both passes
-        removeEmptyTextNodes(element);
-        cleanupEmptyElements(element);
-        const endTime = Date.now();
-        this._log('Removed empty lines:', {
-            charactersRemoved: removedCount,
-            processingTime: `${(endTime - startTime).toFixed(2)}ms`
-        });
-    }
-    handleLazyImages(element) {
-        let processedCount = 0;
-        const lazyImages = element.querySelectorAll('img[data-src], img[data-srcset]');
-        lazyImages.forEach(img => {
-            // Check if element is an image by checking tag name and required properties
-            if (img.tagName.toLowerCase() !== 'img' || !('src' in img) || !('srcset' in img)) {
-                return;
-            }
-            // Handle data-src
-            const dataSrc = img.getAttribute('data-src');
-            if (dataSrc && !img.getAttribute('src')) {
-                img.setAttribute('src', dataSrc);
-                processedCount++;
-            }
-            // Handle data-srcset
-            const dataSrcset = img.getAttribute('data-srcset');
-            if (dataSrcset && !img.getAttribute('srcset')) {
-                img.setAttribute('srcset', dataSrcset);
-                processedCount++;
-            }
-            // Remove lazy loading related classes and attributes
-            img.classList.remove('lazy', 'lazyload');
-            img.removeAttribute('data-ll-status');
-            img.removeAttribute('data-src');
-            img.removeAttribute('data-srcset');
-        });
-        this._log('Processed lazy images:', processedCount);
-    }
-    standardizeElements(element) {
-        let processedCount = 0;
-        // Convert elements based on standardization rules
-        ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
-            const elements = element.querySelectorAll(rule.selector);
-            elements.forEach(el => {
-                if (rule.transform) {
-                    // If there's a transform function, use it to create the new element
-                    const transformed = rule.transform(el, this.doc);
-                    el.replaceWith(transformed);
-                    processedCount++;
-                }
-            });
-        });
-        // Convert lite-youtube elements
-        const liteYoutubeElements = element.querySelectorAll('lite-youtube');
-        liteYoutubeElements.forEach(el => {
-            const videoId = el.getAttribute('videoid');
-            if (!videoId)
-                return;
-            const iframe = this.doc.createElement('iframe');
-            iframe.width = '560';
-            iframe.height = '315';
-            iframe.src = `https://www.youtube.com/embed/${videoId}`;
-            iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
-            iframe.frameBorder = '0';
-            iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
-            iframe.setAttribute('allowfullscreen', '');
-            el.replaceWith(iframe);
-            processedCount++;
-        });
-        this._log('Converted embedded elements:', processedCount);
-    }
     // Find small IMG and SVG elements
     findSmallImages(doc) {
         const MIN_DIMENSION = 33;
@@ -1184,21 +311,7 @@ class Defuddle {
         const elements = [
             ...Array.from(doc.getElementsByTagName('img')),
             ...Array.from(doc.getElementsByTagName('svg'))
-        ].filter(element => {
-            // Skip lazy-loaded images that haven't been processed yet
-            // and math images which may be small
-            if (element.tagName.toLowerCase() === 'img') {
-                const ignoredImage = element.classList.contains('lazy') ||
-                    element.classList.contains('lazyload') ||
-                    element.classList.contains('latex') ||
-                    element.hasAttribute('decoding') ||
-                    element.hasAttribute('data-src') ||
-                    element.hasAttribute('data-srcset') ||
-                    element.hasAttribute('loading');
-                return !ignoredImage;
-            }
-            return true;
-        });
+        ];
         if (elements.length === 0) {
             return smallImages;
         }
@@ -1342,7 +455,7 @@ class Defuddle {
             const elements = doc.querySelectorAll(selector);
             elements.forEach(element => {
                 // Base score from selector priority (earlier = higher)
-                let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 10;
+                let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40;
                 // Add score based on content analysis
                 score += scoring_1.ContentScorer.scoreElement(element);
                 candidates.push({ element, score });
@@ -1416,6 +529,9 @@ class Defuddle {
         }
         return parts.join(' > ');
     }
+    getComputedStyle(element) {
+        return (0, utils_1.getComputedStyle)(element);
+    }
 }
 exports.Defuddle = Defuddle;
 //# sourceMappingURL=defuddle.js.map