npm - defuddle - Versions diffs - 0.11.0 → 0.12.0 - Mend

defuddle 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +23 -8
package/dist/cli.js +2 -1
package/dist/cli.js.map +1 -1
package/dist/constants.d.ts +2 -0
package/dist/constants.js +12 -1
package/dist/constants.js.map +1 -1
package/dist/defuddle.d.ts +48 -1
package/dist/defuddle.js +519 -213
package/dist/defuddle.js.map +1 -1
package/dist/elements/footnotes.js +2 -1
package/dist/elements/footnotes.js.map +1 -1
package/dist/extractor-registry.d.ts +1 -0
package/dist/extractor-registry.js +3 -0
package/dist/extractor-registry.js.map +1 -1
package/dist/extractors/_base.d.ts +6 -0
package/dist/extractors/_base.js +8 -0
package/dist/extractors/_base.js.map +1 -1
package/dist/extractors/github.d.ts +10 -2
package/dist/extractors/github.js +158 -71
package/dist/extractors/github.js.map +1 -1
package/dist/extractors/hackernews.js +18 -72
package/dist/extractors/hackernews.js.map +1 -1
package/dist/extractors/reddit.d.ts +1 -2
package/dist/extractors/reddit.js +41 -94
package/dist/extractors/reddit.js.map +1 -1
package/dist/extractors/x-oembed.d.ts +0 -1
package/dist/extractors/x-oembed.js +20 -27
package/dist/extractors/x-oembed.js.map +1 -1
package/dist/extractors/youtube.d.ts +37 -0
package/dist/extractors/youtube.js +409 -9
package/dist/extractors/youtube.js.map +1 -1
package/dist/index.full.js +1 -1
package/dist/index.js +1 -1
package/dist/metadata.d.ts +5 -0
package/dist/metadata.js +28 -0
package/dist/metadata.js.map +1 -1
package/dist/node.js +0 -5
package/dist/node.js.map +1 -1
package/dist/scoring.d.ts +6 -1
package/dist/scoring.js +66 -19
package/dist/scoring.js.map +1 -1
package/dist/standardize.js +64 -60
package/dist/standardize.js.map +1 -1
package/dist/types.d.ts +9 -0
package/dist/utils/comments.d.ts +44 -0
package/dist/utils/comments.js +103 -0
package/dist/utils/comments.js.map +1 -0
package/dist/utils/dom.d.ts +9 -0
package/dist/utils/dom.js +20 -0
package/dist/utils/dom.js.map +1 -1
package/dist/utils/transcript.d.ts +37 -0
package/dist/utils/transcript.js +61 -0
package/dist/utils/transcript.js.map +1 -0
package/package.json +1 -1

package/dist/defuddle.js CHANGED Viewed

@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
 const scoring_1 = require("./scoring");
 const utils_1 = require("./utils");
 const dom_1 = require("./utils/dom");
+/** Keys from extractor variables that map to top-level DefuddleResponse fields */
+const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
+// Content pattern detection constants
+const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
+const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
+const BOILERPLATE_PATTERNS = [
+    /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
+    /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
+    /^Originally (?:published|appeared) (?:in|on|at)\b/i,
+];
+const METADATA_STRIP_PATTERNS = [
+    /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
+    /\b\d+(?:st|nd|rd|th)?\b/g,
+    /\bmin(?:ute)?s?\b/gi,
+    /\bread\b/gi,
+    /[|·•—–\-,.\s]/g,
+];
 class Defuddle {
     /**
      * Create a new Defuddle instance
@@ -16,10 +33,23 @@ class Defuddle {
      * @param options - Options for parsing
      */
     constructor(doc, options = {}) {
+        this._schemaOrgData = undefined;
+        this._schemaOrgExtracted = false;
         this.doc = doc;
         this.options = options;
         this.debug = options.debug || false;
     }
+    /**
+     * Lazily extract and cache schema.org data. Must be called before
+     * parse() strips script tags from the document.
+     */
+    getSchemaOrgData() {
+        if (!this._schemaOrgExtracted) {
+            this._schemaOrgData = this._extractSchemaOrgData(this.doc);
+            this._schemaOrgExtracted = true;
+        }
+        return this._schemaOrgData;
+    }
     /**
      * Parse the document and extract its main content
      */
@@ -48,7 +78,8 @@ class Defuddle {
             this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
             const indexRetry = this.parseInternal({
                 removeLowScoring: false,
-                removePartialSelectors: false
+                removePartialSelectors: false,
+                removeContentPatterns: false
             });
             if (indexRetry.wordCount > result.wordCount) {
                 this._log('Index page retry produced more content');
@@ -125,8 +156,7 @@ class Defuddle {
                     el.removeAttribute(attr.name);
                 }
                 else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
-                    const val = attr.value.replace(/[\s\u0000-\u001F]+/g, '').toLowerCase();
-                    if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
+                    if ((0, dom_1.isDangerousUrl)(attr.value)) {
                         el.removeAttribute(attr.name);
                     }
                 }
@@ -245,46 +275,61 @@ class Defuddle {
         return url;
     }
     /**
-     * Parse the document, falling back to async extractors if sync parse yields no content
+     * Parse the document asynchronously. Checks for extractors that prefer
+     * async (e.g. YouTube transcripts) before sync, then falls back to async
+     * extractors if sync parse yields no content.
      */
     async parseAsync() {
+        if (this.options.useAsync !== false) {
+            const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
+            if (asyncResult)
+                return asyncResult;
+        }
         const result = this.parse();
         if (result.wordCount > 0 || this.options.useAsync === false) {
             return result;
         }
+        return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
+    }
+    /**
+     * Fetch only async variables (e.g. transcript) without re-parsing.
+     * Safe to call after parse() — uses cached schema.org data since
+     * parse() strips script tags from the document.
+     */
+    async fetchAsyncVariables() {
+        if (this.options.useAsync === false)
+            return null;
         try {
             const url = this.options.url || this.doc.URL;
-            const schemaOrgData = this._extractSchemaOrgData(this.doc);
-            const extractor = extractor_registry_1.ExtractorRegistry.findAsyncExtractor(this.doc, url, schemaOrgData);
+            const schemaOrgData = this.getSchemaOrgData();
+            const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
+            if (extractor) {
+                const extracted = await extractor.extractAsync();
+                return this.getExtractorVariables(extracted.variables) || null;
+            }
+        }
+        catch (error) {
+            console.error('Defuddle', 'Error fetching async variables:', error);
+        }
+        return null;
+    }
+    async tryAsyncExtractor(finder) {
+        try {
+            const url = this.options.url || this.doc.URL;
+            const schemaOrgData = this.getSchemaOrgData();
+            const extractor = finder(this.doc, url, schemaOrgData);
             if (extractor) {
                 const startTime = Date.now();
                 const extracted = await extractor.extractAsync();
-                const contentHtml = this.resolveContentUrls(extracted.contentHtml);
                 const pageMetaTags = this._collectMetaTags();
                 const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
-                const endTime = Date.now();
-                return {
-                    content: contentHtml,
-                    title: extracted.variables?.title || metadata.title,
-                    description: metadata.description,
-                    domain: metadata.domain,
-                    favicon: metadata.favicon,
-                    image: metadata.image,
-                    published: extracted.variables?.published || metadata.published,
-                    author: extracted.variables?.author || metadata.author,
-                    site: extracted.variables?.site || metadata.site,
-                    schemaOrgData: metadata.schemaOrgData,
-                    wordCount: this.countWords(extracted.contentHtml),
-                    parseTime: Math.round(endTime - startTime),
-                    extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
-                    metaTags: pageMetaTags
-                };
+                return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
             }
         }
         catch (error) {
             console.error('Defuddle', 'Error in async extraction:', error);
         }
-        return result;
+        return null;
     }
     /**
      * Internal parse method that does the actual work
@@ -297,16 +342,23 @@ class Defuddle {
             removeHiddenElements: true,
             removeLowScoring: true,
             removeSmallImages: true,
+            removeContentPatterns: true,
             standardize: true,
             ...this.options,
             ...overrideOptions
         };
         const debugRemovals = [];
-        // Extract schema.org data
-        const schemaOrgData = this._extractSchemaOrgData(this.doc);
-        const pageMetaTags = this._collectMetaTags();
-        // Extract metadata
-        const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
+        // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
+        const schemaOrgData = this.getSchemaOrgData();
+        // Cache meta tags and metadata across retries
+        if (!this._metaTags) {
+            this._metaTags = this._collectMetaTags();
+        }
+        const pageMetaTags = this._metaTags;
+        if (!this._metadata) {
+            this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
+        }
+        const metadata = this._metadata;
         if (options.removeImages) {
             this.removeImages(this.doc);
         }
@@ -316,35 +368,25 @@ class Defuddle {
             const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
             if (extractor && extractor.canExtract()) {
                 const extracted = extractor.extract();
-                const contentHtml = this.resolveContentUrls(extracted.contentHtml);
-                const endTime = Date.now();
-                // console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
-                return {
-                    content: contentHtml,
-                    title: extracted.variables?.title || metadata.title,
-                    description: metadata.description,
-                    domain: metadata.domain,
-                    favicon: metadata.favicon,
-                    image: metadata.image,
-                    published: extracted.variables?.published || metadata.published,
-                    author: extracted.variables?.author || metadata.author,
-                    site: extracted.variables?.site || metadata.site,
-                    schemaOrgData: metadata.schemaOrgData,
-                    wordCount: this.countWords(extracted.contentHtml),
-                    parseTime: Math.round(endTime - startTime),
-                    extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
-                    metaTags: pageMetaTags
-                };
+                return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
             }
             // Continue if there is no extractor...
-            // Evaluate mobile styles and sizes on original document
-            const mobileStyles = this._evaluateMediaQueries(this.doc);
-            // Find small images in original document, excluding lazy-loaded ones
-            const smallImages = this.findSmallImages(this.doc);
+            // Evaluate mobile styles and sizes on original document (cached across retries)
+            if (!this._mobileStyles) {
+                this._mobileStyles = this._evaluateMediaQueries(this.doc);
+            }
+            const mobileStyles = this._mobileStyles;
+            // Find small images in original document (cached across retries)
+            if (!this._smallImages) {
+                this._smallImages = this.findSmallImages(this.doc);
+            }
+            const smallImages = this._smallImages;
             // Clone document
             const clone = this.doc.cloneNode(true);
             // Flatten shadow DOM content into the clone
             this.flattenShadowRoots(this.doc, clone);
+            // Resolve React streaming SSR suspense boundaries
+            this.resolveStreamedContent(clone);
             // Apply mobile styles to clone
             this.applyMobileStyles(clone, mobileStyles);
             // Find main content
@@ -382,12 +424,16 @@ class Defuddle {
             // Remove non-content blocks by scoring
             // Tries to find lists, navigation based on text content and link density
             if (options.removeLowScoring) {
-                scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals);
+                scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
             }
             // Remove clutter using selectors
             if (options.removeExactSelectors || options.removePartialSelectors) {
                 this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
             }
+            // Remove elements by content patterns (read time, boilerplate, article cards)
+            if (options.removeContentPatterns && mainContent) {
+                this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
+            }
             // Normalize the main content
             if (options.standardize) {
                 (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
@@ -425,17 +471,33 @@ class Defuddle {
         }
     }
     countWords(content) {
-        // Parse HTML content to extract text
-        const tempDiv = this.doc.createElement('div');
-        tempDiv.appendChild((0, dom_1.parseHTML)(this.doc, content));
-        // Get text content, removing extra whitespace
-        const text = tempDiv.textContent || '';
-        const words = text
-            .trim()
-            .replace(/\s+/g, ' ') // Replace multiple spaces with single space
-            .split(' ')
-            .filter(word => word.length > 0); // Filter out empty strings
-        return words.length;
+        // Strip HTML tags and decode common entities without DOM parsing
+        const text = content
+            .replace(/<[^>]*>/g, ' ')
+            .replace(/&nbsp;/gi, ' ')
+            .replace(/&amp;/gi, '&')
+            .replace(/&lt;/gi, '<')
+            .replace(/&gt;/gi, '>')
+            .replace(/&quot;/gi, '"')
+            .replace(/&#\d+;/g, ' ')
+            .replace(/&\w+;/g, ' ');
+        const trimmed = text.trim();
+        if (!trimmed)
+            return 0;
+        // Count words by splitting on whitespace
+        let count = 0;
+        let inWord = false;
+        for (let i = 0; i < trimmed.length; i++) {
+            const isSpace = trimmed.charCodeAt(i) <= 32;
+            if (!isSpace && !inWord) {
+                count++;
+                inWord = true;
+            }
+            else if (isSpace) {
+                inWord = false;
+            }
+        }
+        return count;
     }
     // Make all other methods private by removing the static keyword and using private
     _log(...args) {
@@ -535,36 +597,34 @@ class Defuddle {
     removeHiddenElements(doc, debugRemovals) {
         let count = 0;
         const elementsToRemove = new Map();
-        // Use querySelectorAll instead of getElementsByTagName because
-        // linkedom's cloneNode does not wire up live HTMLCollections.
-        const allElements = Array.from(doc.querySelectorAll('*'));
-        // Process styles in batches to minimize layout thrashing
-        const BATCH_SIZE = 100;
-        for (let i = 0; i < allElements.length; i += BATCH_SIZE) {
-            const batch = allElements.slice(i, i + BATCH_SIZE);
-            // Read phase - gather all computedStyles
-            const styles = batch.map(element => {
+        // Check inline styles and CSS class-based hidden patterns.
+        const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
+        // Only use getComputedStyle in browser environments where it's meaningful.
+        // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
+        const defaultView = doc.defaultView;
+        const isBrowser = typeof window !== 'undefined' && defaultView === window;
+        const allElements = doc.querySelectorAll('*');
+        for (const element of allElements) {
+            // Skip elements that contain math — sites like Wikipedia wrap MathML
+            // in display:none spans for accessibility (the visible version is an
+            // image/SVG fallback). We need to preserve these for math extraction.
+            if (element.querySelector('math, [data-mathml], .katex-mathml') ||
+                element.tagName.toLowerCase() === 'math') {
+                continue;
+            }
+            // Check inline style for hidden patterns
+            const style = element.getAttribute('style');
+            if (style && hiddenStylePattern.test(style)) {
+                const reason = style.includes('display') ? 'display:none' :
+                    style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
+                elementsToRemove.set(element, reason);
+                count++;
+                continue;
+            }
+            // Use getComputedStyle only in real browser environments
+            if (isBrowser) {
                 try {
-                    return element.ownerDocument.defaultView?.getComputedStyle(element);
-                }
-                catch (e) {
-                    // If we can't get computed style, check inline styles
-                    const style = element.getAttribute('style');
-                    if (!style)
-                        return null;
-                    // Create a temporary style element to parse inline styles
-                    const tempStyle = doc.createElement('style');
-                    tempStyle.textContent = `* { ${style} }`;
-                    doc.head.appendChild(tempStyle);
-                    const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
-                    doc.head.removeChild(tempStyle);
-                    return computedStyle;
-                }
-            });
-            // Write phase - mark elements for removal
-            batch.forEach((element, index) => {
-                const computedStyle = styles[index];
-                if (computedStyle) {
+                    const computedStyle = defaultView.getComputedStyle(element);
                     let reason = '';
                     if (computedStyle.display === 'none')
                         reason = 'display:none';
@@ -575,25 +635,24 @@ class Defuddle {
                     if (reason) {
                         elementsToRemove.set(element, reason);
                         count++;
+                        continue;
                     }
                 }
-                // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
-                // "sm:hidden", "not-machine:hidden") which JSDOM/linkedom can't
-                // resolve through computed styles.
-                if (!elementsToRemove.has(element)) {
-                    const className = element.getAttribute('class') || '';
-                    if (className) {
-                        const tokens = className.split(/\s+/);
-                        for (const token of tokens) {
-                            if (token === 'hidden' || token.endsWith(':hidden')) {
-                                elementsToRemove.set(element, `class:${token}`);
-                                count++;
-                                break;
-                            }
-                        }
+                catch (e) { }
+            }
+            // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
+            // "sm:hidden", "not-machine:hidden")
+            const className = element.getAttribute('class') || '';
+            if (className) {
+                const tokens = className.split(/\s+/);
+                for (const token of tokens) {
+                    if (token === 'hidden' || token.endsWith(':hidden')) {
+                        elementsToRemove.set(element, `class:${token}`);
+                        count++;
+                        break;
                     }
                 }
-            });
+            }
         }
         // Batch remove all hidden elements
         elementsToRemove.forEach((reason, el) => {
@@ -719,106 +778,50 @@ class Defuddle {
     findSmallImages(doc) {
         const MIN_DIMENSION = 33;
         const smallImages = new Set();
-        const transformRegex = /scale\(([\d.]+)\)/;
-        const startTime = Date.now();
         let processedCount = 0;
-        // 1. Read phase - Gather all elements in a single pass
-        const elements = [
-            ...Array.from(doc.getElementsByTagName('img')),
-            ...Array.from(doc.getElementsByTagName('svg'))
-        ];
-        if (elements.length === 0) {
-            return smallImages;
-        }
-        // 2. Batch process - Collect all measurements in one go
-        const measurements = elements.map(element => ({
-            element,
-            // Static attributes (no reflow)
-            naturalWidth: element.tagName.toLowerCase() === 'img' ?
-                parseInt(element.getAttribute('width') || '0') || 0 : 0,
-            naturalHeight: element.tagName.toLowerCase() === 'img' ?
-                parseInt(element.getAttribute('height') || '0') || 0 : 0,
-            attrWidth: parseInt(element.getAttribute('width') || '0'),
-            attrHeight: parseInt(element.getAttribute('height') || '0')
-        }));
-        // 3. Batch compute styles - Process in chunks to avoid long tasks
-        const BATCH_SIZE = 50;
-        for (let i = 0; i < measurements.length; i += BATCH_SIZE) {
-            const batch = measurements.slice(i, i + BATCH_SIZE);
-            try {
-                // Read phase - compute all styles at once
-                const styles = batch.map(({ element }) => {
-                    try {
-                        return element.ownerDocument.defaultView?.getComputedStyle(element);
-                    }
-                    catch (e) {
-                        return null;
-                    }
-                });
-                // Get bounding rectangles if available
-                const rects = batch.map(({ element }) => {
-                    try {
-                        return element.getBoundingClientRect();
-                    }
-                    catch (e) {
-                        return null;
-                    }
-                });
-                // Process phase - no DOM operations
-                batch.forEach((measurement, index) => {
-                    try {
-                        const style = styles[index];
-                        const rect = rects[index];
-                        if (!style)
-                            return;
-                        // Get transform scale in the same batch
-                        const transform = style.transform;
-                        const scale = transform ?
-                            parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
-                        // Calculate effective dimensions
-                        const widths = [
-                            measurement.naturalWidth,
-                            measurement.attrWidth,
-                            parseInt(style.width) || 0,
-                            rect ? rect.width * scale : 0
-                        ].filter(dim => typeof dim === 'number' && dim > 0);
-                        const heights = [
-                            measurement.naturalHeight,
-                            measurement.attrHeight,
-                            parseInt(style.height) || 0,
-                            rect ? rect.height * scale : 0
-                        ].filter(dim => typeof dim === 'number' && dim > 0);
-                        // Decision phase - no DOM operations
-                        if (widths.length > 0 && heights.length > 0) {
-                            const effectiveWidth = Math.min(...widths);
-                            const effectiveHeight = Math.min(...heights);
-                            if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
-                                const identifier = this.getElementIdentifier(measurement.element);
-                                if (identifier) {
-                                    smallImages.add(identifier);
-                                    processedCount++;
-                                }
-                            }
-                        }
-                    }
-                    catch (e) {
-                        if (this.debug) {
-                            console.warn('Defuddle: Failed to process element dimensions:', e);
-                        }
-                    }
-                });
+        const elements = doc.querySelectorAll('img, svg');
+        const defaultView = doc.defaultView;
+        const isBrowser = typeof window !== 'undefined' && defaultView === window;
+        for (const element of elements) {
+            const attrWidth = parseInt(element.getAttribute('width') || '0');
+            const attrHeight = parseInt(element.getAttribute('height') || '0');
+            // Check inline style dimensions
+            const style = element.getAttribute('style') || '';
+            const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
+            const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
+            // Use getComputedStyle and getBoundingClientRect only in browser
+            let computedWidth = 0, computedHeight = 0;
+            if (isBrowser) {
+                try {
+                    const cs = defaultView.getComputedStyle(element);
+                    computedWidth = parseInt(cs.width) || 0;
+                    computedHeight = parseInt(cs.height) || 0;
+                }
+                catch (e) { }
+                try {
+                    const rect = element.getBoundingClientRect();
+                    if (rect.width > 0)
+                        computedWidth = computedWidth || rect.width;
+                    if (rect.height > 0)
+                        computedHeight = computedHeight || rect.height;
+                }
+                catch (e) { }
             }
-            catch (e) {
-                if (this.debug) {
-                    console.warn('Defuddle: Failed to process batch:', e);
+            const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
+            const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
+            if (widths.length > 0 && heights.length > 0) {
+                const effectiveWidth = Math.min(...widths);
+                const effectiveHeight = Math.min(...heights);
+                if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
+                    const identifier = this.getElementIdentifier(element);
+                    if (identifier) {
+                        smallImages.add(identifier);
+                        processedCount++;
+                    }
                 }
             }
         }
-        const endTime = Date.now();
-        this._log('Found small elements:', {
-            count: processedCount,
-            processingTime: `${(endTime - startTime).toFixed(2)}ms`
-        });
+        this._log('Found small elements:', processedCount);
         return smallImages;
     }
     removeSmallImages(doc, smallImages) {
@@ -953,13 +956,11 @@ class Defuddle {
     }
     findContentByScoring(doc) {
         const candidates = [];
-        constants_1.BLOCK_ELEMENTS.forEach((tag) => {
-            Array.from(doc.getElementsByTagName(tag)).forEach((element) => {
-                const score = scoring_1.ContentScorer.scoreElement(element);
-                if (score > 0) {
-                    candidates.push({ score, element });
-                }
-            });
+        doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
+            const score = scoring_1.ContentScorer.scoreElement(element);
+            if (score > 0) {
+                candidates.push({ score, element });
+            }
         });
         return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
     }
@@ -1050,12 +1051,12 @@ class Defuddle {
      * Walks both trees in parallel so positional correspondence is exact.
      */
     flattenShadowRoots(original, clone) {
-        const origElements = Array.from(original.body.getElementsByTagName('*'));
+        const origElements = Array.from(original.body.querySelectorAll('*'));
         // Find the first element with a shadow root (also serves as the hasShadowRoots check)
         const firstShadow = origElements.find(el => el.shadowRoot);
         if (!firstShadow)
             return;
-        const cloneElements = Array.from(clone.body.getElementsByTagName('*'));
+        const cloneElements = Array.from(clone.body.querySelectorAll('*'));
         // Check if we can directly read shadow DOM content (main world / Node.js).
         // In content script isolated worlds, shadowRoot exists but content is empty.
         const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
@@ -1096,6 +1097,68 @@ class Defuddle {
             }
         }
     }
+    /**
+     * Resolve React streaming SSR suspense boundaries.
+     * React's streaming SSR places content in hidden divs (id="S:0") and
+     * template placeholders (id="B:0") with $RC scripts to swap them.
+     * Since we don't execute scripts, we perform the swap manually.
+     */
+    resolveStreamedContent(doc) {
+        // Find $RC("B:X","S:X") calls in inline scripts
+        const scripts = doc.querySelectorAll('script');
+        const swaps = [];
+        const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
+        for (const script of scripts) {
+            const text = script.textContent || '';
+            if (!text.includes('$RC('))
+                continue;
+            rcPattern.lastIndex = 0;
+            let match;
+            while ((match = rcPattern.exec(text)) !== null) {
+                swaps.push({ templateId: match[1], contentId: match[2] });
+            }
+        }
+        if (swaps.length === 0)
+            return;
+        let swapCount = 0;
+        for (const { templateId, contentId } of swaps) {
+            const template = doc.getElementById(templateId);
+            const content = doc.getElementById(contentId);
+            if (!template || !content)
+                continue;
+            const parent = template.parentNode;
+            if (!parent)
+                continue;
+            // Remove the fallback/skeleton content after the template
+            // until the <!--/$--> comment marker
+            let next = template.nextSibling;
+            let foundMarker = false;
+            while (next) {
+                const following = next.nextSibling;
+                if (next.nodeType === 8 && next.data === '/$') {
+                    next.remove();
+                    foundMarker = true;
+                    break;
+                }
+                next.remove();
+                next = following;
+            }
+            // Skip swap if marker wasn't found — malformed streaming output
+            if (!foundMarker)
+                continue;
+            // Insert content children before the template position
+            while (content.firstChild) {
+                parent.insertBefore(content.firstChild, template);
+            }
+            // Clean up the template and hidden div
+            template.remove();
+            content.remove();
+            swapCount++;
+        }
+        if (swapCount > 0) {
+            this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
+        }
+    }
     /**
      * Replace a shadow DOM host element with a div containing its shadow content.
      * Custom elements (tag names with hyphens) would re-initialize when inserted
@@ -1187,6 +1250,249 @@ class Defuddle {
     _decodeHTMLEntities(text) {
         return (0, dom_1.decodeHTMLEntities)(this.doc, text);
     }
+    /**
+     * Build a DefuddleResponse from an extractor result with metadata
+     */
+    buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
+        const contentHtml = this.resolveContentUrls(extracted.contentHtml);
+        const variables = this.getExtractorVariables(extracted.variables);
+        return {
+            content: contentHtml,
+            title: extracted.variables?.title || metadata.title,
+            description: metadata.description,
+            domain: metadata.domain,
+            favicon: metadata.favicon,
+            image: metadata.image,
+            language: extracted.variables?.language || metadata.language,
+            published: extracted.variables?.published || metadata.published,
+            author: extracted.variables?.author || metadata.author,
+            site: extracted.variables?.site || metadata.site,
+            schemaOrgData: metadata.schemaOrgData,
+            wordCount: this.countWords(extracted.contentHtml),
+            parseTime: Math.round(Date.now() - startTime),
+            extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
+            metaTags: pageMetaTags,
+            ...(variables ? { variables } : {}),
+        };
+    }
+    /**
+     * Filter extractor variables to only include custom ones
+     * (exclude standard fields that are already mapped to top-level properties)
+     */
+    getExtractorVariables(variables) {
+        if (!variables)
+            return undefined;
+        const custom = {};
+        let hasCustom = false;
+        for (const [key, value] of Object.entries(variables)) {
+            if (!STANDARD_VARIABLE_KEYS.has(key)) {
+                custom[key] = value;
+                hasCustom = true;
+            }
+        }
+        return hasCustom ? custom : undefined;
+    }
+    /**
+     * Content-based pattern removal for elements that can't be detected by
+     * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
+     */
+    removeByContentPattern(mainContent, debugRemovals) {
+        // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
+        // Only removes leaf elements whose text is PURELY date + read time,
+        // not mixed with other meaningful content like tag names.
+        const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
+        for (const el of candidates) {
+            if (!el.parentNode)
+                continue;
+            if (el.closest('pre') || el.closest('code'))
+                continue;
+            const text = el.textContent?.trim() || '';
+            const words = text.split(/\s+/).length;
+            // Match date + read time in short elements
+            if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
+                // Ensure this is a leaf-ish element, not a large container
+                if (el.querySelectorAll('p, div, section, article').length === 0) {
+                    // Verify the text is ONLY date + read time metadata
+                    // by stripping all date/time words and checking nothing remains
+                    let cleaned = text;
+                    for (const pattern of METADATA_STRIP_PATTERNS) {
+                        cleaned = cleaned.replace(pattern, '');
+                    }
+                    if (cleaned.trim().length > 0)
+                        continue;
+                    if (this.debug && debugRemovals) {
+                        debugRemovals.push({
+                            step: 'removeByContentPattern',
+                            reason: 'read time metadata',
+                            text: (0, utils_1.textPreview)(el)
+                        });
+                    }
+                    el.remove();
+                }
+            }
+        }
+        // Remove standalone time/date elements near the start or end of content.
+        // A <time> in its own paragraph at the boundary is metadata (publish date),
+        // but <time> inline within prose should be preserved (see issue #136).
+        const timeElements = Array.from(mainContent.querySelectorAll('time'));
+        const contentText = mainContent.textContent || '';
+        for (const time of timeElements) {
+            if (!time.parentNode)
+                continue;
+            // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
+            // Stop at block elements to avoid removing containers with other content.
+            let target = time;
+            let targetText = target.textContent?.trim() || '';
+            while (target.parentElement && target.parentElement !== mainContent) {
+                const parentTag = target.parentElement.tagName.toLowerCase();
+                const parentText = target.parentElement.textContent?.trim() || '';
+                // If parent is a <p> that only wraps this time, include it
+                if (parentTag === 'p' && parentText === targetText) {
+                    target = target.parentElement;
+                    break;
+                }
+                // Only walk through inline formatting wrappers
+                if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
+                    parentText === targetText) {
+                    target = target.parentElement;
+                    targetText = parentText;
+                    continue;
+                }
+                break;
+            }
+            const text = target.textContent?.trim() || '';
+            const words = text.split(/\s+/).length;
+            if (words > 10)
+                continue;
+            // Check if this element is near the start or end of mainContent
+            const pos = contentText.indexOf(text);
+            const distFromEnd = contentText.length - (pos + text.length);
+            if (pos > 200 && distFromEnd > 200)
+                continue;
+            if (this.debug && debugRemovals) {
+                debugRemovals.push({
+                    step: 'removeByContentPattern',
+                    reason: 'boundary date element',
+                    text: (0, utils_1.textPreview)(target)
+                });
+            }
+            target.remove();
+        }
+        // Remove section breadcrumbs
+        // Short elements containing a link to a parent section of the current URL.
+        const url = this.options.url || this.doc.URL || '';
+        let urlPath = '';
+        try {
+            urlPath = new URL(url).pathname;
+        }
+        catch { }
+        if (urlPath) {
+            const shortElements = mainContent.querySelectorAll('div, span, p');
+            for (const el of shortElements) {
+                if (!el.parentNode)
+                    continue;
+                const text = el.textContent?.trim() || '';
+                const words = text.split(/\s+/).length;
+                if (words > 10)
+                    continue;
+                // Must be a leaf-ish element (no block children)
+                if (el.querySelectorAll('p, div, section, article').length > 0)
+                    continue;
+                const link = el.querySelector('a[href]');
+                if (!link)
+                    continue;
+                try {
+                    const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
+                    if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
+                        if (this.debug && debugRemovals) {
+                            debugRemovals.push({
+                                step: 'removeByContentPattern',
+                                reason: 'section breadcrumb',
+                                text: (0, utils_1.textPreview)(el)
+                            });
+                        }
+                        el.remove();
+                    }
+                }
+                catch { }
+            }
+        }
+        // Remove boilerplate sentences and trailing non-content.
+        // Search elements for end-of-article boilerplate, then truncate
+        // from the best ancestor that has siblings to remove.
+        const fullText = mainContent.textContent || '';
+        const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
+        for (const el of boilerplateElements) {
+            if (!el.parentNode)
+                continue;
+            const text = el.textContent?.trim() || '';
+            const words = text.split(/\s+/).length;
+            if (words > 50 || words < 3)
+                continue;
+            for (const pattern of BOILERPLATE_PATTERNS) {
+                if (pattern.test(text)) {
+                    // Walk up to find an ancestor that has next siblings to truncate.
+                    // Don't walk all the way to mainContent's direct child — if there's
+                    // a single wrapper div, that would remove everything.
+                    let target = el;
+                    while (target.parentElement && target.parentElement !== mainContent) {
+                        if (target.nextElementSibling)
+                            break;
+                        target = target.parentElement;
+                    }
+                    // Only truncate if there's substantial content before the boilerplate
+                    const targetText = target.textContent || '';
+                    const targetPos = fullText.indexOf(targetText);
+                    if (targetPos < 200)
+                        continue;
+                    // Collect ancestors before modifying the DOM
+                    const ancestors = [];
+                    let anc = target.parentElement;
+                    while (anc && anc !== mainContent) {
+                        ancestors.push(anc);
+                        anc = anc.parentElement;
+                    }
+                    // Remove target element and its following siblings
+                    this.removeTrailingSiblings(target, true, debugRemovals);
+                    // Cascade upward: remove following siblings at each
+                    // ancestor level too. Everything after the boilerplate
+                    // in document order is non-content.
+                    for (const ancestor of ancestors) {
+                        this.removeTrailingSiblings(ancestor, false, debugRemovals);
+                    }
+                    return;
+                }
+            }
+        }
+    }
+    /**
+     * Remove an element's following siblings, and optionally the element itself.
+     */
+    removeTrailingSiblings(element, removeSelf, debugRemovals) {
+        let sibling = element.nextElementSibling;
+        while (sibling) {
+            const next = sibling.nextElementSibling;
+            if (this.debug && debugRemovals) {
+                debugRemovals.push({
+                    step: 'removeByContentPattern',
+                    reason: 'trailing non-content',
+                    text: (0, utils_1.textPreview)(sibling)
+                });
+            }
+            sibling.remove();
+            sibling = next;
+        }
+        if (removeSelf) {
+            if (this.debug && debugRemovals) {
+                debugRemovals.push({
+                    step: 'removeByContentPattern',
+                    reason: 'boilerplate text',
+                    text: (0, utils_1.textPreview)(element)
+                });
+            }
+            element.remove();
+        }
+    }
 }
 exports.Defuddle = Defuddle;
 //# sourceMappingURL=defuddle.js.map