npm - defuddle - Versions diffs - 0.12.0 → 0.14.0 - Mend

defuddle 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/README.md +59 -21
package/dist/cli.js +54 -49
package/dist/cli.js.map +1 -1
package/dist/constants.d.ts +9 -0
package/dist/constants.js +50 -10
package/dist/constants.js.map +1 -1
package/dist/defuddle.d.ts +8 -1
package/dist/defuddle.js +404 -86
package/dist/defuddle.js.map +1 -1
package/dist/elements/callouts.d.ts +6 -0
package/dist/elements/callouts.js +74 -0
package/dist/elements/callouts.js.map +1 -0
package/dist/elements/code.js +31 -9
package/dist/elements/code.js.map +1 -1
package/dist/elements/headings.d.ts +6 -0
package/dist/elements/headings.js +55 -50
package/dist/elements/headings.js.map +1 -1
package/dist/elements/images.js +10 -1
package/dist/elements/images.js.map +1 -1
package/dist/elements/math.base.js +1 -4
package/dist/elements/math.base.js.map +1 -1
package/dist/extractor-registry.d.ts +5 -5
package/dist/extractor-registry.js +8 -8
package/dist/extractor-registry.js.map +1 -1
package/dist/extractors/_base.d.ts +6 -1
package/dist/extractors/_base.js +2 -1
package/dist/extractors/_base.js.map +1 -1
package/dist/extractors/github.js +3 -3
package/dist/extractors/github.js.map +1 -1
package/dist/extractors/hackernews.js +1 -1
package/dist/extractors/hackernews.js.map +1 -1
package/dist/extractors/reddit.js +7 -4
package/dist/extractors/reddit.js.map +1 -1
package/dist/extractors/twitter.js +3 -1
package/dist/extractors/twitter.js.map +1 -1
package/dist/extractors/youtube.d.ts +35 -2
package/dist/extractors/youtube.js +359 -30
package/dist/extractors/youtube.js.map +1 -1
package/dist/fetch.d.ts +13 -0
package/dist/fetch.js +181 -0
package/dist/fetch.js.map +1 -0
package/dist/index.full.js +1 -1
package/dist/index.js +1 -1
package/dist/markdown.js +81 -33
package/dist/markdown.js.map +1 -1
package/dist/metadata.js +1 -1
package/dist/metadata.js.map +1 -1
package/dist/node.d.ts +12 -5
package/dist/node.js +53 -17
package/dist/node.js.map +1 -1
package/dist/scoring.js +15 -10
package/dist/scoring.js.map +1 -1
package/dist/standardize.js +112 -60
package/dist/standardize.js.map +1 -1
package/dist/types.d.ts +14 -0
package/dist/utils/dom.d.ts +5 -0
package/dist/utils/dom.js +8 -0
package/dist/utils/dom.js.map +1 -1
package/dist/utils/linkedom-compat.d.ts +5 -0
package/dist/utils/linkedom-compat.js +23 -0
package/dist/utils/linkedom-compat.js.map +1 -0
package/dist/utils.d.ts +6 -0
package/dist/utils.js +36 -0
package/dist/utils.js.map +1 -1
package/package.json +3 -4

package/dist/defuddle.js CHANGED Viewed

@@ -2,16 +2,20 @@
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.Defuddle = void 0;
 const metadata_1 = require("./metadata");
+const headings_1 = require("./elements/headings");
 const extractor_registry_1 = require("./extractor-registry");
 const constants_1 = require("./constants");
 const standardize_1 = require("./standardize");
 const footnotes_1 = require("./elements/footnotes");
+const callouts_1 = require("./elements/callouts");
 const scoring_1 = require("./scoring");
 const utils_1 = require("./utils");
 const dom_1 = require("./utils/dom");
 /** Keys from extractor variables that map to top-level DefuddleResponse fields */
 const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
 // Content pattern detection constants
+const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
+const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
 const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
 const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
 const BOILERPLATE_PATTERNS = [
@@ -72,6 +76,36 @@ class Defuddle {
             }
         }
         // If still very little content, the page may be an index/listing page
+        // or a page that reveals content at runtime from a hidden wrapper.
+        // Retry once with hidden-element removal disabled.
+        if (result.wordCount < 50) {
+            this._log('Still very little content, retrying without hidden-element removal');
+            const hiddenRetry = this.parseInternal({
+                removeHiddenElements: false
+            });
+            if (hiddenRetry.wordCount > result.wordCount * 2) {
+                this._log('Hidden-element retry produced more content');
+                result = hiddenRetry;
+            }
+            // Try targeting the largest hidden subtree directly to avoid body-level
+            // leftovers (e.g. FPS counters) when hidden content is the real article.
+            const hiddenSelector = this.findLargestHiddenContentSelector();
+            if (hiddenSelector) {
+                this._log('Retrying with hidden content selector:', hiddenSelector);
+                const hiddenSelectorRetry = this.parseInternal({
+                    removeHiddenElements: false,
+                    removePartialSelectors: false,
+                    contentSelector: hiddenSelector
+                });
+                if (hiddenSelectorRetry.wordCount > result.wordCount ||
+                    (hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
+                        hiddenSelectorRetry.content.length < result.content.length)) {
+                    this._log('Hidden-selector retry produced better focused content');
+                    result = hiddenSelectorRetry;
+                }
+            }
+        }
+        // If still very little content, the page may be an index/listing page
         // where card elements were scored as non-content or removed by partial
         // selectors (e.g. "post-preview"). Retry with both disabled.
         if (result.wordCount < 50) {
@@ -95,17 +129,17 @@ class Defuddle {
         // longer than what we extracted, the scorer likely picked the wrong
         // element from a feed. Find the correct element in the DOM.
         const schemaText = this._getSchemaText(result.schemaOrgData);
-        if (schemaText && this.countWords(schemaText) > result.wordCount) {
+        if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
             const contentHtml = this._findContentBySchemaText(schemaText);
             if (contentHtml) {
                 this._log('Found DOM content matching schema.org text');
                 result.content = contentHtml;
-                result.wordCount = this.countWords(contentHtml);
+                result.wordCount = this.countHtmlWords(contentHtml);
             }
             else {
                 this._log('Using schema.org text as content (DOM element not found)');
                 result.content = schemaText;
-                result.wordCount = this.countWords(schemaText);
+                result.wordCount = this.countHtmlWords(schemaText);
             }
         }
         return result;
@@ -113,17 +147,30 @@ class Defuddle {
     /**
      * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
      */
-    _getSchemaText(schemaOrgData) {
-        if (!schemaOrgData)
+    _getSchemaText(schemaOrgData, depth = 0) {
+        if (!schemaOrgData || depth > 10)
             return '';
         const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
         for (const item of items) {
+            // Recurse into nested arrays
+            if (Array.isArray(item)) {
+                const found = this._getSchemaText(item, depth + 1);
+                if (found)
+                    return found;
+                continue;
+            }
             if (item?.text && typeof item.text === 'string') {
                 return item.text;
             }
             if (item?.articleBody && typeof item.articleBody === 'string') {
                 return item.articleBody;
             }
+            // Traverse @graph arrays (common in JSON-LD with multiple entities)
+            if (item?.['@graph'] && Array.isArray(item['@graph'])) {
+                const found = this._getSchemaText(item['@graph'], depth + 1);
+                if (found)
+                    return found;
+            }
         }
         return '';
     }
@@ -164,39 +211,43 @@ class Defuddle {
         }
     }
     /**
-     * Find a DOM element whose text matches the schema.org text content.
-     * Used when the content scorer picked the wrong element from a feed page.
-     * Returns the element's inner HTML including sibling media (images, etc.)
+     * Find the smallest DOM element whose text contains the search phrase
+     * and whose word count is at least 80% of the expected count.
+     * Shared by _findSchemaContentElement and _findContentBySchemaText.
      */
-    _findContentBySchemaText(schemaText) {
-        const body = this.doc.body;
-        if (!body)
-            return '';
-        // Use the first paragraph as the search phrase.
-        // DOM textContent concatenates <p> elements without separators,
-        // so we can't cross paragraph boundaries when matching.
+    _findElementBySchemaText(root, schemaText) {
         const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
         const searchPhrase = firstPara.substring(0, 100).trim();
         if (!searchPhrase)
-            return '';
-        const schemaWordCount = this.countWords(schemaText);
-        // Find the smallest element whose text contains the search phrase
-        // and whose word count is close to the schema text's word count
+            return null;
+        const schemaWordCount = (0, utils_1.countWords)(schemaText);
         let bestMatch = null;
         let bestSize = Infinity;
-        const allElements = body.querySelectorAll('*');
+        const allElements = root.querySelectorAll('*');
         for (const el of allElements) {
-            const elText = (el.textContent || '');
+            if (el === root)
+                continue;
+            const elText = el.textContent || '';
             if (!elText.includes(searchPhrase))
                 continue;
-            const elWords = elText.trim().split(/\s+/).length;
-            // Element should contain roughly the same amount of text
-            // (allow some slack for surrounding whitespace / minor extras)
+            const elWords = (0, utils_1.countWords)(elText);
             if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
                 bestSize = elWords;
                 bestMatch = el;
             }
         }
+        return bestMatch;
+    }
+    /**
+     * Find a DOM element whose text matches the schema.org text content.
+     * Used when the content scorer picked the wrong element from a feed page.
+     * Returns the element's inner HTML including sibling media (images, etc.)
+     */
+    _findContentBySchemaText(schemaText) {
+        const body = this.doc.body;
+        if (!body)
+            return '';
+        const bestMatch = this._findElementBySchemaText(body, schemaText);
         if (!bestMatch)
             return '';
         // Read the largest sibling image src BEFORE resolveRelativeUrls
@@ -230,6 +281,8 @@ class Defuddle {
                 catch { }
             }
         }
+        // Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
+        (0, headings_1.removeHeadingAnchors)(bestMatch);
         // Now resolve URLs in the text content
         this.resolveRelativeUrls(bestMatch);
         let html = (0, dom_1.serializeHTML)(bestMatch);
@@ -241,6 +294,27 @@ class Defuddle {
         }
         return html;
     }
+    findLargestHiddenContentSelector() {
+        const body = this.doc.body;
+        if (!body)
+            return undefined;
+        const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
+            const className = el.getAttribute('class') || '';
+            return !className.includes('math');
+        });
+        let best = null;
+        let bestWords = 0;
+        for (const el of candidates) {
+            const words = (0, utils_1.countWords)(el.textContent || '');
+            if (words > bestWords) {
+                best = el;
+                bestWords = words;
+            }
+        }
+        if (!best || bestWords < 30)
+            return undefined;
+        return this.getElementSelector(best);
+    }
     /**
      * Get the largest available src from an img element,
      * checking srcset for higher-resolution versions.
@@ -302,7 +376,8 @@ class Defuddle {
         try {
             const url = this.options.url || this.doc.URL;
             const schemaOrgData = this.getSchemaOrgData();
-            const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
+            const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
+            const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor) {
                 const extracted = await extractor.extractAsync();
                 return this.getExtractorVariables(extracted.variables) || null;
@@ -317,7 +392,8 @@ class Defuddle {
         try {
             const url = this.options.url || this.doc.URL;
             const schemaOrgData = this.getSchemaOrgData();
-            const extractor = finder(this.doc, url, schemaOrgData);
+            const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
+            const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor) {
                 const startTime = Date.now();
                 const extracted = await extractor.extractAsync();
@@ -336,6 +412,25 @@ class Defuddle {
      */
     parseInternal(overrideOptions = {}) {
         const startTime = Date.now();
+        // Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
+        if (!this.doc.documentElement) {
+            const url = this.options.url || '';
+            return {
+                content: '',
+                title: '',
+                description: '',
+                domain: url ? new URL(url).hostname : '',
+                favicon: '',
+                image: '',
+                language: '',
+                parseTime: Date.now() - startTime,
+                published: '',
+                author: '',
+                site: '',
+                schemaOrgData: null,
+                wordCount: 0,
+            };
+        }
         const options = {
             removeExactSelectors: true,
             removePartialSelectors: true,
@@ -344,6 +439,7 @@ class Defuddle {
             removeSmallImages: true,
             removeContentPatterns: true,
             standardize: true,
+            includeReplies: 'extractors',
             ...this.options,
             ...overrideOptions
         };
@@ -365,7 +461,11 @@ class Defuddle {
         try {
             // Use site-specific extractor first, if there is one
             const url = options.url || this.doc.URL;
-            const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
+            const extractorOpts = {
+                includeReplies: options.includeReplies,
+                language: options.language,
+            };
+            const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor && extractor.canExtract()) {
                 const extracted = extractor.extract();
                 return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
@@ -383,6 +483,9 @@ class Defuddle {
             const smallImages = this._smallImages;
             // Clone document
             const clone = this.doc.cloneNode(true);
+            // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
+            // create when parsing HTML entities like &#39;
+            clone.body?.normalize();
             // Flatten shadow DOM content into the clone
             this.flattenShadowRoots(this.doc, clone);
             // Resolve React streaming SSR suspense boundaries
@@ -398,20 +501,36 @@ class Defuddle {
             if (!mainContent) {
                 mainContent = this.findMainContent(clone);
             }
+            // If we fell back to <body>, try using schema.org articleBody/text
+            // to find a more specific content element within the DOM.
+            if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
+                const schemaText = this._getSchemaText(schemaOrgData);
+                if (schemaText) {
+                    const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
+                    if (schemaContent) {
+                        this._log('Found content element via schema.org text');
+                        mainContent = schemaContent;
+                    }
+                }
+            }
             if (!mainContent) {
-                const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
+                const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
                 const endTime = Date.now();
                 return {
                     content: fallbackContent,
                     ...metadata,
-                    wordCount: this.countWords(fallbackContent),
+                    wordCount: this.countHtmlWords(fallbackContent),
                     parseTime: Math.round(endTime - startTime),
                     metaTags: pageMetaTags
                 };
             }
+            // Remove <wbr> elements — word break opportunity hints that carry no
+            // content but cause unwanted whitespace during standardization.
+            mainContent.querySelectorAll('wbr').forEach(el => el.remove());
             // Standardize footnotes before cleanup (CSS sidenotes use display:none)
             if (options.standardize) {
                 (0, footnotes_1.standardizeFootnotes)(mainContent);
+                (0, callouts_1.standardizeCallouts)(mainContent);
             }
             // Remove small images
             if (options.removeSmallImages) {
@@ -421,15 +540,17 @@ class Defuddle {
             if (options.removeHiddenElements) {
                 this.removeHiddenElements(clone, debugRemovals);
             }
-            // Remove non-content blocks by scoring
-            // Tries to find lists, navigation based on text content and link density
+            // Remove clutter using selectors — deterministic removal of known
+            // non-content elements (nav, footer, .sidebar, etc.) by class/id.
+            // Runs before scoring so the heuristic scorer sees a cleaner DOM.
+            if (options.removeExactSelectors || options.removePartialSelectors) {
+                this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
+            }
+            // Remove non-content blocks by scoring — heuristic removal based
+            // on link density, text ratios, and navigation indicators.
             if (options.removeLowScoring) {
                 scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
             }
-            // Remove clutter using selectors
-            if (options.removeExactSelectors || options.removePartialSelectors) {
-                this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
-            }
             // Remove elements by content patterns (read time, boilerplate, article cards)
             if (options.removeContentPatterns && mainContent) {
                 this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
@@ -445,7 +566,7 @@ class Defuddle {
             const result = {
                 content,
                 ...metadata,
-                wordCount: this.countWords(content),
+                wordCount: this.countHtmlWords(content),
                 parseTime: Math.round(endTime - startTime),
                 metaTags: pageMetaTags
             };
@@ -459,18 +580,18 @@ class Defuddle {
         }
         catch (error) {
             console.error('Defuddle', 'Error processing document:', error);
-            const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
+            const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
             const endTime = Date.now();
             return {
                 content: errorContent,
                 ...metadata,
-                wordCount: this.countWords(errorContent),
+                wordCount: this.countHtmlWords(errorContent),
                 parseTime: Math.round(endTime - startTime),
                 metaTags: pageMetaTags
             };
         }
     }
-    countWords(content) {
+    countHtmlWords(content) {
         // Strip HTML tags and decode common entities without DOM parsing
         const text = content
             .replace(/<[^>]*>/g, ' ')
@@ -481,25 +602,8 @@ class Defuddle {
             .replace(/&quot;/gi, '"')
             .replace(/&#\d+;/g, ' ')
             .replace(/&\w+;/g, ' ');
-        const trimmed = text.trim();
-        if (!trimmed)
-            return 0;
-        // Count words by splitting on whitespace
-        let count = 0;
-        let inWord = false;
-        for (let i = 0; i < trimmed.length; i++) {
-            const isSpace = trimmed.charCodeAt(i) <= 32;
-            if (!isSpace && !inWord) {
-                count++;
-                inWord = true;
-            }
-            else if (isSpace) {
-                inWord = false;
-            }
-        }
-        return count;
+        return (0, utils_1.countWords)(text);
     }
-    // Make all other methods private by removing the static keyword and using private
     _log(...args) {
         if (this.debug) {
             console.log('Defuddle:', ...args);
@@ -509,6 +613,8 @@ class Defuddle {
         const mobileStyles = [];
         const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
         try {
+            if (!doc.styleSheets)
+                return mobileStyles;
             // Get all styles, including inline styles
             const sheets = Array.from(doc.styleSheets).filter(sheet => {
                 try {
@@ -646,7 +752,7 @@ class Defuddle {
             if (className) {
                 const tokens = className.split(/\s+/);
                 for (const token of tokens) {
-                    if (token === 'hidden' || token.endsWith(':hidden')) {
+                    if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
                         elementsToRemove.set(element, `class:${token}`);
                         count++;
                         break;
@@ -667,7 +773,7 @@ class Defuddle {
         });
         this._log('Removed hidden elements:', count);
     }
-    removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
+    removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
         const startTime = Date.now();
         let exactSelectorCount = 0;
         let partialSelectorCount = 0;
@@ -675,9 +781,17 @@ class Defuddle {
         const elementsToRemove = new Map();
         // First collect elements matching exact selectors
         if (removeExact) {
-            const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(','));
+            const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
             exactElements.forEach(el => {
                 if (el?.parentNode) {
+                    if (skipHiddenExactSelectors) {
+                        const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
+                        const role = (el.getAttribute('role') || '').toLowerCase();
+                        if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
+                            (hiddenAncestor && role === 'dialog')) {
+                            return;
+                        }
+                    }
                     // Skip elements inside code blocks (e.g. syntax highlighting spans)
                     if (el.closest('pre, code')) {
                         return;
@@ -688,16 +802,12 @@ class Defuddle {
             });
         }
         if (removePartial) {
-            // Pre-compile regexes and combine into a single regex for better performance
-            const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
-            const partialRegex = new RegExp(combinedPattern, 'i');
-            // Pre-compile individual regexes for debug pattern identification
+            // Pre-compile individual regexes for debug pattern identification only
             const individualRegexes = this.debug
                 ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
                 : null;
-            // Create an efficient attribute selector for elements we care about
-            const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(',');
-            const allElements = doc.querySelectorAll(attributeSelector);
+            // Use pre-built attribute selector for elements we care about
+            const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
             // Process elements for partial matches
             allElements.forEach(el => {
                 // Skip if already marked for removal
@@ -707,13 +817,13 @@ class Defuddle {
                 // Skip code elements and elements containing code blocks
                 // where class names indicate language/syntax, not page structure
                 const tag = el.tagName;
-                if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
+                if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
                     return;
                 }
                 // Get all relevant attributes and combine into a single string
                 const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
                     if (attr === 'class') {
-                        return el.className && typeof el.className === 'string' ? el.className : '';
+                        return (0, dom_1.getClassName)(el);
                     }
                     if (attr === 'id') {
                         return el.id || '';
@@ -725,7 +835,7 @@ class Defuddle {
                     return;
                 }
                 // Check for partial match using single regex test
-                if (partialRegex.test(attrs)) {
+                if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
                     const matchedPattern = individualRegexes
                         ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
                         : undefined;
@@ -787,8 +897,8 @@ class Defuddle {
             const attrHeight = parseInt(element.getAttribute('height') || '0');
             // Check inline style dimensions
             const style = element.getAttribute('style') || '';
-            const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
-            const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
+            const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
+            const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
             // Use getComputedStyle and getBoundingClientRect only in browser
             let computedWidth = 0, computedHeight = 0;
             if (isBrowser) {
@@ -856,7 +966,7 @@ class Defuddle {
                 return `srcset:${dataSrcset}`;
         }
         const id = element.id || '';
-        const className = element.className || '';
+        const className = (0, dom_1.getClassName)(element);
         const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
         if (id)
             return `id:${id}`;
@@ -912,7 +1022,7 @@ class Defuddle {
         let best = top;
         for (let i = 1; i < candidates.length; i++) {
             const child = candidates[i];
-            const childWords = (child.element.textContent || '').split(/\s+/).length;
+            const childWords = (0, utils_1.countWords)(child.element.textContent || '');
             if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
                 // Count how many candidates share this selector index inside
                 // the top element. Use top (not best) as the stable reference
@@ -972,8 +1082,8 @@ class Defuddle {
             if (current.id) {
                 selector += '#' + current.id;
             }
-            else if (current.className && typeof current.className === 'string') {
-                selector += '.' + current.className.trim().split(/\s+/).join('.');
+            else if ((0, dom_1.getClassName)(current)) {
+                selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
             }
             parts.unshift(selector);
             current = current.parentElement;
@@ -987,15 +1097,35 @@ class Defuddle {
      * Resolve relative URLs to absolute within a DOM element
      */
     resolveRelativeUrls(element) {
-        const baseUrl = this.options.url || this.doc.URL;
-        if (!baseUrl)
+        const docUrl = this.options.url || this.doc.URL;
+        if (!docUrl)
             return;
+        // Respect <base href> for relative URL resolution, matching browser behavior
+        let baseUrl = docUrl;
+        const baseEl = this.doc.querySelector('base[href]');
+        if (baseEl) {
+            const baseHref = baseEl.getAttribute('href');
+            if (baseHref) {
+                try {
+                    baseUrl = new URL(baseHref, docUrl).href;
+                }
+                catch {
+                    // Invalid base href, fall back to document URL
+                }
+            }
+        }
         const resolve = (url) => {
+            // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
+            // Normalize these before URL resolution.
+            const normalized = url
+                .trim()
+                .replace(/^\\?["']+/, '')
+                .replace(/\\?["']+$/, '');
             try {
-                return new URL(url, baseUrl).href;
+                return new URL(normalized, baseUrl).href;
             }
             catch {
-                return url;
+                return normalized || url;
             }
         };
         element.querySelectorAll('[href]').forEach(el => {
@@ -1051,6 +1181,8 @@ class Defuddle {
      * Walks both trees in parallel so positional correspondence is exact.
      */
     flattenShadowRoots(original, clone) {
+        if (!original.body || !clone.body)
+            return;
         const origElements = Array.from(original.body.querySelectorAll('*'));
         // Find the first element with a shadow root (also serves as the hasShadowRoots check)
         const firstShadow = origElements.find(el => el.shadowRoot);
@@ -1268,7 +1400,7 @@ class Defuddle {
             author: extracted.variables?.author || metadata.author,
             site: extracted.variables?.site || metadata.site,
             schemaOrgData: metadata.schemaOrgData,
-            wordCount: this.countWords(extracted.contentHtml),
+            wordCount: this.countHtmlWords(extracted.contentHtml),
             parseTime: Math.round(Date.now() - startTime),
             extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
             metaTags: pageMetaTags,
@@ -1307,7 +1439,7 @@ class Defuddle {
             if (el.closest('pre') || el.closest('code'))
                 continue;
             const text = el.textContent?.trim() || '';
-            const words = text.split(/\s+/).length;
+            const words = (0, utils_1.countWords)(text);
             // Match date + read time in short elements
             if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
                 // Ensure this is a leaf-ish element, not a large container
@@ -1361,7 +1493,7 @@ class Defuddle {
                 break;
             }
             const text = target.textContent?.trim() || '';
-            const words = text.split(/\s+/).length;
+            const words = (0, utils_1.countWords)(text);
             if (words > 10)
                 continue;
             // Check if this element is near the start or end of mainContent
@@ -1378,12 +1510,78 @@ class Defuddle {
             }
             target.remove();
         }
+        // Remove blog post metadata lists near content boundaries.
+        // These are short <ul>/<ol> elements where every item is a brief
+        // label + value pair (date, reading time, share, etc.) with no
+        // prose sentences. Detected structurally: all items are very short,
+        // none contain sentence-ending punctuation, and the total text is minimal.
+        const metadataLists = mainContent.querySelectorAll('ul, ol');
+        for (const list of metadataLists) {
+            if (!list.parentNode)
+                continue;
+            const items = Array.from(list.children).filter(el => el.tagName === 'LI');
+            if (items.length < 2 || items.length > 8)
+                continue;
+            // Must be near the start or end of content
+            const listText = list.textContent?.trim() || '';
+            const listPos = contentText.indexOf(listText);
+            const distFromEnd = contentText.length - (listPos + listText.length);
+            if (listPos > 500 && distFromEnd > 500)
+                continue;
+            // Skip lists introduced by a preceding paragraph (e.g. "Features include:")
+            // — those are content lists, not standalone metadata
+            const prevSibling = list.previousElementSibling;
+            if (prevSibling) {
+                const prevText = prevSibling.textContent?.trim() || '';
+                if (prevText.endsWith(':'))
+                    continue;
+            }
+            // Every item must be very short (label + value) with no prose
+            let isMetadata = true;
+            for (const item of items) {
+                const text = item.textContent?.trim() || '';
+                const words = (0, utils_1.countWords)(text);
+                if (words > 8) {
+                    isMetadata = false;
+                    break;
+                }
+                // Prose has sentence-ending punctuation; metadata doesn't
+                if (/[.!?]$/.test(text)) {
+                    isMetadata = false;
+                    break;
+                }
+            }
+            if (!isMetadata)
+                continue;
+            // Total text should be very short — this is metadata, not content
+            if ((0, utils_1.countWords)(listText) > 30)
+                continue;
+            // Walk up to find the container to remove (e.g. a wrapper div)
+            let target = list;
+            while (target.parentElement && target.parentElement !== mainContent) {
+                const parentText = target.parentElement.textContent?.trim() || '';
+                if (parentText !== listText)
+                    break;
+                target = target.parentElement;
+            }
+            if (this.debug && debugRemovals) {
+                debugRemovals.push({
+                    step: 'removeByContentPattern',
+                    reason: 'blog metadata list',
+                    text: (0, utils_1.textPreview)(target)
+                });
+            }
+            target.remove();
+        }
         // Remove section breadcrumbs
         // Short elements containing a link to a parent section of the current URL.
         const url = this.options.url || this.doc.URL || '';
         let urlPath = '';
+        let pageHost = '';
         try {
-            urlPath = new URL(url).pathname;
+            const parsedUrl = new URL(url);
+            urlPath = parsedUrl.pathname;
+            pageHost = parsedUrl.hostname.replace(/^www\./, '');
         }
         catch { }
         if (urlPath) {
@@ -1392,7 +1590,7 @@ class Defuddle {
                 if (!el.parentNode)
                     continue;
                 const text = el.textContent?.trim() || '';
-                const words = text.split(/\s+/).length;
+                const words = (0, utils_1.countWords)(text);
                 if (words > 10)
                     continue;
                 // Must be a leaf-ish element (no block children)
@@ -1417,6 +1615,126 @@ class Defuddle {
                 catch { }
             }
         }
+        // Remove trailing external link lists — a heading + list of purely
+        // off-site links as the last content block (affiliate picks, product
+        // roundups, etc.). Only removed when nothing meaningful follows.
+        if (pageHost) {
+            const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
+            for (const heading of headings) {
+                if (!heading.parentNode)
+                    continue;
+                const list = heading.nextElementSibling;
+                if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
+                    continue;
+                const items = Array.from(list.children).filter(el => el.tagName === 'LI');
+                if (items.length < 2)
+                    continue;
+                // The list must be the last meaningful block — nothing after it
+                // except whitespace or empty elements. Walk up through ancestors
+                // to check siblings at each level up to mainContent.
+                let trailingContent = false;
+                let checkEl = list;
+                while (checkEl && checkEl !== mainContent) {
+                    let sibling = checkEl.nextElementSibling;
+                    while (sibling) {
+                        if ((sibling.textContent?.trim() || '').length > 0) {
+                            trailingContent = true;
+                            break;
+                        }
+                        sibling = sibling.nextElementSibling;
+                    }
+                    if (trailingContent)
+                        break;
+                    checkEl = checkEl.parentElement;
+                }
+                if (trailingContent)
+                    continue;
+                // Every list item must be primarily a link pointing off-site
+                let allExternalLinks = true;
+                for (const item of items) {
+                    const links = item.querySelectorAll('a[href]');
+                    if (links.length === 0) {
+                        allExternalLinks = false;
+                        break;
+                    }
+                    const itemText = item.textContent?.trim() || '';
+                    let linkTextLen = 0;
+                    for (const link of links) {
+                        linkTextLen += (link.textContent?.trim() || '').length;
+                        try {
+                            const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
+                            if (linkHost === pageHost) {
+                                allExternalLinks = false;
+                                break;
+                            }
+                        }
+                        catch { }
+                    }
+                    if (!allExternalLinks)
+                        break;
+                    if (linkTextLen < itemText.length * 0.6) {
+                        allExternalLinks = false;
+                        break;
+                    }
+                }
+                if (!allExternalLinks)
+                    continue;
+                if (this.debug && debugRemovals) {
+                    debugRemovals.push({
+                        step: 'removeByContentPattern',
+                        reason: 'trailing external link list',
+                        text: (0, utils_1.textPreview)(heading)
+                    });
+                    debugRemovals.push({
+                        step: 'removeByContentPattern',
+                        reason: 'trailing external link list',
+                        text: (0, utils_1.textPreview)(list)
+                    });
+                }
+                list.remove();
+                heading.remove();
+            }
+        }
+        // Remove trailing thin sections — the last few direct children of
+        // mainContent that contain a heading but very little prose. These are
+        // typically CTAs, newsletter prompts, or promotional sections that
+        // have been partially stripped by prior removal steps.
+        const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
+        if (totalWords > 300) {
+            // Walk backwards from the last direct child of mainContent,
+            // collecting trailing elements that are thin (empty or very short prose).
+            // Exclude SVG text (path data) from word counts — it's not prose.
+            const trailingEls = [];
+            let trailingWords = 0;
+            let child = mainContent.lastElementChild;
+            while (child) {
+                // Count prose words, excluding SVG path data which inflates word counts
+                let svgWords = 0;
+                for (const svg of child.querySelectorAll('svg')) {
+                    svgWords += (0, utils_1.countWords)(svg.textContent || '');
+                }
+                const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
+                if (words > 25)
+                    break;
+                trailingWords += words;
+                trailingEls.push(child);
+                child = child.previousElementSibling;
+            }
+            // Must have a heading in the trailing elements and total < 15% of content.
+            // Skip if trailing elements contain content indicators (math, code, tables, images).
+            if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
+                const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
+                const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
+                if (hasHeading && !hasContent) {
+                    for (const el of trailingEls) {
+                        if (this.debug && debugRemovals) {
+                            debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
+                        }
+                        el.remove();
+                    }
+                }
+            }
+        }
         // Remove boilerplate sentences and trailing non-content.
         // Search elements for end-of-article boilerplate, then truncate
         // from the best ancestor that has siblings to remove.
@@ -1426,7 +1744,7 @@ class Defuddle {
             if (!el.parentNode)
                 continue;
             const text = el.textContent?.trim() || '';
-            const words = text.split(/\s+/).length;
+            const words = (0, utils_1.countWords)(text);
             if (words > 50 || words < 3)
                 continue;
             for (const pattern of BOILERPLATE_PATTERNS) {