npm - defuddle - Versions diffs - 0.13.0 → 0.14.0 - Mend

defuddle 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +33 -0
package/dist/cli.js +47 -10
package/dist/cli.js.map +1 -1
package/dist/constants.d.ts +2 -0
package/dist/constants.js +29 -2
package/dist/constants.js.map +1 -1
package/dist/defuddle.d.ts +6 -0
package/dist/defuddle.js +287 -40
package/dist/defuddle.js.map +1 -1
package/dist/elements/callouts.d.ts +6 -0
package/dist/elements/callouts.js +74 -0
package/dist/elements/callouts.js.map +1 -0
package/dist/elements/headings.d.ts +6 -0
package/dist/elements/headings.js +13 -0
package/dist/elements/headings.js.map +1 -1
package/dist/elements/images.js +10 -1
package/dist/elements/images.js.map +1 -1
package/dist/elements/math.base.js +1 -4
package/dist/elements/math.base.js.map +1 -1
package/dist/extractor-registry.d.ts +5 -5
package/dist/extractor-registry.js +8 -8
package/dist/extractor-registry.js.map +1 -1
package/dist/extractors/_base.d.ts +6 -1
package/dist/extractors/_base.js +2 -1
package/dist/extractors/_base.js.map +1 -1
package/dist/extractors/github.js +3 -3
package/dist/extractors/github.js.map +1 -1
package/dist/extractors/hackernews.js +1 -1
package/dist/extractors/hackernews.js.map +1 -1
package/dist/extractors/reddit.js +7 -4
package/dist/extractors/reddit.js.map +1 -1
package/dist/extractors/twitter.js +3 -1
package/dist/extractors/twitter.js.map +1 -1
package/dist/extractors/youtube.d.ts +13 -0
package/dist/extractors/youtube.js +140 -20
package/dist/extractors/youtube.js.map +1 -1
package/dist/fetch.d.ts +13 -0
package/dist/fetch.js +181 -0
package/dist/fetch.js.map +1 -0
package/dist/index.full.js +1 -1
package/dist/index.js +1 -1
package/dist/markdown.js +76 -33
package/dist/markdown.js.map +1 -1
package/dist/metadata.js +1 -1
package/dist/metadata.js.map +1 -1
package/dist/scoring.js +11 -6
package/dist/scoring.js.map +1 -1
package/dist/standardize.js +24 -57
package/dist/standardize.js.map +1 -1
package/dist/types.d.ts +14 -0
package/dist/utils/dom.d.ts +5 -0
package/dist/utils/dom.js +8 -0
package/dist/utils/dom.js.map +1 -1
package/package.json +1 -1

package/dist/defuddle.js CHANGED Viewed

@@ -2,16 +2,20 @@
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.Defuddle = void 0;
 const metadata_1 = require("./metadata");
+const headings_1 = require("./elements/headings");
 const extractor_registry_1 = require("./extractor-registry");
 const constants_1 = require("./constants");
 const standardize_1 = require("./standardize");
 const footnotes_1 = require("./elements/footnotes");
+const callouts_1 = require("./elements/callouts");
 const scoring_1 = require("./scoring");
 const utils_1 = require("./utils");
 const dom_1 = require("./utils/dom");
 /** Keys from extractor variables that map to top-level DefuddleResponse fields */
 const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
 // Content pattern detection constants
+const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
+const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
 const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
 const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
 const BOILERPLATE_PATTERNS = [
@@ -143,17 +147,30 @@ class Defuddle {
     /**
      * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
      */
-    _getSchemaText(schemaOrgData) {
-        if (!schemaOrgData)
+    _getSchemaText(schemaOrgData, depth = 0) {
+        if (!schemaOrgData || depth > 10)
             return '';
         const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
         for (const item of items) {
+            // Recurse into nested arrays
+            if (Array.isArray(item)) {
+                const found = this._getSchemaText(item, depth + 1);
+                if (found)
+                    return found;
+                continue;
+            }
             if (item?.text && typeof item.text === 'string') {
                 return item.text;
             }
             if (item?.articleBody && typeof item.articleBody === 'string') {
                 return item.articleBody;
             }
+            // Traverse @graph arrays (common in JSON-LD with multiple entities)
+            if (item?.['@graph'] && Array.isArray(item['@graph'])) {
+                const found = this._getSchemaText(item['@graph'], depth + 1);
+                if (found)
+                    return found;
+            }
         }
         return '';
     }
@@ -194,39 +211,43 @@ class Defuddle {
         }
     }
     /**
-     * Find a DOM element whose text matches the schema.org text content.
-     * Used when the content scorer picked the wrong element from a feed page.
-     * Returns the element's inner HTML including sibling media (images, etc.)
+     * Find the smallest DOM element whose text contains the search phrase
+     * and whose word count is at least 80% of the expected count.
+     * Shared by _findSchemaContentElement and _findContentBySchemaText.
      */
-    _findContentBySchemaText(schemaText) {
-        const body = this.doc.body;
-        if (!body)
-            return '';
-        // Use the first paragraph as the search phrase.
-        // DOM textContent concatenates <p> elements without separators,
-        // so we can't cross paragraph boundaries when matching.
+    _findElementBySchemaText(root, schemaText) {
         const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
         const searchPhrase = firstPara.substring(0, 100).trim();
         if (!searchPhrase)
-            return '';
-        const schemaWordCount = this.countHtmlWords(schemaText);
-        // Find the smallest element whose text contains the search phrase
-        // and whose word count is close to the schema text's word count
+            return null;
+        const schemaWordCount = (0, utils_1.countWords)(schemaText);
         let bestMatch = null;
         let bestSize = Infinity;
-        const allElements = body.querySelectorAll('*');
+        const allElements = root.querySelectorAll('*');
         for (const el of allElements) {
-            const elText = (el.textContent || '');
+            if (el === root)
+                continue;
+            const elText = el.textContent || '';
             if (!elText.includes(searchPhrase))
                 continue;
             const elWords = (0, utils_1.countWords)(elText);
-            // Element should contain roughly the same amount of text
-            // (allow some slack for surrounding whitespace / minor extras)
             if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
                 bestSize = elWords;
                 bestMatch = el;
             }
         }
+        return bestMatch;
+    }
+    /**
+     * Find a DOM element whose text matches the schema.org text content.
+     * Used when the content scorer picked the wrong element from a feed page.
+     * Returns the element's inner HTML including sibling media (images, etc.)
+     */
+    _findContentBySchemaText(schemaText) {
+        const body = this.doc.body;
+        if (!body)
+            return '';
+        const bestMatch = this._findElementBySchemaText(body, schemaText);
         if (!bestMatch)
             return '';
         // Read the largest sibling image src BEFORE resolveRelativeUrls
@@ -260,6 +281,8 @@ class Defuddle {
                 catch { }
             }
         }
+        // Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
+        (0, headings_1.removeHeadingAnchors)(bestMatch);
         // Now resolve URLs in the text content
         this.resolveRelativeUrls(bestMatch);
         let html = (0, dom_1.serializeHTML)(bestMatch);
@@ -353,7 +376,8 @@ class Defuddle {
         try {
             const url = this.options.url || this.doc.URL;
             const schemaOrgData = this.getSchemaOrgData();
-            const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
+            const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
+            const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor) {
                 const extracted = await extractor.extractAsync();
                 return this.getExtractorVariables(extracted.variables) || null;
@@ -368,7 +392,8 @@ class Defuddle {
         try {
             const url = this.options.url || this.doc.URL;
             const schemaOrgData = this.getSchemaOrgData();
-            const extractor = finder(this.doc, url, schemaOrgData);
+            const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
+            const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor) {
                 const startTime = Date.now();
                 const extracted = await extractor.extractAsync();
@@ -414,6 +439,7 @@ class Defuddle {
             removeSmallImages: true,
             removeContentPatterns: true,
             standardize: true,
+            includeReplies: 'extractors',
             ...this.options,
             ...overrideOptions
         };
@@ -435,7 +461,11 @@ class Defuddle {
         try {
             // Use site-specific extractor first, if there is one
             const url = options.url || this.doc.URL;
-            const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
+            const extractorOpts = {
+                includeReplies: options.includeReplies,
+                language: options.language,
+            };
+            const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
             if (extractor && extractor.canExtract()) {
                 const extracted = extractor.extract();
                 return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
@@ -471,6 +501,18 @@ class Defuddle {
             if (!mainContent) {
                 mainContent = this.findMainContent(clone);
             }
+            // If we fell back to <body>, try using schema.org articleBody/text
+            // to find a more specific content element within the DOM.
+            if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
+                const schemaText = this._getSchemaText(schemaOrgData);
+                if (schemaText) {
+                    const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
+                    if (schemaContent) {
+                        this._log('Found content element via schema.org text');
+                        mainContent = schemaContent;
+                    }
+                }
+            }
             if (!mainContent) {
                 const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
                 const endTime = Date.now();
@@ -482,9 +524,13 @@ class Defuddle {
                     metaTags: pageMetaTags
                 };
             }
+            // Remove <wbr> elements — word break opportunity hints that carry no
+            // content but cause unwanted whitespace during standardization.
+            mainContent.querySelectorAll('wbr').forEach(el => el.remove());
             // Standardize footnotes before cleanup (CSS sidenotes use display:none)
             if (options.standardize) {
                 (0, footnotes_1.standardizeFootnotes)(mainContent);
+                (0, callouts_1.standardizeCallouts)(mainContent);
             }
             // Remove small images
             if (options.removeSmallImages) {
@@ -494,15 +540,17 @@ class Defuddle {
             if (options.removeHiddenElements) {
                 this.removeHiddenElements(clone, debugRemovals);
             }
-            // Remove non-content blocks by scoring
-            // Tries to find lists, navigation based on text content and link density
-            if (options.removeLowScoring) {
-                scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
-            }
-            // Remove clutter using selectors
+            // Remove clutter using selectors — deterministic removal of known
+            // non-content elements (nav, footer, .sidebar, etc.) by class/id.
+            // Runs before scoring so the heuristic scorer sees a cleaner DOM.
             if (options.removeExactSelectors || options.removePartialSelectors) {
                 this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
             }
+            // Remove non-content blocks by scoring — heuristic removal based
+            // on link density, text ratios, and navigation indicators.
+            if (options.removeLowScoring) {
+                scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
+            }
             // Remove elements by content patterns (read time, boilerplate, article cards)
             if (options.removeContentPatterns && mainContent) {
                 this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
@@ -556,7 +604,6 @@ class Defuddle {
             .replace(/&\w+;/g, ' ');
         return (0, utils_1.countWords)(text);
     }
-    // Make all other methods private by removing the static keyword and using private
     _log(...args) {
         if (this.debug) {
             console.log('Defuddle:', ...args);
@@ -770,13 +817,13 @@ class Defuddle {
                 // Skip code elements and elements containing code blocks
                 // where class names indicate language/syntax, not page structure
                 const tag = el.tagName;
-                if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
+                if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
                     return;
                 }
                 // Get all relevant attributes and combine into a single string
                 const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
                     if (attr === 'class') {
-                        return el.className && typeof el.className === 'string' ? el.className : '';
+                        return (0, dom_1.getClassName)(el);
                     }
                     if (attr === 'id') {
                         return el.id || '';
@@ -850,8 +897,8 @@ class Defuddle {
             const attrHeight = parseInt(element.getAttribute('height') || '0');
             // Check inline style dimensions
             const style = element.getAttribute('style') || '';
-            const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
-            const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
+            const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
+            const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
             // Use getComputedStyle and getBoundingClientRect only in browser
             let computedWidth = 0, computedHeight = 0;
             if (isBrowser) {
@@ -919,7 +966,7 @@ class Defuddle {
                 return `srcset:${dataSrcset}`;
         }
         const id = element.id || '';
-        const className = element.className || '';
+        const className = (0, dom_1.getClassName)(element);
         const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
         if (id)
             return `id:${id}`;
@@ -1035,8 +1082,8 @@ class Defuddle {
             if (current.id) {
                 selector += '#' + current.id;
             }
-            else if (current.className && typeof current.className === 'string') {
-                selector += '.' + current.className.trim().split(/\s+/).join('.');
+            else if ((0, dom_1.getClassName)(current)) {
+                selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
             }
             parts.unshift(selector);
             current = current.parentElement;
@@ -1050,9 +1097,23 @@ class Defuddle {
      * Resolve relative URLs to absolute within a DOM element
      */
     resolveRelativeUrls(element) {
-        const baseUrl = this.options.url || this.doc.URL;
-        if (!baseUrl)
+        const docUrl = this.options.url || this.doc.URL;
+        if (!docUrl)
             return;
+        // Respect <base href> for relative URL resolution, matching browser behavior
+        let baseUrl = docUrl;
+        const baseEl = this.doc.querySelector('base[href]');
+        if (baseEl) {
+            const baseHref = baseEl.getAttribute('href');
+            if (baseHref) {
+                try {
+                    baseUrl = new URL(baseHref, docUrl).href;
+                }
+                catch {
+                    // Invalid base href, fall back to document URL
+                }
+            }
+        }
         const resolve = (url) => {
             // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
             // Normalize these before URL resolution.
@@ -1449,12 +1510,78 @@ class Defuddle {
             }
             target.remove();
         }
+        // Remove blog post metadata lists near content boundaries.
+        // These are short <ul>/<ol> elements where every item is a brief
+        // label + value pair (date, reading time, share, etc.) with no
+        // prose sentences. Detected structurally: all items are very short,
+        // none contain sentence-ending punctuation, and the total text is minimal.
+        const metadataLists = mainContent.querySelectorAll('ul, ol');
+        for (const list of metadataLists) {
+            if (!list.parentNode)
+                continue;
+            const items = Array.from(list.children).filter(el => el.tagName === 'LI');
+            if (items.length < 2 || items.length > 8)
+                continue;
+            // Must be near the start or end of content
+            const listText = list.textContent?.trim() || '';
+            const listPos = contentText.indexOf(listText);
+            const distFromEnd = contentText.length - (listPos + listText.length);
+            if (listPos > 500 && distFromEnd > 500)
+                continue;
+            // Skip lists introduced by a preceding paragraph (e.g. "Features include:")
+            // — those are content lists, not standalone metadata
+            const prevSibling = list.previousElementSibling;
+            if (prevSibling) {
+                const prevText = prevSibling.textContent?.trim() || '';
+                if (prevText.endsWith(':'))
+                    continue;
+            }
+            // Every item must be very short (label + value) with no prose
+            let isMetadata = true;
+            for (const item of items) {
+                const text = item.textContent?.trim() || '';
+                const words = (0, utils_1.countWords)(text);
+                if (words > 8) {
+                    isMetadata = false;
+                    break;
+                }
+                // Prose has sentence-ending punctuation; metadata doesn't
+                if (/[.!?]$/.test(text)) {
+                    isMetadata = false;
+                    break;
+                }
+            }
+            if (!isMetadata)
+                continue;
+            // Total text should be very short — this is metadata, not content
+            if ((0, utils_1.countWords)(listText) > 30)
+                continue;
+            // Walk up to find the container to remove (e.g. a wrapper div)
+            let target = list;
+            while (target.parentElement && target.parentElement !== mainContent) {
+                const parentText = target.parentElement.textContent?.trim() || '';
+                if (parentText !== listText)
+                    break;
+                target = target.parentElement;
+            }
+            if (this.debug && debugRemovals) {
+                debugRemovals.push({
+                    step: 'removeByContentPattern',
+                    reason: 'blog metadata list',
+                    text: (0, utils_1.textPreview)(target)
+                });
+            }
+            target.remove();
+        }
         // Remove section breadcrumbs
         // Short elements containing a link to a parent section of the current URL.
         const url = this.options.url || this.doc.URL || '';
         let urlPath = '';
+        let pageHost = '';
         try {
-            urlPath = new URL(url).pathname;
+            const parsedUrl = new URL(url);
+            urlPath = parsedUrl.pathname;
+            pageHost = parsedUrl.hostname.replace(/^www\./, '');
         }
         catch { }
         if (urlPath) {
@@ -1488,6 +1615,126 @@ class Defuddle {
                 catch { }
             }
         }
+        // Remove trailing external link lists — a heading + list of purely
+        // off-site links as the last content block (affiliate picks, product
+        // roundups, etc.). Only removed when nothing meaningful follows.
+        if (pageHost) {
+            const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
+            for (const heading of headings) {
+                if (!heading.parentNode)
+                    continue;
+                const list = heading.nextElementSibling;
+                if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
+                    continue;
+                const items = Array.from(list.children).filter(el => el.tagName === 'LI');
+                if (items.length < 2)
+                    continue;
+                // The list must be the last meaningful block — nothing after it
+                // except whitespace or empty elements. Walk up through ancestors
+                // to check siblings at each level up to mainContent.
+                let trailingContent = false;
+                let checkEl = list;
+                while (checkEl && checkEl !== mainContent) {
+                    let sibling = checkEl.nextElementSibling;
+                    while (sibling) {
+                        if ((sibling.textContent?.trim() || '').length > 0) {
+                            trailingContent = true;
+                            break;
+                        }
+                        sibling = sibling.nextElementSibling;
+                    }
+                    if (trailingContent)
+                        break;
+                    checkEl = checkEl.parentElement;
+                }
+                if (trailingContent)
+                    continue;
+                // Every list item must be primarily a link pointing off-site
+                let allExternalLinks = true;
+                for (const item of items) {
+                    const links = item.querySelectorAll('a[href]');
+                    if (links.length === 0) {
+                        allExternalLinks = false;
+                        break;
+                    }
+                    const itemText = item.textContent?.trim() || '';
+                    let linkTextLen = 0;
+                    for (const link of links) {
+                        linkTextLen += (link.textContent?.trim() || '').length;
+                        try {
+                            const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
+                            if (linkHost === pageHost) {
+                                allExternalLinks = false;
+                                break;
+                            }
+                        }
+                        catch { }
+                    }
+                    if (!allExternalLinks)
+                        break;
+                    if (linkTextLen < itemText.length * 0.6) {
+                        allExternalLinks = false;
+                        break;
+                    }
+                }
+                if (!allExternalLinks)
+                    continue;
+                if (this.debug && debugRemovals) {
+                    debugRemovals.push({
+                        step: 'removeByContentPattern',
+                        reason: 'trailing external link list',
+                        text: (0, utils_1.textPreview)(heading)
+                    });
+                    debugRemovals.push({
+                        step: 'removeByContentPattern',
+                        reason: 'trailing external link list',
+                        text: (0, utils_1.textPreview)(list)
+                    });
+                }
+                list.remove();
+                heading.remove();
+            }
+        }
+        // Remove trailing thin sections — the last few direct children of
+        // mainContent that contain a heading but very little prose. These are
+        // typically CTAs, newsletter prompts, or promotional sections that
+        // have been partially stripped by prior removal steps.
+        const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
+        if (totalWords > 300) {
+            // Walk backwards from the last direct child of mainContent,
+            // collecting trailing elements that are thin (empty or very short prose).
+            // Exclude SVG text (path data) from word counts — it's not prose.
+            const trailingEls = [];
+            let trailingWords = 0;
+            let child = mainContent.lastElementChild;
+            while (child) {
+                // Count prose words, excluding SVG path data which inflates word counts
+                let svgWords = 0;
+                for (const svg of child.querySelectorAll('svg')) {
+                    svgWords += (0, utils_1.countWords)(svg.textContent || '');
+                }
+                const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
+                if (words > 25)
+                    break;
+                trailingWords += words;
+                trailingEls.push(child);
+                child = child.previousElementSibling;
+            }
+            // Must have a heading in the trailing elements and total < 15% of content.
+            // Skip if trailing elements contain content indicators (math, code, tables, images).
+            if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
+                const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
+                const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
+                if (hasHeading && !hasContent) {
+                    for (const el of trailingEls) {
+                        if (this.debug && debugRemovals) {
+                            debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
+                        }
+                        el.remove();
+                    }
+                }
+            }
+        }
         // Remove boilerplate sentences and trailing non-content.
         // Search elements for end-of-article boilerplate, then truncate
         // from the best ancestor that has siblings to remove.