npm - @j0hanz/superfetch - Versions diffs - 2.4.13 → 2.5.0 - Mend

@j0hanz/superfetch 2.4.13 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/language-detection.js +2 -0
package/dist/markdown-cleanup.js +2 -2
package/dist/transform.js +23 -48
package/package.json +1 -1

package/dist/language-detection.js CHANGED Viewed

@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
  * Detect programming language from code content using heuristics.
  */
 export function detectLanguageFromCode(code) {
+    if (!code || code.trim().length === 0)
+        return undefined;
     return detector.detect(code);
 }
 /**

package/dist/markdown-cleanup.js CHANGED Viewed

@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
     }
     return false;
 }
-function addSourceToMarkdownMarkdownFormat(content, url) {
+function addSourceToMarkdownAsMarkdown(content, url) {
     if (hasMarkdownSourceLine(content))
         return content;
     const lineEnding = detectLineEnding(content);
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
 export function addSourceToMarkdown(content, url) {
     const fm = frontmatter.find(content);
     if (config.transform.metadataFormat === 'markdown' && !fm) {
-        return addSourceToMarkdownMarkdownFormat(content, url);
+        return addSourceToMarkdownAsMarkdown(content, url);
     }
     if (!fm) {
         // Preserve existing behavior: always uses LF even if content uses CRLF.

package/dist/transform.js CHANGED Viewed

@@ -684,12 +684,12 @@ function tryTransformRawContent(params) {
 /* -------------------------------------------------------------------------------------------------
  * Quality gates + content source resolution
  * ------------------------------------------------------------------------------------------------- */
-const MIN_CONTENT_RATIO = 0.3;
+const MIN_CONTENT_RATIO = 0.15;
 const MIN_HTML_LENGTH_FOR_GATE = 100;
-const MIN_HEADING_RETENTION_RATIO = 0.7;
-const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
+const MIN_HEADING_RETENTION_RATIO = 0.3;
+const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
 const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
-const MAX_TRUNCATED_LINE_RATIO = 0.5;
+const MAX_TRUNCATED_LINE_RATIO = 0.95;
 function needsDocumentWrapper(html) {
     const trimmed = html.trim().toLowerCase();
     return (!trimmed.startsWith('<!doctype') &&
@@ -780,8 +780,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
     return metadata;
 }
 const CONTENT_ROOT_SELECTORS = [
-    'main',
     'article',
+    'main',
     '[role="main"]',
     '#content',
     '#main-content',
@@ -808,10 +808,9 @@ function findContentRoot(document) {
     }
     return undefined;
 }
-function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
+function shouldUseArticleContent(article, originalHtmlOrDocument) {
     const articleLength = article.textContent.length;
     const originalLength = getVisibleTextLength(originalHtmlOrDocument);
-    const safeUrl = url.substring(0, 80);
     let articleDocument = null;
     const getArticleDocument = () => {
         if (articleDocument)
@@ -821,69 +820,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
     };
     if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
         const ratio = articleLength / originalLength;
-        if (ratio < MIN_CONTENT_RATIO) {
-            logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
-                url: safeUrl,
-                articleLength,
-            });
+        if (ratio < MIN_CONTENT_RATIO)
             return false;
-        }
     }
     const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
     if (originalHeadings > 0) {
         const articleHeadings = countHeadingsDom(getArticleDocument());
         const retentionRatio = articleHeadings / originalHeadings;
-        if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
-            logDebug('Quality gate: Readability broke heading structure, using full HTML', {
-                url: safeUrl,
-                originalHeadings,
-                articleHeadings,
-            });
+        if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
             return false;
-        }
     }
     const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
     if (originalCodeBlocks > 0) {
         const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
         const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
-        logDebug('Code block retention check', {
-            url: safeUrl,
-            originalCodeBlocks,
-            articleCodeBlocks,
-            codeRetentionRatio,
-        });
-        if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
-            logDebug('Quality gate: Readability removed code blocks, using full HTML', {
-                url: safeUrl,
-                originalCodeBlocks,
-                articleCodeBlocks,
-            });
+        if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
             return false;
-        }
-    }
-    if (hasTruncatedSentences(article.textContent)) {
-        logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
-            url: safeUrl,
-        });
-        return false;
     }
-    return true;
+    return !hasTruncatedSentences(article.textContent);
 }
 function buildContentSource(params) {
     const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
     const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
     if (useArticleContent && article) {
-        return { sourceHtml: article.content, title: article.title, metadata };
+        // Apply noise removal to Readability-extracted content to remove
+        // author bylines, social share buttons, and other boilerplate
+        // that Readability may have included in the article content
+        const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
+        return {
+            sourceHtml: cleanedArticleHtml,
+            title: article.title,
+            metadata,
+            skipNoiseRemoval: true, // Already cleaned
+        };
     }
     if (document) {
-        removeNoiseFromHtml(html, document, url);
-        const cleanedDoc = document;
+        const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
+        const { document: cleanedDoc } = parseHTML(cleanedHtml);
         const contentRoot = findContentRoot(cleanedDoc);
         if (contentRoot) {
-            logDebug('Using content root fallback instead of full HTML', {
-                url: url.substring(0, 80),
-                contentLength: contentRoot.length,
-            });
             return {
                 sourceHtml: contentRoot,
                 title: extractedMeta.title,
@@ -905,7 +880,7 @@ function resolveContentSource(params) {
         ...(params.signal ? { signal: params.signal } : {}),
     });
     const useArticleContent = article
-        ? shouldUseArticleContent(article, document, params.url)
+        ? shouldUseArticleContent(article, document)
         : false;
     return buildContentSource({
         html: params.html,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@j0hanz/superfetch",
-  "version": "2.4.13",
+  "version": "2.5.0",
   "mcpName": "io.github.j0hanz/superfetch",
   "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
   "type": "module",