@j0hanz/superfetch 2.4.13 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
319
319
  * Detect programming language from code content using heuristics.
320
320
  */
321
321
  export function detectLanguageFromCode(code) {
322
+ if (!code || code.trim().length === 0)
323
+ return undefined;
322
324
  return detector.detect(code);
323
325
  }
324
326
  /**
@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
398
398
  }
399
399
  return false;
400
400
  }
401
- function addSourceToMarkdownMarkdownFormat(content, url) {
401
+ function addSourceToMarkdownAsMarkdown(content, url) {
402
402
  if (hasMarkdownSourceLine(content))
403
403
  return content;
404
404
  const lineEnding = detectLineEnding(content);
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
423
423
  export function addSourceToMarkdown(content, url) {
424
424
  const fm = frontmatter.find(content);
425
425
  if (config.transform.metadataFormat === 'markdown' && !fm) {
426
- return addSourceToMarkdownMarkdownFormat(content, url);
426
+ return addSourceToMarkdownAsMarkdown(content, url);
427
427
  }
428
428
  if (!fm) {
429
429
  // Preserve existing behavior: always uses LF even if content uses CRLF.
package/dist/transform.js CHANGED
@@ -684,12 +684,12 @@ function tryTransformRawContent(params) {
684
684
  /* -------------------------------------------------------------------------------------------------
685
685
  * Quality gates + content source resolution
686
686
  * ------------------------------------------------------------------------------------------------- */
687
- const MIN_CONTENT_RATIO = 0.3;
687
+ const MIN_CONTENT_RATIO = 0.15;
688
688
  const MIN_HTML_LENGTH_FOR_GATE = 100;
689
- const MIN_HEADING_RETENTION_RATIO = 0.7;
690
- const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
689
+ const MIN_HEADING_RETENTION_RATIO = 0.3;
690
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
691
691
  const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
692
- const MAX_TRUNCATED_LINE_RATIO = 0.5;
692
+ const MAX_TRUNCATED_LINE_RATIO = 0.95;
693
693
  function needsDocumentWrapper(html) {
694
694
  const trimmed = html.trim().toLowerCase();
695
695
  return (!trimmed.startsWith('<!doctype') &&
@@ -780,8 +780,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
780
780
  return metadata;
781
781
  }
782
782
  const CONTENT_ROOT_SELECTORS = [
783
- 'main',
784
783
  'article',
784
+ 'main',
785
785
  '[role="main"]',
786
786
  '#content',
787
787
  '#main-content',
@@ -808,10 +808,9 @@ function findContentRoot(document) {
808
808
  }
809
809
  return undefined;
810
810
  }
811
- function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
811
+ function shouldUseArticleContent(article, originalHtmlOrDocument) {
812
812
  const articleLength = article.textContent.length;
813
813
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
814
- const safeUrl = url.substring(0, 80);
815
814
  let articleDocument = null;
816
815
  const getArticleDocument = () => {
817
816
  if (articleDocument)
@@ -821,69 +820,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
821
820
  };
822
821
  if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
823
822
  const ratio = articleLength / originalLength;
824
- if (ratio < MIN_CONTENT_RATIO) {
825
- logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
826
- url: safeUrl,
827
- articleLength,
828
- });
823
+ if (ratio < MIN_CONTENT_RATIO)
829
824
  return false;
830
- }
831
825
  }
832
826
  const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
833
827
  if (originalHeadings > 0) {
834
828
  const articleHeadings = countHeadingsDom(getArticleDocument());
835
829
  const retentionRatio = articleHeadings / originalHeadings;
836
- if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
837
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
838
- url: safeUrl,
839
- originalHeadings,
840
- articleHeadings,
841
- });
830
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
842
831
  return false;
843
- }
844
832
  }
845
833
  const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
846
834
  if (originalCodeBlocks > 0) {
847
835
  const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
848
836
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
849
- logDebug('Code block retention check', {
850
- url: safeUrl,
851
- originalCodeBlocks,
852
- articleCodeBlocks,
853
- codeRetentionRatio,
854
- });
855
- if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
856
- logDebug('Quality gate: Readability removed code blocks, using full HTML', {
857
- url: safeUrl,
858
- originalCodeBlocks,
859
- articleCodeBlocks,
860
- });
837
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
861
838
  return false;
862
- }
863
- }
864
- if (hasTruncatedSentences(article.textContent)) {
865
- logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
866
- url: safeUrl,
867
- });
868
- return false;
869
839
  }
870
- return true;
840
+ return !hasTruncatedSentences(article.textContent);
871
841
  }
872
842
  function buildContentSource(params) {
873
843
  const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
874
844
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
875
845
  if (useArticleContent && article) {
876
- return { sourceHtml: article.content, title: article.title, metadata };
846
+ // Apply noise removal to Readability-extracted content to remove
847
+ // author bylines, social share buttons, and other boilerplate
848
+ // that Readability may have included in the article content
849
+ const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
850
+ return {
851
+ sourceHtml: cleanedArticleHtml,
852
+ title: article.title,
853
+ metadata,
854
+ skipNoiseRemoval: true, // Already cleaned
855
+ };
877
856
  }
878
857
  if (document) {
879
- removeNoiseFromHtml(html, document, url);
880
- const cleanedDoc = document;
858
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
859
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
881
860
  const contentRoot = findContentRoot(cleanedDoc);
882
861
  if (contentRoot) {
883
- logDebug('Using content root fallback instead of full HTML', {
884
- url: url.substring(0, 80),
885
- contentLength: contentRoot.length,
886
- });
887
862
  return {
888
863
  sourceHtml: contentRoot,
889
864
  title: extractedMeta.title,
@@ -905,7 +880,7 @@ function resolveContentSource(params) {
905
880
  ...(params.signal ? { signal: params.signal } : {}),
906
881
  });
907
882
  const useArticleContent = article
908
- ? shouldUseArticleContent(article, document, params.url)
883
+ ? shouldUseArticleContent(article, document)
909
884
  : false;
910
885
  return buildContentSource({
911
886
  html: params.html,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.4.13",
3
+ "version": "2.5.0",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",