@j0hanz/superfetch 2.4.13 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/language-detection.js +2 -0
- package/dist/markdown-cleanup.js +2 -2
- package/dist/transform.js +23 -48
- package/package.json +1 -1
|
@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
|
|
|
319
319
|
* Detect programming language from code content using heuristics.
|
|
320
320
|
*/
|
|
321
321
|
export function detectLanguageFromCode(code) {
|
|
322
|
+
if (!code || code.trim().length === 0)
|
|
323
|
+
return undefined;
|
|
322
324
|
return detector.detect(code);
|
|
323
325
|
}
|
|
324
326
|
/**
|
package/dist/markdown-cleanup.js
CHANGED
|
@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
|
|
|
398
398
|
}
|
|
399
399
|
return false;
|
|
400
400
|
}
|
|
401
|
-
function
|
|
401
|
+
function addSourceToMarkdownAsMarkdown(content, url) {
|
|
402
402
|
if (hasMarkdownSourceLine(content))
|
|
403
403
|
return content;
|
|
404
404
|
const lineEnding = detectLineEnding(content);
|
|
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
|
423
423
|
export function addSourceToMarkdown(content, url) {
|
|
424
424
|
const fm = frontmatter.find(content);
|
|
425
425
|
if (config.transform.metadataFormat === 'markdown' && !fm) {
|
|
426
|
-
return
|
|
426
|
+
return addSourceToMarkdownAsMarkdown(content, url);
|
|
427
427
|
}
|
|
428
428
|
if (!fm) {
|
|
429
429
|
// Preserve existing behavior: always uses LF even if content uses CRLF.
|
package/dist/transform.js
CHANGED
|
@@ -684,12 +684,12 @@ function tryTransformRawContent(params) {
|
|
|
684
684
|
/* -------------------------------------------------------------------------------------------------
|
|
685
685
|
* Quality gates + content source resolution
|
|
686
686
|
* ------------------------------------------------------------------------------------------------- */
|
|
687
|
-
const MIN_CONTENT_RATIO = 0.
|
|
687
|
+
const MIN_CONTENT_RATIO = 0.15;
|
|
688
688
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
689
|
-
const MIN_HEADING_RETENTION_RATIO = 0.
|
|
690
|
-
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.
|
|
689
|
+
const MIN_HEADING_RETENTION_RATIO = 0.3;
|
|
690
|
+
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
|
|
691
691
|
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
692
|
-
const MAX_TRUNCATED_LINE_RATIO = 0.
|
|
692
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.95;
|
|
693
693
|
function needsDocumentWrapper(html) {
|
|
694
694
|
const trimmed = html.trim().toLowerCase();
|
|
695
695
|
return (!trimmed.startsWith('<!doctype') &&
|
|
@@ -780,8 +780,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
780
780
|
return metadata;
|
|
781
781
|
}
|
|
782
782
|
const CONTENT_ROOT_SELECTORS = [
|
|
783
|
-
'main',
|
|
784
783
|
'article',
|
|
784
|
+
'main',
|
|
785
785
|
'[role="main"]',
|
|
786
786
|
'#content',
|
|
787
787
|
'#main-content',
|
|
@@ -808,10 +808,9 @@ function findContentRoot(document) {
|
|
|
808
808
|
}
|
|
809
809
|
return undefined;
|
|
810
810
|
}
|
|
811
|
-
function shouldUseArticleContent(article, originalHtmlOrDocument
|
|
811
|
+
function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
812
812
|
const articleLength = article.textContent.length;
|
|
813
813
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
814
|
-
const safeUrl = url.substring(0, 80);
|
|
815
814
|
let articleDocument = null;
|
|
816
815
|
const getArticleDocument = () => {
|
|
817
816
|
if (articleDocument)
|
|
@@ -821,69 +820,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
|
821
820
|
};
|
|
822
821
|
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
823
822
|
const ratio = articleLength / originalLength;
|
|
824
|
-
if (ratio < MIN_CONTENT_RATIO)
|
|
825
|
-
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
826
|
-
url: safeUrl,
|
|
827
|
-
articleLength,
|
|
828
|
-
});
|
|
823
|
+
if (ratio < MIN_CONTENT_RATIO)
|
|
829
824
|
return false;
|
|
830
|
-
}
|
|
831
825
|
}
|
|
832
826
|
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
833
827
|
if (originalHeadings > 0) {
|
|
834
828
|
const articleHeadings = countHeadingsDom(getArticleDocument());
|
|
835
829
|
const retentionRatio = articleHeadings / originalHeadings;
|
|
836
|
-
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
837
|
-
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
838
|
-
url: safeUrl,
|
|
839
|
-
originalHeadings,
|
|
840
|
-
articleHeadings,
|
|
841
|
-
});
|
|
830
|
+
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
842
831
|
return false;
|
|
843
|
-
}
|
|
844
832
|
}
|
|
845
833
|
const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
|
|
846
834
|
if (originalCodeBlocks > 0) {
|
|
847
835
|
const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
|
|
848
836
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
849
|
-
|
|
850
|
-
url: safeUrl,
|
|
851
|
-
originalCodeBlocks,
|
|
852
|
-
articleCodeBlocks,
|
|
853
|
-
codeRetentionRatio,
|
|
854
|
-
});
|
|
855
|
-
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
|
|
856
|
-
logDebug('Quality gate: Readability removed code blocks, using full HTML', {
|
|
857
|
-
url: safeUrl,
|
|
858
|
-
originalCodeBlocks,
|
|
859
|
-
articleCodeBlocks,
|
|
860
|
-
});
|
|
837
|
+
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
|
|
861
838
|
return false;
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
if (hasTruncatedSentences(article.textContent)) {
|
|
865
|
-
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
|
|
866
|
-
url: safeUrl,
|
|
867
|
-
});
|
|
868
|
-
return false;
|
|
869
839
|
}
|
|
870
|
-
return
|
|
840
|
+
return !hasTruncatedSentences(article.textContent);
|
|
871
841
|
}
|
|
872
842
|
function buildContentSource(params) {
|
|
873
843
|
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
|
|
874
844
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
875
845
|
if (useArticleContent && article) {
|
|
876
|
-
|
|
846
|
+
// Apply noise removal to Readability-extracted content to remove
|
|
847
|
+
// author bylines, social share buttons, and other boilerplate
|
|
848
|
+
// that Readability may have included in the article content
|
|
849
|
+
const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
|
|
850
|
+
return {
|
|
851
|
+
sourceHtml: cleanedArticleHtml,
|
|
852
|
+
title: article.title,
|
|
853
|
+
metadata,
|
|
854
|
+
skipNoiseRemoval: true, // Already cleaned
|
|
855
|
+
};
|
|
877
856
|
}
|
|
878
857
|
if (document) {
|
|
879
|
-
removeNoiseFromHtml(html,
|
|
880
|
-
const cleanedDoc =
|
|
858
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
859
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
881
860
|
const contentRoot = findContentRoot(cleanedDoc);
|
|
882
861
|
if (contentRoot) {
|
|
883
|
-
logDebug('Using content root fallback instead of full HTML', {
|
|
884
|
-
url: url.substring(0, 80),
|
|
885
|
-
contentLength: contentRoot.length,
|
|
886
|
-
});
|
|
887
862
|
return {
|
|
888
863
|
sourceHtml: contentRoot,
|
|
889
864
|
title: extractedMeta.title,
|
|
@@ -905,7 +880,7 @@ function resolveContentSource(params) {
|
|
|
905
880
|
...(params.signal ? { signal: params.signal } : {}),
|
|
906
881
|
});
|
|
907
882
|
const useArticleContent = article
|
|
908
|
-
? shouldUseArticleContent(article, document
|
|
883
|
+
? shouldUseArticleContent(article, document)
|
|
909
884
|
: false;
|
|
910
885
|
return buildContentSource({
|
|
911
886
|
html: params.html,
|
package/package.json
CHANGED