@j0hanz/fetch-url-mcp 1.9.3 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/code-lang.d.ts +4 -0
- package/dist/lib/code-lang.d.ts.map +1 -0
- package/dist/lib/code-lang.js +315 -0
- package/dist/lib/dom-prep.d.ts +4 -0
- package/dist/lib/dom-prep.d.ts.map +1 -0
- package/dist/lib/dom-prep.js +606 -0
- package/dist/lib/md-cleanup.d.ts +13 -0
- package/dist/lib/md-cleanup.d.ts.map +1 -0
- package/dist/lib/md-cleanup.js +391 -0
- package/dist/lib/md-metadata.d.ts +6 -0
- package/dist/lib/md-metadata.d.ts.map +1 -0
- package/dist/lib/md-metadata.js +186 -0
- package/dist/transform/html-translators.js +1 -1
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +160 -176
- package/package.json +1 -1
- package/dist/lib/content.d.ts +0 -18
- package/dist/lib/content.d.ts.map +0 -1
- package/dist/lib/content.js +0 -1433
|
@@ -3,11 +3,14 @@ import diagnosticsChannel from 'node:diagnostics_channel';
|
|
|
3
3
|
import { performance } from 'node:perf_hooks';
|
|
4
4
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
5
5
|
import { parseHTML } from 'linkedom';
|
|
6
|
-
import {
|
|
6
|
+
import { detectLanguageFromCode, extractLanguageFromClassName, } from '../lib/code-lang.js';
|
|
7
7
|
import { config } from '../lib/core.js';
|
|
8
8
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
9
|
+
import { prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
9
10
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
10
|
-
import {
|
|
11
|
+
import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
|
|
12
|
+
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
|
|
13
|
+
import { throwIfAborted } from '../lib/utils.js';
|
|
11
14
|
import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
|
|
12
15
|
import { isObject } from '../lib/utils.js';
|
|
13
16
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
@@ -34,7 +37,6 @@ function decodeInput(input, encoding) {
|
|
|
34
37
|
function asError(value) {
|
|
35
38
|
return value instanceof Error ? value : undefined;
|
|
36
39
|
}
|
|
37
|
-
const abortPolicy = { throwIfAborted, createAbortError };
|
|
38
40
|
function isWhitespaceChar(code) {
|
|
39
41
|
return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
|
|
40
42
|
}
|
|
@@ -306,7 +308,7 @@ function extractArticle(document, url, signal) {
|
|
|
306
308
|
return null;
|
|
307
309
|
}
|
|
308
310
|
const checkAbort = (stage) => {
|
|
309
|
-
|
|
311
|
+
throwIfAborted(signal, url, stage);
|
|
310
312
|
};
|
|
311
313
|
try {
|
|
312
314
|
const doc = document;
|
|
@@ -391,29 +393,43 @@ function applyBaseUri(document, url) {
|
|
|
391
393
|
});
|
|
392
394
|
}
|
|
393
395
|
}
|
|
396
|
+
function createEmptyExtractionContext() {
|
|
397
|
+
const { document } = parseHTML('<html></html>');
|
|
398
|
+
return { article: null, metadata: {}, document };
|
|
399
|
+
}
|
|
400
|
+
function extractEarlyMetadataIfNeeded(html, url) {
|
|
401
|
+
if (!willTruncate(html))
|
|
402
|
+
return null;
|
|
403
|
+
return stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url));
|
|
404
|
+
}
|
|
405
|
+
function parseExtractionDocument(html, url, inputTruncated) {
|
|
406
|
+
const { html: limitedHtml, truncated } = truncateHtml(html, inputTruncated);
|
|
407
|
+
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
408
|
+
return { document, truncated };
|
|
409
|
+
}
|
|
410
|
+
function extractMergedMetadata(html, url, document) {
|
|
411
|
+
const earlyMetadata = extractEarlyMetadataIfNeeded(html, url);
|
|
412
|
+
const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
|
|
413
|
+
return mergeMetadata(earlyMetadata, lateMetadata);
|
|
414
|
+
}
|
|
415
|
+
function extractArticleIfRequested(document, url, options) {
|
|
416
|
+
if (!options.extractArticle)
|
|
417
|
+
return null;
|
|
418
|
+
return stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal));
|
|
419
|
+
}
|
|
394
420
|
function extractContentContext(html, url, options) {
|
|
395
421
|
if (!isValidInput(html, url)) {
|
|
396
|
-
|
|
397
|
-
return { article: null, metadata: {}, document };
|
|
422
|
+
return createEmptyExtractionContext();
|
|
398
423
|
}
|
|
399
424
|
try {
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
? stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url))
|
|
404
|
-
: null;
|
|
405
|
-
const { html: limitedHtml, truncated } = truncateHtml(html, options.inputTruncated);
|
|
406
|
-
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
407
|
-
abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
|
|
425
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
426
|
+
const { document, truncated } = parseExtractionDocument(html, url, options.inputTruncated);
|
|
427
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
408
428
|
applyBaseUri(document, url);
|
|
409
|
-
const
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
const article = options.extractArticle
|
|
414
|
-
? stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal))
|
|
415
|
-
: null;
|
|
416
|
-
abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
|
|
429
|
+
const metadata = extractMergedMetadata(html, url, document);
|
|
430
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
431
|
+
const article = extractArticleIfRequested(document, url, options);
|
|
432
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
417
433
|
return {
|
|
418
434
|
article,
|
|
419
435
|
metadata,
|
|
@@ -424,10 +440,9 @@ function extractContentContext(html, url, options) {
|
|
|
424
440
|
catch (error) {
|
|
425
441
|
if (error instanceof FetchError)
|
|
426
442
|
throw error;
|
|
427
|
-
|
|
443
|
+
throwIfAborted(options.signal, url, 'extract:error');
|
|
428
444
|
logError('Failed to extract content', asError(error));
|
|
429
|
-
|
|
430
|
-
return { article: null, metadata: {}, document };
|
|
445
|
+
return createEmptyExtractionContext();
|
|
431
446
|
}
|
|
432
447
|
}
|
|
433
448
|
export function extractContent(html, url, options = {
|
|
@@ -436,8 +451,6 @@ export function extractContent(html, url, options = {
|
|
|
436
451
|
const result = extractContentContext(html, url, options);
|
|
437
452
|
return { article: result.article, metadata: result.metadata };
|
|
438
453
|
}
|
|
439
|
-
const ABORT_CHECK_LINE_INTERVAL = 500;
|
|
440
|
-
const CR_CHAR_CODE = 13;
|
|
441
454
|
function resolveRelativeHref(href, baseUrl, origin) {
|
|
442
455
|
const trimmedHref = href.trim();
|
|
443
456
|
if (!trimmedHref || containsWhitespace(trimmedHref))
|
|
@@ -501,7 +514,6 @@ function isAbsoluteOrSpecialUrl(href) {
|
|
|
501
514
|
return true;
|
|
502
515
|
return URL.canParse(trimmedHref);
|
|
503
516
|
}
|
|
504
|
-
const FENCE_LINE_PATTERN = /^\s*(`{3,}|~{3,})/;
|
|
505
517
|
function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
|
|
506
518
|
let cursor = 0;
|
|
507
519
|
let output = '';
|
|
@@ -527,71 +539,20 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
527
539
|
}
|
|
528
540
|
if (!markdown)
|
|
529
541
|
return markdown;
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
let lastIndex = 0;
|
|
535
|
-
let lineCount = 0;
|
|
536
|
-
while (lastIndex < len) {
|
|
537
|
-
if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
|
|
538
|
-
abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
539
|
-
}
|
|
540
|
-
// Extract next line (handling CR+LF)
|
|
541
|
-
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
542
|
-
const isLastLine = nextIndex === -1;
|
|
543
|
-
if (isLastLine)
|
|
544
|
-
nextIndex = len;
|
|
545
|
-
const lineWithNewline = isLastLine
|
|
546
|
-
? markdown.slice(lastIndex)
|
|
547
|
-
: markdown.slice(lastIndex, nextIndex + 1);
|
|
548
|
-
const lineEnd = !isLastLine &&
|
|
549
|
-
nextIndex > lastIndex &&
|
|
550
|
-
markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
|
|
551
|
-
? nextIndex - 1
|
|
552
|
-
: isLastLine
|
|
553
|
-
? len
|
|
554
|
-
: nextIndex;
|
|
555
|
-
const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
|
|
556
|
-
if (fenceMarker) {
|
|
557
|
-
// Inside a code fence — pass through without URL resolution
|
|
558
|
-
output += lineWithNewline;
|
|
559
|
-
if (trimmed.startsWith(fenceMarker) &&
|
|
560
|
-
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
561
|
-
fenceMarker = null;
|
|
562
|
-
}
|
|
563
|
-
}
|
|
564
|
-
else {
|
|
565
|
-
const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
|
|
566
|
-
if (fenceMatch?.[1]) {
|
|
567
|
-
// Entering a code fence — flush buffered content first
|
|
568
|
-
if (buffer) {
|
|
569
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
570
|
-
buffer = '';
|
|
571
|
-
}
|
|
572
|
-
output += lineWithNewline;
|
|
573
|
-
fenceMarker = fenceMatch[1];
|
|
574
|
-
}
|
|
575
|
-
else {
|
|
576
|
-
buffer += lineWithNewline;
|
|
577
|
-
}
|
|
578
|
-
}
|
|
579
|
-
lastIndex = isLastLine ? len : nextIndex + 1;
|
|
580
|
-
}
|
|
581
|
-
if (buffer) {
|
|
582
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
583
|
-
}
|
|
584
|
-
return output;
|
|
542
|
+
return processFencedContent(markdown, (text) => {
|
|
543
|
+
throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
544
|
+
return resolveRelativeUrlsInSegment(text, baseUrl, origin);
|
|
545
|
+
});
|
|
585
546
|
}
|
|
586
547
|
function translateHtmlToMarkdown(params) {
|
|
587
548
|
const { html, url, signal, document, skipNoiseRemoval } = params;
|
|
588
|
-
|
|
549
|
+
throwIfAborted(signal, url, 'markdown:begin');
|
|
589
550
|
const cleanedHtml = skipNoiseRemoval
|
|
590
551
|
? html
|
|
591
552
|
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
|
|
592
|
-
|
|
553
|
+
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
593
554
|
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
594
|
-
|
|
555
|
+
throwIfAborted(signal, url, 'markdown:translated');
|
|
595
556
|
const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
596
557
|
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
597
558
|
}
|
|
@@ -937,57 +898,74 @@ export const TransformHeuristics = {
|
|
|
937
898
|
findPrimaryHeading,
|
|
938
899
|
isGithubRepositoryRootUrl,
|
|
939
900
|
};
|
|
940
|
-
|
|
941
|
-
|
|
901
|
+
const ARTICLE_INTERACTIVE_SELECTOR = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
|
|
902
|
+
function buildArticleDocument(article) {
|
|
903
|
+
return parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
|
|
904
|
+
}
|
|
905
|
+
function hasSufficientArticleContentRatio(article, document) {
|
|
942
906
|
const originalLength = getVisibleTextLength(document);
|
|
943
|
-
if (originalLength
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
907
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
908
|
+
return true;
|
|
909
|
+
return article.textContent.length / originalLength >= MIN_CONTENT_RATIO;
|
|
910
|
+
}
|
|
911
|
+
function retainsEnoughHeadings(articleDoc, document) {
|
|
949
912
|
const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
|
|
913
|
+
if (originalHeadings === 0)
|
|
914
|
+
return true;
|
|
950
915
|
const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
return false;
|
|
955
|
-
}
|
|
916
|
+
return articleHeadings / originalHeadings >= MIN_HEADING_RETENTION_RATIO;
|
|
917
|
+
}
|
|
918
|
+
function retainsEnoughCodeBlocks(articleDoc, document) {
|
|
956
919
|
const originalCodeBlocks = countMatchingElements(document, 'pre');
|
|
957
|
-
if (originalCodeBlocks
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
920
|
+
if (originalCodeBlocks === 0)
|
|
921
|
+
return true;
|
|
922
|
+
const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
|
|
923
|
+
return (articleCodeBlocks / originalCodeBlocks >= MIN_CODE_BLOCK_RETENTION_RATIO);
|
|
924
|
+
}
|
|
925
|
+
function retainsEnoughTables(articleDoc, document) {
|
|
963
926
|
const originalTables = countMatchingElements(document, 'table');
|
|
964
|
-
if (originalTables
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
927
|
+
if (originalTables === 0)
|
|
928
|
+
return true;
|
|
929
|
+
const articleTables = countMatchingElements(articleDoc, 'table');
|
|
930
|
+
return articleTables / originalTables >= MIN_TABLE_RETENTION_RATIO;
|
|
931
|
+
}
|
|
932
|
+
function retainsEnoughImages(articleDoc, document) {
|
|
970
933
|
const originalImages = countMatchingElements(document, 'img');
|
|
971
|
-
if (originalImages
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
const
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
934
|
+
if (originalImages < MIN_IMAGE_ELEMENTS_FOR_GATE)
|
|
935
|
+
return true;
|
|
936
|
+
const articleImages = countMatchingElements(articleDoc, 'img');
|
|
937
|
+
return articleImages / originalImages >= MIN_IMAGE_RETENTION_RATIO;
|
|
938
|
+
}
|
|
939
|
+
function retainsEnoughInteractiveElements(articleDoc, document) {
|
|
940
|
+
const originalInteractive = countMatchingElements(document, ARTICLE_INTERACTIVE_SELECTOR);
|
|
941
|
+
if (originalInteractive < MIN_INTERACTIVE_ELEMENTS_FOR_GATE)
|
|
942
|
+
return true;
|
|
943
|
+
const articleInteractive = countMatchingElements(articleDoc, ARTICLE_INTERACTIVE_SELECTOR);
|
|
944
|
+
return (articleInteractive / originalInteractive >= MIN_INTERACTIVE_RETENTION_RATIO);
|
|
945
|
+
}
|
|
946
|
+
function hasAcceptableEmptySectionRatio(articleDoc) {
|
|
947
|
+
const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
|
|
948
|
+
if (articleHeadings < MIN_HEADINGS_FOR_EMPTY_SECTION_GATE)
|
|
949
|
+
return true;
|
|
950
|
+
const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
|
|
951
|
+
return emptySectionRatio <= MAX_EMPTY_SECTION_RATIO;
|
|
952
|
+
}
|
|
953
|
+
function shouldUseArticleContent(article, document) {
|
|
954
|
+
if (!hasSufficientArticleContentRatio(article, document))
|
|
955
|
+
return false;
|
|
956
|
+
const articleDoc = buildArticleDocument(article);
|
|
957
|
+
if (!retainsEnoughHeadings(articleDoc, document))
|
|
958
|
+
return false;
|
|
959
|
+
if (!retainsEnoughCodeBlocks(articleDoc, document))
|
|
960
|
+
return false;
|
|
961
|
+
if (!retainsEnoughTables(articleDoc, document))
|
|
962
|
+
return false;
|
|
963
|
+
if (!retainsEnoughImages(articleDoc, document))
|
|
964
|
+
return false;
|
|
965
|
+
if (!retainsEnoughInteractiveElements(articleDoc, document))
|
|
966
|
+
return false;
|
|
967
|
+
if (!hasAcceptableEmptySectionRatio(articleDoc))
|
|
968
|
+
return false;
|
|
991
969
|
return !hasTruncatedSentences(article.textContent);
|
|
992
970
|
}
|
|
993
971
|
function buildContentSource(params) {
|
|
@@ -1339,6 +1317,33 @@ function resolveContentSource(params) {
|
|
|
1339
1317
|
...(params.signal ? { signal: params.signal } : {}),
|
|
1340
1318
|
});
|
|
1341
1319
|
}
|
|
1320
|
+
function shouldStripGithubPrimaryHeading(context, url) {
|
|
1321
|
+
return (context.primaryHeading !== undefined &&
|
|
1322
|
+
TransformHeuristics.isGithubRepositoryRootUrl(url));
|
|
1323
|
+
}
|
|
1324
|
+
function maybeStripGithubPrimaryHeading(markdown, context, url) {
|
|
1325
|
+
if (!shouldStripGithubPrimaryHeading(context, url))
|
|
1326
|
+
return markdown;
|
|
1327
|
+
return stripLeadingHeading(markdown, context.primaryHeading ?? '');
|
|
1328
|
+
}
|
|
1329
|
+
function buildSyntheticTitlePrefix(url, favicon) {
|
|
1330
|
+
if (!favicon)
|
|
1331
|
+
return ' ';
|
|
1332
|
+
let alt = '';
|
|
1333
|
+
try {
|
|
1334
|
+
alt = new URL(url).hostname;
|
|
1335
|
+
}
|
|
1336
|
+
catch {
|
|
1337
|
+
/* skip */
|
|
1338
|
+
}
|
|
1339
|
+
return `  `;
|
|
1340
|
+
}
|
|
1341
|
+
function maybePrependSyntheticTitle(markdown, context, url) {
|
|
1342
|
+
if (!context.title || /^(#{1,6})\s/.test(markdown.trimStart())) {
|
|
1343
|
+
return markdown;
|
|
1344
|
+
}
|
|
1345
|
+
return `#${buildSyntheticTitlePrefix(url, context.favicon)}${context.title}\n\n${markdown}`;
|
|
1346
|
+
}
|
|
1342
1347
|
function buildMarkdownFromContext(context, url, signal) {
|
|
1343
1348
|
let content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
1344
1349
|
url,
|
|
@@ -1346,25 +1351,8 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1346
1351
|
...(context.document ? { document: context.document } : {}),
|
|
1347
1352
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
1348
1353
|
}));
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
content = stripLeadingHeading(content, context.primaryHeading);
|
|
1352
|
-
}
|
|
1353
|
-
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
1354
|
-
const icon = context.favicon;
|
|
1355
|
-
let prefix = ' ';
|
|
1356
|
-
if (icon) {
|
|
1357
|
-
let alt = '';
|
|
1358
|
-
try {
|
|
1359
|
-
alt = new URL(url).hostname;
|
|
1360
|
-
}
|
|
1361
|
-
catch {
|
|
1362
|
-
/* skip */
|
|
1363
|
-
}
|
|
1364
|
-
prefix = `  `;
|
|
1365
|
-
}
|
|
1366
|
-
content = `#${prefix}${context.title}\n\n${content}`;
|
|
1367
|
-
}
|
|
1354
|
+
content = maybeStripGithubPrimaryHeading(content, context, url);
|
|
1355
|
+
content = maybePrependSyntheticTitle(content, context, url);
|
|
1368
1356
|
content = supplementMarkdownFromNextFlight(content, context.originalHtml);
|
|
1369
1357
|
content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
1370
1358
|
return {
|
|
@@ -1422,10 +1410,24 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1422
1410
|
const signal = buildTransformSignal(options.signal);
|
|
1423
1411
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1424
1412
|
try {
|
|
1425
|
-
|
|
1413
|
+
throwIfAborted(signal, url, 'transform:begin');
|
|
1426
1414
|
validateBinaryContent(html, url);
|
|
1427
|
-
const result =
|
|
1428
|
-
|
|
1415
|
+
const result = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1416
|
+
html,
|
|
1417
|
+
url,
|
|
1418
|
+
includeMetadata: options.includeMetadata,
|
|
1419
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1420
|
+
})) ??
|
|
1421
|
+
(() => {
|
|
1422
|
+
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1423
|
+
html,
|
|
1424
|
+
url,
|
|
1425
|
+
includeMetadata: options.includeMetadata,
|
|
1426
|
+
...(signal ? { signal } : {}),
|
|
1427
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1428
|
+
}));
|
|
1429
|
+
return buildMarkdownFromContext(context, url, signal);
|
|
1430
|
+
})();
|
|
1429
1431
|
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1430
1432
|
return result;
|
|
1431
1433
|
}
|
|
@@ -1439,24 +1441,6 @@ function validateBinaryContent(html, url) {
|
|
|
1439
1441
|
throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
1440
1442
|
}
|
|
1441
1443
|
}
|
|
1442
|
-
function tryRawContentPipeline(html, url, options) {
|
|
1443
|
-
return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1444
|
-
html,
|
|
1445
|
-
url,
|
|
1446
|
-
includeMetadata: options.includeMetadata,
|
|
1447
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1448
|
-
}));
|
|
1449
|
-
}
|
|
1450
|
-
function tryHtmlContentPipeline(html, url, options, signal) {
|
|
1451
|
-
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1452
|
-
html,
|
|
1453
|
-
url,
|
|
1454
|
-
includeMetadata: options.includeMetadata,
|
|
1455
|
-
...(signal ? { signal } : {}),
|
|
1456
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1457
|
-
}));
|
|
1458
|
-
return buildMarkdownFromContext(context, url, signal);
|
|
1459
|
-
}
|
|
1460
1444
|
export function getTransformPoolStats() {
|
|
1461
1445
|
return getWorkerPoolStats();
|
|
1462
1446
|
}
|
|
@@ -1466,7 +1450,7 @@ export async function shutdownTransformWorkerPool() {
|
|
|
1466
1450
|
function transformInputInProcess(htmlOrBuffer, url, options) {
|
|
1467
1451
|
return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
|
|
1468
1452
|
}
|
|
1469
|
-
function
|
|
1453
|
+
function workerTransformOptions(options) {
|
|
1470
1454
|
return {
|
|
1471
1455
|
includeMetadata: options.includeMetadata,
|
|
1472
1456
|
...(options.signal ? { signal: options.signal } : {}),
|
|
@@ -1479,10 +1463,10 @@ async function transformWithWorkerPool(htmlOrBuffer, url, options) {
|
|
|
1479
1463
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1480
1464
|
}
|
|
1481
1465
|
if (typeof htmlOrBuffer === 'string') {
|
|
1482
|
-
return pool.transform(htmlOrBuffer, url,
|
|
1466
|
+
return pool.transform(htmlOrBuffer, url, workerTransformOptions(options));
|
|
1483
1467
|
}
|
|
1484
1468
|
return pool.transform(htmlOrBuffer, url, {
|
|
1485
|
-
...
|
|
1469
|
+
...workerTransformOptions(options),
|
|
1486
1470
|
...(options.encoding ? { encoding: options.encoding } : {}),
|
|
1487
1471
|
});
|
|
1488
1472
|
}
|
|
@@ -1494,7 +1478,7 @@ function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
|
|
|
1494
1478
|
});
|
|
1495
1479
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1496
1480
|
}
|
|
1497
|
-
|
|
1481
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1498
1482
|
if (error instanceof FetchError)
|
|
1499
1483
|
throw error;
|
|
1500
1484
|
if (!(error instanceof Error))
|
|
@@ -1521,7 +1505,7 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
|
1521
1505
|
async function transformInputToMarkdown(htmlOrBuffer, url, options) {
|
|
1522
1506
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1523
1507
|
try {
|
|
1524
|
-
|
|
1508
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1525
1509
|
const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
|
|
1526
1510
|
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1527
1511
|
return result;
|
package/package.json
CHANGED
package/dist/lib/content.d.ts
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import { type MetadataBlock } from '../transform/types.js';
|
|
2
|
-
export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
|
|
3
|
-
export declare function prepareDocumentForMarkdown(document: Document, baseUrl?: string, signal?: AbortSignal): void;
|
|
4
|
-
export declare function removeNoiseFromHtml(html: string, document?: Document, baseUrl?: string, signal?: AbortSignal): string;
|
|
5
|
-
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
6
|
-
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
7
|
-
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
8
|
-
interface CleanupOptions {
|
|
9
|
-
signal?: AbortSignal;
|
|
10
|
-
url?: string;
|
|
11
|
-
}
|
|
12
|
-
export declare function cleanupMarkdownArtifacts(content: string, options?: CleanupOptions): string;
|
|
13
|
-
export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
|
|
14
|
-
export declare function addSourceToMarkdown(content: string, url: string): string;
|
|
15
|
-
export declare function isRawTextContent(content: string): boolean;
|
|
16
|
-
export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
|
|
17
|
-
export {};
|
|
18
|
-
//# sourceMappingURL=content.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAwjB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AA2DD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAwQD,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAqBD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE;AAsDD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAqTD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6CR;AAgGD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|