@j0hanz/fetch-url-mcp 1.9.2 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/code-lang.d.ts +4 -0
- package/dist/lib/code-lang.d.ts.map +1 -0
- package/dist/lib/code-lang.js +315 -0
- package/dist/lib/dom-prep.d.ts +4 -0
- package/dist/lib/dom-prep.d.ts.map +1 -0
- package/dist/lib/dom-prep.js +606 -0
- package/dist/lib/md-cleanup.d.ts +13 -0
- package/dist/lib/md-cleanup.d.ts.map +1 -0
- package/dist/lib/md-cleanup.js +391 -0
- package/dist/lib/md-metadata.d.ts +6 -0
- package/dist/lib/md-metadata.d.ts.map +1 -0
- package/dist/lib/md-metadata.js +186 -0
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +9 -6
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +510 -160
- package/package.json +1 -1
- package/dist/lib/content.d.ts +0 -17
- package/dist/lib/content.d.ts.map +0 -1
- package/dist/lib/content.js +0 -1399
|
@@ -3,11 +3,14 @@ import diagnosticsChannel from 'node:diagnostics_channel';
|
|
|
3
3
|
import { performance } from 'node:perf_hooks';
|
|
4
4
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
5
5
|
import { parseHTML } from 'linkedom';
|
|
6
|
-
import {
|
|
6
|
+
import { detectLanguageFromCode, extractLanguageFromClassName, } from '../lib/code-lang.js';
|
|
7
7
|
import { config } from '../lib/core.js';
|
|
8
8
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
9
|
+
import { prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
9
10
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
10
|
-
import {
|
|
11
|
+
import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
|
|
12
|
+
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
|
|
13
|
+
import { throwIfAborted } from '../lib/utils.js';
|
|
11
14
|
import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
|
|
12
15
|
import { isObject } from '../lib/utils.js';
|
|
13
16
|
import { translateHtmlFragmentToMarkdown } from './html-translators.js';
|
|
@@ -34,7 +37,6 @@ function decodeInput(input, encoding) {
|
|
|
34
37
|
function asError(value) {
|
|
35
38
|
return value instanceof Error ? value : undefined;
|
|
36
39
|
}
|
|
37
|
-
const abortPolicy = { throwIfAborted, createAbortError };
|
|
38
40
|
function isWhitespaceChar(code) {
|
|
39
41
|
return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
|
|
40
42
|
}
|
|
@@ -284,20 +286,29 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
284
286
|
return length;
|
|
285
287
|
}
|
|
286
288
|
function preserveAlertElements(doc) {
|
|
287
|
-
const alerts = doc.querySelectorAll('[role="alert"], .admonition,
|
|
289
|
+
const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
|
|
288
290
|
for (const el of alerts) {
|
|
289
291
|
const bq = doc.createElement('blockquote');
|
|
290
292
|
bq.innerHTML = el.innerHTML;
|
|
291
293
|
el.replaceWith(bq);
|
|
292
294
|
}
|
|
293
295
|
}
|
|
296
|
+
function preserveCodeLanguageAttributes(doc) {
|
|
297
|
+
for (const el of doc.querySelectorAll('pre, code')) {
|
|
298
|
+
if (el.getAttribute('data-language'))
|
|
299
|
+
continue;
|
|
300
|
+
const lang = extractLanguageFromClassName(el.getAttribute('class') ?? '');
|
|
301
|
+
if (lang)
|
|
302
|
+
el.setAttribute('data-language', lang);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
294
305
|
function extractArticle(document, url, signal) {
|
|
295
306
|
if (!isReadabilityCompatible(document)) {
|
|
296
307
|
logWarn('Document not compatible with Readability');
|
|
297
308
|
return null;
|
|
298
309
|
}
|
|
299
310
|
const checkAbort = (stage) => {
|
|
300
|
-
|
|
311
|
+
throwIfAborted(signal, url, stage);
|
|
301
312
|
};
|
|
302
313
|
try {
|
|
303
314
|
const doc = document;
|
|
@@ -321,6 +332,10 @@ function extractArticle(document, url, signal) {
|
|
|
321
332
|
? doc.cloneNode(true)
|
|
322
333
|
: doc;
|
|
323
334
|
preserveAlertElements(readabilityDoc);
|
|
335
|
+
preserveCodeLanguageAttributes(readabilityDoc);
|
|
336
|
+
for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
|
|
337
|
+
el.remove();
|
|
338
|
+
}
|
|
324
339
|
checkAbort('extract:article:parse');
|
|
325
340
|
const reader = new Readability(readabilityDoc, {
|
|
326
341
|
maxElemsToParse: MAX_READABILITY_ELEMENTS,
|
|
@@ -378,29 +393,43 @@ function applyBaseUri(document, url) {
|
|
|
378
393
|
});
|
|
379
394
|
}
|
|
380
395
|
}
|
|
396
|
+
function createEmptyExtractionContext() {
|
|
397
|
+
const { document } = parseHTML('<html></html>');
|
|
398
|
+
return { article: null, metadata: {}, document };
|
|
399
|
+
}
|
|
400
|
+
function extractEarlyMetadataIfNeeded(html, url) {
|
|
401
|
+
if (!willTruncate(html))
|
|
402
|
+
return null;
|
|
403
|
+
return stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url));
|
|
404
|
+
}
|
|
405
|
+
function parseExtractionDocument(html, url, inputTruncated) {
|
|
406
|
+
const { html: limitedHtml, truncated } = truncateHtml(html, inputTruncated);
|
|
407
|
+
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
408
|
+
return { document, truncated };
|
|
409
|
+
}
|
|
410
|
+
function extractMergedMetadata(html, url, document) {
|
|
411
|
+
const earlyMetadata = extractEarlyMetadataIfNeeded(html, url);
|
|
412
|
+
const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
|
|
413
|
+
return mergeMetadata(earlyMetadata, lateMetadata);
|
|
414
|
+
}
|
|
415
|
+
function extractArticleIfRequested(document, url, options) {
|
|
416
|
+
if (!options.extractArticle)
|
|
417
|
+
return null;
|
|
418
|
+
return stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal));
|
|
419
|
+
}
|
|
381
420
|
function extractContentContext(html, url, options) {
|
|
382
421
|
if (!isValidInput(html, url)) {
|
|
383
|
-
|
|
384
|
-
return { article: null, metadata: {}, document };
|
|
422
|
+
return createEmptyExtractionContext();
|
|
385
423
|
}
|
|
386
424
|
try {
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
? stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url))
|
|
391
|
-
: null;
|
|
392
|
-
const { html: limitedHtml, truncated } = truncateHtml(html, options.inputTruncated);
|
|
393
|
-
const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
|
|
394
|
-
abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
|
|
425
|
+
throwIfAborted(options.signal, url, 'extract:begin');
|
|
426
|
+
const { document, truncated } = parseExtractionDocument(html, url, options.inputTruncated);
|
|
427
|
+
throwIfAborted(options.signal, url, 'extract:parsed');
|
|
395
428
|
applyBaseUri(document, url);
|
|
396
|
-
const
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
const article = options.extractArticle
|
|
401
|
-
? stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal))
|
|
402
|
-
: null;
|
|
403
|
-
abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
|
|
429
|
+
const metadata = extractMergedMetadata(html, url, document);
|
|
430
|
+
throwIfAborted(options.signal, url, 'extract:metadata');
|
|
431
|
+
const article = extractArticleIfRequested(document, url, options);
|
|
432
|
+
throwIfAborted(options.signal, url, 'extract:article');
|
|
404
433
|
return {
|
|
405
434
|
article,
|
|
406
435
|
metadata,
|
|
@@ -411,10 +440,9 @@ function extractContentContext(html, url, options) {
|
|
|
411
440
|
catch (error) {
|
|
412
441
|
if (error instanceof FetchError)
|
|
413
442
|
throw error;
|
|
414
|
-
|
|
443
|
+
throwIfAborted(options.signal, url, 'extract:error');
|
|
415
444
|
logError('Failed to extract content', asError(error));
|
|
416
|
-
|
|
417
|
-
return { article: null, metadata: {}, document };
|
|
445
|
+
return createEmptyExtractionContext();
|
|
418
446
|
}
|
|
419
447
|
}
|
|
420
448
|
export function extractContent(html, url, options = {
|
|
@@ -423,8 +451,6 @@ export function extractContent(html, url, options = {
|
|
|
423
451
|
const result = extractContentContext(html, url, options);
|
|
424
452
|
return { article: result.article, metadata: result.metadata };
|
|
425
453
|
}
|
|
426
|
-
const ABORT_CHECK_LINE_INTERVAL = 500;
|
|
427
|
-
const CR_CHAR_CODE = 13;
|
|
428
454
|
function resolveRelativeHref(href, baseUrl, origin) {
|
|
429
455
|
const trimmedHref = href.trim();
|
|
430
456
|
if (!trimmedHref || containsWhitespace(trimmedHref))
|
|
@@ -488,7 +514,6 @@ function isAbsoluteOrSpecialUrl(href) {
|
|
|
488
514
|
return true;
|
|
489
515
|
return URL.canParse(trimmedHref);
|
|
490
516
|
}
|
|
491
|
-
const FENCE_LINE_PATTERN = /^\s*(`{3,}|~{3,})/;
|
|
492
517
|
function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
|
|
493
518
|
let cursor = 0;
|
|
494
519
|
let output = '';
|
|
@@ -514,71 +539,20 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
|
|
|
514
539
|
}
|
|
515
540
|
if (!markdown)
|
|
516
541
|
return markdown;
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
let lastIndex = 0;
|
|
522
|
-
let lineCount = 0;
|
|
523
|
-
while (lastIndex < len) {
|
|
524
|
-
if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
|
|
525
|
-
abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
526
|
-
}
|
|
527
|
-
// Extract next line (handling CR+LF)
|
|
528
|
-
let nextIndex = markdown.indexOf('\n', lastIndex);
|
|
529
|
-
const isLastLine = nextIndex === -1;
|
|
530
|
-
if (isLastLine)
|
|
531
|
-
nextIndex = len;
|
|
532
|
-
const lineWithNewline = isLastLine
|
|
533
|
-
? markdown.slice(lastIndex)
|
|
534
|
-
: markdown.slice(lastIndex, nextIndex + 1);
|
|
535
|
-
const lineEnd = !isLastLine &&
|
|
536
|
-
nextIndex > lastIndex &&
|
|
537
|
-
markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
|
|
538
|
-
? nextIndex - 1
|
|
539
|
-
: isLastLine
|
|
540
|
-
? len
|
|
541
|
-
: nextIndex;
|
|
542
|
-
const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
|
|
543
|
-
if (fenceMarker) {
|
|
544
|
-
// Inside a code fence — pass through without URL resolution
|
|
545
|
-
output += lineWithNewline;
|
|
546
|
-
if (trimmed.startsWith(fenceMarker) &&
|
|
547
|
-
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
548
|
-
fenceMarker = null;
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
else {
|
|
552
|
-
const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
|
|
553
|
-
if (fenceMatch?.[1]) {
|
|
554
|
-
// Entering a code fence — flush buffered content first
|
|
555
|
-
if (buffer) {
|
|
556
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
557
|
-
buffer = '';
|
|
558
|
-
}
|
|
559
|
-
output += lineWithNewline;
|
|
560
|
-
fenceMarker = fenceMatch[1];
|
|
561
|
-
}
|
|
562
|
-
else {
|
|
563
|
-
buffer += lineWithNewline;
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
lastIndex = isLastLine ? len : nextIndex + 1;
|
|
567
|
-
}
|
|
568
|
-
if (buffer) {
|
|
569
|
-
output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
|
|
570
|
-
}
|
|
571
|
-
return output;
|
|
542
|
+
return processFencedContent(markdown, (text) => {
|
|
543
|
+
throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
|
|
544
|
+
return resolveRelativeUrlsInSegment(text, baseUrl, origin);
|
|
545
|
+
});
|
|
572
546
|
}
|
|
573
547
|
function translateHtmlToMarkdown(params) {
|
|
574
548
|
const { html, url, signal, document, skipNoiseRemoval } = params;
|
|
575
|
-
|
|
549
|
+
throwIfAborted(signal, url, 'markdown:begin');
|
|
576
550
|
const cleanedHtml = skipNoiseRemoval
|
|
577
551
|
? html
|
|
578
552
|
: stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
|
|
579
|
-
|
|
553
|
+
throwIfAborted(signal, url, 'markdown:cleaned');
|
|
580
554
|
const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
|
|
581
|
-
|
|
555
|
+
throwIfAborted(signal, url, 'markdown:translated');
|
|
582
556
|
const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
583
557
|
return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
|
|
584
558
|
}
|
|
@@ -654,6 +628,13 @@ const MIN_CONTENT_RATIO = 0.15;
|
|
|
654
628
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
655
629
|
const MIN_HEADING_RETENTION_RATIO = 0.3;
|
|
656
630
|
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
|
|
631
|
+
const MIN_TABLE_RETENTION_RATIO = 0.5;
|
|
632
|
+
const MIN_IMAGE_RETENTION_RATIO = 0.2;
|
|
633
|
+
const MIN_INTERACTIVE_RETENTION_RATIO = 0.1;
|
|
634
|
+
const MIN_INTERACTIVE_ELEMENTS_FOR_GATE = 6;
|
|
635
|
+
const MIN_IMAGE_ELEMENTS_FOR_GATE = 4;
|
|
636
|
+
const MIN_HEADINGS_FOR_EMPTY_SECTION_GATE = 5;
|
|
637
|
+
const MAX_EMPTY_SECTION_RATIO = 0.05;
|
|
657
638
|
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
658
639
|
const MAX_TRUNCATED_LINE_RATIO = 0.95;
|
|
659
640
|
function needsDocumentWrapper(html) {
|
|
@@ -679,13 +660,6 @@ function resolveHtmlDocument(htmlOrDocument) {
|
|
|
679
660
|
return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
|
|
680
661
|
}
|
|
681
662
|
}
|
|
682
|
-
function countTagsInString(html, regex) {
|
|
683
|
-
let count = 0;
|
|
684
|
-
while (regex.exec(html) !== null) {
|
|
685
|
-
count++;
|
|
686
|
-
}
|
|
687
|
-
return count;
|
|
688
|
-
}
|
|
689
663
|
function stripNonVisibleNodes(root) {
|
|
690
664
|
for (const el of root.querySelectorAll('script,style,noscript')) {
|
|
691
665
|
el.remove();
|
|
@@ -868,6 +842,43 @@ function findPrimaryHeading(document) {
|
|
|
868
842
|
}
|
|
869
843
|
return undefined;
|
|
870
844
|
}
|
|
845
|
+
function countMatchingElements(root, selector) {
|
|
846
|
+
return root.querySelectorAll(selector).length;
|
|
847
|
+
}
|
|
848
|
+
function getHeadingLevel(heading) {
|
|
849
|
+
const match = /^H([1-6])$/.exec(heading.tagName);
|
|
850
|
+
if (!match)
|
|
851
|
+
return null;
|
|
852
|
+
return Number.parseInt(match[1] ?? '', 10);
|
|
853
|
+
}
|
|
854
|
+
function hasSectionContent(heading) {
|
|
855
|
+
const level = getHeadingLevel(heading);
|
|
856
|
+
if (level === null)
|
|
857
|
+
return false;
|
|
858
|
+
let current = heading.nextElementSibling;
|
|
859
|
+
while (current) {
|
|
860
|
+
const currentLevel = getHeadingLevel(current);
|
|
861
|
+
if (currentLevel !== null && currentLevel <= level)
|
|
862
|
+
return false;
|
|
863
|
+
const text = current.textContent.trim();
|
|
864
|
+
if (text.length > 0)
|
|
865
|
+
return true;
|
|
866
|
+
if (current.querySelector('img,table,pre,code,ul,ol,figure,blockquote')) {
|
|
867
|
+
return true;
|
|
868
|
+
}
|
|
869
|
+
current = current.nextElementSibling;
|
|
870
|
+
}
|
|
871
|
+
return false;
|
|
872
|
+
}
|
|
873
|
+
function countEmptyHeadingSections(root) {
|
|
874
|
+
let emptyCount = 0;
|
|
875
|
+
const headings = root.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
876
|
+
for (const heading of headings) {
|
|
877
|
+
if (!hasSectionContent(heading))
|
|
878
|
+
emptyCount += 1;
|
|
879
|
+
}
|
|
880
|
+
return emptyCount;
|
|
881
|
+
}
|
|
871
882
|
function isGithubRepositoryRootUrl(url) {
|
|
872
883
|
let parsed;
|
|
873
884
|
try {
|
|
@@ -887,28 +898,74 @@ export const TransformHeuristics = {
|
|
|
887
898
|
findPrimaryHeading,
|
|
888
899
|
isGithubRepositoryRootUrl,
|
|
889
900
|
};
|
|
890
|
-
|
|
891
|
-
|
|
901
|
+
const ARTICLE_INTERACTIVE_SELECTOR = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
|
|
902
|
+
function buildArticleDocument(article) {
|
|
903
|
+
return parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
|
|
904
|
+
}
|
|
905
|
+
function hasSufficientArticleContentRatio(article, document) {
|
|
892
906
|
const originalLength = getVisibleTextLength(document);
|
|
893
|
-
if (originalLength
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
const originalHeadings = document
|
|
899
|
-
if (originalHeadings
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
const originalCodeBlocks = document
|
|
906
|
-
if (originalCodeBlocks
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
907
|
+
if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
|
|
908
|
+
return true;
|
|
909
|
+
return article.textContent.length / originalLength >= MIN_CONTENT_RATIO;
|
|
910
|
+
}
|
|
911
|
+
function retainsEnoughHeadings(articleDoc, document) {
|
|
912
|
+
const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
|
|
913
|
+
if (originalHeadings === 0)
|
|
914
|
+
return true;
|
|
915
|
+
const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
|
|
916
|
+
return articleHeadings / originalHeadings >= MIN_HEADING_RETENTION_RATIO;
|
|
917
|
+
}
|
|
918
|
+
function retainsEnoughCodeBlocks(articleDoc, document) {
|
|
919
|
+
const originalCodeBlocks = countMatchingElements(document, 'pre');
|
|
920
|
+
if (originalCodeBlocks === 0)
|
|
921
|
+
return true;
|
|
922
|
+
const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
|
|
923
|
+
return (articleCodeBlocks / originalCodeBlocks >= MIN_CODE_BLOCK_RETENTION_RATIO);
|
|
924
|
+
}
|
|
925
|
+
function retainsEnoughTables(articleDoc, document) {
|
|
926
|
+
const originalTables = countMatchingElements(document, 'table');
|
|
927
|
+
if (originalTables === 0)
|
|
928
|
+
return true;
|
|
929
|
+
const articleTables = countMatchingElements(articleDoc, 'table');
|
|
930
|
+
return articleTables / originalTables >= MIN_TABLE_RETENTION_RATIO;
|
|
931
|
+
}
|
|
932
|
+
function retainsEnoughImages(articleDoc, document) {
|
|
933
|
+
const originalImages = countMatchingElements(document, 'img');
|
|
934
|
+
if (originalImages < MIN_IMAGE_ELEMENTS_FOR_GATE)
|
|
935
|
+
return true;
|
|
936
|
+
const articleImages = countMatchingElements(articleDoc, 'img');
|
|
937
|
+
return articleImages / originalImages >= MIN_IMAGE_RETENTION_RATIO;
|
|
938
|
+
}
|
|
939
|
+
function retainsEnoughInteractiveElements(articleDoc, document) {
|
|
940
|
+
const originalInteractive = countMatchingElements(document, ARTICLE_INTERACTIVE_SELECTOR);
|
|
941
|
+
if (originalInteractive < MIN_INTERACTIVE_ELEMENTS_FOR_GATE)
|
|
942
|
+
return true;
|
|
943
|
+
const articleInteractive = countMatchingElements(articleDoc, ARTICLE_INTERACTIVE_SELECTOR);
|
|
944
|
+
return (articleInteractive / originalInteractive >= MIN_INTERACTIVE_RETENTION_RATIO);
|
|
945
|
+
}
|
|
946
|
+
function hasAcceptableEmptySectionRatio(articleDoc) {
|
|
947
|
+
const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
|
|
948
|
+
if (articleHeadings < MIN_HEADINGS_FOR_EMPTY_SECTION_GATE)
|
|
949
|
+
return true;
|
|
950
|
+
const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
|
|
951
|
+
return emptySectionRatio <= MAX_EMPTY_SECTION_RATIO;
|
|
952
|
+
}
|
|
953
|
+
function shouldUseArticleContent(article, document) {
|
|
954
|
+
if (!hasSufficientArticleContentRatio(article, document))
|
|
955
|
+
return false;
|
|
956
|
+
const articleDoc = buildArticleDocument(article);
|
|
957
|
+
if (!retainsEnoughHeadings(articleDoc, document))
|
|
958
|
+
return false;
|
|
959
|
+
if (!retainsEnoughCodeBlocks(articleDoc, document))
|
|
960
|
+
return false;
|
|
961
|
+
if (!retainsEnoughTables(articleDoc, document))
|
|
962
|
+
return false;
|
|
963
|
+
if (!retainsEnoughImages(articleDoc, document))
|
|
964
|
+
return false;
|
|
965
|
+
if (!retainsEnoughInteractiveElements(articleDoc, document))
|
|
966
|
+
return false;
|
|
967
|
+
if (!hasAcceptableEmptySectionRatio(articleDoc))
|
|
968
|
+
return false;
|
|
912
969
|
return !hasTruncatedSentences(article.textContent);
|
|
913
970
|
}
|
|
914
971
|
function buildContentSource(params) {
|
|
@@ -922,6 +979,7 @@ function buildContentSource(params) {
|
|
|
922
979
|
primaryHeading: document
|
|
923
980
|
? TransformHeuristics.findPrimaryHeading(document)
|
|
924
981
|
: undefined,
|
|
982
|
+
originalHtml: html,
|
|
925
983
|
};
|
|
926
984
|
if (useArticleContent && article) {
|
|
927
985
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
@@ -954,6 +1012,290 @@ function buildContentSource(params) {
|
|
|
954
1012
|
title: extractedMeta.title,
|
|
955
1013
|
};
|
|
956
1014
|
}
|
|
1015
|
+
const NEXT_FLIGHT_PAYLOAD_RE = /self\.__next_f\.push\(\[1,"((?:\\.|[^"\\])*)"\]\)<\/script>/gs;
|
|
1016
|
+
const TEMPLATE_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=`([\s\S]*?)`;/g;
|
|
1017
|
+
const OBJECT_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=\{([^{}]+)\}/g;
|
|
1018
|
+
const FLIGHT_INSTALL_RE = /commands:\{cli:"([^"]+)",npm:"([^"]+)",yarn:"([^"]+)",pnpm:"([^"]+)",bun:"([^"]+)"\}/;
|
|
1019
|
+
const FLIGHT_IMPORT_RE = /commands:\{main:'([^']+)',individual:'([^']+)'\}/;
|
|
1020
|
+
const FLIGHT_DEMO_RE = /title:"([^"]+)",files:([A-Za-z_$][\w$]*)\.([A-Za-z_$][\w$]*)/g;
|
|
1021
|
+
const FLIGHT_API_RE = /children:"([^"]+)"\}\),`\\n`,\(0,e\.jsx\)\(o,\{data:\[([\s\S]*?)\]\}\)/g;
|
|
1022
|
+
const FLIGHT_API_ROW_RE = /attribute:"([^"]+)",type:"([^"]+)",description:"([^"]*)",default:"([^"]*)"/g;
|
|
1023
|
+
const FLIGHT_MERMAID_SECTION_RE = /_jsx\(Heading,\{\s*level:"[1-6]",\s*id:"[^"]+",\s*children:"((?:\\.|[^"\\])*)"\s*\}\)(?:(?!_jsx\(Heading,\{)[\s\S]){0,12000}?_jsx\(Mermaid,\{\s*chart:"((?:\\.|[^"\\])*)"\s*\}\)/g;
|
|
1024
|
+
function decodeHtmlEntities(value) {
|
|
1025
|
+
return value
|
|
1026
|
+
.replace(/'|'/g, "'")
|
|
1027
|
+
.replace(/"/g, '"')
|
|
1028
|
+
.replace(/&/g, '&')
|
|
1029
|
+
.replace(/</g, '<')
|
|
1030
|
+
.replace(/>/g, '>');
|
|
1031
|
+
}
|
|
1032
|
+
function decodeFlightStringValue(value) {
|
|
1033
|
+
try {
|
|
1034
|
+
return JSON.parse(`"${value}"`);
|
|
1035
|
+
}
|
|
1036
|
+
catch {
|
|
1037
|
+
return decodeHtmlEntities(value);
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
function decodeNextFlightPayloads(html) {
|
|
1041
|
+
const payloads = [];
|
|
1042
|
+
for (const match of html.matchAll(NEXT_FLIGHT_PAYLOAD_RE)) {
|
|
1043
|
+
const rawPayload = match[1];
|
|
1044
|
+
if (!rawPayload)
|
|
1045
|
+
continue;
|
|
1046
|
+
try {
|
|
1047
|
+
payloads.push(JSON.parse(`"${rawPayload}"`));
|
|
1048
|
+
}
|
|
1049
|
+
catch {
|
|
1050
|
+
// Ignore malformed payload fragments and continue with the rest.
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
return payloads;
|
|
1054
|
+
}
|
|
1055
|
+
function parseFlightObjectRefs(text) {
|
|
1056
|
+
const templateMap = new Map();
|
|
1057
|
+
const aliasMap = new Map();
|
|
1058
|
+
const objectMaps = new Map();
|
|
1059
|
+
for (const match of text.matchAll(TEMPLATE_ASSIGNMENT_RE)) {
|
|
1060
|
+
const name = match[1];
|
|
1061
|
+
const code = match[2];
|
|
1062
|
+
if (name && code)
|
|
1063
|
+
templateMap.set(name, decodeHtmlEntities(code));
|
|
1064
|
+
}
|
|
1065
|
+
for (const match of text.matchAll(OBJECT_ASSIGNMENT_RE)) {
|
|
1066
|
+
const objectName = match[1];
|
|
1067
|
+
const body = match[2]?.trim() ?? '';
|
|
1068
|
+
if (!objectName || !body)
|
|
1069
|
+
continue;
|
|
1070
|
+
const spreadMatch = /^\.\.\.([A-Za-z_$][\w$]*)$/.exec(body);
|
|
1071
|
+
if (spreadMatch?.[1]) {
|
|
1072
|
+
aliasMap.set(objectName, spreadMatch[1]);
|
|
1073
|
+
continue;
|
|
1074
|
+
}
|
|
1075
|
+
const entries = new Map();
|
|
1076
|
+
for (const part of body.split(',')) {
|
|
1077
|
+
const entryMatch = /(?:"([^"]+)"|([A-Za-z_$][\w$]*)):([A-Za-z_$][\w$]*)$/.exec(part.trim());
|
|
1078
|
+
const key = entryMatch?.[1] ?? entryMatch?.[2];
|
|
1079
|
+
const value = entryMatch?.[3];
|
|
1080
|
+
if (key && value)
|
|
1081
|
+
entries.set(key, value);
|
|
1082
|
+
}
|
|
1083
|
+
if (entries.size > 0)
|
|
1084
|
+
objectMaps.set(objectName, entries);
|
|
1085
|
+
}
|
|
1086
|
+
return { templateMap, aliasMap, objectMaps };
|
|
1087
|
+
}
|
|
1088
|
+
function resolveFlightCodeRef(name, refs, seen = new Set()) {
|
|
1089
|
+
if (!name || seen.has(name))
|
|
1090
|
+
return undefined;
|
|
1091
|
+
seen.add(name);
|
|
1092
|
+
const direct = refs.templateMap.get(name);
|
|
1093
|
+
if (direct)
|
|
1094
|
+
return direct;
|
|
1095
|
+
const alias = refs.aliasMap.get(name);
|
|
1096
|
+
if (alias)
|
|
1097
|
+
return resolveFlightCodeRef(alias, refs, seen);
|
|
1098
|
+
const objectMap = refs.objectMaps.get(name);
|
|
1099
|
+
if (!objectMap)
|
|
1100
|
+
return undefined;
|
|
1101
|
+
for (const ref of objectMap.values()) {
|
|
1102
|
+
const resolved = resolveFlightCodeRef(ref, refs, seen);
|
|
1103
|
+
if (resolved)
|
|
1104
|
+
return resolved;
|
|
1105
|
+
}
|
|
1106
|
+
return undefined;
|
|
1107
|
+
}
|
|
1108
|
+
function escapeMarkdownTableCell(value) {
|
|
1109
|
+
const normalized = decodeHtmlEntities(value).replace(/\s+/g, ' ').trim();
|
|
1110
|
+
return (normalized || '-').replace(/\|/g, '\\|');
|
|
1111
|
+
}
|
|
1112
|
+
function buildMarkdownTable(rows) {
|
|
1113
|
+
if (rows.length === 0)
|
|
1114
|
+
return '';
|
|
1115
|
+
const lines = [
|
|
1116
|
+
'| Prop | Type | Description | Default |',
|
|
1117
|
+
'| ---- | ---- | ----------- | ------- |',
|
|
1118
|
+
];
|
|
1119
|
+
for (const row of rows) {
|
|
1120
|
+
lines.push(`| ${escapeMarkdownTableCell(row.attribute)} | ${escapeMarkdownTableCell(row.type)} | ${escapeMarkdownTableCell(row.description)} | ${escapeMarkdownTableCell(row.defaultValue)} |`);
|
|
1121
|
+
}
|
|
1122
|
+
return lines.join('\n');
|
|
1123
|
+
}
|
|
1124
|
+
function buildCodeBlock(code) {
|
|
1125
|
+
const trimmed = code.trim();
|
|
1126
|
+
if (!trimmed)
|
|
1127
|
+
return '';
|
|
1128
|
+
const language = detectLanguageFromCode(trimmed) ?? 'tsx';
|
|
1129
|
+
return `\`\`\`${language}\n${trimmed}\n\`\`\``;
|
|
1130
|
+
}
|
|
1131
|
+
function buildMermaidBlock(chart) {
|
|
1132
|
+
const normalized = decodeFlightStringValue(chart).trim();
|
|
1133
|
+
if (!normalized)
|
|
1134
|
+
return '';
|
|
1135
|
+
return `\`\`\`mermaid\n${normalized}\n\`\`\``;
|
|
1136
|
+
}
|
|
1137
|
+
function normalizeSupplementHeadingText(value) {
|
|
1138
|
+
return value
|
|
1139
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
|
|
1140
|
+
.replace(/\s+/g, ' ')
|
|
1141
|
+
.trim()
|
|
1142
|
+
.toLowerCase();
|
|
1143
|
+
}
|
|
1144
|
+
function getMarkdownHeadingInfo(line) {
|
|
1145
|
+
const match = /^(#{1,6})\s+(.+?)\s*$/.exec(line.trim());
|
|
1146
|
+
if (!match)
|
|
1147
|
+
return null;
|
|
1148
|
+
return {
|
|
1149
|
+
level: match[1]?.length ?? 0,
|
|
1150
|
+
title: normalizeSupplementHeadingText(match[2] ?? ''),
|
|
1151
|
+
};
|
|
1152
|
+
}
|
|
1153
|
+
function findMarkdownSection(lines, title) {
|
|
1154
|
+
const target = normalizeSupplementHeadingText(title);
|
|
1155
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1156
|
+
const heading = getMarkdownHeadingInfo(lines[i] ?? '');
|
|
1157
|
+
if (heading?.title !== target)
|
|
1158
|
+
continue;
|
|
1159
|
+
let end = lines.length;
|
|
1160
|
+
for (let j = i + 1; j < lines.length; j += 1) {
|
|
1161
|
+
const nextLine = lines[j];
|
|
1162
|
+
const nextHeading = nextLine !== undefined ? getMarkdownHeadingInfo(nextLine) : null;
|
|
1163
|
+
if (nextHeading && nextHeading.level <= heading.level) {
|
|
1164
|
+
end = j;
|
|
1165
|
+
break;
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
return { start: i, end };
|
|
1169
|
+
}
|
|
1170
|
+
return null;
|
|
1171
|
+
}
|
|
1172
|
+
function getSectionBody(lines, section) {
|
|
1173
|
+
return lines
|
|
1174
|
+
.slice(section.start + 1, section.end)
|
|
1175
|
+
.join('\n')
|
|
1176
|
+
.trim();
|
|
1177
|
+
}
|
|
1178
|
+
function replaceMarkdownSection(lines, title, body) {
|
|
1179
|
+
const section = findMarkdownSection(lines, title);
|
|
1180
|
+
if (!section)
|
|
1181
|
+
return false;
|
|
1182
|
+
const replacement = body.trim().length > 0 ? ['', ...body.trim().split('\n'), ''] : [''];
|
|
1183
|
+
lines.splice(section.start + 1, section.end - section.start - 1, ...replacement);
|
|
1184
|
+
return true;
|
|
1185
|
+
}
|
|
1186
|
+
function appendMarkdownSection(lines, title, body) {
|
|
1187
|
+
const section = findMarkdownSection(lines, title);
|
|
1188
|
+
if (!section)
|
|
1189
|
+
return false;
|
|
1190
|
+
const bodyText = getSectionBody(lines, section);
|
|
1191
|
+
if (bodyText.includes('```'))
|
|
1192
|
+
return false;
|
|
1193
|
+
const nextBody = bodyText ? `${bodyText}\n\n${body.trim()}` : body.trim();
|
|
1194
|
+
return replaceMarkdownSection(lines, title, nextBody);
|
|
1195
|
+
}
|
|
1196
|
+
function extractNextFlightSupplement(originalHtml) {
|
|
1197
|
+
const payloads = decodeNextFlightPayloads(originalHtml);
|
|
1198
|
+
if (payloads.length === 0)
|
|
1199
|
+
return null;
|
|
1200
|
+
const text = payloads.join('\n');
|
|
1201
|
+
const refs = parseFlightObjectRefs(text);
|
|
1202
|
+
const installMatch = FLIGHT_INSTALL_RE.exec(text);
|
|
1203
|
+
const importMatch = FLIGHT_IMPORT_RE.exec(text);
|
|
1204
|
+
const apiTables = new Map();
|
|
1205
|
+
for (const match of text.matchAll(FLIGHT_API_RE)) {
|
|
1206
|
+
const title = match[1];
|
|
1207
|
+
const rawRows = match[2] ?? '';
|
|
1208
|
+
if (!title)
|
|
1209
|
+
continue;
|
|
1210
|
+
const rows = [];
|
|
1211
|
+
for (const rowMatch of rawRows.matchAll(FLIGHT_API_ROW_RE)) {
|
|
1212
|
+
const attribute = rowMatch[1];
|
|
1213
|
+
const type = rowMatch[2];
|
|
1214
|
+
const description = rowMatch[3];
|
|
1215
|
+
const defaultValue = rowMatch[4];
|
|
1216
|
+
if (!attribute ||
|
|
1217
|
+
!type ||
|
|
1218
|
+
description === undefined ||
|
|
1219
|
+
defaultValue === undefined) {
|
|
1220
|
+
continue;
|
|
1221
|
+
}
|
|
1222
|
+
rows.push({ attribute, type, description, defaultValue });
|
|
1223
|
+
}
|
|
1224
|
+
const table = buildMarkdownTable(rows);
|
|
1225
|
+
if (table)
|
|
1226
|
+
apiTables.set(title, table);
|
|
1227
|
+
}
|
|
1228
|
+
const mermaidDiagrams = new Map();
|
|
1229
|
+
for (const match of text.matchAll(FLIGHT_MERMAID_SECTION_RE)) {
|
|
1230
|
+
const title = match[1] ? decodeFlightStringValue(match[1]).trim() : '';
|
|
1231
|
+
const chart = match[2] ? buildMermaidBlock(match[2]) : '';
|
|
1232
|
+
if (title && chart)
|
|
1233
|
+
mermaidDiagrams.set(title, chart);
|
|
1234
|
+
}
|
|
1235
|
+
const demoCodeBlocks = new Map();
|
|
1236
|
+
for (const match of text.matchAll(FLIGHT_DEMO_RE)) {
|
|
1237
|
+
const title = match[1];
|
|
1238
|
+
const objectName = match[2];
|
|
1239
|
+
const key = match[3];
|
|
1240
|
+
const ref = objectName
|
|
1241
|
+
? refs.objectMaps.get(objectName)?.get(key ?? '')
|
|
1242
|
+
: undefined;
|
|
1243
|
+
const code = resolveFlightCodeRef(ref, refs);
|
|
1244
|
+
const codeBlock = code ? buildCodeBlock(code) : '';
|
|
1245
|
+
if (title && codeBlock)
|
|
1246
|
+
demoCodeBlocks.set(title, codeBlock);
|
|
1247
|
+
}
|
|
1248
|
+
return {
|
|
1249
|
+
...(installMatch ? { installationCommands: installMatch.slice(1) } : {}),
|
|
1250
|
+
...(importMatch ? { importCommands: importMatch.slice(1) } : {}),
|
|
1251
|
+
apiTables,
|
|
1252
|
+
demoCodeBlocks,
|
|
1253
|
+
mermaidDiagrams,
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
function supplementMarkdownFromNextFlight(markdown, originalHtml) {
|
|
1257
|
+
const supplement = extractNextFlightSupplement(originalHtml);
|
|
1258
|
+
if (!supplement)
|
|
1259
|
+
return markdown;
|
|
1260
|
+
const lines = markdown.split('\n');
|
|
1261
|
+
if (supplement.installationCommands?.length) {
|
|
1262
|
+
const installationSection = findMarkdownSection(lines, 'Installation');
|
|
1263
|
+
if (installationSection) {
|
|
1264
|
+
const installBody = getSectionBody(lines, installationSection);
|
|
1265
|
+
if (!/(npm|pnpm|yarn|bun|npx)\s+(install|add)/.test(installBody)) {
|
|
1266
|
+
appendMarkdownSection(lines, 'Installation', buildCodeBlock(supplement.installationCommands.join('\n')));
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
if (supplement.importCommands?.length) {
|
|
1271
|
+
const importSection = findMarkdownSection(lines, 'Import');
|
|
1272
|
+
if (importSection) {
|
|
1273
|
+
const importBody = getSectionBody(lines, importSection);
|
|
1274
|
+
if (!/import\s+\{/.test(importBody)) {
|
|
1275
|
+
appendMarkdownSection(lines, 'Import', buildCodeBlock(supplement.importCommands.join('\n\n')));
|
|
1276
|
+
}
|
|
1277
|
+
}
|
|
1278
|
+
}
|
|
1279
|
+
for (const [title, table] of supplement.apiTables) {
|
|
1280
|
+
replaceMarkdownSection(lines, title, table);
|
|
1281
|
+
}
|
|
1282
|
+
for (const [title, mermaidBlock] of supplement.mermaidDiagrams) {
|
|
1283
|
+
const section = findMarkdownSection(lines, title);
|
|
1284
|
+
if (!section)
|
|
1285
|
+
continue;
|
|
1286
|
+
const sectionBody = getSectionBody(lines, section);
|
|
1287
|
+
if (sectionBody.includes('```mermaid'))
|
|
1288
|
+
continue;
|
|
1289
|
+
const nextBody = sectionBody
|
|
1290
|
+
? `${sectionBody}\n\n${mermaidBlock}`
|
|
1291
|
+
: mermaidBlock;
|
|
1292
|
+
replaceMarkdownSection(lines, title, nextBody);
|
|
1293
|
+
}
|
|
1294
|
+
for (const [title, codeBlock] of supplement.demoCodeBlocks) {
|
|
1295
|
+
appendMarkdownSection(lines, title, codeBlock);
|
|
1296
|
+
}
|
|
1297
|
+
return lines.join('\n');
|
|
1298
|
+
}
|
|
957
1299
|
function resolveContentSource(params) {
|
|
958
1300
|
const { article, metadata: extractedMeta, document, truncated, } = extractContentContext(params.html, params.url, {
|
|
959
1301
|
extractArticle: true,
|
|
@@ -975,6 +1317,33 @@ function resolveContentSource(params) {
|
|
|
975
1317
|
...(params.signal ? { signal: params.signal } : {}),
|
|
976
1318
|
});
|
|
977
1319
|
}
|
|
1320
|
+
function shouldStripGithubPrimaryHeading(context, url) {
|
|
1321
|
+
return (context.primaryHeading !== undefined &&
|
|
1322
|
+
TransformHeuristics.isGithubRepositoryRootUrl(url));
|
|
1323
|
+
}
|
|
1324
|
+
function maybeStripGithubPrimaryHeading(markdown, context, url) {
|
|
1325
|
+
if (!shouldStripGithubPrimaryHeading(context, url))
|
|
1326
|
+
return markdown;
|
|
1327
|
+
return stripLeadingHeading(markdown, context.primaryHeading ?? '');
|
|
1328
|
+
}
|
|
1329
|
+
function buildSyntheticTitlePrefix(url, favicon) {
|
|
1330
|
+
if (!favicon)
|
|
1331
|
+
return ' ';
|
|
1332
|
+
let alt = '';
|
|
1333
|
+
try {
|
|
1334
|
+
alt = new URL(url).hostname;
|
|
1335
|
+
}
|
|
1336
|
+
catch {
|
|
1337
|
+
/* skip */
|
|
1338
|
+
}
|
|
1339
|
+
return `  `;
|
|
1340
|
+
}
|
|
1341
|
+
function maybePrependSyntheticTitle(markdown, context, url) {
|
|
1342
|
+
if (!context.title || /^(#{1,6})\s/.test(markdown.trimStart())) {
|
|
1343
|
+
return markdown;
|
|
1344
|
+
}
|
|
1345
|
+
return `#${buildSyntheticTitlePrefix(url, context.favicon)}${context.title}\n\n${markdown}`;
|
|
1346
|
+
}
|
|
978
1347
|
function buildMarkdownFromContext(context, url, signal) {
|
|
979
1348
|
let content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
|
|
980
1349
|
url,
|
|
@@ -982,25 +1351,10 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
982
1351
|
...(context.document ? { document: context.document } : {}),
|
|
983
1352
|
...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
|
|
984
1353
|
}));
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
}
|
|
989
|
-
if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
|
|
990
|
-
const icon = context.favicon;
|
|
991
|
-
let prefix = ' ';
|
|
992
|
-
if (icon) {
|
|
993
|
-
let alt = '';
|
|
994
|
-
try {
|
|
995
|
-
alt = new URL(url).hostname;
|
|
996
|
-
}
|
|
997
|
-
catch {
|
|
998
|
-
/* skip */
|
|
999
|
-
}
|
|
1000
|
-
prefix = `  `;
|
|
1001
|
-
}
|
|
1002
|
-
content = `#${prefix}${context.title}\n\n${content}`;
|
|
1003
|
-
}
|
|
1354
|
+
content = maybeStripGithubPrimaryHeading(content, context, url);
|
|
1355
|
+
content = maybePrependSyntheticTitle(content, context, url);
|
|
1356
|
+
content = supplementMarkdownFromNextFlight(content, context.originalHtml);
|
|
1357
|
+
content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
1004
1358
|
return {
|
|
1005
1359
|
markdown: content,
|
|
1006
1360
|
title: context.title,
|
|
@@ -1056,10 +1410,24 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
|
|
|
1056
1410
|
const signal = buildTransformSignal(options.signal);
|
|
1057
1411
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1058
1412
|
try {
|
|
1059
|
-
|
|
1413
|
+
throwIfAborted(signal, url, 'transform:begin');
|
|
1060
1414
|
validateBinaryContent(html, url);
|
|
1061
|
-
const result =
|
|
1062
|
-
|
|
1415
|
+
const result = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1416
|
+
html,
|
|
1417
|
+
url,
|
|
1418
|
+
includeMetadata: options.includeMetadata,
|
|
1419
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1420
|
+
})) ??
|
|
1421
|
+
(() => {
|
|
1422
|
+
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1423
|
+
html,
|
|
1424
|
+
url,
|
|
1425
|
+
includeMetadata: options.includeMetadata,
|
|
1426
|
+
...(signal ? { signal } : {}),
|
|
1427
|
+
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1428
|
+
}));
|
|
1429
|
+
return buildMarkdownFromContext(context, url, signal);
|
|
1430
|
+
})();
|
|
1063
1431
|
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1064
1432
|
return result;
|
|
1065
1433
|
}
|
|
@@ -1073,24 +1441,6 @@ function validateBinaryContent(html, url) {
|
|
|
1073
1441
|
throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
|
|
1074
1442
|
}
|
|
1075
1443
|
}
|
|
1076
|
-
function tryRawContentPipeline(html, url, options) {
|
|
1077
|
-
return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
|
|
1078
|
-
html,
|
|
1079
|
-
url,
|
|
1080
|
-
includeMetadata: options.includeMetadata,
|
|
1081
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1082
|
-
}));
|
|
1083
|
-
}
|
|
1084
|
-
function tryHtmlContentPipeline(html, url, options, signal) {
|
|
1085
|
-
const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
|
|
1086
|
-
html,
|
|
1087
|
-
url,
|
|
1088
|
-
includeMetadata: options.includeMetadata,
|
|
1089
|
-
...(signal ? { signal } : {}),
|
|
1090
|
-
...(options.inputTruncated ? { inputTruncated: true } : {}),
|
|
1091
|
-
}));
|
|
1092
|
-
return buildMarkdownFromContext(context, url, signal);
|
|
1093
|
-
}
|
|
1094
1444
|
export function getTransformPoolStats() {
|
|
1095
1445
|
return getWorkerPoolStats();
|
|
1096
1446
|
}
|
|
@@ -1100,7 +1450,7 @@ export async function shutdownTransformWorkerPool() {
|
|
|
1100
1450
|
function transformInputInProcess(htmlOrBuffer, url, options) {
|
|
1101
1451
|
return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
|
|
1102
1452
|
}
|
|
1103
|
-
function
|
|
1453
|
+
function workerTransformOptions(options) {
|
|
1104
1454
|
return {
|
|
1105
1455
|
includeMetadata: options.includeMetadata,
|
|
1106
1456
|
...(options.signal ? { signal: options.signal } : {}),
|
|
@@ -1113,10 +1463,10 @@ async function transformWithWorkerPool(htmlOrBuffer, url, options) {
|
|
|
1113
1463
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1114
1464
|
}
|
|
1115
1465
|
if (typeof htmlOrBuffer === 'string') {
|
|
1116
|
-
return pool.transform(htmlOrBuffer, url,
|
|
1466
|
+
return pool.transform(htmlOrBuffer, url, workerTransformOptions(options));
|
|
1117
1467
|
}
|
|
1118
1468
|
return pool.transform(htmlOrBuffer, url, {
|
|
1119
|
-
...
|
|
1469
|
+
...workerTransformOptions(options),
|
|
1120
1470
|
...(options.encoding ? { encoding: options.encoding } : {}),
|
|
1121
1471
|
});
|
|
1122
1472
|
}
|
|
@@ -1128,7 +1478,7 @@ function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
|
|
|
1128
1478
|
});
|
|
1129
1479
|
return transformInputInProcess(htmlOrBuffer, url, options);
|
|
1130
1480
|
}
|
|
1131
|
-
|
|
1481
|
+
throwIfAborted(options.signal, url, 'transform:worker-fallback');
|
|
1132
1482
|
if (error instanceof FetchError)
|
|
1133
1483
|
throw error;
|
|
1134
1484
|
if (!(error instanceof Error))
|
|
@@ -1155,7 +1505,7 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
|
|
|
1155
1505
|
async function transformInputToMarkdown(htmlOrBuffer, url, options) {
|
|
1156
1506
|
const totalStage = stageTracker.start(url, 'transform:total');
|
|
1157
1507
|
try {
|
|
1158
|
-
|
|
1508
|
+
throwIfAborted(options.signal, url, 'transform:begin');
|
|
1159
1509
|
const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
|
|
1160
1510
|
stageTracker.end(totalStage, { truncated: result.truncated });
|
|
1161
1511
|
return result;
|