@j0hanz/fetch-url-mcp 1.9.2 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,11 +3,14 @@ import diagnosticsChannel from 'node:diagnostics_channel';
3
3
  import { performance } from 'node:perf_hooks';
4
4
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
5
5
  import { parseHTML } from 'linkedom';
6
- import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
6
+ import { detectLanguageFromCode, extractLanguageFromClassName, } from '../lib/code-lang.js';
7
7
  import { config } from '../lib/core.js';
8
8
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
9
+ import { prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
9
10
  import { isRawTextContentUrl } from '../lib/http.js';
10
- import { createAbortError, throwIfAborted } from '../lib/utils.js';
11
+ import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
12
+ import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
13
+ import { throwIfAborted } from '../lib/utils.js';
11
14
  import { FetchError, getErrorMessage, toError } from '../lib/utils.js';
12
15
  import { isObject } from '../lib/utils.js';
13
16
  import { translateHtmlFragmentToMarkdown } from './html-translators.js';
@@ -34,7 +37,6 @@ function decodeInput(input, encoding) {
34
37
  function asError(value) {
35
38
  return value instanceof Error ? value : undefined;
36
39
  }
37
- const abortPolicy = { throwIfAborted, createAbortError };
38
40
  function isWhitespaceChar(code) {
39
41
  return code === 9 || code === 10 || code === 12 || code === 13 || code === 32;
40
42
  }
@@ -284,20 +286,29 @@ function resolveCollapsedTextLengthUpTo(text, max) {
284
286
  return length;
285
287
  }
286
288
  function preserveAlertElements(doc) {
287
- const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
289
+ const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
288
290
  for (const el of alerts) {
289
291
  const bq = doc.createElement('blockquote');
290
292
  bq.innerHTML = el.innerHTML;
291
293
  el.replaceWith(bq);
292
294
  }
293
295
  }
296
+ function preserveCodeLanguageAttributes(doc) {
297
+ for (const el of doc.querySelectorAll('pre, code')) {
298
+ if (el.getAttribute('data-language'))
299
+ continue;
300
+ const lang = extractLanguageFromClassName(el.getAttribute('class') ?? '');
301
+ if (lang)
302
+ el.setAttribute('data-language', lang);
303
+ }
304
+ }
294
305
  function extractArticle(document, url, signal) {
295
306
  if (!isReadabilityCompatible(document)) {
296
307
  logWarn('Document not compatible with Readability');
297
308
  return null;
298
309
  }
299
310
  const checkAbort = (stage) => {
300
- abortPolicy.throwIfAborted(signal, url, stage);
311
+ throwIfAborted(signal, url, stage);
301
312
  };
302
313
  try {
303
314
  const doc = document;
@@ -321,6 +332,10 @@ function extractArticle(document, url, signal) {
321
332
  ? doc.cloneNode(true)
322
333
  : doc;
323
334
  preserveAlertElements(readabilityDoc);
335
+ preserveCodeLanguageAttributes(readabilityDoc);
336
+ for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
337
+ el.remove();
338
+ }
324
339
  checkAbort('extract:article:parse');
325
340
  const reader = new Readability(readabilityDoc, {
326
341
  maxElemsToParse: MAX_READABILITY_ELEMENTS,
@@ -378,29 +393,43 @@ function applyBaseUri(document, url) {
378
393
  });
379
394
  }
380
395
  }
396
+ function createEmptyExtractionContext() {
397
+ const { document } = parseHTML('<html></html>');
398
+ return { article: null, metadata: {}, document };
399
+ }
400
+ function extractEarlyMetadataIfNeeded(html, url) {
401
+ if (!willTruncate(html))
402
+ return null;
403
+ return stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url));
404
+ }
405
+ function parseExtractionDocument(html, url, inputTruncated) {
406
+ const { html: limitedHtml, truncated } = truncateHtml(html, inputTruncated);
407
+ const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
408
+ return { document, truncated };
409
+ }
410
+ function extractMergedMetadata(html, url, document) {
411
+ const earlyMetadata = extractEarlyMetadataIfNeeded(html, url);
412
+ const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
413
+ return mergeMetadata(earlyMetadata, lateMetadata);
414
+ }
415
+ function extractArticleIfRequested(document, url, options) {
416
+ if (!options.extractArticle)
417
+ return null;
418
+ return stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal));
419
+ }
381
420
  function extractContentContext(html, url, options) {
382
421
  if (!isValidInput(html, url)) {
383
- const { document } = parseHTML('<html></html>');
384
- return { article: null, metadata: {}, document };
422
+ return createEmptyExtractionContext();
385
423
  }
386
424
  try {
387
- abortPolicy.throwIfAborted(options.signal, url, 'extract:begin');
388
- // F2: Extract metadata from <head> BEFORE truncation to preserve it
389
- const earlyMetadata = willTruncate(html)
390
- ? stageTracker.run(url, 'extract:early-metadata', () => extractMetadataFromHead(html, url))
391
- : null;
392
- const { html: limitedHtml, truncated } = truncateHtml(html, options.inputTruncated);
393
- const { document } = stageTracker.run(url, 'extract:parse', () => parseHTML(limitedHtml));
394
- abortPolicy.throwIfAborted(options.signal, url, 'extract:parsed');
425
+ throwIfAborted(options.signal, url, 'extract:begin');
426
+ const { document, truncated } = parseExtractionDocument(html, url, options.inputTruncated);
427
+ throwIfAborted(options.signal, url, 'extract:parsed');
395
428
  applyBaseUri(document, url);
396
- const lateMetadata = stageTracker.run(url, 'extract:metadata', () => extractMetadata(document, url));
397
- abortPolicy.throwIfAborted(options.signal, url, 'extract:metadata');
398
- // Merge early (pre-truncation) with late (post-truncation) metadata
399
- const metadata = mergeMetadata(earlyMetadata, lateMetadata);
400
- const article = options.extractArticle
401
- ? stageTracker.run(url, 'extract:article', () => extractArticle(document, url, options.signal))
402
- : null;
403
- abortPolicy.throwIfAborted(options.signal, url, 'extract:article');
429
+ const metadata = extractMergedMetadata(html, url, document);
430
+ throwIfAborted(options.signal, url, 'extract:metadata');
431
+ const article = extractArticleIfRequested(document, url, options);
432
+ throwIfAborted(options.signal, url, 'extract:article');
404
433
  return {
405
434
  article,
406
435
  metadata,
@@ -411,10 +440,9 @@ function extractContentContext(html, url, options) {
411
440
  catch (error) {
412
441
  if (error instanceof FetchError)
413
442
  throw error;
414
- abortPolicy.throwIfAborted(options.signal, url, 'extract:error');
443
+ throwIfAborted(options.signal, url, 'extract:error');
415
444
  logError('Failed to extract content', asError(error));
416
- const { document } = parseHTML('<html></html>');
417
- return { article: null, metadata: {}, document };
445
+ return createEmptyExtractionContext();
418
446
  }
419
447
  }
420
448
  export function extractContent(html, url, options = {
@@ -423,8 +451,6 @@ export function extractContent(html, url, options = {
423
451
  const result = extractContentContext(html, url, options);
424
452
  return { article: result.article, metadata: result.metadata };
425
453
  }
426
- const ABORT_CHECK_LINE_INTERVAL = 500;
427
- const CR_CHAR_CODE = 13;
428
454
  function resolveRelativeHref(href, baseUrl, origin) {
429
455
  const trimmedHref = href.trim();
430
456
  if (!trimmedHref || containsWhitespace(trimmedHref))
@@ -488,7 +514,6 @@ function isAbsoluteOrSpecialUrl(href) {
488
514
  return true;
489
515
  return URL.canParse(trimmedHref);
490
516
  }
491
- const FENCE_LINE_PATTERN = /^\s*(`{3,}|~{3,})/;
492
517
  function resolveRelativeUrlsInSegment(markdown, baseUrl, origin) {
493
518
  let cursor = 0;
494
519
  let output = '';
@@ -514,71 +539,20 @@ function resolveRelativeUrls(markdown, baseUrl, signal) {
514
539
  }
515
540
  if (!markdown)
516
541
  return markdown;
517
- let output = '';
518
- let buffer = '';
519
- let fenceMarker = null;
520
- const len = markdown.length;
521
- let lastIndex = 0;
522
- let lineCount = 0;
523
- while (lastIndex < len) {
524
- if (++lineCount % ABORT_CHECK_LINE_INTERVAL === 0) {
525
- abortPolicy.throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
526
- }
527
- // Extract next line (handling CR+LF)
528
- let nextIndex = markdown.indexOf('\n', lastIndex);
529
- const isLastLine = nextIndex === -1;
530
- if (isLastLine)
531
- nextIndex = len;
532
- const lineWithNewline = isLastLine
533
- ? markdown.slice(lastIndex)
534
- : markdown.slice(lastIndex, nextIndex + 1);
535
- const lineEnd = !isLastLine &&
536
- nextIndex > lastIndex &&
537
- markdown.charCodeAt(nextIndex - 1) === CR_CHAR_CODE
538
- ? nextIndex - 1
539
- : isLastLine
540
- ? len
541
- : nextIndex;
542
- const trimmed = markdown.slice(lastIndex, lineEnd).trimStart();
543
- if (fenceMarker) {
544
- // Inside a code fence — pass through without URL resolution
545
- output += lineWithNewline;
546
- if (trimmed.startsWith(fenceMarker) &&
547
- trimmed.slice(fenceMarker.length).trim() === '') {
548
- fenceMarker = null;
549
- }
550
- }
551
- else {
552
- const fenceMatch = FENCE_LINE_PATTERN.exec(markdown.slice(lastIndex, lineEnd));
553
- if (fenceMatch?.[1]) {
554
- // Entering a code fence — flush buffered content first
555
- if (buffer) {
556
- output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
557
- buffer = '';
558
- }
559
- output += lineWithNewline;
560
- fenceMarker = fenceMatch[1];
561
- }
562
- else {
563
- buffer += lineWithNewline;
564
- }
565
- }
566
- lastIndex = isLastLine ? len : nextIndex + 1;
567
- }
568
- if (buffer) {
569
- output += resolveRelativeUrlsInSegment(buffer, baseUrl, origin);
570
- }
571
- return output;
542
+ return processFencedContent(markdown, (text) => {
543
+ throwIfAborted(signal, baseUrl, 'markdown:resolve-urls');
544
+ return resolveRelativeUrlsInSegment(text, baseUrl, origin);
545
+ });
572
546
  }
573
547
  function translateHtmlToMarkdown(params) {
574
548
  const { html, url, signal, document, skipNoiseRemoval } = params;
575
- abortPolicy.throwIfAborted(signal, url, 'markdown:begin');
549
+ throwIfAborted(signal, url, 'markdown:begin');
576
550
  const cleanedHtml = skipNoiseRemoval
577
551
  ? html
578
552
  : stageTracker.run(url, 'markdown:noise', () => removeNoiseFromHtml(html, document, url, signal));
579
- abortPolicy.throwIfAborted(signal, url, 'markdown:cleaned');
553
+ throwIfAborted(signal, url, 'markdown:cleaned');
580
554
  const content = stageTracker.run(url, 'markdown:translate', () => translateHtmlFragmentToMarkdown(cleanedHtml));
581
- abortPolicy.throwIfAborted(signal, url, 'markdown:translated');
555
+ throwIfAborted(signal, url, 'markdown:translated');
582
556
  const cleaned = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
583
557
  return url ? resolveRelativeUrls(cleaned, url, signal) : cleaned;
584
558
  }
@@ -654,6 +628,13 @@ const MIN_CONTENT_RATIO = 0.15;
654
628
  const MIN_HTML_LENGTH_FOR_GATE = 100;
655
629
  const MIN_HEADING_RETENTION_RATIO = 0.3;
656
630
  const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
631
+ const MIN_TABLE_RETENTION_RATIO = 0.5;
632
+ const MIN_IMAGE_RETENTION_RATIO = 0.2;
633
+ const MIN_INTERACTIVE_RETENTION_RATIO = 0.1;
634
+ const MIN_INTERACTIVE_ELEMENTS_FOR_GATE = 6;
635
+ const MIN_IMAGE_ELEMENTS_FOR_GATE = 4;
636
+ const MIN_HEADINGS_FOR_EMPTY_SECTION_GATE = 5;
637
+ const MAX_EMPTY_SECTION_RATIO = 0.05;
657
638
  const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
658
639
  const MAX_TRUNCATED_LINE_RATIO = 0.95;
659
640
  function needsDocumentWrapper(html) {
@@ -679,13 +660,6 @@ function resolveHtmlDocument(htmlOrDocument) {
679
660
  return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
680
661
  }
681
662
  }
682
- function countTagsInString(html, regex) {
683
- let count = 0;
684
- while (regex.exec(html) !== null) {
685
- count++;
686
- }
687
- return count;
688
- }
689
663
  function stripNonVisibleNodes(root) {
690
664
  for (const el of root.querySelectorAll('script,style,noscript')) {
691
665
  el.remove();
@@ -868,6 +842,43 @@ function findPrimaryHeading(document) {
868
842
  }
869
843
  return undefined;
870
844
  }
845
+ function countMatchingElements(root, selector) {
846
+ return root.querySelectorAll(selector).length;
847
+ }
848
+ function getHeadingLevel(heading) {
849
+ const match = /^H([1-6])$/.exec(heading.tagName);
850
+ if (!match)
851
+ return null;
852
+ return Number.parseInt(match[1] ?? '', 10);
853
+ }
854
+ function hasSectionContent(heading) {
855
+ const level = getHeadingLevel(heading);
856
+ if (level === null)
857
+ return false;
858
+ let current = heading.nextElementSibling;
859
+ while (current) {
860
+ const currentLevel = getHeadingLevel(current);
861
+ if (currentLevel !== null && currentLevel <= level)
862
+ return false;
863
+ const text = current.textContent.trim();
864
+ if (text.length > 0)
865
+ return true;
866
+ if (current.querySelector('img,table,pre,code,ul,ol,figure,blockquote')) {
867
+ return true;
868
+ }
869
+ current = current.nextElementSibling;
870
+ }
871
+ return false;
872
+ }
873
+ function countEmptyHeadingSections(root) {
874
+ let emptyCount = 0;
875
+ const headings = root.querySelectorAll('h1,h2,h3,h4,h5,h6');
876
+ for (const heading of headings) {
877
+ if (!hasSectionContent(heading))
878
+ emptyCount += 1;
879
+ }
880
+ return emptyCount;
881
+ }
871
882
  function isGithubRepositoryRootUrl(url) {
872
883
  let parsed;
873
884
  try {
@@ -887,28 +898,74 @@ export const TransformHeuristics = {
887
898
  findPrimaryHeading,
888
899
  isGithubRepositoryRootUrl,
889
900
  };
890
- function shouldUseArticleContent(article, document) {
891
- const articleLength = article.textContent.length;
901
+ const ARTICLE_INTERACTIVE_SELECTOR = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
902
+ function buildArticleDocument(article) {
903
+ return parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`).document;
904
+ }
905
+ function hasSufficientArticleContentRatio(article, document) {
892
906
  const originalLength = getVisibleTextLength(document);
893
- if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
894
- const ratio = articleLength / originalLength;
895
- if (ratio < MIN_CONTENT_RATIO)
896
- return false;
897
- }
898
- const originalHeadings = document.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
899
- if (originalHeadings > 0) {
900
- const articleHeadings = countTagsInString(article.content, /<h[1-6]\b/gi);
901
- const retentionRatio = articleHeadings / originalHeadings;
902
- if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
903
- return false;
904
- }
905
- const originalCodeBlocks = document.querySelectorAll('pre').length;
906
- if (originalCodeBlocks > 0) {
907
- const articleCodeBlocks = countTagsInString(article.content, /<pre\b/gi);
908
- const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
909
- if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
910
- return false;
911
- }
907
+ if (originalLength < MIN_HTML_LENGTH_FOR_GATE)
908
+ return true;
909
+ return article.textContent.length / originalLength >= MIN_CONTENT_RATIO;
910
+ }
911
+ function retainsEnoughHeadings(articleDoc, document) {
912
+ const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
913
+ if (originalHeadings === 0)
914
+ return true;
915
+ const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
916
+ return articleHeadings / originalHeadings >= MIN_HEADING_RETENTION_RATIO;
917
+ }
918
+ function retainsEnoughCodeBlocks(articleDoc, document) {
919
+ const originalCodeBlocks = countMatchingElements(document, 'pre');
920
+ if (originalCodeBlocks === 0)
921
+ return true;
922
+ const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
923
+ return (articleCodeBlocks / originalCodeBlocks >= MIN_CODE_BLOCK_RETENTION_RATIO);
924
+ }
925
+ function retainsEnoughTables(articleDoc, document) {
926
+ const originalTables = countMatchingElements(document, 'table');
927
+ if (originalTables === 0)
928
+ return true;
929
+ const articleTables = countMatchingElements(articleDoc, 'table');
930
+ return articleTables / originalTables >= MIN_TABLE_RETENTION_RATIO;
931
+ }
932
+ function retainsEnoughImages(articleDoc, document) {
933
+ const originalImages = countMatchingElements(document, 'img');
934
+ if (originalImages < MIN_IMAGE_ELEMENTS_FOR_GATE)
935
+ return true;
936
+ const articleImages = countMatchingElements(articleDoc, 'img');
937
+ return articleImages / originalImages >= MIN_IMAGE_RETENTION_RATIO;
938
+ }
939
+ function retainsEnoughInteractiveElements(articleDoc, document) {
940
+ const originalInteractive = countMatchingElements(document, ARTICLE_INTERACTIVE_SELECTOR);
941
+ if (originalInteractive < MIN_INTERACTIVE_ELEMENTS_FOR_GATE)
942
+ return true;
943
+ const articleInteractive = countMatchingElements(articleDoc, ARTICLE_INTERACTIVE_SELECTOR);
944
+ return (articleInteractive / originalInteractive >= MIN_INTERACTIVE_RETENTION_RATIO);
945
+ }
946
+ function hasAcceptableEmptySectionRatio(articleDoc) {
947
+ const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
948
+ if (articleHeadings < MIN_HEADINGS_FOR_EMPTY_SECTION_GATE)
949
+ return true;
950
+ const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
951
+ return emptySectionRatio <= MAX_EMPTY_SECTION_RATIO;
952
+ }
953
+ function shouldUseArticleContent(article, document) {
954
+ if (!hasSufficientArticleContentRatio(article, document))
955
+ return false;
956
+ const articleDoc = buildArticleDocument(article);
957
+ if (!retainsEnoughHeadings(articleDoc, document))
958
+ return false;
959
+ if (!retainsEnoughCodeBlocks(articleDoc, document))
960
+ return false;
961
+ if (!retainsEnoughTables(articleDoc, document))
962
+ return false;
963
+ if (!retainsEnoughImages(articleDoc, document))
964
+ return false;
965
+ if (!retainsEnoughInteractiveElements(articleDoc, document))
966
+ return false;
967
+ if (!hasAcceptableEmptySectionRatio(articleDoc))
968
+ return false;
912
969
  return !hasTruncatedSentences(article.textContent);
913
970
  }
914
971
  function buildContentSource(params) {
@@ -922,6 +979,7 @@ function buildContentSource(params) {
922
979
  primaryHeading: document
923
980
  ? TransformHeuristics.findPrimaryHeading(document)
924
981
  : undefined,
982
+ originalHtml: html,
925
983
  };
926
984
  if (useArticleContent && article) {
927
985
  const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
@@ -954,6 +1012,290 @@ function buildContentSource(params) {
954
1012
  title: extractedMeta.title,
955
1013
  };
956
1014
  }
1015
+ const NEXT_FLIGHT_PAYLOAD_RE = /self\.__next_f\.push\(\[1,"((?:\\.|[^"\\])*)"\]\)<\/script>/gs;
1016
+ const TEMPLATE_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=`([\s\S]*?)`;/g;
1017
+ const OBJECT_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=\{([^{}]+)\}/g;
1018
+ const FLIGHT_INSTALL_RE = /commands:\{cli:"([^"]+)",npm:"([^"]+)",yarn:"([^"]+)",pnpm:"([^"]+)",bun:"([^"]+)"\}/;
1019
+ const FLIGHT_IMPORT_RE = /commands:\{main:'([^']+)',individual:'([^']+)'\}/;
1020
+ const FLIGHT_DEMO_RE = /title:"([^"]+)",files:([A-Za-z_$][\w$]*)\.([A-Za-z_$][\w$]*)/g;
1021
+ const FLIGHT_API_RE = /children:"([^"]+)"\}\),`\\n`,\(0,e\.jsx\)\(o,\{data:\[([\s\S]*?)\]\}\)/g;
1022
+ const FLIGHT_API_ROW_RE = /attribute:"([^"]+)",type:"([^"]+)",description:"([^"]*)",default:"([^"]*)"/g;
1023
+ const FLIGHT_MERMAID_SECTION_RE = /_jsx\(Heading,\{\s*level:"[1-6]",\s*id:"[^"]+",\s*children:"((?:\\.|[^"\\])*)"\s*\}\)(?:(?!_jsx\(Heading,\{)[\s\S]){0,12000}?_jsx\(Mermaid,\{\s*chart:"((?:\\.|[^"\\])*)"\s*\}\)/g;
1024
+ function decodeHtmlEntities(value) {
1025
+ return value
1026
+ .replace(/&#39;|&#x27;/g, "'")
1027
+ .replace(/&quot;/g, '"')
1028
+ .replace(/&amp;/g, '&')
1029
+ .replace(/&lt;/g, '<')
1030
+ .replace(/&gt;/g, '>');
1031
+ }
1032
+ function decodeFlightStringValue(value) {
1033
+ try {
1034
+ return JSON.parse(`"${value}"`);
1035
+ }
1036
+ catch {
1037
+ return decodeHtmlEntities(value);
1038
+ }
1039
+ }
1040
+ function decodeNextFlightPayloads(html) {
1041
+ const payloads = [];
1042
+ for (const match of html.matchAll(NEXT_FLIGHT_PAYLOAD_RE)) {
1043
+ const rawPayload = match[1];
1044
+ if (!rawPayload)
1045
+ continue;
1046
+ try {
1047
+ payloads.push(JSON.parse(`"${rawPayload}"`));
1048
+ }
1049
+ catch {
1050
+ // Ignore malformed payload fragments and continue with the rest.
1051
+ }
1052
+ }
1053
+ return payloads;
1054
+ }
1055
+ function parseFlightObjectRefs(text) {
1056
+ const templateMap = new Map();
1057
+ const aliasMap = new Map();
1058
+ const objectMaps = new Map();
1059
+ for (const match of text.matchAll(TEMPLATE_ASSIGNMENT_RE)) {
1060
+ const name = match[1];
1061
+ const code = match[2];
1062
+ if (name && code)
1063
+ templateMap.set(name, decodeHtmlEntities(code));
1064
+ }
1065
+ for (const match of text.matchAll(OBJECT_ASSIGNMENT_RE)) {
1066
+ const objectName = match[1];
1067
+ const body = match[2]?.trim() ?? '';
1068
+ if (!objectName || !body)
1069
+ continue;
1070
+ const spreadMatch = /^\.\.\.([A-Za-z_$][\w$]*)$/.exec(body);
1071
+ if (spreadMatch?.[1]) {
1072
+ aliasMap.set(objectName, spreadMatch[1]);
1073
+ continue;
1074
+ }
1075
+ const entries = new Map();
1076
+ for (const part of body.split(',')) {
1077
+ const entryMatch = /(?:"([^"]+)"|([A-Za-z_$][\w$]*)):([A-Za-z_$][\w$]*)$/.exec(part.trim());
1078
+ const key = entryMatch?.[1] ?? entryMatch?.[2];
1079
+ const value = entryMatch?.[3];
1080
+ if (key && value)
1081
+ entries.set(key, value);
1082
+ }
1083
+ if (entries.size > 0)
1084
+ objectMaps.set(objectName, entries);
1085
+ }
1086
+ return { templateMap, aliasMap, objectMaps };
1087
+ }
1088
+ function resolveFlightCodeRef(name, refs, seen = new Set()) {
1089
+ if (!name || seen.has(name))
1090
+ return undefined;
1091
+ seen.add(name);
1092
+ const direct = refs.templateMap.get(name);
1093
+ if (direct)
1094
+ return direct;
1095
+ const alias = refs.aliasMap.get(name);
1096
+ if (alias)
1097
+ return resolveFlightCodeRef(alias, refs, seen);
1098
+ const objectMap = refs.objectMaps.get(name);
1099
+ if (!objectMap)
1100
+ return undefined;
1101
+ for (const ref of objectMap.values()) {
1102
+ const resolved = resolveFlightCodeRef(ref, refs, seen);
1103
+ if (resolved)
1104
+ return resolved;
1105
+ }
1106
+ return undefined;
1107
+ }
1108
+ function escapeMarkdownTableCell(value) {
1109
+ const normalized = decodeHtmlEntities(value).replace(/\s+/g, ' ').trim();
1110
+ return (normalized || '-').replace(/\|/g, '\\|');
1111
+ }
1112
+ function buildMarkdownTable(rows) {
1113
+ if (rows.length === 0)
1114
+ return '';
1115
+ const lines = [
1116
+ '| Prop | Type | Description | Default |',
1117
+ '| ---- | ---- | ----------- | ------- |',
1118
+ ];
1119
+ for (const row of rows) {
1120
+ lines.push(`| ${escapeMarkdownTableCell(row.attribute)} | ${escapeMarkdownTableCell(row.type)} | ${escapeMarkdownTableCell(row.description)} | ${escapeMarkdownTableCell(row.defaultValue)} |`);
1121
+ }
1122
+ return lines.join('\n');
1123
+ }
1124
+ function buildCodeBlock(code) {
1125
+ const trimmed = code.trim();
1126
+ if (!trimmed)
1127
+ return '';
1128
+ const language = detectLanguageFromCode(trimmed) ?? 'tsx';
1129
+ return `\`\`\`${language}\n${trimmed}\n\`\`\``;
1130
+ }
1131
+ function buildMermaidBlock(chart) {
1132
+ const normalized = decodeFlightStringValue(chart).trim();
1133
+ if (!normalized)
1134
+ return '';
1135
+ return `\`\`\`mermaid\n${normalized}\n\`\`\``;
1136
+ }
1137
+ function normalizeSupplementHeadingText(value) {
1138
+ return value
1139
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
1140
+ .replace(/\s+/g, ' ')
1141
+ .trim()
1142
+ .toLowerCase();
1143
+ }
1144
+ function getMarkdownHeadingInfo(line) {
1145
+ const match = /^(#{1,6})\s+(.+?)\s*$/.exec(line.trim());
1146
+ if (!match)
1147
+ return null;
1148
+ return {
1149
+ level: match[1]?.length ?? 0,
1150
+ title: normalizeSupplementHeadingText(match[2] ?? ''),
1151
+ };
1152
+ }
1153
+ function findMarkdownSection(lines, title) {
1154
+ const target = normalizeSupplementHeadingText(title);
1155
+ for (let i = 0; i < lines.length; i += 1) {
1156
+ const heading = getMarkdownHeadingInfo(lines[i] ?? '');
1157
+ if (heading?.title !== target)
1158
+ continue;
1159
+ let end = lines.length;
1160
+ for (let j = i + 1; j < lines.length; j += 1) {
1161
+ const nextLine = lines[j];
1162
+ const nextHeading = nextLine !== undefined ? getMarkdownHeadingInfo(nextLine) : null;
1163
+ if (nextHeading && nextHeading.level <= heading.level) {
1164
+ end = j;
1165
+ break;
1166
+ }
1167
+ }
1168
+ return { start: i, end };
1169
+ }
1170
+ return null;
1171
+ }
1172
+ function getSectionBody(lines, section) {
1173
+ return lines
1174
+ .slice(section.start + 1, section.end)
1175
+ .join('\n')
1176
+ .trim();
1177
+ }
1178
+ function replaceMarkdownSection(lines, title, body) {
1179
+ const section = findMarkdownSection(lines, title);
1180
+ if (!section)
1181
+ return false;
1182
+ const replacement = body.trim().length > 0 ? ['', ...body.trim().split('\n'), ''] : [''];
1183
+ lines.splice(section.start + 1, section.end - section.start - 1, ...replacement);
1184
+ return true;
1185
+ }
1186
+ function appendMarkdownSection(lines, title, body) {
1187
+ const section = findMarkdownSection(lines, title);
1188
+ if (!section)
1189
+ return false;
1190
+ const bodyText = getSectionBody(lines, section);
1191
+ if (bodyText.includes('```'))
1192
+ return false;
1193
+ const nextBody = bodyText ? `${bodyText}\n\n${body.trim()}` : body.trim();
1194
+ return replaceMarkdownSection(lines, title, nextBody);
1195
+ }
1196
+ function extractNextFlightSupplement(originalHtml) {
1197
+ const payloads = decodeNextFlightPayloads(originalHtml);
1198
+ if (payloads.length === 0)
1199
+ return null;
1200
+ const text = payloads.join('\n');
1201
+ const refs = parseFlightObjectRefs(text);
1202
+ const installMatch = FLIGHT_INSTALL_RE.exec(text);
1203
+ const importMatch = FLIGHT_IMPORT_RE.exec(text);
1204
+ const apiTables = new Map();
1205
+ for (const match of text.matchAll(FLIGHT_API_RE)) {
1206
+ const title = match[1];
1207
+ const rawRows = match[2] ?? '';
1208
+ if (!title)
1209
+ continue;
1210
+ const rows = [];
1211
+ for (const rowMatch of rawRows.matchAll(FLIGHT_API_ROW_RE)) {
1212
+ const attribute = rowMatch[1];
1213
+ const type = rowMatch[2];
1214
+ const description = rowMatch[3];
1215
+ const defaultValue = rowMatch[4];
1216
+ if (!attribute ||
1217
+ !type ||
1218
+ description === undefined ||
1219
+ defaultValue === undefined) {
1220
+ continue;
1221
+ }
1222
+ rows.push({ attribute, type, description, defaultValue });
1223
+ }
1224
+ const table = buildMarkdownTable(rows);
1225
+ if (table)
1226
+ apiTables.set(title, table);
1227
+ }
1228
+ const mermaidDiagrams = new Map();
1229
+ for (const match of text.matchAll(FLIGHT_MERMAID_SECTION_RE)) {
1230
+ const title = match[1] ? decodeFlightStringValue(match[1]).trim() : '';
1231
+ const chart = match[2] ? buildMermaidBlock(match[2]) : '';
1232
+ if (title && chart)
1233
+ mermaidDiagrams.set(title, chart);
1234
+ }
1235
+ const demoCodeBlocks = new Map();
1236
+ for (const match of text.matchAll(FLIGHT_DEMO_RE)) {
1237
+ const title = match[1];
1238
+ const objectName = match[2];
1239
+ const key = match[3];
1240
+ const ref = objectName
1241
+ ? refs.objectMaps.get(objectName)?.get(key ?? '')
1242
+ : undefined;
1243
+ const code = resolveFlightCodeRef(ref, refs);
1244
+ const codeBlock = code ? buildCodeBlock(code) : '';
1245
+ if (title && codeBlock)
1246
+ demoCodeBlocks.set(title, codeBlock);
1247
+ }
1248
+ return {
1249
+ ...(installMatch ? { installationCommands: installMatch.slice(1) } : {}),
1250
+ ...(importMatch ? { importCommands: importMatch.slice(1) } : {}),
1251
+ apiTables,
1252
+ demoCodeBlocks,
1253
+ mermaidDiagrams,
1254
+ };
1255
+ }
1256
+ function supplementMarkdownFromNextFlight(markdown, originalHtml) {
1257
+ const supplement = extractNextFlightSupplement(originalHtml);
1258
+ if (!supplement)
1259
+ return markdown;
1260
+ const lines = markdown.split('\n');
1261
+ if (supplement.installationCommands?.length) {
1262
+ const installationSection = findMarkdownSection(lines, 'Installation');
1263
+ if (installationSection) {
1264
+ const installBody = getSectionBody(lines, installationSection);
1265
+ if (!/(npm|pnpm|yarn|bun|npx)\s+(install|add)/.test(installBody)) {
1266
+ appendMarkdownSection(lines, 'Installation', buildCodeBlock(supplement.installationCommands.join('\n')));
1267
+ }
1268
+ }
1269
+ }
1270
+ if (supplement.importCommands?.length) {
1271
+ const importSection = findMarkdownSection(lines, 'Import');
1272
+ if (importSection) {
1273
+ const importBody = getSectionBody(lines, importSection);
1274
+ if (!/import\s+\{/.test(importBody)) {
1275
+ appendMarkdownSection(lines, 'Import', buildCodeBlock(supplement.importCommands.join('\n\n')));
1276
+ }
1277
+ }
1278
+ }
1279
+ for (const [title, table] of supplement.apiTables) {
1280
+ replaceMarkdownSection(lines, title, table);
1281
+ }
1282
+ for (const [title, mermaidBlock] of supplement.mermaidDiagrams) {
1283
+ const section = findMarkdownSection(lines, title);
1284
+ if (!section)
1285
+ continue;
1286
+ const sectionBody = getSectionBody(lines, section);
1287
+ if (sectionBody.includes('```mermaid'))
1288
+ continue;
1289
+ const nextBody = sectionBody
1290
+ ? `${sectionBody}\n\n${mermaidBlock}`
1291
+ : mermaidBlock;
1292
+ replaceMarkdownSection(lines, title, nextBody);
1293
+ }
1294
+ for (const [title, codeBlock] of supplement.demoCodeBlocks) {
1295
+ appendMarkdownSection(lines, title, codeBlock);
1296
+ }
1297
+ return lines.join('\n');
1298
+ }
957
1299
  function resolveContentSource(params) {
958
1300
  const { article, metadata: extractedMeta, document, truncated, } = extractContentContext(params.html, params.url, {
959
1301
  extractArticle: true,
@@ -975,6 +1317,33 @@ function resolveContentSource(params) {
975
1317
  ...(params.signal ? { signal: params.signal } : {}),
976
1318
  });
977
1319
  }
1320
+ function shouldStripGithubPrimaryHeading(context, url) {
1321
+ return (context.primaryHeading !== undefined &&
1322
+ TransformHeuristics.isGithubRepositoryRootUrl(url));
1323
+ }
1324
+ function maybeStripGithubPrimaryHeading(markdown, context, url) {
1325
+ if (!shouldStripGithubPrimaryHeading(context, url))
1326
+ return markdown;
1327
+ return stripLeadingHeading(markdown, context.primaryHeading ?? '');
1328
+ }
1329
+ function buildSyntheticTitlePrefix(url, favicon) {
1330
+ if (!favicon)
1331
+ return ' ';
1332
+ let alt = '';
1333
+ try {
1334
+ alt = new URL(url).hostname;
1335
+ }
1336
+ catch {
1337
+ /* skip */
1338
+ }
1339
+ return ` ![${alt}](${favicon}) `;
1340
+ }
1341
+ function maybePrependSyntheticTitle(markdown, context, url) {
1342
+ if (!context.title || /^(#{1,6})\s/.test(markdown.trimStart())) {
1343
+ return markdown;
1344
+ }
1345
+ return `#${buildSyntheticTitlePrefix(url, context.favicon)}${context.title}\n\n${markdown}`;
1346
+ }
978
1347
  function buildMarkdownFromContext(context, url, signal) {
979
1348
  let content = stageTracker.run(url, 'transform:markdown', () => htmlToMarkdown(context.sourceHtml, context.metadata, {
980
1349
  url,
@@ -982,25 +1351,10 @@ function buildMarkdownFromContext(context, url, signal) {
982
1351
  ...(context.document ? { document: context.document } : {}),
983
1352
  ...(context.skipNoiseRemoval ? { skipNoiseRemoval: true } : {}),
984
1353
  }));
985
- if (context.primaryHeading &&
986
- TransformHeuristics.isGithubRepositoryRootUrl(url)) {
987
- content = stripLeadingHeading(content, context.primaryHeading);
988
- }
989
- if (context.title && !/^(#{1,6})\s/.test(content.trimStart())) {
990
- const icon = context.favicon;
991
- let prefix = ' ';
992
- if (icon) {
993
- let alt = '';
994
- try {
995
- alt = new URL(url).hostname;
996
- }
997
- catch {
998
- /* skip */
999
- }
1000
- prefix = ` ![${alt}](${icon}) `;
1001
- }
1002
- content = `#${prefix}${context.title}\n\n${content}`;
1003
- }
1354
+ content = maybeStripGithubPrimaryHeading(content, context, url);
1355
+ content = maybePrependSyntheticTitle(content, context, url);
1356
+ content = supplementMarkdownFromNextFlight(content, context.originalHtml);
1357
+ content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
1004
1358
  return {
1005
1359
  markdown: content,
1006
1360
  title: context.title,
@@ -1056,10 +1410,24 @@ export function transformHtmlToMarkdownInProcess(html, url, options) {
1056
1410
  const signal = buildTransformSignal(options.signal);
1057
1411
  const totalStage = stageTracker.start(url, 'transform:total');
1058
1412
  try {
1059
- abortPolicy.throwIfAborted(signal, url, 'transform:begin');
1413
+ throwIfAborted(signal, url, 'transform:begin');
1060
1414
  validateBinaryContent(html, url);
1061
- const result = tryRawContentPipeline(html, url, options) ??
1062
- tryHtmlContentPipeline(html, url, options, signal);
1415
+ const result = stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
1416
+ html,
1417
+ url,
1418
+ includeMetadata: options.includeMetadata,
1419
+ ...(options.inputTruncated ? { inputTruncated: true } : {}),
1420
+ })) ??
1421
+ (() => {
1422
+ const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
1423
+ html,
1424
+ url,
1425
+ includeMetadata: options.includeMetadata,
1426
+ ...(signal ? { signal } : {}),
1427
+ ...(options.inputTruncated ? { inputTruncated: true } : {}),
1428
+ }));
1429
+ return buildMarkdownFromContext(context, url, signal);
1430
+ })();
1063
1431
  stageTracker.end(totalStage, { truncated: result.truncated });
1064
1432
  return result;
1065
1433
  }
@@ -1073,24 +1441,6 @@ function validateBinaryContent(html, url) {
1073
1441
  throw new FetchError('Content appears to be binary data (high replacement character ratio or null bytes)', url, 415, { reason: 'binary_content_detected', stage: 'transform:validate' });
1074
1442
  }
1075
1443
  }
1076
- function tryRawContentPipeline(html, url, options) {
1077
- return stageTracker.run(url, 'transform:raw', () => tryTransformRawContent({
1078
- html,
1079
- url,
1080
- includeMetadata: options.includeMetadata,
1081
- ...(options.inputTruncated ? { inputTruncated: true } : {}),
1082
- }));
1083
- }
1084
- function tryHtmlContentPipeline(html, url, options, signal) {
1085
- const context = stageTracker.run(url, 'transform:extract', () => resolveContentSource({
1086
- html,
1087
- url,
1088
- includeMetadata: options.includeMetadata,
1089
- ...(signal ? { signal } : {}),
1090
- ...(options.inputTruncated ? { inputTruncated: true } : {}),
1091
- }));
1092
- return buildMarkdownFromContext(context, url, signal);
1093
- }
1094
1444
  export function getTransformPoolStats() {
1095
1445
  return getWorkerPoolStats();
1096
1446
  }
@@ -1100,7 +1450,7 @@ export async function shutdownTransformWorkerPool() {
1100
1450
  function transformInputInProcess(htmlOrBuffer, url, options) {
1101
1451
  return transformHtmlToMarkdownInProcess(decodeInput(htmlOrBuffer, options.encoding), url, options);
1102
1452
  }
1103
- function buildWorkerTransformOptions(options) {
1453
+ function workerTransformOptions(options) {
1104
1454
  return {
1105
1455
  includeMetadata: options.includeMetadata,
1106
1456
  ...(options.signal ? { signal: options.signal } : {}),
@@ -1113,10 +1463,10 @@ async function transformWithWorkerPool(htmlOrBuffer, url, options) {
1113
1463
  return transformInputInProcess(htmlOrBuffer, url, options);
1114
1464
  }
1115
1465
  if (typeof htmlOrBuffer === 'string') {
1116
- return pool.transform(htmlOrBuffer, url, buildWorkerTransformOptions(options));
1466
+ return pool.transform(htmlOrBuffer, url, workerTransformOptions(options));
1117
1467
  }
1118
1468
  return pool.transform(htmlOrBuffer, url, {
1119
- ...buildWorkerTransformOptions(options),
1469
+ ...workerTransformOptions(options),
1120
1470
  ...(options.encoding ? { encoding: options.encoding } : {}),
1121
1471
  });
1122
1472
  }
@@ -1128,7 +1478,7 @@ function resolveWorkerFallback(error, htmlOrBuffer, url, options) {
1128
1478
  });
1129
1479
  return transformInputInProcess(htmlOrBuffer, url, options);
1130
1480
  }
1131
- abortPolicy.throwIfAborted(options.signal, url, 'transform:worker-fallback');
1481
+ throwIfAborted(options.signal, url, 'transform:worker-fallback');
1132
1482
  if (error instanceof FetchError)
1133
1483
  throw error;
1134
1484
  if (!(error instanceof Error))
@@ -1155,7 +1505,7 @@ async function runWorkerTransformWithFallback(htmlOrBuffer, url, options) {
1155
1505
  async function transformInputToMarkdown(htmlOrBuffer, url, options) {
1156
1506
  const totalStage = stageTracker.start(url, 'transform:total');
1157
1507
  try {
1158
- abortPolicy.throwIfAborted(options.signal, url, 'transform:begin');
1508
+ throwIfAborted(options.signal, url, 'transform:begin');
1159
1509
  const result = await runWorkerTransformWithFallback(htmlOrBuffer, url, options);
1160
1510
  stageTracker.end(totalStage, { truncated: result.truncated });
1161
1511
  return result;