@j0hanz/fetch-url-mcp 1.9.2 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import { type MetadataBlock } from '../transform/types.js';
2
2
  export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
3
3
  export declare function prepareDocumentForMarkdown(document: Document, baseUrl?: string, signal?: AbortSignal): void;
4
4
  export declare function removeNoiseFromHtml(html: string, document?: Document, baseUrl?: string, signal?: AbortSignal): string;
5
+ export declare function extractLanguageFromClassName(className: string): string | undefined;
5
6
  export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
6
7
  export declare function detectLanguageFromCode(code: string): string | undefined;
7
8
  interface CleanupOptions {
@@ -1 +1 @@
1
- {"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAujB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AAuCD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAoTD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE;AAoDD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAqSD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6CR;AAgGD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
1
+ {"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAwjB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AA2DD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAwQD,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAqBD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE;AAsDD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAqTD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6CR;AAgGD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
@@ -41,7 +41,7 @@ const NOISE_PATTERNS = [
41
41
  /[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
42
42
  /[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
43
43
  /[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
44
- /[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumb|pagination)\b/i,
44
+ /[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumbs?|pagination)\b/i,
45
45
  ];
46
46
  const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
47
47
  const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
@@ -102,6 +102,7 @@ const PROMO_TOKENS_ALWAYS = [
102
102
  'sponsor',
103
103
  'recommend',
104
104
  'breadcrumb',
105
+ 'breadcrumbs',
105
106
  'pagination',
106
107
  'pager',
107
108
  'taglist',
@@ -118,7 +119,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
118
119
  };
119
120
  // Noise selector configurations
120
121
  const BASE_NOISE_SELECTORS = {
121
- navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
122
+ navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"],[class*="breadcrumb"]',
122
123
  cookieBanners: '[role="dialog"]',
123
124
  hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
124
125
  };
@@ -504,17 +505,31 @@ function mayContainNoise(html) {
504
505
  : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
505
506
  return NOISE_PATTERNS.some((re) => re.test(sample));
506
507
  }
508
+ function surfaceHiddenTabPanels(document) {
509
+ const panels = document.querySelectorAll('[data-slot="tabContent"], [role="tabpanel"]');
510
+ for (const panel of panels) {
511
+ const style = panel.getAttribute('style') ?? '';
512
+ if (/display\s*:\s*none/i.test(style)) {
513
+ panel.setAttribute('style', style.replace(/display\s*:\s*none\s*;?/gi, '').trim());
514
+ }
515
+ panel.removeAttribute('hidden');
516
+ }
517
+ }
507
518
  function stripTabTriggers(document) {
519
+ surfaceHiddenTabPanels(document);
508
520
  const tabs = document.querySelectorAll('button[role="tab"]');
509
521
  for (let i = tabs.length - 1; i >= 0; i--) {
510
522
  tabs[i]?.remove();
511
523
  }
512
524
  }
513
525
  function escapeTableCellPipes(document) {
514
- const codes = document.querySelectorAll('td code, th code');
515
- for (const code of codes) {
516
- if (code.textContent.includes('|')) {
517
- code.textContent = code.textContent.replace(/\|/g, '\\|');
526
+ const cells = document.querySelectorAll('td, th');
527
+ for (const cell of cells) {
528
+ for (const node of cell.childNodes) {
529
+ const text = node.textContent;
530
+ if (node.nodeType === 3 && text?.includes('|')) {
531
+ node.textContent = text.replace(/\|/g, '\\|');
532
+ }
518
533
  }
519
534
  }
520
535
  }
@@ -820,7 +835,7 @@ const LANGUAGES = [
820
835
  match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
821
836
  },
822
837
  ];
823
- function extractLanguageFromClassName(className) {
838
+ export function extractLanguageFromClassName(className) {
824
839
  if (!className)
825
840
  return undefined;
826
841
  // Split by whitespace and check for language indicators
@@ -895,12 +910,13 @@ const REGEX = {
895
910
  HEADING_MARKER: /^#{1,6}\s/m,
896
911
  HEADING_STRICT: /^#{1,6}\s+/m,
897
912
  EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
913
+ ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
898
914
  FENCE_START: /^\s*(`{3,}|~{3,})/,
899
915
  LIST_MARKER: /^(?:[-*+])\s/m,
900
916
  TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
901
- TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i,
917
+ TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents|on this page)\s*$/i,
902
918
  HTML_DOC_START: /^(<!doctype|<html)/i,
903
- COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??)\s*$/gim,
919
+ COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??|\[Back to top\]\(#[^)]*\)|\[\s*\]\(https?:\/\/[^)]*\))\s*$/gim,
904
920
  ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
905
921
  CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
906
922
  DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
@@ -948,6 +964,9 @@ function hasFollowingContent(lines, startIndex) {
948
964
  }
949
965
  return false;
950
966
  }
967
+ function stripAnchorOnlyHeading(line) {
968
+ return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
969
+ }
951
970
  function isTitleCaseOrKeyword(trimmed) {
952
971
  // Quick check for length to avoid regex on long strings
953
972
  if (trimmed.length > MAX_LINE_LENGTH)
@@ -1094,6 +1113,11 @@ function preprocessLines(lines, options) {
1094
1113
  const trimmed = line.trim();
1095
1114
  if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
1096
1115
  continue;
1116
+ if (REGEX.ANCHOR_ONLY_HEADING.test(trimmed)) {
1117
+ if (!hasFollowingContent(lines, i))
1118
+ continue;
1119
+ line = stripAnchorOnlyHeading(trimmed);
1120
+ }
1097
1121
  const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc, options);
1098
1122
  if (tocSkip !== null) {
1099
1123
  skipUntil = tocSkip;
@@ -1127,6 +1151,16 @@ function removeSkipLinks(text) {
1127
1151
  .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1128
1152
  .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1129
1153
  }
1154
+ function normalizeInlineCodeTokens(text) {
1155
+ return text.replace(/`([^`\n]+)`/g, (match, inner) => {
1156
+ const trimmed = inner.trim();
1157
+ if (trimmed === inner || /\s/.test(trimmed))
1158
+ return match;
1159
+ if (!/^[*A-Za-z0-9_./:-]+$/.test(trimmed))
1160
+ return match;
1161
+ return `\`${trimmed}\``;
1162
+ });
1163
+ }
1130
1164
  function normalizeMarkdownSpacing(text) {
1131
1165
  let result = text
1132
1166
  .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
@@ -1136,8 +1170,8 @@ function normalizeMarkdownSpacing(text) {
1136
1170
  .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
1137
1171
  .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
1138
1172
  .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
1139
- // Trim leading whitespace inside inline code spans
1140
- result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
1173
+ // Trim whitespace around token-like inline code spans.
1174
+ result = normalizeInlineCodeTokens(result);
1141
1175
  // Unescape backticks inside markdown link text
1142
1176
  result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
1143
1177
  result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4fA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAigBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -255,11 +255,13 @@ const GFM_ALERT_MAP = new Map([
255
255
  ['danger', 'CAUTION'],
256
256
  ['important', 'IMPORTANT'],
257
257
  ]);
258
+ const ADMONITION_TOKEN_RE = /^(?:note|tip|hint|info|warning|warn|danger|caution|important)$/i;
258
259
  function resolveGfmAlertType(className) {
259
- const lower = className.toLowerCase();
260
- for (const [key, type] of GFM_ALERT_MAP) {
261
- if (lower.includes(key))
262
- return type;
260
+ const tokens = className.toLowerCase().split(/\s+/);
261
+ for (const token of tokens) {
262
+ const mapped = GFM_ALERT_MAP.get(token);
263
+ if (mapped)
264
+ return mapped;
263
265
  }
264
266
  return undefined;
265
267
  }
@@ -278,11 +280,12 @@ function buildDivTranslator(ctx) {
278
280
  postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
279
281
  };
280
282
  }
283
+ const classTokens = className.split(/\s+/);
281
284
  const isAdmonition = className.includes('admonition') ||
282
285
  className.includes('callout') ||
283
286
  className.includes('custom-block') ||
284
287
  getAttribute('role') === 'alert' ||
285
- /\b(note|tip|info|warning|danger|caution|important)\b/i.test(className);
288
+ classTokens.some((t) => ADMONITION_TOKEN_RE.test(t));
286
289
  if (isAdmonition) {
287
290
  return {
288
291
  postprocess: ({ content }) => {
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAuJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAuVD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA+OD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAwJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAsCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AAED,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AAiQX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAmBzB;AA+CD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAsH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAyCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAuJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwWD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA+OD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAuCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AA6CD,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AAurBX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAmBzB;AA+CD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAsH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
@@ -3,7 +3,7 @@ import diagnosticsChannel from 'node:diagnostics_channel';
3
3
  import { performance } from 'node:perf_hooks';
4
4
  import { isProbablyReaderable, Readability } from '@mozilla/readability';
5
5
  import { parseHTML } from 'linkedom';
6
- import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
6
+ import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, detectLanguageFromCode, extractLanguageFromClassName, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
7
7
  import { config } from '../lib/core.js';
8
8
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
9
9
  import { isRawTextContentUrl } from '../lib/http.js';
@@ -284,13 +284,22 @@ function resolveCollapsedTextLengthUpTo(text, max) {
284
284
  return length;
285
285
  }
286
286
  function preserveAlertElements(doc) {
287
- const alerts = doc.querySelectorAll('[role="alert"], .admonition, .callout');
287
+ const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
288
288
  for (const el of alerts) {
289
289
  const bq = doc.createElement('blockquote');
290
290
  bq.innerHTML = el.innerHTML;
291
291
  el.replaceWith(bq);
292
292
  }
293
293
  }
294
+ function preserveCodeLanguageAttributes(doc) {
295
+ for (const el of doc.querySelectorAll('pre, code')) {
296
+ if (el.getAttribute('data-language'))
297
+ continue;
298
+ const lang = extractLanguageFromClassName(el.getAttribute('class') ?? '');
299
+ if (lang)
300
+ el.setAttribute('data-language', lang);
301
+ }
302
+ }
294
303
  function extractArticle(document, url, signal) {
295
304
  if (!isReadabilityCompatible(document)) {
296
305
  logWarn('Document not compatible with Readability');
@@ -321,6 +330,10 @@ function extractArticle(document, url, signal) {
321
330
  ? doc.cloneNode(true)
322
331
  : doc;
323
332
  preserveAlertElements(readabilityDoc);
333
+ preserveCodeLanguageAttributes(readabilityDoc);
334
+ for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
335
+ el.remove();
336
+ }
324
337
  checkAbort('extract:article:parse');
325
338
  const reader = new Readability(readabilityDoc, {
326
339
  maxElemsToParse: MAX_READABILITY_ELEMENTS,
@@ -654,6 +667,13 @@ const MIN_CONTENT_RATIO = 0.15;
654
667
  const MIN_HTML_LENGTH_FOR_GATE = 100;
655
668
  const MIN_HEADING_RETENTION_RATIO = 0.3;
656
669
  const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
670
+ const MIN_TABLE_RETENTION_RATIO = 0.5;
671
+ const MIN_IMAGE_RETENTION_RATIO = 0.2;
672
+ const MIN_INTERACTIVE_RETENTION_RATIO = 0.1;
673
+ const MIN_INTERACTIVE_ELEMENTS_FOR_GATE = 6;
674
+ const MIN_IMAGE_ELEMENTS_FOR_GATE = 4;
675
+ const MIN_HEADINGS_FOR_EMPTY_SECTION_GATE = 5;
676
+ const MAX_EMPTY_SECTION_RATIO = 0.05;
657
677
  const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
658
678
  const MAX_TRUNCATED_LINE_RATIO = 0.95;
659
679
  function needsDocumentWrapper(html) {
@@ -679,13 +699,6 @@ function resolveHtmlDocument(htmlOrDocument) {
679
699
  return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
680
700
  }
681
701
  }
682
- function countTagsInString(html, regex) {
683
- let count = 0;
684
- while (regex.exec(html) !== null) {
685
- count++;
686
- }
687
- return count;
688
- }
689
702
  function stripNonVisibleNodes(root) {
690
703
  for (const el of root.querySelectorAll('script,style,noscript')) {
691
704
  el.remove();
@@ -868,6 +881,43 @@ function findPrimaryHeading(document) {
868
881
  }
869
882
  return undefined;
870
883
  }
884
+ function countMatchingElements(root, selector) {
885
+ return root.querySelectorAll(selector).length;
886
+ }
887
+ function getHeadingLevel(heading) {
888
+ const match = /^H([1-6])$/.exec(heading.tagName);
889
+ if (!match)
890
+ return null;
891
+ return Number.parseInt(match[1] ?? '', 10);
892
+ }
893
+ function hasSectionContent(heading) {
894
+ const level = getHeadingLevel(heading);
895
+ if (level === null)
896
+ return false;
897
+ let current = heading.nextElementSibling;
898
+ while (current) {
899
+ const currentLevel = getHeadingLevel(current);
900
+ if (currentLevel !== null && currentLevel <= level)
901
+ return false;
902
+ const text = current.textContent.trim();
903
+ if (text.length > 0)
904
+ return true;
905
+ if (current.querySelector('img,table,pre,code,ul,ol,figure,blockquote')) {
906
+ return true;
907
+ }
908
+ current = current.nextElementSibling;
909
+ }
910
+ return false;
911
+ }
912
+ function countEmptyHeadingSections(root) {
913
+ let emptyCount = 0;
914
+ const headings = root.querySelectorAll('h1,h2,h3,h4,h5,h6');
915
+ for (const heading of headings) {
916
+ if (!hasSectionContent(heading))
917
+ emptyCount += 1;
918
+ }
919
+ return emptyCount;
920
+ }
871
921
  function isGithubRepositoryRootUrl(url) {
872
922
  let parsed;
873
923
  try {
@@ -895,20 +945,49 @@ function shouldUseArticleContent(article, document) {
895
945
  if (ratio < MIN_CONTENT_RATIO)
896
946
  return false;
897
947
  }
898
- const originalHeadings = document.querySelectorAll('h1,h2,h3,h4,h5,h6').length;
948
+ const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
949
+ const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
950
+ const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
899
951
  if (originalHeadings > 0) {
900
- const articleHeadings = countTagsInString(article.content, /<h[1-6]\b/gi);
901
952
  const retentionRatio = articleHeadings / originalHeadings;
902
953
  if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
903
954
  return false;
904
955
  }
905
- const originalCodeBlocks = document.querySelectorAll('pre').length;
956
+ const originalCodeBlocks = countMatchingElements(document, 'pre');
906
957
  if (originalCodeBlocks > 0) {
907
- const articleCodeBlocks = countTagsInString(article.content, /<pre\b/gi);
958
+ const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
908
959
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
909
960
  if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
910
961
  return false;
911
962
  }
963
+ const originalTables = countMatchingElements(document, 'table');
964
+ if (originalTables > 0) {
965
+ const articleTables = countMatchingElements(articleDoc, 'table');
966
+ const tableRetentionRatio = articleTables / originalTables;
967
+ if (tableRetentionRatio < MIN_TABLE_RETENTION_RATIO)
968
+ return false;
969
+ }
970
+ const originalImages = countMatchingElements(document, 'img');
971
+ if (originalImages >= MIN_IMAGE_ELEMENTS_FOR_GATE) {
972
+ const articleImages = countMatchingElements(articleDoc, 'img');
973
+ const imageRetentionRatio = articleImages / originalImages;
974
+ if (imageRetentionRatio < MIN_IMAGE_RETENTION_RATIO)
975
+ return false;
976
+ }
977
+ const interactiveSelector = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
978
+ const originalInteractive = countMatchingElements(document, interactiveSelector);
979
+ if (originalInteractive >= MIN_INTERACTIVE_ELEMENTS_FOR_GATE) {
980
+ const articleInteractive = countMatchingElements(articleDoc, interactiveSelector);
981
+ const interactiveRetentionRatio = articleInteractive / originalInteractive;
982
+ if (interactiveRetentionRatio < MIN_INTERACTIVE_RETENTION_RATIO) {
983
+ return false;
984
+ }
985
+ }
986
+ if (articleHeadings >= MIN_HEADINGS_FOR_EMPTY_SECTION_GATE) {
987
+ const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
988
+ if (emptySectionRatio > MAX_EMPTY_SECTION_RATIO)
989
+ return false;
990
+ }
912
991
  return !hasTruncatedSentences(article.textContent);
913
992
  }
914
993
  function buildContentSource(params) {
@@ -922,6 +1001,7 @@ function buildContentSource(params) {
922
1001
  primaryHeading: document
923
1002
  ? TransformHeuristics.findPrimaryHeading(document)
924
1003
  : undefined,
1004
+ originalHtml: html,
925
1005
  };
926
1006
  if (useArticleContent && article) {
927
1007
  const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
@@ -954,6 +1034,290 @@ function buildContentSource(params) {
954
1034
  title: extractedMeta.title,
955
1035
  };
956
1036
  }
1037
+ const NEXT_FLIGHT_PAYLOAD_RE = /self\.__next_f\.push\(\[1,"((?:\\.|[^"\\])*)"\]\)<\/script>/gs;
1038
+ const TEMPLATE_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=`([\s\S]*?)`;/g;
1039
+ const OBJECT_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=\{([^{}]+)\}/g;
1040
+ const FLIGHT_INSTALL_RE = /commands:\{cli:"([^"]+)",npm:"([^"]+)",yarn:"([^"]+)",pnpm:"([^"]+)",bun:"([^"]+)"\}/;
1041
+ const FLIGHT_IMPORT_RE = /commands:\{main:'([^']+)',individual:'([^']+)'\}/;
1042
+ const FLIGHT_DEMO_RE = /title:"([^"]+)",files:([A-Za-z_$][\w$]*)\.([A-Za-z_$][\w$]*)/g;
1043
+ const FLIGHT_API_RE = /children:"([^"]+)"\}\),`\\n`,\(0,e\.jsx\)\(o,\{data:\[([\s\S]*?)\]\}\)/g;
1044
+ const FLIGHT_API_ROW_RE = /attribute:"([^"]+)",type:"([^"]+)",description:"([^"]*)",default:"([^"]*)"/g;
1045
+ const FLIGHT_MERMAID_SECTION_RE = /_jsx\(Heading,\{\s*level:"[1-6]",\s*id:"[^"]+",\s*children:"((?:\\.|[^"\\])*)"\s*\}\)(?:(?!_jsx\(Heading,\{)[\s\S]){0,12000}?_jsx\(Mermaid,\{\s*chart:"((?:\\.|[^"\\])*)"\s*\}\)/g;
1046
+ function decodeHtmlEntities(value) {
1047
+ return value
1048
+ .replace(/&#39;|&#x27;/g, "'")
1049
+ .replace(/&quot;/g, '"')
1050
+ .replace(/&amp;/g, '&')
1051
+ .replace(/&lt;/g, '<')
1052
+ .replace(/&gt;/g, '>');
1053
+ }
1054
+ function decodeFlightStringValue(value) {
1055
+ try {
1056
+ return JSON.parse(`"${value}"`);
1057
+ }
1058
+ catch {
1059
+ return decodeHtmlEntities(value);
1060
+ }
1061
+ }
1062
+ function decodeNextFlightPayloads(html) {
1063
+ const payloads = [];
1064
+ for (const match of html.matchAll(NEXT_FLIGHT_PAYLOAD_RE)) {
1065
+ const rawPayload = match[1];
1066
+ if (!rawPayload)
1067
+ continue;
1068
+ try {
1069
+ payloads.push(JSON.parse(`"${rawPayload}"`));
1070
+ }
1071
+ catch {
1072
+ // Ignore malformed payload fragments and continue with the rest.
1073
+ }
1074
+ }
1075
+ return payloads;
1076
+ }
1077
+ function parseFlightObjectRefs(text) {
1078
+ const templateMap = new Map();
1079
+ const aliasMap = new Map();
1080
+ const objectMaps = new Map();
1081
+ for (const match of text.matchAll(TEMPLATE_ASSIGNMENT_RE)) {
1082
+ const name = match[1];
1083
+ const code = match[2];
1084
+ if (name && code)
1085
+ templateMap.set(name, decodeHtmlEntities(code));
1086
+ }
1087
+ for (const match of text.matchAll(OBJECT_ASSIGNMENT_RE)) {
1088
+ const objectName = match[1];
1089
+ const body = match[2]?.trim() ?? '';
1090
+ if (!objectName || !body)
1091
+ continue;
1092
+ const spreadMatch = /^\.\.\.([A-Za-z_$][\w$]*)$/.exec(body);
1093
+ if (spreadMatch?.[1]) {
1094
+ aliasMap.set(objectName, spreadMatch[1]);
1095
+ continue;
1096
+ }
1097
+ const entries = new Map();
1098
+ for (const part of body.split(',')) {
1099
+ const entryMatch = /(?:"([^"]+)"|([A-Za-z_$][\w$]*)):([A-Za-z_$][\w$]*)$/.exec(part.trim());
1100
+ const key = entryMatch?.[1] ?? entryMatch?.[2];
1101
+ const value = entryMatch?.[3];
1102
+ if (key && value)
1103
+ entries.set(key, value);
1104
+ }
1105
+ if (entries.size > 0)
1106
+ objectMaps.set(objectName, entries);
1107
+ }
1108
+ return { templateMap, aliasMap, objectMaps };
1109
+ }
1110
+ function resolveFlightCodeRef(name, refs, seen = new Set()) {
1111
+ if (!name || seen.has(name))
1112
+ return undefined;
1113
+ seen.add(name);
1114
+ const direct = refs.templateMap.get(name);
1115
+ if (direct)
1116
+ return direct;
1117
+ const alias = refs.aliasMap.get(name);
1118
+ if (alias)
1119
+ return resolveFlightCodeRef(alias, refs, seen);
1120
+ const objectMap = refs.objectMaps.get(name);
1121
+ if (!objectMap)
1122
+ return undefined;
1123
+ for (const ref of objectMap.values()) {
1124
+ const resolved = resolveFlightCodeRef(ref, refs, seen);
1125
+ if (resolved)
1126
+ return resolved;
1127
+ }
1128
+ return undefined;
1129
+ }
1130
+ function escapeMarkdownTableCell(value) {
1131
+ const normalized = decodeHtmlEntities(value).replace(/\s+/g, ' ').trim();
1132
+ return (normalized || '-').replace(/\|/g, '\\|');
1133
+ }
1134
+ function buildMarkdownTable(rows) {
1135
+ if (rows.length === 0)
1136
+ return '';
1137
+ const lines = [
1138
+ '| Prop | Type | Description | Default |',
1139
+ '| ---- | ---- | ----------- | ------- |',
1140
+ ];
1141
+ for (const row of rows) {
1142
+ lines.push(`| ${escapeMarkdownTableCell(row.attribute)} | ${escapeMarkdownTableCell(row.type)} | ${escapeMarkdownTableCell(row.description)} | ${escapeMarkdownTableCell(row.defaultValue)} |`);
1143
+ }
1144
+ return lines.join('\n');
1145
+ }
1146
+ function buildCodeBlock(code) {
1147
+ const trimmed = code.trim();
1148
+ if (!trimmed)
1149
+ return '';
1150
+ const language = detectLanguageFromCode(trimmed) ?? 'tsx';
1151
+ return `\`\`\`${language}\n${trimmed}\n\`\`\``;
1152
+ }
1153
+ function buildMermaidBlock(chart) {
1154
+ const normalized = decodeFlightStringValue(chart).trim();
1155
+ if (!normalized)
1156
+ return '';
1157
+ return `\`\`\`mermaid\n${normalized}\n\`\`\``;
1158
+ }
1159
+ function normalizeSupplementHeadingText(value) {
1160
+ return value
1161
+ .replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
1162
+ .replace(/\s+/g, ' ')
1163
+ .trim()
1164
+ .toLowerCase();
1165
+ }
1166
+ function getMarkdownHeadingInfo(line) {
1167
+ const match = /^(#{1,6})\s+(.+?)\s*$/.exec(line.trim());
1168
+ if (!match)
1169
+ return null;
1170
+ return {
1171
+ level: match[1]?.length ?? 0,
1172
+ title: normalizeSupplementHeadingText(match[2] ?? ''),
1173
+ };
1174
+ }
1175
+ function findMarkdownSection(lines, title) {
1176
+ const target = normalizeSupplementHeadingText(title);
1177
+ for (let i = 0; i < lines.length; i += 1) {
1178
+ const heading = getMarkdownHeadingInfo(lines[i] ?? '');
1179
+ if (heading?.title !== target)
1180
+ continue;
1181
+ let end = lines.length;
1182
+ for (let j = i + 1; j < lines.length; j += 1) {
1183
+ const nextLine = lines[j];
1184
+ const nextHeading = nextLine !== undefined ? getMarkdownHeadingInfo(nextLine) : null;
1185
+ if (nextHeading && nextHeading.level <= heading.level) {
1186
+ end = j;
1187
+ break;
1188
+ }
1189
+ }
1190
+ return { start: i, end };
1191
+ }
1192
+ return null;
1193
+ }
1194
+ function getSectionBody(lines, section) {
1195
+ return lines
1196
+ .slice(section.start + 1, section.end)
1197
+ .join('\n')
1198
+ .trim();
1199
+ }
1200
+ function replaceMarkdownSection(lines, title, body) {
1201
+ const section = findMarkdownSection(lines, title);
1202
+ if (!section)
1203
+ return false;
1204
+ const replacement = body.trim().length > 0 ? ['', ...body.trim().split('\n'), ''] : [''];
1205
+ lines.splice(section.start + 1, section.end - section.start - 1, ...replacement);
1206
+ return true;
1207
+ }
1208
+ function appendMarkdownSection(lines, title, body) {
1209
+ const section = findMarkdownSection(lines, title);
1210
+ if (!section)
1211
+ return false;
1212
+ const bodyText = getSectionBody(lines, section);
1213
+ if (bodyText.includes('```'))
1214
+ return false;
1215
+ const nextBody = bodyText ? `${bodyText}\n\n${body.trim()}` : body.trim();
1216
+ return replaceMarkdownSection(lines, title, nextBody);
1217
+ }
1218
+ function extractNextFlightSupplement(originalHtml) {
1219
+ const payloads = decodeNextFlightPayloads(originalHtml);
1220
+ if (payloads.length === 0)
1221
+ return null;
1222
+ const text = payloads.join('\n');
1223
+ const refs = parseFlightObjectRefs(text);
1224
+ const installMatch = FLIGHT_INSTALL_RE.exec(text);
1225
+ const importMatch = FLIGHT_IMPORT_RE.exec(text);
1226
+ const apiTables = new Map();
1227
+ for (const match of text.matchAll(FLIGHT_API_RE)) {
1228
+ const title = match[1];
1229
+ const rawRows = match[2] ?? '';
1230
+ if (!title)
1231
+ continue;
1232
+ const rows = [];
1233
+ for (const rowMatch of rawRows.matchAll(FLIGHT_API_ROW_RE)) {
1234
+ const attribute = rowMatch[1];
1235
+ const type = rowMatch[2];
1236
+ const description = rowMatch[3];
1237
+ const defaultValue = rowMatch[4];
1238
+ if (!attribute ||
1239
+ !type ||
1240
+ description === undefined ||
1241
+ defaultValue === undefined) {
1242
+ continue;
1243
+ }
1244
+ rows.push({ attribute, type, description, defaultValue });
1245
+ }
1246
+ const table = buildMarkdownTable(rows);
1247
+ if (table)
1248
+ apiTables.set(title, table);
1249
+ }
1250
+ const mermaidDiagrams = new Map();
1251
+ for (const match of text.matchAll(FLIGHT_MERMAID_SECTION_RE)) {
1252
+ const title = match[1] ? decodeFlightStringValue(match[1]).trim() : '';
1253
+ const chart = match[2] ? buildMermaidBlock(match[2]) : '';
1254
+ if (title && chart)
1255
+ mermaidDiagrams.set(title, chart);
1256
+ }
1257
+ const demoCodeBlocks = new Map();
1258
+ for (const match of text.matchAll(FLIGHT_DEMO_RE)) {
1259
+ const title = match[1];
1260
+ const objectName = match[2];
1261
+ const key = match[3];
1262
+ const ref = objectName
1263
+ ? refs.objectMaps.get(objectName)?.get(key ?? '')
1264
+ : undefined;
1265
+ const code = resolveFlightCodeRef(ref, refs);
1266
+ const codeBlock = code ? buildCodeBlock(code) : '';
1267
+ if (title && codeBlock)
1268
+ demoCodeBlocks.set(title, codeBlock);
1269
+ }
1270
+ return {
1271
+ ...(installMatch ? { installationCommands: installMatch.slice(1) } : {}),
1272
+ ...(importMatch ? { importCommands: importMatch.slice(1) } : {}),
1273
+ apiTables,
1274
+ demoCodeBlocks,
1275
+ mermaidDiagrams,
1276
+ };
1277
+ }
1278
+ function supplementMarkdownFromNextFlight(markdown, originalHtml) {
1279
+ const supplement = extractNextFlightSupplement(originalHtml);
1280
+ if (!supplement)
1281
+ return markdown;
1282
+ const lines = markdown.split('\n');
1283
+ if (supplement.installationCommands?.length) {
1284
+ const installationSection = findMarkdownSection(lines, 'Installation');
1285
+ if (installationSection) {
1286
+ const installBody = getSectionBody(lines, installationSection);
1287
+ if (!/(npm|pnpm|yarn|bun|npx)\s+(install|add)/.test(installBody)) {
1288
+ appendMarkdownSection(lines, 'Installation', buildCodeBlock(supplement.installationCommands.join('\n')));
1289
+ }
1290
+ }
1291
+ }
1292
+ if (supplement.importCommands?.length) {
1293
+ const importSection = findMarkdownSection(lines, 'Import');
1294
+ if (importSection) {
1295
+ const importBody = getSectionBody(lines, importSection);
1296
+ if (!/import\s+\{/.test(importBody)) {
1297
+ appendMarkdownSection(lines, 'Import', buildCodeBlock(supplement.importCommands.join('\n\n')));
1298
+ }
1299
+ }
1300
+ }
1301
+ for (const [title, table] of supplement.apiTables) {
1302
+ replaceMarkdownSection(lines, title, table);
1303
+ }
1304
+ for (const [title, mermaidBlock] of supplement.mermaidDiagrams) {
1305
+ const section = findMarkdownSection(lines, title);
1306
+ if (!section)
1307
+ continue;
1308
+ const sectionBody = getSectionBody(lines, section);
1309
+ if (sectionBody.includes('```mermaid'))
1310
+ continue;
1311
+ const nextBody = sectionBody
1312
+ ? `${sectionBody}\n\n${mermaidBlock}`
1313
+ : mermaidBlock;
1314
+ replaceMarkdownSection(lines, title, nextBody);
1315
+ }
1316
+ for (const [title, codeBlock] of supplement.demoCodeBlocks) {
1317
+ appendMarkdownSection(lines, title, codeBlock);
1318
+ }
1319
+ return lines.join('\n');
1320
+ }
957
1321
  function resolveContentSource(params) {
958
1322
  const { article, metadata: extractedMeta, document, truncated, } = extractContentContext(params.html, params.url, {
959
1323
  extractArticle: true,
@@ -1001,6 +1365,8 @@ function buildMarkdownFromContext(context, url, signal) {
1001
1365
  }
1002
1366
  content = `#${prefix}${context.title}\n\n${content}`;
1003
1367
  }
1368
+ content = supplementMarkdownFromNextFlight(content, context.originalHtml);
1369
+ content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
1004
1370
  return {
1005
1371
  markdown: content,
1006
1372
  title: context.title,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.9.2",
3
+ "version": "1.9.3",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",