@j0hanz/fetch-url-mcp 1.9.2 → 1.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/content.d.ts +1 -0
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +45 -11
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +8 -5
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +379 -13
- package/package.json +1 -1
package/dist/lib/content.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ import { type MetadataBlock } from '../transform/types.js';
|
|
|
2
2
|
export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
|
|
3
3
|
export declare function prepareDocumentForMarkdown(document: Document, baseUrl?: string, signal?: AbortSignal): void;
|
|
4
4
|
export declare function removeNoiseFromHtml(html: string, document?: Document, baseUrl?: string, signal?: AbortSignal): string;
|
|
5
|
+
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
5
6
|
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
6
7
|
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
7
8
|
interface CleanupOptions {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;
|
|
1
|
+
{"version":3,"file":"content.d.ts","sourceRoot":"","sources":["../../src/lib/content.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,aAAa,EAAE,MAAM,uBAAuB,CAAC;AAwjB3D,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CAQR;AA2DD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAiBN;AA0BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR;AAwQD,wBAAgB,4BAA4B,CAC1C,SAAS,EAAE,MAAM,GAChB,MAAM,GAAG,SAAS,CAuBpB;AAqBD,wBAAgB,6BAA6B,CAC3C,SAAS,EAAE,MAAM,EACjB,QAAQ,EAAE,MAAM,GACf,MAAM,GAAG,SAAS,CAKpB;AACD,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS,CAqBvE;AAsDD,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAqTD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CA6CR;AAgGD,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
package/dist/lib/content.js
CHANGED
|
@@ -41,7 +41,7 @@ const NOISE_PATTERNS = [
|
|
|
41
41
|
/[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
|
|
42
42
|
/[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
|
|
43
43
|
/[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
|
|
44
|
-
/[\s"'](?:fixed|sticky|z-50|z-4|isolate|
|
|
44
|
+
/[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumbs?|pagination)\b/i,
|
|
45
45
|
];
|
|
46
46
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
47
47
|
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
|
|
@@ -102,6 +102,7 @@ const PROMO_TOKENS_ALWAYS = [
|
|
|
102
102
|
'sponsor',
|
|
103
103
|
'recommend',
|
|
104
104
|
'breadcrumb',
|
|
105
|
+
'breadcrumbs',
|
|
105
106
|
'pagination',
|
|
106
107
|
'pager',
|
|
107
108
|
'taglist',
|
|
@@ -118,7 +119,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
|
|
|
118
119
|
};
|
|
119
120
|
// Noise selector configurations
|
|
120
121
|
const BASE_NOISE_SELECTORS = {
|
|
121
|
-
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
122
|
+
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"],[class*="breadcrumb"]',
|
|
122
123
|
cookieBanners: '[role="dialog"]',
|
|
123
124
|
hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
|
|
124
125
|
};
|
|
@@ -504,17 +505,31 @@ function mayContainNoise(html) {
|
|
|
504
505
|
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
505
506
|
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
506
507
|
}
|
|
508
|
+
function surfaceHiddenTabPanels(document) {
|
|
509
|
+
const panels = document.querySelectorAll('[data-slot="tabContent"], [role="tabpanel"]');
|
|
510
|
+
for (const panel of panels) {
|
|
511
|
+
const style = panel.getAttribute('style') ?? '';
|
|
512
|
+
if (/display\s*:\s*none/i.test(style)) {
|
|
513
|
+
panel.setAttribute('style', style.replace(/display\s*:\s*none\s*;?/gi, '').trim());
|
|
514
|
+
}
|
|
515
|
+
panel.removeAttribute('hidden');
|
|
516
|
+
}
|
|
517
|
+
}
|
|
507
518
|
function stripTabTriggers(document) {
|
|
519
|
+
surfaceHiddenTabPanels(document);
|
|
508
520
|
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
509
521
|
for (let i = tabs.length - 1; i >= 0; i--) {
|
|
510
522
|
tabs[i]?.remove();
|
|
511
523
|
}
|
|
512
524
|
}
|
|
513
525
|
function escapeTableCellPipes(document) {
|
|
514
|
-
const
|
|
515
|
-
for (const
|
|
516
|
-
|
|
517
|
-
|
|
526
|
+
const cells = document.querySelectorAll('td, th');
|
|
527
|
+
for (const cell of cells) {
|
|
528
|
+
for (const node of cell.childNodes) {
|
|
529
|
+
const text = node.textContent;
|
|
530
|
+
if (node.nodeType === 3 && text?.includes('|')) {
|
|
531
|
+
node.textContent = text.replace(/\|/g, '\\|');
|
|
532
|
+
}
|
|
518
533
|
}
|
|
519
534
|
}
|
|
520
535
|
}
|
|
@@ -820,7 +835,7 @@ const LANGUAGES = [
|
|
|
820
835
|
match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
|
|
821
836
|
},
|
|
822
837
|
];
|
|
823
|
-
function extractLanguageFromClassName(className) {
|
|
838
|
+
export function extractLanguageFromClassName(className) {
|
|
824
839
|
if (!className)
|
|
825
840
|
return undefined;
|
|
826
841
|
// Split by whitespace and check for language indicators
|
|
@@ -895,12 +910,13 @@ const REGEX = {
|
|
|
895
910
|
HEADING_MARKER: /^#{1,6}\s/m,
|
|
896
911
|
HEADING_STRICT: /^#{1,6}\s+/m,
|
|
897
912
|
EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
|
|
913
|
+
ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
898
914
|
FENCE_START: /^\s*(`{3,}|~{3,})/,
|
|
899
915
|
LIST_MARKER: /^(?:[-*+])\s/m,
|
|
900
916
|
TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
901
|
-
TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i,
|
|
917
|
+
TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents|on this page)\s*$/i,
|
|
902
918
|
HTML_DOC_START: /^(<!doctype|<html)/i,
|
|
903
|
-
COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful
|
|
919
|
+
COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??|\[Back to top\]\(#[^)]*\)|\[\s*\]\(https?:\/\/[^)]*\))\s*$/gim,
|
|
904
920
|
ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
|
|
905
921
|
CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
|
|
906
922
|
DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
|
|
@@ -948,6 +964,9 @@ function hasFollowingContent(lines, startIndex) {
|
|
|
948
964
|
}
|
|
949
965
|
return false;
|
|
950
966
|
}
|
|
967
|
+
function stripAnchorOnlyHeading(line) {
|
|
968
|
+
return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
|
|
969
|
+
}
|
|
951
970
|
function isTitleCaseOrKeyword(trimmed) {
|
|
952
971
|
// Quick check for length to avoid regex on long strings
|
|
953
972
|
if (trimmed.length > MAX_LINE_LENGTH)
|
|
@@ -1094,6 +1113,11 @@ function preprocessLines(lines, options) {
|
|
|
1094
1113
|
const trimmed = line.trim();
|
|
1095
1114
|
if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
|
|
1096
1115
|
continue;
|
|
1116
|
+
if (REGEX.ANCHOR_ONLY_HEADING.test(trimmed)) {
|
|
1117
|
+
if (!hasFollowingContent(lines, i))
|
|
1118
|
+
continue;
|
|
1119
|
+
line = stripAnchorOnlyHeading(trimmed);
|
|
1120
|
+
}
|
|
1097
1121
|
const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc, options);
|
|
1098
1122
|
if (tocSkip !== null) {
|
|
1099
1123
|
skipUntil = tocSkip;
|
|
@@ -1127,6 +1151,16 @@ function removeSkipLinks(text) {
|
|
|
1127
1151
|
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1128
1152
|
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1129
1153
|
}
|
|
1154
|
+
function normalizeInlineCodeTokens(text) {
|
|
1155
|
+
return text.replace(/`([^`\n]+)`/g, (match, inner) => {
|
|
1156
|
+
const trimmed = inner.trim();
|
|
1157
|
+
if (trimmed === inner || /\s/.test(trimmed))
|
|
1158
|
+
return match;
|
|
1159
|
+
if (!/^[*A-Za-z0-9_./:-]+$/.test(trimmed))
|
|
1160
|
+
return match;
|
|
1161
|
+
return `\`${trimmed}\``;
|
|
1162
|
+
});
|
|
1163
|
+
}
|
|
1130
1164
|
function normalizeMarkdownSpacing(text) {
|
|
1131
1165
|
let result = text
|
|
1132
1166
|
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
@@ -1136,8 +1170,8 @@ function normalizeMarkdownSpacing(text) {
|
|
|
1136
1170
|
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1137
1171
|
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
1138
1172
|
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1139
|
-
// Trim
|
|
1140
|
-
result = result
|
|
1173
|
+
// Trim whitespace around token-like inline code spans.
|
|
1174
|
+
result = normalizeInlineCodeTokens(result);
|
|
1141
1175
|
// Unescape backticks inside markdown link text
|
|
1142
1176
|
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
1143
1177
|
result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAigBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -255,11 +255,13 @@ const GFM_ALERT_MAP = new Map([
|
|
|
255
255
|
['danger', 'CAUTION'],
|
|
256
256
|
['important', 'IMPORTANT'],
|
|
257
257
|
]);
|
|
258
|
+
const ADMONITION_TOKEN_RE = /^(?:note|tip|hint|info|warning|warn|danger|caution|important)$/i;
|
|
258
259
|
function resolveGfmAlertType(className) {
|
|
259
|
-
const
|
|
260
|
-
for (const
|
|
261
|
-
|
|
262
|
-
|
|
260
|
+
const tokens = className.toLowerCase().split(/\s+/);
|
|
261
|
+
for (const token of tokens) {
|
|
262
|
+
const mapped = GFM_ALERT_MAP.get(token);
|
|
263
|
+
if (mapped)
|
|
264
|
+
return mapped;
|
|
263
265
|
}
|
|
264
266
|
return undefined;
|
|
265
267
|
}
|
|
@@ -278,11 +280,12 @@ function buildDivTranslator(ctx) {
|
|
|
278
280
|
postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
|
|
279
281
|
};
|
|
280
282
|
}
|
|
283
|
+
const classTokens = className.split(/\s+/);
|
|
281
284
|
const isAdmonition = className.includes('admonition') ||
|
|
282
285
|
className.includes('callout') ||
|
|
283
286
|
className.includes('custom-block') ||
|
|
284
287
|
getAttribute('role') === 'alert' ||
|
|
285
|
-
|
|
288
|
+
classTokens.some((t) => ADMONITION_TOKEN_RE.test(t));
|
|
286
289
|
if (isAdmonition) {
|
|
287
290
|
return {
|
|
288
291
|
postprocess: ({ content }) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAyCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAuJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwWD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA+OD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAuCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AA6CD,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AAurBX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAmBzB;AA+CD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAsH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -3,7 +3,7 @@ import diagnosticsChannel from 'node:diagnostics_channel';
|
|
|
3
3
|
import { performance } from 'node:perf_hooks';
|
|
4
4
|
import { isProbablyReaderable, Readability } from '@mozilla/readability';
|
|
5
5
|
import { parseHTML } from 'linkedom';
|
|
6
|
-
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
|
|
6
|
+
import { addSourceToMarkdown, buildMetadataFooter, cleanupMarkdownArtifacts, detectLanguageFromCode, extractLanguageFromClassName, extractTitleFromRawMarkdown, isRawTextContent, prepareDocumentForMarkdown, removeNoiseFromHtml, serializeDocumentForMarkdown, } from '../lib/content.js';
|
|
7
7
|
import { config } from '../lib/core.js';
|
|
8
8
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
9
9
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
@@ -284,13 +284,22 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
284
284
|
return length;
|
|
285
285
|
}
|
|
286
286
|
function preserveAlertElements(doc) {
|
|
287
|
-
const alerts = doc.querySelectorAll('[role="alert"], .admonition,
|
|
287
|
+
const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
|
|
288
288
|
for (const el of alerts) {
|
|
289
289
|
const bq = doc.createElement('blockquote');
|
|
290
290
|
bq.innerHTML = el.innerHTML;
|
|
291
291
|
el.replaceWith(bq);
|
|
292
292
|
}
|
|
293
293
|
}
|
|
294
|
+
function preserveCodeLanguageAttributes(doc) {
|
|
295
|
+
for (const el of doc.querySelectorAll('pre, code')) {
|
|
296
|
+
if (el.getAttribute('data-language'))
|
|
297
|
+
continue;
|
|
298
|
+
const lang = extractLanguageFromClassName(el.getAttribute('class') ?? '');
|
|
299
|
+
if (lang)
|
|
300
|
+
el.setAttribute('data-language', lang);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
294
303
|
function extractArticle(document, url, signal) {
|
|
295
304
|
if (!isReadabilityCompatible(document)) {
|
|
296
305
|
logWarn('Document not compatible with Readability');
|
|
@@ -321,6 +330,10 @@ function extractArticle(document, url, signal) {
|
|
|
321
330
|
? doc.cloneNode(true)
|
|
322
331
|
: doc;
|
|
323
332
|
preserveAlertElements(readabilityDoc);
|
|
333
|
+
preserveCodeLanguageAttributes(readabilityDoc);
|
|
334
|
+
for (const el of readabilityDoc.querySelectorAll('[class*="breadcrumb"],[class*="pagination"]')) {
|
|
335
|
+
el.remove();
|
|
336
|
+
}
|
|
324
337
|
checkAbort('extract:article:parse');
|
|
325
338
|
const reader = new Readability(readabilityDoc, {
|
|
326
339
|
maxElemsToParse: MAX_READABILITY_ELEMENTS,
|
|
@@ -654,6 +667,13 @@ const MIN_CONTENT_RATIO = 0.15;
|
|
|
654
667
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
655
668
|
const MIN_HEADING_RETENTION_RATIO = 0.3;
|
|
656
669
|
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
|
|
670
|
+
const MIN_TABLE_RETENTION_RATIO = 0.5;
|
|
671
|
+
const MIN_IMAGE_RETENTION_RATIO = 0.2;
|
|
672
|
+
const MIN_INTERACTIVE_RETENTION_RATIO = 0.1;
|
|
673
|
+
const MIN_INTERACTIVE_ELEMENTS_FOR_GATE = 6;
|
|
674
|
+
const MIN_IMAGE_ELEMENTS_FOR_GATE = 4;
|
|
675
|
+
const MIN_HEADINGS_FOR_EMPTY_SECTION_GATE = 5;
|
|
676
|
+
const MAX_EMPTY_SECTION_RATIO = 0.05;
|
|
657
677
|
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
658
678
|
const MAX_TRUNCATED_LINE_RATIO = 0.95;
|
|
659
679
|
function needsDocumentWrapper(html) {
|
|
@@ -679,13 +699,6 @@ function resolveHtmlDocument(htmlOrDocument) {
|
|
|
679
699
|
return parseHTML('<!DOCTYPE html><html><body></body></html>').document;
|
|
680
700
|
}
|
|
681
701
|
}
|
|
682
|
-
function countTagsInString(html, regex) {
|
|
683
|
-
let count = 0;
|
|
684
|
-
while (regex.exec(html) !== null) {
|
|
685
|
-
count++;
|
|
686
|
-
}
|
|
687
|
-
return count;
|
|
688
|
-
}
|
|
689
702
|
function stripNonVisibleNodes(root) {
|
|
690
703
|
for (const el of root.querySelectorAll('script,style,noscript')) {
|
|
691
704
|
el.remove();
|
|
@@ -868,6 +881,43 @@ function findPrimaryHeading(document) {
|
|
|
868
881
|
}
|
|
869
882
|
return undefined;
|
|
870
883
|
}
|
|
884
|
+
function countMatchingElements(root, selector) {
|
|
885
|
+
return root.querySelectorAll(selector).length;
|
|
886
|
+
}
|
|
887
|
+
function getHeadingLevel(heading) {
|
|
888
|
+
const match = /^H([1-6])$/.exec(heading.tagName);
|
|
889
|
+
if (!match)
|
|
890
|
+
return null;
|
|
891
|
+
return Number.parseInt(match[1] ?? '', 10);
|
|
892
|
+
}
|
|
893
|
+
function hasSectionContent(heading) {
|
|
894
|
+
const level = getHeadingLevel(heading);
|
|
895
|
+
if (level === null)
|
|
896
|
+
return false;
|
|
897
|
+
let current = heading.nextElementSibling;
|
|
898
|
+
while (current) {
|
|
899
|
+
const currentLevel = getHeadingLevel(current);
|
|
900
|
+
if (currentLevel !== null && currentLevel <= level)
|
|
901
|
+
return false;
|
|
902
|
+
const text = current.textContent.trim();
|
|
903
|
+
if (text.length > 0)
|
|
904
|
+
return true;
|
|
905
|
+
if (current.querySelector('img,table,pre,code,ul,ol,figure,blockquote')) {
|
|
906
|
+
return true;
|
|
907
|
+
}
|
|
908
|
+
current = current.nextElementSibling;
|
|
909
|
+
}
|
|
910
|
+
return false;
|
|
911
|
+
}
|
|
912
|
+
function countEmptyHeadingSections(root) {
|
|
913
|
+
let emptyCount = 0;
|
|
914
|
+
const headings = root.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
915
|
+
for (const heading of headings) {
|
|
916
|
+
if (!hasSectionContent(heading))
|
|
917
|
+
emptyCount += 1;
|
|
918
|
+
}
|
|
919
|
+
return emptyCount;
|
|
920
|
+
}
|
|
871
921
|
function isGithubRepositoryRootUrl(url) {
|
|
872
922
|
let parsed;
|
|
873
923
|
try {
|
|
@@ -895,20 +945,49 @@ function shouldUseArticleContent(article, document) {
|
|
|
895
945
|
if (ratio < MIN_CONTENT_RATIO)
|
|
896
946
|
return false;
|
|
897
947
|
}
|
|
898
|
-
const
|
|
948
|
+
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
949
|
+
const originalHeadings = countMatchingElements(document, 'h1,h2,h3,h4,h5,h6');
|
|
950
|
+
const articleHeadings = countMatchingElements(articleDoc, 'h1,h2,h3,h4,h5,h6');
|
|
899
951
|
if (originalHeadings > 0) {
|
|
900
|
-
const articleHeadings = countTagsInString(article.content, /<h[1-6]\b/gi);
|
|
901
952
|
const retentionRatio = articleHeadings / originalHeadings;
|
|
902
953
|
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
903
954
|
return false;
|
|
904
955
|
}
|
|
905
|
-
const originalCodeBlocks = document
|
|
956
|
+
const originalCodeBlocks = countMatchingElements(document, 'pre');
|
|
906
957
|
if (originalCodeBlocks > 0) {
|
|
907
|
-
const articleCodeBlocks =
|
|
958
|
+
const articleCodeBlocks = countMatchingElements(articleDoc, 'pre');
|
|
908
959
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
909
960
|
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
|
|
910
961
|
return false;
|
|
911
962
|
}
|
|
963
|
+
const originalTables = countMatchingElements(document, 'table');
|
|
964
|
+
if (originalTables > 0) {
|
|
965
|
+
const articleTables = countMatchingElements(articleDoc, 'table');
|
|
966
|
+
const tableRetentionRatio = articleTables / originalTables;
|
|
967
|
+
if (tableRetentionRatio < MIN_TABLE_RETENTION_RATIO)
|
|
968
|
+
return false;
|
|
969
|
+
}
|
|
970
|
+
const originalImages = countMatchingElements(document, 'img');
|
|
971
|
+
if (originalImages >= MIN_IMAGE_ELEMENTS_FOR_GATE) {
|
|
972
|
+
const articleImages = countMatchingElements(articleDoc, 'img');
|
|
973
|
+
const imageRetentionRatio = articleImages / originalImages;
|
|
974
|
+
if (imageRetentionRatio < MIN_IMAGE_RETENTION_RATIO)
|
|
975
|
+
return false;
|
|
976
|
+
}
|
|
977
|
+
const interactiveSelector = 'button,[role="tab"],[role="tabpanel"],[aria-controls]';
|
|
978
|
+
const originalInteractive = countMatchingElements(document, interactiveSelector);
|
|
979
|
+
if (originalInteractive >= MIN_INTERACTIVE_ELEMENTS_FOR_GATE) {
|
|
980
|
+
const articleInteractive = countMatchingElements(articleDoc, interactiveSelector);
|
|
981
|
+
const interactiveRetentionRatio = articleInteractive / originalInteractive;
|
|
982
|
+
if (interactiveRetentionRatio < MIN_INTERACTIVE_RETENTION_RATIO) {
|
|
983
|
+
return false;
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
if (articleHeadings >= MIN_HEADINGS_FOR_EMPTY_SECTION_GATE) {
|
|
987
|
+
const emptySectionRatio = countEmptyHeadingSections(articleDoc) / articleHeadings;
|
|
988
|
+
if (emptySectionRatio > MAX_EMPTY_SECTION_RATIO)
|
|
989
|
+
return false;
|
|
990
|
+
}
|
|
912
991
|
return !hasTruncatedSentences(article.textContent);
|
|
913
992
|
}
|
|
914
993
|
function buildContentSource(params) {
|
|
@@ -922,6 +1001,7 @@ function buildContentSource(params) {
|
|
|
922
1001
|
primaryHeading: document
|
|
923
1002
|
? TransformHeuristics.findPrimaryHeading(document)
|
|
924
1003
|
: undefined,
|
|
1004
|
+
originalHtml: html,
|
|
925
1005
|
};
|
|
926
1006
|
if (useArticleContent && article) {
|
|
927
1007
|
const { document: articleDoc } = parseHTML(`<!DOCTYPE html><html><body>${article.content}</body></html>`);
|
|
@@ -954,6 +1034,290 @@ function buildContentSource(params) {
|
|
|
954
1034
|
title: extractedMeta.title,
|
|
955
1035
|
};
|
|
956
1036
|
}
|
|
1037
|
+
const NEXT_FLIGHT_PAYLOAD_RE = /self\.__next_f\.push\(\[1,"((?:\\.|[^"\\])*)"\]\)<\/script>/gs;
|
|
1038
|
+
const TEMPLATE_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=`([\s\S]*?)`;/g;
|
|
1039
|
+
const OBJECT_ASSIGNMENT_RE = /([A-Za-z_$][\w$]*)=\{([^{}]+)\}/g;
|
|
1040
|
+
const FLIGHT_INSTALL_RE = /commands:\{cli:"([^"]+)",npm:"([^"]+)",yarn:"([^"]+)",pnpm:"([^"]+)",bun:"([^"]+)"\}/;
|
|
1041
|
+
const FLIGHT_IMPORT_RE = /commands:\{main:'([^']+)',individual:'([^']+)'\}/;
|
|
1042
|
+
const FLIGHT_DEMO_RE = /title:"([^"]+)",files:([A-Za-z_$][\w$]*)\.([A-Za-z_$][\w$]*)/g;
|
|
1043
|
+
const FLIGHT_API_RE = /children:"([^"]+)"\}\),`\\n`,\(0,e\.jsx\)\(o,\{data:\[([\s\S]*?)\]\}\)/g;
|
|
1044
|
+
const FLIGHT_API_ROW_RE = /attribute:"([^"]+)",type:"([^"]+)",description:"([^"]*)",default:"([^"]*)"/g;
|
|
1045
|
+
const FLIGHT_MERMAID_SECTION_RE = /_jsx\(Heading,\{\s*level:"[1-6]",\s*id:"[^"]+",\s*children:"((?:\\.|[^"\\])*)"\s*\}\)(?:(?!_jsx\(Heading,\{)[\s\S]){0,12000}?_jsx\(Mermaid,\{\s*chart:"((?:\\.|[^"\\])*)"\s*\}\)/g;
|
|
1046
|
+
function decodeHtmlEntities(value) {
|
|
1047
|
+
return value
|
|
1048
|
+
.replace(/'|'/g, "'")
|
|
1049
|
+
.replace(/"/g, '"')
|
|
1050
|
+
.replace(/&/g, '&')
|
|
1051
|
+
.replace(/</g, '<')
|
|
1052
|
+
.replace(/>/g, '>');
|
|
1053
|
+
}
|
|
1054
|
+
function decodeFlightStringValue(value) {
|
|
1055
|
+
try {
|
|
1056
|
+
return JSON.parse(`"${value}"`);
|
|
1057
|
+
}
|
|
1058
|
+
catch {
|
|
1059
|
+
return decodeHtmlEntities(value);
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
function decodeNextFlightPayloads(html) {
|
|
1063
|
+
const payloads = [];
|
|
1064
|
+
for (const match of html.matchAll(NEXT_FLIGHT_PAYLOAD_RE)) {
|
|
1065
|
+
const rawPayload = match[1];
|
|
1066
|
+
if (!rawPayload)
|
|
1067
|
+
continue;
|
|
1068
|
+
try {
|
|
1069
|
+
payloads.push(JSON.parse(`"${rawPayload}"`));
|
|
1070
|
+
}
|
|
1071
|
+
catch {
|
|
1072
|
+
// Ignore malformed payload fragments and continue with the rest.
|
|
1073
|
+
}
|
|
1074
|
+
}
|
|
1075
|
+
return payloads;
|
|
1076
|
+
}
|
|
1077
|
+
function parseFlightObjectRefs(text) {
|
|
1078
|
+
const templateMap = new Map();
|
|
1079
|
+
const aliasMap = new Map();
|
|
1080
|
+
const objectMaps = new Map();
|
|
1081
|
+
for (const match of text.matchAll(TEMPLATE_ASSIGNMENT_RE)) {
|
|
1082
|
+
const name = match[1];
|
|
1083
|
+
const code = match[2];
|
|
1084
|
+
if (name && code)
|
|
1085
|
+
templateMap.set(name, decodeHtmlEntities(code));
|
|
1086
|
+
}
|
|
1087
|
+
for (const match of text.matchAll(OBJECT_ASSIGNMENT_RE)) {
|
|
1088
|
+
const objectName = match[1];
|
|
1089
|
+
const body = match[2]?.trim() ?? '';
|
|
1090
|
+
if (!objectName || !body)
|
|
1091
|
+
continue;
|
|
1092
|
+
const spreadMatch = /^\.\.\.([A-Za-z_$][\w$]*)$/.exec(body);
|
|
1093
|
+
if (spreadMatch?.[1]) {
|
|
1094
|
+
aliasMap.set(objectName, spreadMatch[1]);
|
|
1095
|
+
continue;
|
|
1096
|
+
}
|
|
1097
|
+
const entries = new Map();
|
|
1098
|
+
for (const part of body.split(',')) {
|
|
1099
|
+
const entryMatch = /(?:"([^"]+)"|([A-Za-z_$][\w$]*)):([A-Za-z_$][\w$]*)$/.exec(part.trim());
|
|
1100
|
+
const key = entryMatch?.[1] ?? entryMatch?.[2];
|
|
1101
|
+
const value = entryMatch?.[3];
|
|
1102
|
+
if (key && value)
|
|
1103
|
+
entries.set(key, value);
|
|
1104
|
+
}
|
|
1105
|
+
if (entries.size > 0)
|
|
1106
|
+
objectMaps.set(objectName, entries);
|
|
1107
|
+
}
|
|
1108
|
+
return { templateMap, aliasMap, objectMaps };
|
|
1109
|
+
}
|
|
1110
|
+
function resolveFlightCodeRef(name, refs, seen = new Set()) {
|
|
1111
|
+
if (!name || seen.has(name))
|
|
1112
|
+
return undefined;
|
|
1113
|
+
seen.add(name);
|
|
1114
|
+
const direct = refs.templateMap.get(name);
|
|
1115
|
+
if (direct)
|
|
1116
|
+
return direct;
|
|
1117
|
+
const alias = refs.aliasMap.get(name);
|
|
1118
|
+
if (alias)
|
|
1119
|
+
return resolveFlightCodeRef(alias, refs, seen);
|
|
1120
|
+
const objectMap = refs.objectMaps.get(name);
|
|
1121
|
+
if (!objectMap)
|
|
1122
|
+
return undefined;
|
|
1123
|
+
for (const ref of objectMap.values()) {
|
|
1124
|
+
const resolved = resolveFlightCodeRef(ref, refs, seen);
|
|
1125
|
+
if (resolved)
|
|
1126
|
+
return resolved;
|
|
1127
|
+
}
|
|
1128
|
+
return undefined;
|
|
1129
|
+
}
|
|
1130
|
+
function escapeMarkdownTableCell(value) {
|
|
1131
|
+
const normalized = decodeHtmlEntities(value).replace(/\s+/g, ' ').trim();
|
|
1132
|
+
return (normalized || '-').replace(/\|/g, '\\|');
|
|
1133
|
+
}
|
|
1134
|
+
function buildMarkdownTable(rows) {
|
|
1135
|
+
if (rows.length === 0)
|
|
1136
|
+
return '';
|
|
1137
|
+
const lines = [
|
|
1138
|
+
'| Prop | Type | Description | Default |',
|
|
1139
|
+
'| ---- | ---- | ----------- | ------- |',
|
|
1140
|
+
];
|
|
1141
|
+
for (const row of rows) {
|
|
1142
|
+
lines.push(`| ${escapeMarkdownTableCell(row.attribute)} | ${escapeMarkdownTableCell(row.type)} | ${escapeMarkdownTableCell(row.description)} | ${escapeMarkdownTableCell(row.defaultValue)} |`);
|
|
1143
|
+
}
|
|
1144
|
+
return lines.join('\n');
|
|
1145
|
+
}
|
|
1146
|
+
function buildCodeBlock(code) {
|
|
1147
|
+
const trimmed = code.trim();
|
|
1148
|
+
if (!trimmed)
|
|
1149
|
+
return '';
|
|
1150
|
+
const language = detectLanguageFromCode(trimmed) ?? 'tsx';
|
|
1151
|
+
return `\`\`\`${language}\n${trimmed}\n\`\`\``;
|
|
1152
|
+
}
|
|
1153
|
+
function buildMermaidBlock(chart) {
|
|
1154
|
+
const normalized = decodeFlightStringValue(chart).trim();
|
|
1155
|
+
if (!normalized)
|
|
1156
|
+
return '';
|
|
1157
|
+
return `\`\`\`mermaid\n${normalized}\n\`\`\``;
|
|
1158
|
+
}
|
|
1159
|
+
function normalizeSupplementHeadingText(value) {
|
|
1160
|
+
return value
|
|
1161
|
+
.replace(/\[([^\]]+)\]\([^)]*\)/g, '$1')
|
|
1162
|
+
.replace(/\s+/g, ' ')
|
|
1163
|
+
.trim()
|
|
1164
|
+
.toLowerCase();
|
|
1165
|
+
}
|
|
1166
|
+
function getMarkdownHeadingInfo(line) {
|
|
1167
|
+
const match = /^(#{1,6})\s+(.+?)\s*$/.exec(line.trim());
|
|
1168
|
+
if (!match)
|
|
1169
|
+
return null;
|
|
1170
|
+
return {
|
|
1171
|
+
level: match[1]?.length ?? 0,
|
|
1172
|
+
title: normalizeSupplementHeadingText(match[2] ?? ''),
|
|
1173
|
+
};
|
|
1174
|
+
}
|
|
1175
|
+
function findMarkdownSection(lines, title) {
|
|
1176
|
+
const target = normalizeSupplementHeadingText(title);
|
|
1177
|
+
for (let i = 0; i < lines.length; i += 1) {
|
|
1178
|
+
const heading = getMarkdownHeadingInfo(lines[i] ?? '');
|
|
1179
|
+
if (heading?.title !== target)
|
|
1180
|
+
continue;
|
|
1181
|
+
let end = lines.length;
|
|
1182
|
+
for (let j = i + 1; j < lines.length; j += 1) {
|
|
1183
|
+
const nextLine = lines[j];
|
|
1184
|
+
const nextHeading = nextLine !== undefined ? getMarkdownHeadingInfo(nextLine) : null;
|
|
1185
|
+
if (nextHeading && nextHeading.level <= heading.level) {
|
|
1186
|
+
end = j;
|
|
1187
|
+
break;
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
return { start: i, end };
|
|
1191
|
+
}
|
|
1192
|
+
return null;
|
|
1193
|
+
}
|
|
1194
|
+
function getSectionBody(lines, section) {
|
|
1195
|
+
return lines
|
|
1196
|
+
.slice(section.start + 1, section.end)
|
|
1197
|
+
.join('\n')
|
|
1198
|
+
.trim();
|
|
1199
|
+
}
|
|
1200
|
+
function replaceMarkdownSection(lines, title, body) {
|
|
1201
|
+
const section = findMarkdownSection(lines, title);
|
|
1202
|
+
if (!section)
|
|
1203
|
+
return false;
|
|
1204
|
+
const replacement = body.trim().length > 0 ? ['', ...body.trim().split('\n'), ''] : [''];
|
|
1205
|
+
lines.splice(section.start + 1, section.end - section.start - 1, ...replacement);
|
|
1206
|
+
return true;
|
|
1207
|
+
}
|
|
1208
|
+
function appendMarkdownSection(lines, title, body) {
|
|
1209
|
+
const section = findMarkdownSection(lines, title);
|
|
1210
|
+
if (!section)
|
|
1211
|
+
return false;
|
|
1212
|
+
const bodyText = getSectionBody(lines, section);
|
|
1213
|
+
if (bodyText.includes('```'))
|
|
1214
|
+
return false;
|
|
1215
|
+
const nextBody = bodyText ? `${bodyText}\n\n${body.trim()}` : body.trim();
|
|
1216
|
+
return replaceMarkdownSection(lines, title, nextBody);
|
|
1217
|
+
}
|
|
1218
|
+
function extractNextFlightSupplement(originalHtml) {
|
|
1219
|
+
const payloads = decodeNextFlightPayloads(originalHtml);
|
|
1220
|
+
if (payloads.length === 0)
|
|
1221
|
+
return null;
|
|
1222
|
+
const text = payloads.join('\n');
|
|
1223
|
+
const refs = parseFlightObjectRefs(text);
|
|
1224
|
+
const installMatch = FLIGHT_INSTALL_RE.exec(text);
|
|
1225
|
+
const importMatch = FLIGHT_IMPORT_RE.exec(text);
|
|
1226
|
+
const apiTables = new Map();
|
|
1227
|
+
for (const match of text.matchAll(FLIGHT_API_RE)) {
|
|
1228
|
+
const title = match[1];
|
|
1229
|
+
const rawRows = match[2] ?? '';
|
|
1230
|
+
if (!title)
|
|
1231
|
+
continue;
|
|
1232
|
+
const rows = [];
|
|
1233
|
+
for (const rowMatch of rawRows.matchAll(FLIGHT_API_ROW_RE)) {
|
|
1234
|
+
const attribute = rowMatch[1];
|
|
1235
|
+
const type = rowMatch[2];
|
|
1236
|
+
const description = rowMatch[3];
|
|
1237
|
+
const defaultValue = rowMatch[4];
|
|
1238
|
+
if (!attribute ||
|
|
1239
|
+
!type ||
|
|
1240
|
+
description === undefined ||
|
|
1241
|
+
defaultValue === undefined) {
|
|
1242
|
+
continue;
|
|
1243
|
+
}
|
|
1244
|
+
rows.push({ attribute, type, description, defaultValue });
|
|
1245
|
+
}
|
|
1246
|
+
const table = buildMarkdownTable(rows);
|
|
1247
|
+
if (table)
|
|
1248
|
+
apiTables.set(title, table);
|
|
1249
|
+
}
|
|
1250
|
+
const mermaidDiagrams = new Map();
|
|
1251
|
+
for (const match of text.matchAll(FLIGHT_MERMAID_SECTION_RE)) {
|
|
1252
|
+
const title = match[1] ? decodeFlightStringValue(match[1]).trim() : '';
|
|
1253
|
+
const chart = match[2] ? buildMermaidBlock(match[2]) : '';
|
|
1254
|
+
if (title && chart)
|
|
1255
|
+
mermaidDiagrams.set(title, chart);
|
|
1256
|
+
}
|
|
1257
|
+
const demoCodeBlocks = new Map();
|
|
1258
|
+
for (const match of text.matchAll(FLIGHT_DEMO_RE)) {
|
|
1259
|
+
const title = match[1];
|
|
1260
|
+
const objectName = match[2];
|
|
1261
|
+
const key = match[3];
|
|
1262
|
+
const ref = objectName
|
|
1263
|
+
? refs.objectMaps.get(objectName)?.get(key ?? '')
|
|
1264
|
+
: undefined;
|
|
1265
|
+
const code = resolveFlightCodeRef(ref, refs);
|
|
1266
|
+
const codeBlock = code ? buildCodeBlock(code) : '';
|
|
1267
|
+
if (title && codeBlock)
|
|
1268
|
+
demoCodeBlocks.set(title, codeBlock);
|
|
1269
|
+
}
|
|
1270
|
+
return {
|
|
1271
|
+
...(installMatch ? { installationCommands: installMatch.slice(1) } : {}),
|
|
1272
|
+
...(importMatch ? { importCommands: importMatch.slice(1) } : {}),
|
|
1273
|
+
apiTables,
|
|
1274
|
+
demoCodeBlocks,
|
|
1275
|
+
mermaidDiagrams,
|
|
1276
|
+
};
|
|
1277
|
+
}
|
|
1278
|
+
function supplementMarkdownFromNextFlight(markdown, originalHtml) {
|
|
1279
|
+
const supplement = extractNextFlightSupplement(originalHtml);
|
|
1280
|
+
if (!supplement)
|
|
1281
|
+
return markdown;
|
|
1282
|
+
const lines = markdown.split('\n');
|
|
1283
|
+
if (supplement.installationCommands?.length) {
|
|
1284
|
+
const installationSection = findMarkdownSection(lines, 'Installation');
|
|
1285
|
+
if (installationSection) {
|
|
1286
|
+
const installBody = getSectionBody(lines, installationSection);
|
|
1287
|
+
if (!/(npm|pnpm|yarn|bun|npx)\s+(install|add)/.test(installBody)) {
|
|
1288
|
+
appendMarkdownSection(lines, 'Installation', buildCodeBlock(supplement.installationCommands.join('\n')));
|
|
1289
|
+
}
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1292
|
+
if (supplement.importCommands?.length) {
|
|
1293
|
+
const importSection = findMarkdownSection(lines, 'Import');
|
|
1294
|
+
if (importSection) {
|
|
1295
|
+
const importBody = getSectionBody(lines, importSection);
|
|
1296
|
+
if (!/import\s+\{/.test(importBody)) {
|
|
1297
|
+
appendMarkdownSection(lines, 'Import', buildCodeBlock(supplement.importCommands.join('\n\n')));
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
for (const [title, table] of supplement.apiTables) {
|
|
1302
|
+
replaceMarkdownSection(lines, title, table);
|
|
1303
|
+
}
|
|
1304
|
+
for (const [title, mermaidBlock] of supplement.mermaidDiagrams) {
|
|
1305
|
+
const section = findMarkdownSection(lines, title);
|
|
1306
|
+
if (!section)
|
|
1307
|
+
continue;
|
|
1308
|
+
const sectionBody = getSectionBody(lines, section);
|
|
1309
|
+
if (sectionBody.includes('```mermaid'))
|
|
1310
|
+
continue;
|
|
1311
|
+
const nextBody = sectionBody
|
|
1312
|
+
? `${sectionBody}\n\n${mermaidBlock}`
|
|
1313
|
+
: mermaidBlock;
|
|
1314
|
+
replaceMarkdownSection(lines, title, nextBody);
|
|
1315
|
+
}
|
|
1316
|
+
for (const [title, codeBlock] of supplement.demoCodeBlocks) {
|
|
1317
|
+
appendMarkdownSection(lines, title, codeBlock);
|
|
1318
|
+
}
|
|
1319
|
+
return lines.join('\n');
|
|
1320
|
+
}
|
|
957
1321
|
function resolveContentSource(params) {
|
|
958
1322
|
const { article, metadata: extractedMeta, document, truncated, } = extractContentContext(params.html, params.url, {
|
|
959
1323
|
extractArticle: true,
|
|
@@ -1001,6 +1365,8 @@ function buildMarkdownFromContext(context, url, signal) {
|
|
|
1001
1365
|
}
|
|
1002
1366
|
content = `#${prefix}${context.title}\n\n${content}`;
|
|
1003
1367
|
}
|
|
1368
|
+
content = supplementMarkdownFromNextFlight(content, context.originalHtml);
|
|
1369
|
+
content = cleanupMarkdownArtifacts(content, signal ? { signal, url } : { url });
|
|
1004
1370
|
return {
|
|
1005
1371
|
markdown: content,
|
|
1006
1372
|
title: context.title,
|
package/package.json
CHANGED