@j0hanz/fetch-url-mcp 1.9.2 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/code-lang.d.ts +4 -0
- package/dist/lib/code-lang.d.ts.map +1 -0
- package/dist/lib/code-lang.js +315 -0
- package/dist/lib/dom-prep.d.ts +4 -0
- package/dist/lib/dom-prep.d.ts.map +1 -0
- package/dist/lib/dom-prep.js +606 -0
- package/dist/lib/md-cleanup.d.ts +13 -0
- package/dist/lib/md-cleanup.d.ts.map +1 -0
- package/dist/lib/md-cleanup.js +391 -0
- package/dist/lib/md-metadata.d.ts +6 -0
- package/dist/lib/md-metadata.d.ts.map +1 -0
- package/dist/lib/md-metadata.js +186 -0
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +9 -6
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +510 -160
- package/package.json +1 -1
- package/dist/lib/content.d.ts +0 -17
- package/dist/lib/content.d.ts.map +0 -1
- package/dist/lib/content.js +0 -1399
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
import { config } from './core.js';
|
|
2
|
+
import { throwIfAborted } from './utils.js';
|
|
3
|
+
const ASCII_HASH = 35;
|
|
4
|
+
const ASCII_ASTERISK = 42;
|
|
5
|
+
const ASCII_PLUS = 43;
|
|
6
|
+
const ASCII_DASH = 45;
|
|
7
|
+
const ASCII_PERIOD = 46;
|
|
8
|
+
const ASCII_DIGIT_0 = 48;
|
|
9
|
+
const ASCII_DIGIT_9 = 57;
|
|
10
|
+
const ASCII_EXCLAMATION = 33;
|
|
11
|
+
const ASCII_QUESTION = 63;
|
|
12
|
+
const ASCII_BRACKET_OPEN = 91;
|
|
13
|
+
const TITLE_MIN_WORDS = 2;
|
|
14
|
+
const TITLE_MAX_WORDS = 10;
|
|
15
|
+
const TITLE_MIN_CAPITALIZED = 2;
|
|
16
|
+
const HAS_FOLLOWING_LOOKAHEAD = 10;
|
|
17
|
+
const PROPERTY_FIX_MAX_PASSES = 5;
|
|
18
|
+
const MAX_LINE_LENGTH = 80;
|
|
19
|
+
const FENCE_PATTERN = /^\s*(`{3,}|~{3,})/;
|
|
20
|
+
const REGEX = {
|
|
21
|
+
HEADING_MARKER: /^#{1,6}\s/m,
|
|
22
|
+
HEADING_STRICT: /^#{1,6}\s+/m,
|
|
23
|
+
EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
|
|
24
|
+
ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
25
|
+
FENCE_START: FENCE_PATTERN,
|
|
26
|
+
LIST_MARKER: /^(?:[-*+])\s/m,
|
|
27
|
+
TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
28
|
+
TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents|on this page)\s*$/i,
|
|
29
|
+
HTML_DOC_START: /^(<!doctype|<html)/i,
|
|
30
|
+
COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??|\[Back to top\]\(#[^)]*\)|\[\s*\]\(https?:\/\/[^)]*\))\s*$/gim,
|
|
31
|
+
ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
|
|
32
|
+
CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
|
|
33
|
+
DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
|
|
34
|
+
SOURCE_KEY: /^source:\s/im,
|
|
35
|
+
HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
|
|
36
|
+
HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
|
|
37
|
+
SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
|
|
38
|
+
SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
|
|
39
|
+
SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
|
|
40
|
+
SPACING_ESCAPES: /\\([[\].])/g,
|
|
41
|
+
SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
|
|
42
|
+
PUNCT_ONLY_LIST_ARTIFACT: /^(?:[-*+]|\d+\.)\s*(?:\\[-*+|/]|[-*+|/])(?:\s+(?:\\[-*+|/]|[-*+|/]))*\s*$/gm,
|
|
43
|
+
NESTED_LIST_INDENT: /^( +)((?:[-*+])|\d+\.)\s/gm,
|
|
44
|
+
TYPEDOC_COMMENT: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
|
|
45
|
+
};
|
|
46
|
+
const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
|
|
47
|
+
const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
|
|
48
|
+
const TOC_SCAN_LIMIT = 20;
|
|
49
|
+
const TOC_MAX_NON_EMPTY = 12;
|
|
50
|
+
const TOC_LINK_RATIO_THRESHOLD = 0.8;
|
|
51
|
+
const TYPEDOC_PREFIXES = [
|
|
52
|
+
'Defined in:',
|
|
53
|
+
'Returns:',
|
|
54
|
+
'Since:',
|
|
55
|
+
'See also:',
|
|
56
|
+
];
|
|
57
|
+
function createAbortChecker(options) {
|
|
58
|
+
const signal = options?.signal;
|
|
59
|
+
const url = options?.url ?? '';
|
|
60
|
+
return (stage) => {
|
|
61
|
+
throwIfAborted(signal, url, stage);
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function isBlank(line) {
|
|
65
|
+
return line === undefined || line.trim().length === 0;
|
|
66
|
+
}
|
|
67
|
+
function hasFollowingContent(lines, startIndex) {
|
|
68
|
+
// Optimization: Bound lookahead to avoid checking too many lines in huge files
|
|
69
|
+
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
|
|
70
|
+
if (!isBlank(lines[i]))
|
|
71
|
+
return true;
|
|
72
|
+
}
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
function stripAnchorOnlyHeading(line) {
|
|
76
|
+
return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
|
|
77
|
+
}
|
|
78
|
+
function isTitleCaseOrKeyword(trimmed) {
|
|
79
|
+
// Quick check for length to avoid regex on long strings
|
|
80
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
81
|
+
return false;
|
|
82
|
+
// Single word optimization
|
|
83
|
+
if (!trimmed.includes(' ')) {
|
|
84
|
+
if (!/^[A-Z]/.test(trimmed))
|
|
85
|
+
return false;
|
|
86
|
+
return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
|
|
87
|
+
}
|
|
88
|
+
// Split limited number of words
|
|
89
|
+
const words = trimmed.split(/\s+/);
|
|
90
|
+
const len = words.length;
|
|
91
|
+
if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
|
|
92
|
+
return false;
|
|
93
|
+
let capitalizedCount = 0;
|
|
94
|
+
for (let i = 0; i < len; i++) {
|
|
95
|
+
const w = words[i];
|
|
96
|
+
if (!w)
|
|
97
|
+
continue;
|
|
98
|
+
const isCap = /^[A-Z][a-z]*$/.test(w);
|
|
99
|
+
if (isCap)
|
|
100
|
+
capitalizedCount++;
|
|
101
|
+
else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
return capitalizedCount >= TITLE_MIN_CAPITALIZED;
|
|
105
|
+
}
|
|
106
|
+
function getHeadingPrefix(trimmed) {
|
|
107
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
108
|
+
return null;
|
|
109
|
+
// Fast path: Check common markdown markers first
|
|
110
|
+
const firstChar = trimmed.charCodeAt(0);
|
|
111
|
+
if (firstChar === ASCII_HASH ||
|
|
112
|
+
firstChar === ASCII_DASH ||
|
|
113
|
+
firstChar === ASCII_ASTERISK ||
|
|
114
|
+
firstChar === ASCII_PLUS ||
|
|
115
|
+
firstChar === ASCII_BRACKET_OPEN ||
|
|
116
|
+
(firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
|
|
117
|
+
if (REGEX.HEADING_MARKER.test(trimmed) ||
|
|
118
|
+
REGEX.LIST_MARKER.test(trimmed) ||
|
|
119
|
+
/^\d+\.\s/.test(trimmed) ||
|
|
120
|
+
/^\[.*\]\(.*\)$/.test(trimmed)) {
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
if (SPECIAL_PREFIXES.test(trimmed)) {
|
|
125
|
+
return /^example:\s/i.test(trimmed) ? '### ' : '## ';
|
|
126
|
+
}
|
|
127
|
+
const lastChar = trimmed.charCodeAt(trimmed.length - 1);
|
|
128
|
+
if (lastChar === ASCII_PERIOD ||
|
|
129
|
+
lastChar === ASCII_EXCLAMATION ||
|
|
130
|
+
lastChar === ASCII_QUESTION)
|
|
131
|
+
return null;
|
|
132
|
+
return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
|
|
133
|
+
}
|
|
134
|
+
function getTocBlockStats(lines, headingIndex) {
|
|
135
|
+
let total = 0;
|
|
136
|
+
let linkCount = 0;
|
|
137
|
+
let nonLinkCount = 0;
|
|
138
|
+
const lookaheadMax = Math.min(lines.length, headingIndex + TOC_SCAN_LIMIT);
|
|
139
|
+
for (let i = headingIndex + 1; i < lookaheadMax; i++) {
|
|
140
|
+
const line = lines[i];
|
|
141
|
+
if (!line)
|
|
142
|
+
continue;
|
|
143
|
+
const trimmed = line.trim();
|
|
144
|
+
if (!trimmed)
|
|
145
|
+
continue;
|
|
146
|
+
if (REGEX.HEADING_MARKER.test(trimmed))
|
|
147
|
+
break;
|
|
148
|
+
total += 1;
|
|
149
|
+
if (REGEX.TOC_LINK.test(trimmed))
|
|
150
|
+
linkCount += 1;
|
|
151
|
+
else
|
|
152
|
+
nonLinkCount += 1;
|
|
153
|
+
if (total >= TOC_MAX_NON_EMPTY)
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
return { total, linkCount, nonLinkCount };
|
|
157
|
+
}
|
|
158
|
+
function skipTocLines(lines, startIndex) {
|
|
159
|
+
for (let i = startIndex; i < lines.length; i++) {
|
|
160
|
+
const line = lines[i];
|
|
161
|
+
if (line === undefined)
|
|
162
|
+
continue;
|
|
163
|
+
const trimmed = line.trim();
|
|
164
|
+
if (!trimmed)
|
|
165
|
+
continue;
|
|
166
|
+
if (!REGEX.TOC_LINK.test(trimmed))
|
|
167
|
+
return i;
|
|
168
|
+
}
|
|
169
|
+
return lines.length;
|
|
170
|
+
}
|
|
171
|
+
function isTypeDocArtifactLine(line) {
|
|
172
|
+
const trimmed = line.trim();
|
|
173
|
+
for (const prefix of TYPEDOC_PREFIXES) {
|
|
174
|
+
if (!trimmed.startsWith(prefix))
|
|
175
|
+
continue;
|
|
176
|
+
const rest = trimmed.slice(prefix.length).trimStart();
|
|
177
|
+
if (!rest.startsWith('**`'))
|
|
178
|
+
return false;
|
|
179
|
+
return rest.includes('`**');
|
|
180
|
+
}
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
function tryPromoteOrphan(lines, i, trimmed) {
|
|
184
|
+
const prevLine = lines[i - 1];
|
|
185
|
+
const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
|
|
186
|
+
if (!isOrphan)
|
|
187
|
+
return null;
|
|
188
|
+
const prefix = getHeadingPrefix(trimmed);
|
|
189
|
+
if (!prefix)
|
|
190
|
+
return null;
|
|
191
|
+
const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed);
|
|
192
|
+
if (!isSpecialPrefix && !hasFollowingContent(lines, i))
|
|
193
|
+
return null;
|
|
194
|
+
return `${prefix}${trimmed}`;
|
|
195
|
+
}
|
|
196
|
+
function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
|
|
197
|
+
if (!removeToc || !REGEX.TOC_HEADING.test(trimmed))
|
|
198
|
+
return null;
|
|
199
|
+
const { total, linkCount, nonLinkCount } = getTocBlockStats(lines, i);
|
|
200
|
+
if (total === 0 || nonLinkCount > 0)
|
|
201
|
+
return null;
|
|
202
|
+
const ratio = linkCount / total;
|
|
203
|
+
if (ratio <= TOC_LINK_RATIO_THRESHOLD)
|
|
204
|
+
return null;
|
|
205
|
+
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc');
|
|
206
|
+
return skipTocLines(lines, i + 1);
|
|
207
|
+
}
|
|
208
|
+
function normalizePreprocessLine(lines, i, trimmed, line) {
|
|
209
|
+
if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
|
|
210
|
+
return null;
|
|
211
|
+
if (!REGEX.ANCHOR_ONLY_HEADING.test(trimmed))
|
|
212
|
+
return line;
|
|
213
|
+
if (!hasFollowingContent(lines, i))
|
|
214
|
+
return null;
|
|
215
|
+
return stripAnchorOnlyHeading(trimmed);
|
|
216
|
+
}
|
|
217
|
+
function maybeSkipTocBlock(lines, i, trimmed, options) {
|
|
218
|
+
return shouldSkipAsToc(lines, i, trimmed, config.markdownCleanup.removeTocBlocks, options);
|
|
219
|
+
}
|
|
220
|
+
function maybePromoteOrphanHeading(lines, i, trimmed, checkAbort) {
|
|
221
|
+
if (!config.markdownCleanup.promoteOrphanHeadings || trimmed.length === 0) {
|
|
222
|
+
return null;
|
|
223
|
+
}
|
|
224
|
+
checkAbort('markdown:cleanup:promote');
|
|
225
|
+
return tryPromoteOrphan(lines, i, trimmed);
|
|
226
|
+
}
|
|
227
|
+
function preprocessLines(lines, options) {
|
|
228
|
+
const processedLines = [];
|
|
229
|
+
const checkAbort = createAbortChecker(options);
|
|
230
|
+
let skipUntil = -1;
|
|
231
|
+
for (let i = 0; i < lines.length; i++) {
|
|
232
|
+
if (i < skipUntil)
|
|
233
|
+
continue;
|
|
234
|
+
const currentLine = lines[i];
|
|
235
|
+
if (currentLine === undefined)
|
|
236
|
+
continue;
|
|
237
|
+
const trimmed = currentLine.trim();
|
|
238
|
+
const normalizedLine = normalizePreprocessLine(lines, i, trimmed, currentLine);
|
|
239
|
+
if (normalizedLine === null)
|
|
240
|
+
continue;
|
|
241
|
+
const tocSkip = maybeSkipTocBlock(lines, i, trimmed, options);
|
|
242
|
+
if (tocSkip !== null) {
|
|
243
|
+
skipUntil = tocSkip;
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
const promotedLine = maybePromoteOrphanHeading(lines, i, trimmed, checkAbort);
|
|
247
|
+
processedLines.push(promotedLine ?? normalizedLine);
|
|
248
|
+
}
|
|
249
|
+
return processedLines.join('\n');
|
|
250
|
+
}
|
|
251
|
+
function processTextBuffer(lines, options) {
|
|
252
|
+
if (lines.length === 0)
|
|
253
|
+
return '';
|
|
254
|
+
const text = preprocessLines(lines, options);
|
|
255
|
+
return applyGlobalRegexes(text, options);
|
|
256
|
+
}
|
|
257
|
+
function removeTypeDocArtifacts(text) {
|
|
258
|
+
const filtered = text
|
|
259
|
+
.split('\n')
|
|
260
|
+
.filter((line) => !isTypeDocArtifactLine(line))
|
|
261
|
+
.join('\n');
|
|
262
|
+
return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
263
|
+
}
|
|
264
|
+
function removeSkipLinks(text) {
|
|
265
|
+
return text
|
|
266
|
+
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
267
|
+
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
268
|
+
}
|
|
269
|
+
function normalizeInlineCodeTokens(text) {
|
|
270
|
+
return text.replace(/`([^`\n]+)`/g, (match, inner) => {
|
|
271
|
+
const trimmed = inner.trim();
|
|
272
|
+
if (trimmed === inner)
|
|
273
|
+
return match;
|
|
274
|
+
if (!/[A-Za-z0-9]/.test(trimmed))
|
|
275
|
+
return match;
|
|
276
|
+
const parts = /^(\s*)(.*?)(\s*)$/.exec(inner);
|
|
277
|
+
if (!parts)
|
|
278
|
+
return match;
|
|
279
|
+
return `${parts[1] ?? ''}\`${parts[2] ?? ''}\`${parts[3] ?? ''}`;
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
function normalizeMarkdownSpacing(text) {
|
|
283
|
+
let result = text
|
|
284
|
+
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
285
|
+
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
286
|
+
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
287
|
+
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
288
|
+
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
289
|
+
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
290
|
+
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
291
|
+
// Fix missing spaces after sentence-ending punctuation followed by uppercase
|
|
292
|
+
result = result.replace(/([.!?:;])([A-Z])/g, '$1 $2');
|
|
293
|
+
// Trim whitespace around token-like inline code spans.
|
|
294
|
+
result = normalizeInlineCodeTokens(result);
|
|
295
|
+
// Unescape backticks inside markdown link text
|
|
296
|
+
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
297
|
+
result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
|
|
298
|
+
return normalizeNestedListIndentation(result);
|
|
299
|
+
}
|
|
300
|
+
function fixConcatenatedProperties(text) {
|
|
301
|
+
let result = text;
|
|
302
|
+
for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
|
|
303
|
+
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
304
|
+
if (next === result)
|
|
305
|
+
break;
|
|
306
|
+
result = next;
|
|
307
|
+
}
|
|
308
|
+
return result;
|
|
309
|
+
}
|
|
310
|
+
function applyGlobalRegexes(text, options) {
|
|
311
|
+
const checkAbort = createAbortChecker(options);
|
|
312
|
+
let result = text.replace(/\u00A0/g, ' ');
|
|
313
|
+
checkAbort('markdown:cleanup:headings');
|
|
314
|
+
result = result
|
|
315
|
+
.replace(REGEX.HEADING_SPACING, '$1\n\n$2')
|
|
316
|
+
.replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
|
|
317
|
+
if (config.markdownCleanup.removeTypeDocComments) {
|
|
318
|
+
checkAbort('markdown:cleanup:typedoc');
|
|
319
|
+
result = removeTypeDocArtifacts(result);
|
|
320
|
+
}
|
|
321
|
+
if (config.markdownCleanup.removeSkipLinks) {
|
|
322
|
+
checkAbort('markdown:cleanup:skip-links');
|
|
323
|
+
result = removeSkipLinks(result);
|
|
324
|
+
}
|
|
325
|
+
checkAbort('markdown:cleanup:spacing');
|
|
326
|
+
result = normalizeMarkdownSpacing(result);
|
|
327
|
+
checkAbort('markdown:cleanup:properties');
|
|
328
|
+
return fixConcatenatedProperties(result);
|
|
329
|
+
}
|
|
330
|
+
function normalizeNestedListIndentation(text) {
|
|
331
|
+
return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
|
|
332
|
+
const count = spaces.length;
|
|
333
|
+
if (count < 2 || count % 2 !== 0)
|
|
334
|
+
return match;
|
|
335
|
+
const normalized = ' '.repeat((count / 2) * 4);
|
|
336
|
+
return `${normalized}${marker} `;
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* Iterate over markdown content, splitting it into fenced (code) and
|
|
341
|
+
* non-fenced segments. Fenced lines pass through unchanged; non-fenced
|
|
342
|
+
* segments are joined and handed to `processTextSegment` for transformation.
|
|
343
|
+
*/
|
|
344
|
+
export function processFencedContent(content, processTextSegment) {
|
|
345
|
+
const lines = content.split(/\r?\n/);
|
|
346
|
+
let fenceMarker = null;
|
|
347
|
+
const segments = [];
|
|
348
|
+
let buffer = [];
|
|
349
|
+
const flushBuffer = () => {
|
|
350
|
+
if (buffer.length > 0) {
|
|
351
|
+
segments.push(processTextSegment(buffer.join('\n')));
|
|
352
|
+
buffer = [];
|
|
353
|
+
}
|
|
354
|
+
};
|
|
355
|
+
for (const line of lines) {
|
|
356
|
+
const trimmed = line.trimStart();
|
|
357
|
+
if (fenceMarker) {
|
|
358
|
+
segments.push(line);
|
|
359
|
+
if (trimmed.startsWith(fenceMarker) &&
|
|
360
|
+
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
361
|
+
fenceMarker = null;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
else {
|
|
365
|
+
const match = FENCE_PATTERN.exec(line);
|
|
366
|
+
const newMarker = match?.[1] ?? null;
|
|
367
|
+
if (!newMarker) {
|
|
368
|
+
buffer.push(line);
|
|
369
|
+
}
|
|
370
|
+
else {
|
|
371
|
+
flushBuffer();
|
|
372
|
+
segments.push(line);
|
|
373
|
+
fenceMarker = newMarker;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
flushBuffer();
|
|
378
|
+
return segments.join('\n');
|
|
379
|
+
}
|
|
380
|
+
function stripLeadingBreadcrumbNoise(text) {
|
|
381
|
+
// Remove a single short plain-text line at the very start if followed
|
|
382
|
+
// (within one optional blank line) by an H1 or H2 heading.
|
|
383
|
+
return text.replace(/^([^\n#>|`\-*+\d[\]()]{1,40})\n(\s*\n)?(?=#{1,2}\s)/, '');
|
|
384
|
+
}
|
|
385
|
+
export function cleanupMarkdownArtifacts(content, options) {
|
|
386
|
+
if (!content)
|
|
387
|
+
return '';
|
|
388
|
+
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:begin');
|
|
389
|
+
const result = processFencedContent(content, (text) => processTextBuffer(text.split('\n'), options)).trim();
|
|
390
|
+
return stripLeadingBreadcrumbNoise(result);
|
|
391
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { MetadataBlock } from '../transform/types.js';
|
|
2
|
+
export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
|
|
3
|
+
export declare function addSourceToMarkdown(content: string, url: string): string;
|
|
4
|
+
export declare function isRawTextContent(content: string): boolean;
|
|
5
|
+
export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
|
|
6
|
+
//# sourceMappingURL=md-metadata.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"md-metadata.d.ts","sourceRoot":"","sources":["../../src/lib/md-metadata.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AA4G3D,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import { config } from './core.js';
|
|
2
|
+
const BODY_SCAN_LIMIT = 500;
|
|
3
|
+
const HTML_TAG_DENSITY_LIMIT = 5;
|
|
4
|
+
const HEADING_MARKER = /^#{1,6}\s/m;
|
|
5
|
+
const HEADING_STRICT = /^#{1,6}\s+/m;
|
|
6
|
+
const SOURCE_KEY = /^source:\s/im;
|
|
7
|
+
const HTML_DOC_START = /^(<!doctype|<html)/i;
|
|
8
|
+
const LIST_MARKER = /^(?:[-*+])\s/m;
|
|
9
|
+
function getLineEnding(content) {
|
|
10
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
11
|
+
}
|
|
12
|
+
function parseFrontmatter(content) {
|
|
13
|
+
const len = content.length;
|
|
14
|
+
if (len < 4)
|
|
15
|
+
return null;
|
|
16
|
+
let lineEnding = null;
|
|
17
|
+
let fenceLen = 0;
|
|
18
|
+
if (content.startsWith('---\n')) {
|
|
19
|
+
lineEnding = '\n';
|
|
20
|
+
fenceLen = 4;
|
|
21
|
+
}
|
|
22
|
+
else if (content.startsWith('---\r\n')) {
|
|
23
|
+
lineEnding = '\r\n';
|
|
24
|
+
fenceLen = 5;
|
|
25
|
+
}
|
|
26
|
+
if (!lineEnding)
|
|
27
|
+
return null;
|
|
28
|
+
const fence = `---${lineEnding}`;
|
|
29
|
+
const closeIndex = content.indexOf(fence, fenceLen);
|
|
30
|
+
if (closeIndex === -1)
|
|
31
|
+
return null;
|
|
32
|
+
const range = {
|
|
33
|
+
start: 0,
|
|
34
|
+
end: closeIndex + fenceLen,
|
|
35
|
+
linesStart: fenceLen,
|
|
36
|
+
linesEnd: closeIndex,
|
|
37
|
+
lineEnding,
|
|
38
|
+
};
|
|
39
|
+
// Parse key-value entries in one pass
|
|
40
|
+
const entries = new Map();
|
|
41
|
+
const fmBody = content.slice(range.linesStart, range.linesEnd);
|
|
42
|
+
let lastIdx = 0;
|
|
43
|
+
while (lastIdx < fmBody.length) {
|
|
44
|
+
let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
|
|
45
|
+
if (nextIdx === -1)
|
|
46
|
+
nextIdx = fmBody.length;
|
|
47
|
+
const line = fmBody.slice(lastIdx, nextIdx).trim();
|
|
48
|
+
const colonIdx = line.indexOf(':');
|
|
49
|
+
if (line && colonIdx > 0) {
|
|
50
|
+
const key = line.slice(0, colonIdx).trim().toLowerCase();
|
|
51
|
+
let value = line.slice(colonIdx + 1).trim();
|
|
52
|
+
// Strip surrounding quotes
|
|
53
|
+
const first = value.charAt(0);
|
|
54
|
+
const last = value.charAt(value.length - 1);
|
|
55
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
56
|
+
value = value.slice(1, -1).trim();
|
|
57
|
+
}
|
|
58
|
+
if (value)
|
|
59
|
+
entries.set(key, value);
|
|
60
|
+
}
|
|
61
|
+
lastIdx = nextIdx + lineEnding.length;
|
|
62
|
+
}
|
|
63
|
+
return { range, entries };
|
|
64
|
+
}
|
|
65
|
+
function scanBodyForTitle(content) {
|
|
66
|
+
const len = content.length;
|
|
67
|
+
let scanIndex = 0;
|
|
68
|
+
const maxScan = Math.min(len, BODY_SCAN_LIMIT);
|
|
69
|
+
while (scanIndex < maxScan) {
|
|
70
|
+
let nextIndex = content.indexOf('\n', scanIndex);
|
|
71
|
+
if (nextIndex === -1)
|
|
72
|
+
nextIndex = len;
|
|
73
|
+
let line = content.slice(scanIndex, nextIndex);
|
|
74
|
+
if (line.endsWith('\r'))
|
|
75
|
+
line = line.slice(0, -1);
|
|
76
|
+
const trimmed = line.trim();
|
|
77
|
+
if (trimmed) {
|
|
78
|
+
if (HEADING_STRICT.test(trimmed)) {
|
|
79
|
+
return trimmed.replace(HEADING_MARKER, '').trim() || undefined;
|
|
80
|
+
}
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
scanIndex = nextIndex + 1;
|
|
84
|
+
}
|
|
85
|
+
return undefined;
|
|
86
|
+
}
|
|
87
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
88
|
+
const fm = parseFrontmatter(content);
|
|
89
|
+
if (fm) {
|
|
90
|
+
const title = fm.entries.get('title') ?? fm.entries.get('name');
|
|
91
|
+
if (title)
|
|
92
|
+
return title;
|
|
93
|
+
}
|
|
94
|
+
return scanBodyForTitle(content);
|
|
95
|
+
}
|
|
96
|
+
export function addSourceToMarkdown(content, url) {
|
|
97
|
+
const fm = parseFrontmatter(content);
|
|
98
|
+
const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
|
|
99
|
+
if (useMarkdownFormat && !fm) {
|
|
100
|
+
if (SOURCE_KEY.test(content))
|
|
101
|
+
return content;
|
|
102
|
+
const lineEnding = getLineEnding(content);
|
|
103
|
+
const firstH1Match = HEADING_MARKER.exec(content);
|
|
104
|
+
if (firstH1Match) {
|
|
105
|
+
const h1Index = firstH1Match.index;
|
|
106
|
+
const lineEndIndex = content.indexOf(lineEnding, h1Index);
|
|
107
|
+
const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
|
|
108
|
+
const injection = `${lineEnding}Source: ${url}${lineEnding}`;
|
|
109
|
+
return content.slice(0, insertPos) + injection + content.slice(insertPos);
|
|
110
|
+
}
|
|
111
|
+
return `Source: ${url}${lineEnding}${lineEnding}${content}`;
|
|
112
|
+
}
|
|
113
|
+
if (!fm) {
|
|
114
|
+
const lineEnding = getLineEnding(content);
|
|
115
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
116
|
+
return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
|
|
117
|
+
}
|
|
118
|
+
const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
|
|
119
|
+
if (SOURCE_KEY.test(fmBody))
|
|
120
|
+
return content;
|
|
121
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
122
|
+
const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
|
|
123
|
+
return (content.slice(0, fm.range.linesEnd) +
|
|
124
|
+
injection +
|
|
125
|
+
content.slice(fm.range.linesEnd));
|
|
126
|
+
}
|
|
127
|
+
// endregion
|
|
128
|
+
// region Content Detection & Metadata Footer
|
|
129
|
+
function countCommonTags(content, limit) {
|
|
130
|
+
if (limit <= 0)
|
|
131
|
+
return 0;
|
|
132
|
+
const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
|
|
133
|
+
let count = 0;
|
|
134
|
+
while (regex.exec(content)) {
|
|
135
|
+
count += 1;
|
|
136
|
+
if (count > limit)
|
|
137
|
+
break;
|
|
138
|
+
}
|
|
139
|
+
return count;
|
|
140
|
+
}
|
|
141
|
+
export function isRawTextContent(content) {
|
|
142
|
+
const trimmed = content.trim();
|
|
143
|
+
if (HTML_DOC_START.test(trimmed))
|
|
144
|
+
return false;
|
|
145
|
+
if (parseFrontmatter(trimmed) !== null)
|
|
146
|
+
return true;
|
|
147
|
+
const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
|
|
148
|
+
if (tagCount > HTML_TAG_DENSITY_LIMIT)
|
|
149
|
+
return false;
|
|
150
|
+
return (HEADING_MARKER.test(content) ||
|
|
151
|
+
LIST_MARKER.test(content) ||
|
|
152
|
+
content.includes('```'));
|
|
153
|
+
}
|
|
154
|
+
function formatFetchedAt(value) {
|
|
155
|
+
const date = new Date(value);
|
|
156
|
+
if (Number.isNaN(date.getTime()))
|
|
157
|
+
return value;
|
|
158
|
+
const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
|
|
159
|
+
day: '2-digit',
|
|
160
|
+
month: '2-digit',
|
|
161
|
+
year: 'numeric',
|
|
162
|
+
});
|
|
163
|
+
return formatter.format(date);
|
|
164
|
+
}
|
|
165
|
+
export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
166
|
+
if (!metadata)
|
|
167
|
+
return '';
|
|
168
|
+
const lines = ['---', ''];
|
|
169
|
+
const url = metadata.url || fallbackUrl;
|
|
170
|
+
const parts = [];
|
|
171
|
+
if (metadata.title)
|
|
172
|
+
parts.push(`_${metadata.title}_`);
|
|
173
|
+
if (metadata.author)
|
|
174
|
+
parts.push(`_${metadata.author}_`);
|
|
175
|
+
if (url)
|
|
176
|
+
parts.push(`[_Original Source_](${url})`);
|
|
177
|
+
if (metadata.fetchedAt) {
|
|
178
|
+
parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
|
|
179
|
+
}
|
|
180
|
+
if (parts.length > 0)
|
|
181
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
182
|
+
if (metadata.description)
|
|
183
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
184
|
+
return lines.join('\n');
|
|
185
|
+
}
|
|
186
|
+
// endregion
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAigBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
2
|
-
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../lib/
|
|
2
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../lib/code-lang.js';
|
|
3
3
|
import { isLikeNode, isObject } from '../lib/utils.js';
|
|
4
4
|
// ---------------------------------------------------------------------------
|
|
5
5
|
// Shared constant
|
|
@@ -255,11 +255,13 @@ const GFM_ALERT_MAP = new Map([
|
|
|
255
255
|
['danger', 'CAUTION'],
|
|
256
256
|
['important', 'IMPORTANT'],
|
|
257
257
|
]);
|
|
258
|
+
const ADMONITION_TOKEN_RE = /^(?:note|tip|hint|info|warning|warn|danger|caution|important)$/i;
|
|
258
259
|
function resolveGfmAlertType(className) {
|
|
259
|
-
const
|
|
260
|
-
for (const
|
|
261
|
-
|
|
262
|
-
|
|
260
|
+
const tokens = className.toLowerCase().split(/\s+/);
|
|
261
|
+
for (const token of tokens) {
|
|
262
|
+
const mapped = GFM_ALERT_MAP.get(token);
|
|
263
|
+
if (mapped)
|
|
264
|
+
return mapped;
|
|
263
265
|
}
|
|
264
266
|
return undefined;
|
|
265
267
|
}
|
|
@@ -278,11 +280,12 @@ function buildDivTranslator(ctx) {
|
|
|
278
280
|
postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
|
|
279
281
|
};
|
|
280
282
|
}
|
|
283
|
+
const classTokens = className.split(/\s+/);
|
|
281
284
|
const isAdmonition = className.includes('admonition') ||
|
|
282
285
|
className.includes('callout') ||
|
|
283
286
|
className.includes('custom-block') ||
|
|
284
287
|
getAttribute('role') === 'alert' ||
|
|
285
|
-
|
|
288
|
+
classTokens.some((t) => ADMONITION_TOKEN_RE.test(t));
|
|
286
289
|
if (isAdmonition) {
|
|
287
290
|
return {
|
|
288
291
|
postprocess: ({ content }) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAgDA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAqJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwYD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA8KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAuCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AA6CD,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AA2vBX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAqCzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|