@j0hanz/fetch-url-mcp 1.9.2 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,391 @@
1
+ import { config } from './core.js';
2
+ import { throwIfAborted } from './utils.js';
3
+ const ASCII_HASH = 35;
4
+ const ASCII_ASTERISK = 42;
5
+ const ASCII_PLUS = 43;
6
+ const ASCII_DASH = 45;
7
+ const ASCII_PERIOD = 46;
8
+ const ASCII_DIGIT_0 = 48;
9
+ const ASCII_DIGIT_9 = 57;
10
+ const ASCII_EXCLAMATION = 33;
11
+ const ASCII_QUESTION = 63;
12
+ const ASCII_BRACKET_OPEN = 91;
13
+ const TITLE_MIN_WORDS = 2;
14
+ const TITLE_MAX_WORDS = 10;
15
+ const TITLE_MIN_CAPITALIZED = 2;
16
+ const HAS_FOLLOWING_LOOKAHEAD = 10;
17
+ const PROPERTY_FIX_MAX_PASSES = 5;
18
+ const MAX_LINE_LENGTH = 80;
19
+ const FENCE_PATTERN = /^\s*(`{3,}|~{3,})/;
20
+ const REGEX = {
21
+ HEADING_MARKER: /^#{1,6}\s/m,
22
+ HEADING_STRICT: /^#{1,6}\s+/m,
23
+ EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
24
+ ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
25
+ FENCE_START: FENCE_PATTERN,
26
+ LIST_MARKER: /^(?:[-*+])\s/m,
27
+ TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
28
+ TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents|on this page)\s*$/i,
29
+ HTML_DOC_START: /^(<!doctype|<html)/i,
30
+ COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??|\[Back to top\]\(#[^)]*\)|\[\s*\]\(https?:\/\/[^)]*\))\s*$/gim,
31
+ ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
32
+ CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
33
+ DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
34
+ SOURCE_KEY: /^source:\s/im,
35
+ HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
36
+ HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
37
+ SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
38
+ SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
39
+ SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
40
+ SPACING_ESCAPES: /\\([[\].])/g,
41
+ SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
42
+ PUNCT_ONLY_LIST_ARTIFACT: /^(?:[-*+]|\d+\.)\s*(?:\\[-*+|/]|[-*+|/])(?:\s+(?:\\[-*+|/]|[-*+|/]))*\s*$/gm,
43
+ NESTED_LIST_INDENT: /^( +)((?:[-*+])|\d+\.)\s/gm,
44
+ TYPEDOC_COMMENT: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
45
+ };
46
+ const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
47
+ const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
48
+ const TOC_SCAN_LIMIT = 20;
49
+ const TOC_MAX_NON_EMPTY = 12;
50
+ const TOC_LINK_RATIO_THRESHOLD = 0.8;
51
+ const TYPEDOC_PREFIXES = [
52
+ 'Defined in:',
53
+ 'Returns:',
54
+ 'Since:',
55
+ 'See also:',
56
+ ];
57
+ function createAbortChecker(options) {
58
+ const signal = options?.signal;
59
+ const url = options?.url ?? '';
60
+ return (stage) => {
61
+ throwIfAborted(signal, url, stage);
62
+ };
63
+ }
64
+ function isBlank(line) {
65
+ return line === undefined || line.trim().length === 0;
66
+ }
67
+ function hasFollowingContent(lines, startIndex) {
68
+ // Optimization: Bound lookahead to avoid checking too many lines in huge files
69
+ for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
70
+ if (!isBlank(lines[i]))
71
+ return true;
72
+ }
73
+ return false;
74
+ }
75
+ function stripAnchorOnlyHeading(line) {
76
+ return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
77
+ }
78
+ function isTitleCaseOrKeyword(trimmed) {
79
+ // Quick check for length to avoid regex on long strings
80
+ if (trimmed.length > MAX_LINE_LENGTH)
81
+ return false;
82
+ // Single word optimization
83
+ if (!trimmed.includes(' ')) {
84
+ if (!/^[A-Z]/.test(trimmed))
85
+ return false;
86
+ return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
87
+ }
88
+ // Split limited number of words
89
+ const words = trimmed.split(/\s+/);
90
+ const len = words.length;
91
+ if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
92
+ return false;
93
+ let capitalizedCount = 0;
94
+ for (let i = 0; i < len; i++) {
95
+ const w = words[i];
96
+ if (!w)
97
+ continue;
98
+ const isCap = /^[A-Z][a-z]*$/.test(w);
99
+ if (isCap)
100
+ capitalizedCount++;
101
+ else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
102
+ return false;
103
+ }
104
+ return capitalizedCount >= TITLE_MIN_CAPITALIZED;
105
+ }
106
+ function getHeadingPrefix(trimmed) {
107
+ if (trimmed.length > MAX_LINE_LENGTH)
108
+ return null;
109
+ // Fast path: Check common markdown markers first
110
+ const firstChar = trimmed.charCodeAt(0);
111
+ if (firstChar === ASCII_HASH ||
112
+ firstChar === ASCII_DASH ||
113
+ firstChar === ASCII_ASTERISK ||
114
+ firstChar === ASCII_PLUS ||
115
+ firstChar === ASCII_BRACKET_OPEN ||
116
+ (firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
117
+ if (REGEX.HEADING_MARKER.test(trimmed) ||
118
+ REGEX.LIST_MARKER.test(trimmed) ||
119
+ /^\d+\.\s/.test(trimmed) ||
120
+ /^\[.*\]\(.*\)$/.test(trimmed)) {
121
+ return null;
122
+ }
123
+ }
124
+ if (SPECIAL_PREFIXES.test(trimmed)) {
125
+ return /^example:\s/i.test(trimmed) ? '### ' : '## ';
126
+ }
127
+ const lastChar = trimmed.charCodeAt(trimmed.length - 1);
128
+ if (lastChar === ASCII_PERIOD ||
129
+ lastChar === ASCII_EXCLAMATION ||
130
+ lastChar === ASCII_QUESTION)
131
+ return null;
132
+ return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
133
+ }
134
+ function getTocBlockStats(lines, headingIndex) {
135
+ let total = 0;
136
+ let linkCount = 0;
137
+ let nonLinkCount = 0;
138
+ const lookaheadMax = Math.min(lines.length, headingIndex + TOC_SCAN_LIMIT);
139
+ for (let i = headingIndex + 1; i < lookaheadMax; i++) {
140
+ const line = lines[i];
141
+ if (!line)
142
+ continue;
143
+ const trimmed = line.trim();
144
+ if (!trimmed)
145
+ continue;
146
+ if (REGEX.HEADING_MARKER.test(trimmed))
147
+ break;
148
+ total += 1;
149
+ if (REGEX.TOC_LINK.test(trimmed))
150
+ linkCount += 1;
151
+ else
152
+ nonLinkCount += 1;
153
+ if (total >= TOC_MAX_NON_EMPTY)
154
+ break;
155
+ }
156
+ return { total, linkCount, nonLinkCount };
157
+ }
158
+ function skipTocLines(lines, startIndex) {
159
+ for (let i = startIndex; i < lines.length; i++) {
160
+ const line = lines[i];
161
+ if (line === undefined)
162
+ continue;
163
+ const trimmed = line.trim();
164
+ if (!trimmed)
165
+ continue;
166
+ if (!REGEX.TOC_LINK.test(trimmed))
167
+ return i;
168
+ }
169
+ return lines.length;
170
+ }
171
+ function isTypeDocArtifactLine(line) {
172
+ const trimmed = line.trim();
173
+ for (const prefix of TYPEDOC_PREFIXES) {
174
+ if (!trimmed.startsWith(prefix))
175
+ continue;
176
+ const rest = trimmed.slice(prefix.length).trimStart();
177
+ if (!rest.startsWith('**`'))
178
+ return false;
179
+ return rest.includes('`**');
180
+ }
181
+ return false;
182
+ }
183
+ function tryPromoteOrphan(lines, i, trimmed) {
184
+ const prevLine = lines[i - 1];
185
+ const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
186
+ if (!isOrphan)
187
+ return null;
188
+ const prefix = getHeadingPrefix(trimmed);
189
+ if (!prefix)
190
+ return null;
191
+ const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed);
192
+ if (!isSpecialPrefix && !hasFollowingContent(lines, i))
193
+ return null;
194
+ return `${prefix}${trimmed}`;
195
+ }
196
+ function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
197
+ if (!removeToc || !REGEX.TOC_HEADING.test(trimmed))
198
+ return null;
199
+ const { total, linkCount, nonLinkCount } = getTocBlockStats(lines, i);
200
+ if (total === 0 || nonLinkCount > 0)
201
+ return null;
202
+ const ratio = linkCount / total;
203
+ if (ratio <= TOC_LINK_RATIO_THRESHOLD)
204
+ return null;
205
+ throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc');
206
+ return skipTocLines(lines, i + 1);
207
+ }
208
+ function normalizePreprocessLine(lines, i, trimmed, line) {
209
+ if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
210
+ return null;
211
+ if (!REGEX.ANCHOR_ONLY_HEADING.test(trimmed))
212
+ return line;
213
+ if (!hasFollowingContent(lines, i))
214
+ return null;
215
+ return stripAnchorOnlyHeading(trimmed);
216
+ }
217
+ function maybeSkipTocBlock(lines, i, trimmed, options) {
218
+ return shouldSkipAsToc(lines, i, trimmed, config.markdownCleanup.removeTocBlocks, options);
219
+ }
220
+ function maybePromoteOrphanHeading(lines, i, trimmed, checkAbort) {
221
+ if (!config.markdownCleanup.promoteOrphanHeadings || trimmed.length === 0) {
222
+ return null;
223
+ }
224
+ checkAbort('markdown:cleanup:promote');
225
+ return tryPromoteOrphan(lines, i, trimmed);
226
+ }
227
+ function preprocessLines(lines, options) {
228
+ const processedLines = [];
229
+ const checkAbort = createAbortChecker(options);
230
+ let skipUntil = -1;
231
+ for (let i = 0; i < lines.length; i++) {
232
+ if (i < skipUntil)
233
+ continue;
234
+ const currentLine = lines[i];
235
+ if (currentLine === undefined)
236
+ continue;
237
+ const trimmed = currentLine.trim();
238
+ const normalizedLine = normalizePreprocessLine(lines, i, trimmed, currentLine);
239
+ if (normalizedLine === null)
240
+ continue;
241
+ const tocSkip = maybeSkipTocBlock(lines, i, trimmed, options);
242
+ if (tocSkip !== null) {
243
+ skipUntil = tocSkip;
244
+ continue;
245
+ }
246
+ const promotedLine = maybePromoteOrphanHeading(lines, i, trimmed, checkAbort);
247
+ processedLines.push(promotedLine ?? normalizedLine);
248
+ }
249
+ return processedLines.join('\n');
250
+ }
251
+ function processTextBuffer(lines, options) {
252
+ if (lines.length === 0)
253
+ return '';
254
+ const text = preprocessLines(lines, options);
255
+ return applyGlobalRegexes(text, options);
256
+ }
257
+ function removeTypeDocArtifacts(text) {
258
+ const filtered = text
259
+ .split('\n')
260
+ .filter((line) => !isTypeDocArtifactLine(line))
261
+ .join('\n');
262
+ return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
263
+ }
264
+ function removeSkipLinks(text) {
265
+ return text
266
+ .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
267
+ .replace(REGEX.COMBINED_LINE_REMOVALS, '');
268
+ }
269
+ function normalizeInlineCodeTokens(text) {
270
+ return text.replace(/`([^`\n]+)`/g, (match, inner) => {
271
+ const trimmed = inner.trim();
272
+ if (trimmed === inner)
273
+ return match;
274
+ if (!/[A-Za-z0-9]/.test(trimmed))
275
+ return match;
276
+ const parts = /^(\s*)(.*?)(\s*)$/.exec(inner);
277
+ if (!parts)
278
+ return match;
279
+ return `${parts[1] ?? ''}\`${parts[2] ?? ''}\`${parts[3] ?? ''}`;
280
+ });
281
+ }
282
+ function normalizeMarkdownSpacing(text) {
283
+ let result = text
284
+ .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
285
+ .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
286
+ .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
287
+ .replace(REGEX.SPACING_ESCAPES, '$1')
288
+ .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
289
+ .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
290
+ .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
291
+ // Fix missing spaces after sentence-ending punctuation followed by uppercase
292
+ result = result.replace(/([.!?:;])([A-Z])/g, '$1 $2');
293
+ // Trim whitespace around token-like inline code spans.
294
+ result = normalizeInlineCodeTokens(result);
295
+ // Unescape backticks inside markdown link text
296
+ result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
297
+ result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
298
+ return normalizeNestedListIndentation(result);
299
+ }
300
+ function fixConcatenatedProperties(text) {
301
+ let result = text;
302
+ for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
303
+ const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
304
+ if (next === result)
305
+ break;
306
+ result = next;
307
+ }
308
+ return result;
309
+ }
310
+ function applyGlobalRegexes(text, options) {
311
+ const checkAbort = createAbortChecker(options);
312
+ let result = text.replace(/\u00A0/g, ' ');
313
+ checkAbort('markdown:cleanup:headings');
314
+ result = result
315
+ .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
316
+ .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
317
+ if (config.markdownCleanup.removeTypeDocComments) {
318
+ checkAbort('markdown:cleanup:typedoc');
319
+ result = removeTypeDocArtifacts(result);
320
+ }
321
+ if (config.markdownCleanup.removeSkipLinks) {
322
+ checkAbort('markdown:cleanup:skip-links');
323
+ result = removeSkipLinks(result);
324
+ }
325
+ checkAbort('markdown:cleanup:spacing');
326
+ result = normalizeMarkdownSpacing(result);
327
+ checkAbort('markdown:cleanup:properties');
328
+ return fixConcatenatedProperties(result);
329
+ }
330
+ function normalizeNestedListIndentation(text) {
331
+ return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
332
+ const count = spaces.length;
333
+ if (count < 2 || count % 2 !== 0)
334
+ return match;
335
+ const normalized = ' '.repeat((count / 2) * 4);
336
+ return `${normalized}${marker} `;
337
+ });
338
+ }
339
+ /**
340
+ * Iterate over markdown content, splitting it into fenced (code) and
341
+ * non-fenced segments. Fenced lines pass through unchanged; non-fenced
342
+ * segments are joined and handed to `processTextSegment` for transformation.
343
+ */
344
+ export function processFencedContent(content, processTextSegment) {
345
+ const lines = content.split(/\r?\n/);
346
+ let fenceMarker = null;
347
+ const segments = [];
348
+ let buffer = [];
349
+ const flushBuffer = () => {
350
+ if (buffer.length > 0) {
351
+ segments.push(processTextSegment(buffer.join('\n')));
352
+ buffer = [];
353
+ }
354
+ };
355
+ for (const line of lines) {
356
+ const trimmed = line.trimStart();
357
+ if (fenceMarker) {
358
+ segments.push(line);
359
+ if (trimmed.startsWith(fenceMarker) &&
360
+ trimmed.slice(fenceMarker.length).trim() === '') {
361
+ fenceMarker = null;
362
+ }
363
+ }
364
+ else {
365
+ const match = FENCE_PATTERN.exec(line);
366
+ const newMarker = match?.[1] ?? null;
367
+ if (!newMarker) {
368
+ buffer.push(line);
369
+ }
370
+ else {
371
+ flushBuffer();
372
+ segments.push(line);
373
+ fenceMarker = newMarker;
374
+ }
375
+ }
376
+ }
377
+ flushBuffer();
378
+ return segments.join('\n');
379
+ }
380
+ function stripLeadingBreadcrumbNoise(text) {
381
+ // Remove a single short plain-text line at the very start if followed
382
+ // (within one optional blank line) by an H1 or H2 heading.
383
+ return text.replace(/^([^\n#>|`\-*+\d[\]()]{1,40})\n(\s*\n)?(?=#{1,2}\s)/, '');
384
+ }
385
+ export function cleanupMarkdownArtifacts(content, options) {
386
+ if (!content)
387
+ return '';
388
+ throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:begin');
389
+ const result = processFencedContent(content, (text) => processTextBuffer(text.split('\n'), options)).trim();
390
+ return stripLeadingBreadcrumbNoise(result);
391
+ }
@@ -0,0 +1,6 @@
1
+ import type { MetadataBlock } from '../transform/types.js';
2
+ export declare function extractTitleFromRawMarkdown(content: string): string | undefined;
3
+ export declare function addSourceToMarkdown(content: string, url: string): string;
4
+ export declare function isRawTextContent(content: string): boolean;
5
+ export declare function buildMetadataFooter(metadata?: MetadataBlock, fallbackUrl?: string): string;
6
+ //# sourceMappingURL=md-metadata.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"md-metadata.d.ts","sourceRoot":"","sources":["../../src/lib/md-metadata.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,uBAAuB,CAAC;AA4G3D,wBAAgB,2BAA2B,CACzC,OAAO,EAAE,MAAM,GACd,MAAM,GAAG,SAAS,CAOpB;AACD,wBAAgB,mBAAmB,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,CAuCxE;AAmBD,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAczD;AAaD,wBAAgB,mBAAmB,CACjC,QAAQ,CAAC,EAAE,aAAa,EACxB,WAAW,CAAC,EAAE,MAAM,GACnB,MAAM,CAmBR"}
@@ -0,0 +1,186 @@
1
+ import { config } from './core.js';
2
+ const BODY_SCAN_LIMIT = 500;
3
+ const HTML_TAG_DENSITY_LIMIT = 5;
4
+ const HEADING_MARKER = /^#{1,6}\s/m;
5
+ const HEADING_STRICT = /^#{1,6}\s+/m;
6
+ const SOURCE_KEY = /^source:\s/im;
7
+ const HTML_DOC_START = /^(<!doctype|<html)/i;
8
+ const LIST_MARKER = /^(?:[-*+])\s/m;
9
+ function getLineEnding(content) {
10
+ return content.includes('\r\n') ? '\r\n' : '\n';
11
+ }
12
+ function parseFrontmatter(content) {
13
+ const len = content.length;
14
+ if (len < 4)
15
+ return null;
16
+ let lineEnding = null;
17
+ let fenceLen = 0;
18
+ if (content.startsWith('---\n')) {
19
+ lineEnding = '\n';
20
+ fenceLen = 4;
21
+ }
22
+ else if (content.startsWith('---\r\n')) {
23
+ lineEnding = '\r\n';
24
+ fenceLen = 5;
25
+ }
26
+ if (!lineEnding)
27
+ return null;
28
+ const fence = `---${lineEnding}`;
29
+ const closeIndex = content.indexOf(fence, fenceLen);
30
+ if (closeIndex === -1)
31
+ return null;
32
+ const range = {
33
+ start: 0,
34
+ end: closeIndex + fenceLen,
35
+ linesStart: fenceLen,
36
+ linesEnd: closeIndex,
37
+ lineEnding,
38
+ };
39
+ // Parse key-value entries in one pass
40
+ const entries = new Map();
41
+ const fmBody = content.slice(range.linesStart, range.linesEnd);
42
+ let lastIdx = 0;
43
+ while (lastIdx < fmBody.length) {
44
+ let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
45
+ if (nextIdx === -1)
46
+ nextIdx = fmBody.length;
47
+ const line = fmBody.slice(lastIdx, nextIdx).trim();
48
+ const colonIdx = line.indexOf(':');
49
+ if (line && colonIdx > 0) {
50
+ const key = line.slice(0, colonIdx).trim().toLowerCase();
51
+ let value = line.slice(colonIdx + 1).trim();
52
+ // Strip surrounding quotes
53
+ const first = value.charAt(0);
54
+ const last = value.charAt(value.length - 1);
55
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
56
+ value = value.slice(1, -1).trim();
57
+ }
58
+ if (value)
59
+ entries.set(key, value);
60
+ }
61
+ lastIdx = nextIdx + lineEnding.length;
62
+ }
63
+ return { range, entries };
64
+ }
65
+ function scanBodyForTitle(content) {
66
+ const len = content.length;
67
+ let scanIndex = 0;
68
+ const maxScan = Math.min(len, BODY_SCAN_LIMIT);
69
+ while (scanIndex < maxScan) {
70
+ let nextIndex = content.indexOf('\n', scanIndex);
71
+ if (nextIndex === -1)
72
+ nextIndex = len;
73
+ let line = content.slice(scanIndex, nextIndex);
74
+ if (line.endsWith('\r'))
75
+ line = line.slice(0, -1);
76
+ const trimmed = line.trim();
77
+ if (trimmed) {
78
+ if (HEADING_STRICT.test(trimmed)) {
79
+ return trimmed.replace(HEADING_MARKER, '').trim() || undefined;
80
+ }
81
+ return undefined;
82
+ }
83
+ scanIndex = nextIndex + 1;
84
+ }
85
+ return undefined;
86
+ }
87
+ export function extractTitleFromRawMarkdown(content) {
88
+ const fm = parseFrontmatter(content);
89
+ if (fm) {
90
+ const title = fm.entries.get('title') ?? fm.entries.get('name');
91
+ if (title)
92
+ return title;
93
+ }
94
+ return scanBodyForTitle(content);
95
+ }
96
+ export function addSourceToMarkdown(content, url) {
97
+ const fm = parseFrontmatter(content);
98
+ const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
99
+ if (useMarkdownFormat && !fm) {
100
+ if (SOURCE_KEY.test(content))
101
+ return content;
102
+ const lineEnding = getLineEnding(content);
103
+ const firstH1Match = HEADING_MARKER.exec(content);
104
+ if (firstH1Match) {
105
+ const h1Index = firstH1Match.index;
106
+ const lineEndIndex = content.indexOf(lineEnding, h1Index);
107
+ const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
108
+ const injection = `${lineEnding}Source: ${url}${lineEnding}`;
109
+ return content.slice(0, insertPos) + injection + content.slice(insertPos);
110
+ }
111
+ return `Source: ${url}${lineEnding}${lineEnding}${content}`;
112
+ }
113
+ if (!fm) {
114
+ const lineEnding = getLineEnding(content);
115
+ const escapedUrl = url.replace(/"/g, '\\"');
116
+ return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
117
+ }
118
+ const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
119
+ if (SOURCE_KEY.test(fmBody))
120
+ return content;
121
+ const escapedUrl = url.replace(/"/g, '\\"');
122
+ const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
123
+ return (content.slice(0, fm.range.linesEnd) +
124
+ injection +
125
+ content.slice(fm.range.linesEnd));
126
+ }
127
+ // endregion
128
+ // region Content Detection & Metadata Footer
129
+ function countCommonTags(content, limit) {
130
+ if (limit <= 0)
131
+ return 0;
132
+ const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
133
+ let count = 0;
134
+ while (regex.exec(content)) {
135
+ count += 1;
136
+ if (count > limit)
137
+ break;
138
+ }
139
+ return count;
140
+ }
141
+ export function isRawTextContent(content) {
142
+ const trimmed = content.trim();
143
+ if (HTML_DOC_START.test(trimmed))
144
+ return false;
145
+ if (parseFrontmatter(trimmed) !== null)
146
+ return true;
147
+ const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
148
+ if (tagCount > HTML_TAG_DENSITY_LIMIT)
149
+ return false;
150
+ return (HEADING_MARKER.test(content) ||
151
+ LIST_MARKER.test(content) ||
152
+ content.includes('```'));
153
+ }
154
+ function formatFetchedAt(value) {
155
+ const date = new Date(value);
156
+ if (Number.isNaN(date.getTime()))
157
+ return value;
158
+ const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
159
+ day: '2-digit',
160
+ month: '2-digit',
161
+ year: 'numeric',
162
+ });
163
+ return formatter.format(date);
164
+ }
165
+ export function buildMetadataFooter(metadata, fallbackUrl) {
166
+ if (!metadata)
167
+ return '';
168
+ const lines = ['---', ''];
169
+ const url = metadata.url || fallbackUrl;
170
+ const parts = [];
171
+ if (metadata.title)
172
+ parts.push(`_${metadata.title}_`);
173
+ if (metadata.author)
174
+ parts.push(`_${metadata.author}_`);
175
+ if (url)
176
+ parts.push(`[_Original Source_](${url})`);
177
+ if (metadata.fetchedAt) {
178
+ parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
179
+ }
180
+ if (parts.length > 0)
181
+ lines.push(` ${parts.join(' | ')}`);
182
+ if (metadata.description)
183
+ lines.push(` <sub>${metadata.description}</sub>`);
184
+ return lines.join('\n');
185
+ }
186
+ // endregion
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4fA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAigBA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -1,5 +1,5 @@
1
1
  import { NodeHtmlMarkdown, } from 'node-html-markdown';
2
- import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../lib/content.js';
2
+ import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../lib/code-lang.js';
3
3
  import { isLikeNode, isObject } from '../lib/utils.js';
4
4
  // ---------------------------------------------------------------------------
5
5
  // Shared constant
@@ -255,11 +255,13 @@ const GFM_ALERT_MAP = new Map([
255
255
  ['danger', 'CAUTION'],
256
256
  ['important', 'IMPORTANT'],
257
257
  ]);
258
+ const ADMONITION_TOKEN_RE = /^(?:note|tip|hint|info|warning|warn|danger|caution|important)$/i;
258
259
  function resolveGfmAlertType(className) {
259
- const lower = className.toLowerCase();
260
- for (const [key, type] of GFM_ALERT_MAP) {
261
- if (lower.includes(key))
262
- return type;
260
+ const tokens = className.toLowerCase().split(/\s+/);
261
+ for (const token of tokens) {
262
+ const mapped = GFM_ALERT_MAP.get(token);
263
+ if (mapped)
264
+ return mapped;
263
265
  }
264
266
  return undefined;
265
267
  }
@@ -278,11 +280,12 @@ function buildDivTranslator(ctx) {
278
280
  postprocess: ({ content }) => `\n\n\`\`\`mermaid\n${content.trim()}\n\`\`\`\n\n`,
279
281
  };
280
282
  }
283
+ const classTokens = className.split(/\s+/);
281
284
  const isAdmonition = className.includes('admonition') ||
282
285
  className.includes('callout') ||
283
286
  className.includes('custom-block') ||
284
287
  getAttribute('role') === 'alert' ||
285
- /\b(note|tip|info|warning|danger|caution|important)\b/i.test(className);
288
+ classTokens.some((t) => ADMONITION_TOKEN_RE.test(t));
286
289
  if (isAdmonition) {
287
290
  return {
288
291
  postprocess: ({ content }) => {
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAuCA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAuJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAuVD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA+OD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAwJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAsCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AAED,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AAiQX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAmBzB;AA+CD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAsH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AAgDA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAqCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AAqJD,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAwYD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA8KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AAuJD,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAiED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAuCD,iBAAS,eAAe,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAc/D;AAED,iBAAS,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,MAAM,GAAG,SAAS,CAYlE;AA6CD,iBAAS,yBAAyB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAcvD;AAED,eAAO,MAAM,mBAAmB;;;;CAItB,CAAC;AA2vBX,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAqCzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AAkH1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}