@j0hanz/fetch-url-mcp 1.9.3 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1433 +0,0 @@
1
- import { parseHTML } from 'linkedom';
2
- import {} from '../transform/types.js';
3
- import { config, logDebug } from './core.js';
4
- import { throwIfAborted } from './utils.js';
5
- // ASCII char codes used in hot-path charCodeAt comparisons
6
- const ASCII_SPACE = 32;
7
- const ASCII_TAB = 9;
8
- const ASCII_EXCLAMATION = 33;
9
- const ASCII_HASH = 35;
10
- const ASCII_ASTERISK = 42;
11
- const ASCII_PLUS = 43;
12
- const ASCII_DASH = 45;
13
- const ASCII_PERIOD = 46;
14
- const ASCII_DIGIT_0 = 48;
15
- const ASCII_DIGIT_9 = 57;
16
- const ASCII_LT = 60;
17
- const ASCII_QUESTION = 63;
18
- const ASCII_UPPER_A = 65;
19
- const ASCII_UPPER_Z = 90;
20
- const ASCII_BRACKET_OPEN = 91;
21
- const ASCII_LOWER_A = 97;
22
- const ASCII_LOWER_Z = 122;
23
- const ASCII_UNDERSCORE = 95;
24
- const HTML_TAG_DENSITY_LIMIT = 5;
25
- const TITLE_MIN_WORDS = 2;
26
- const TITLE_MAX_WORDS = 6;
27
- const TITLE_MIN_CAPITALIZED = 2;
28
- const PROPERTY_FIX_MAX_PASSES = 3;
29
- const BODY_SCAN_LIMIT = 5000;
30
- const HAS_FOLLOWING_LOOKAHEAD = 50;
31
- const NOISE_SCAN_LIMIT = 50_000;
32
- const MIN_BODY_CONTENT_LENGTH = 100;
33
- const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
34
- const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
35
- const ABORT_CHECK_INTERVAL = 500;
36
- const NODE_FILTER_SHOW_TEXT = 4;
37
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
38
- const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
39
- const NOISE_PATTERNS = [
40
- /<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
41
- /[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
42
- /[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
43
- /[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
44
- /[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumbs?|pagination)\b/i,
45
- ];
46
- const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
47
- const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
48
- const SKIP_URL_PREFIXES = [
49
- '#',
50
- 'javascript:',
51
- 'mailto:',
52
- 'tel:',
53
- 'data:',
54
- 'blob:',
55
- ];
56
- const BASE_STRUCTURAL_TAGS = new Set([
57
- 'script',
58
- 'style',
59
- 'noscript',
60
- 'iframe',
61
- 'template',
62
- 'form',
63
- 'button',
64
- 'input',
65
- 'select',
66
- 'textarea',
67
- ]);
68
- const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
69
- const NAVIGATION_ROLES = new Set([
70
- 'navigation',
71
- 'banner',
72
- 'complementary',
73
- 'contentinfo',
74
- 'tree',
75
- 'menubar',
76
- 'menu',
77
- 'dialog',
78
- 'alertdialog',
79
- 'search',
80
- ]);
81
- const INTERACTIVE_CONTENT_ROLES = new Set([
82
- 'tabpanel',
83
- 'tab',
84
- 'tablist',
85
- 'dialog',
86
- 'alertdialog',
87
- 'menu',
88
- 'menuitem',
89
- 'option',
90
- 'listbox',
91
- 'combobox',
92
- 'tooltip',
93
- 'alert',
94
- ]);
95
- const PROMO_TOKENS_ALWAYS = [
96
- 'banner',
97
- 'promo',
98
- 'announcement',
99
- 'cta',
100
- 'advert',
101
- 'ads',
102
- 'sponsor',
103
- 'recommend',
104
- 'breadcrumb',
105
- 'breadcrumbs',
106
- 'pagination',
107
- 'pager',
108
- 'taglist',
109
- 'twitter-tweet',
110
- 'fb-post',
111
- 'instagram-media',
112
- 'social-embed',
113
- ];
114
- const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
115
- const PROMO_TOKENS_BY_CATEGORY = {
116
- 'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
117
- newsletters: ['newsletter', 'subscribe'],
118
- 'social-share': ['share', 'social'],
119
- };
120
- // Noise selector configurations
121
- const BASE_NOISE_SELECTORS = {
122
- navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"],[class*="breadcrumb"]',
123
- cookieBanners: '[role="dialog"]',
124
- hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
125
- };
126
- const NO_MATCH_REGEX = /a^/i;
127
- let cachedContext;
128
- let lastContextKey;
129
- function escapeRegexLiteral(value) {
130
- return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
131
- }
132
- function buildTokenRegex(tokens) {
133
- if (tokens.size === 0)
134
- return NO_MATCH_REGEX;
135
- return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
136
- }
137
- function addTokens(target, tokens) {
138
- for (const token of tokens)
139
- target.add(token);
140
- }
141
- function getPromoMatchers(currentConfig, flags) {
142
- const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
143
- const aggressiveTokens = new Set();
144
- if (currentConfig.aggressiveMode) {
145
- addTokens(aggressiveTokens, PROMO_TOKENS_AGGRESSIVE);
146
- }
147
- if (flags.cookieBanners) {
148
- addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['cookie-banners']);
149
- }
150
- if (flags.newsletters) {
151
- addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['newsletters']);
152
- }
153
- if (flags.socialShare) {
154
- addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['social-share']);
155
- }
156
- for (const t of currentConfig.extraTokens) {
157
- const n = t.toLowerCase().trim();
158
- if (n)
159
- baseTokens.add(n);
160
- }
161
- return {
162
- base: buildTokenRegex(baseTokens),
163
- aggressive: buildTokenRegex(aggressiveTokens),
164
- };
165
- }
166
- function getContext() {
167
- const currentConfig = config.noiseRemoval;
168
- const contextKey = JSON.stringify({
169
- locale: config.i18n.locale,
170
- enabledCategories: currentConfig.enabledCategories,
171
- extraTokens: currentConfig.extraTokens,
172
- extraSelectors: currentConfig.extraSelectors,
173
- aggressiveMode: currentConfig.aggressiveMode,
174
- preserveSvgCanvas: currentConfig.preserveSvgCanvas,
175
- weights: currentConfig.weights,
176
- });
177
- if (cachedContext !== undefined && lastContextKey === contextKey)
178
- return cachedContext;
179
- const enabled = new Set(currentConfig.enabledCategories
180
- .map((c) => {
181
- const s = c.toLowerCase().trim();
182
- const { locale } = config.i18n;
183
- return locale ? s.toLocaleLowerCase(locale) : s;
184
- })
185
- .filter(Boolean));
186
- const isEnabled = (cat) => enabled.has(cat);
187
- const flags = {
188
- navFooter: isEnabled('nav-footer'),
189
- cookieBanners: isEnabled('cookie-banners'),
190
- newsletters: isEnabled('newsletters'),
191
- socialShare: isEnabled('social-share'),
192
- };
193
- const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
194
- if (!currentConfig.preserveSvgCanvas) {
195
- structuralTags.add('svg');
196
- structuralTags.add('canvas');
197
- }
198
- const promoMatchers = getPromoMatchers(currentConfig, flags);
199
- const extraSelectors = currentConfig.extraSelectors
200
- .map((s) => s.trim())
201
- .filter((s) => s.length > 0);
202
- // Pre-build selectors
203
- const selectors = [BASE_NOISE_SELECTORS.hidden];
204
- if (flags.navFooter)
205
- selectors.push(BASE_NOISE_SELECTORS.navFooter);
206
- if (flags.cookieBanners)
207
- selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
208
- const baseSelector = selectors.join(',');
209
- const candidateSelector = [
210
- ...structuralTags,
211
- ...ALWAYS_NOISE_TAGS,
212
- 'aside',
213
- 'header',
214
- '[class]',
215
- '[id]',
216
- '[role]',
217
- '[style]',
218
- ].join(',');
219
- cachedContext = {
220
- flags,
221
- structuralTags,
222
- weights: currentConfig.weights,
223
- promoMatchers,
224
- promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
225
- extraSelectors,
226
- baseSelector,
227
- candidateSelector,
228
- };
229
- lastContextKey = contextKey;
230
- return cachedContext;
231
- }
232
- function isInteractive(element, role) {
233
- if (role && INTERACTIVE_CONTENT_ROLES.has(role))
234
- return true;
235
- const tag = element.tagName.toLowerCase();
236
- const ds = element.getAttribute('data-state');
237
- if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
238
- return true;
239
- const dataOrientation = element.getAttribute('data-orientation');
240
- if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
241
- return true;
242
- return (element.hasAttribute('data-accordion-item') ||
243
- element.hasAttribute('data-radix-collection-item'));
244
- }
245
- function isWithinPrimaryContent(element) {
246
- let current = element;
247
- while (current) {
248
- const tagName = current.tagName.toLowerCase();
249
- if (tagName === 'article' || tagName === 'main')
250
- return true;
251
- if (current.getAttribute('role') === 'main')
252
- return true;
253
- current = current.parentElement;
254
- }
255
- return false;
256
- }
257
- const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
258
- const ASIDE_NAV_MIN_LINKS = 10;
259
- function isNavigationAside(element) {
260
- if (element.querySelector('nav'))
261
- return true;
262
- const links = element.querySelectorAll('a[href]');
263
- if (links.length < ASIDE_NAV_MIN_LINKS)
264
- return false;
265
- const textLen = (element.textContent || '').trim().length;
266
- if (textLen === 0)
267
- return true;
268
- return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
269
- }
270
- function shouldPreserve(element, tagName) {
271
- // Check Dialog
272
- const role = element.getAttribute('role');
273
- if (role === 'dialog' || role === 'alertdialog') {
274
- if (isWithinPrimaryContent(element))
275
- return true;
276
- const textLen = (element.textContent || '').length;
277
- if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
278
- return true;
279
- return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
280
- }
281
- // Check Nav/Footer
282
- if (tagName === 'nav' || tagName === 'footer') {
283
- if (element.querySelector('article,main,section,[role="main"]'))
284
- return true;
285
- return ((element.textContent || '').trim().length >=
286
- NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
287
- }
288
- // Check Aside — preserve only if it looks like article content, not navigation
289
- if (tagName === 'aside') {
290
- if (!isWithinPrimaryContent(element))
291
- return false;
292
- return !isNavigationAside(element);
293
- }
294
- return false;
295
- }
296
- function removeNodes(nodes) {
297
- for (let i = nodes.length - 1; i >= 0; i--) {
298
- const node = nodes[i];
299
- if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
300
- node.remove();
301
- }
302
- }
303
- }
304
- const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
305
- function calculateNavFooterScore(tagName, className, id, role, weights) {
306
- let score = 0;
307
- if (ALWAYS_NOISE_TAGS.has(tagName))
308
- score += weights.structural;
309
- if (tagName === 'header') {
310
- if ((role && NAVIGATION_ROLES.has(role)) ||
311
- HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
312
- score += weights.structural;
313
- }
314
- }
315
- if (tagName === 'aside') {
316
- score += weights.structural;
317
- }
318
- if (role && NAVIGATION_ROLES.has(role)) {
319
- if (tagName !== 'aside' || role !== 'complementary') {
320
- score += weights.structural;
321
- }
322
- }
323
- return score;
324
- }
325
- function calculatePromoScore(element, className, id, context) {
326
- if (!context.promoEnabled)
327
- return 0;
328
- const aggTest = context.promoMatchers.aggressive.test(className) ||
329
- context.promoMatchers.aggressive.test(id);
330
- const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
331
- const isBaseMatch = !aggTest &&
332
- (context.promoMatchers.base.test(className) ||
333
- context.promoMatchers.base.test(id));
334
- return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
335
- }
336
- function isNoiseElement(element, context) {
337
- const tagName = element.tagName.toLowerCase();
338
- const className = element.getAttribute('class') ?? '';
339
- const id = element.getAttribute('id') ?? '';
340
- const role = element.getAttribute('role');
341
- const style = element.getAttribute('style');
342
- const elIsInteractive = isInteractive(element, role);
343
- const elIsHidden = element.hasAttribute('hidden') ||
344
- element.getAttribute('aria-hidden') === 'true' ||
345
- (style !== null && HIDDEN_STYLE_REGEX.test(style));
346
- let score = 0;
347
- const { weights } = context;
348
- // Structural
349
- if (context.structuralTags.has(tagName) && !elIsInteractive) {
350
- score += weights.structural;
351
- }
352
- // Nav/Footer Scoring
353
- if (context.flags.navFooter) {
354
- score += calculateNavFooterScore(tagName, className, id, role, weights);
355
- }
356
- // Hidden
357
- if (elIsHidden && !elIsInteractive) {
358
- score += weights.hidden;
359
- }
360
- // Sticky/Fixed
361
- if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
362
- score += weights.stickyFixed;
363
- }
364
- // Promo
365
- score += calculatePromoScore(element, className, id, context);
366
- return score >= weights.threshold;
367
- }
368
- function cleanHeadings(document) {
369
- const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
370
- for (const h of headings) {
371
- if (!h.parentNode)
372
- continue;
373
- // Remove absolute/positioned wrapper divs
374
- const divs = h.querySelectorAll('div');
375
- for (let j = divs.length - 1; j >= 0; j--) {
376
- const d = divs[j];
377
- if (!d?.parentNode)
378
- continue;
379
- const cls = d.getAttribute('class') ?? '';
380
- const stl = d.getAttribute('style') ?? '';
381
- if (cls.includes('absolute') ||
382
- stl.includes('position') ||
383
- d.getAttribute('tabindex') === '-1') {
384
- d.remove();
385
- }
386
- }
387
- // Remove empty hash-link anchors
388
- const anchors = h.querySelectorAll('a');
389
- for (let j = anchors.length - 1; j >= 0; j--) {
390
- const a = anchors[j];
391
- if (!a?.parentNode)
392
- continue;
393
- const href = a.getAttribute('href') ?? '';
394
- const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
395
- if (href.startsWith('#') && txt.length === 0) {
396
- a.remove();
397
- }
398
- }
399
- // Strip zero-width spaces from text nodes
400
- const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
401
- let node;
402
- while ((node = walker.nextNode())) {
403
- if (node.textContent?.includes('\u200B')) {
404
- node.textContent = node.textContent.replace(/\u200B/g, '');
405
- }
406
- }
407
- }
408
- }
409
- function stripNoise(document, context, signal) {
410
- cleanHeadings(document);
411
- // Structural Removal
412
- const { baseSelector, extraSelectors } = context;
413
- removeNodes(document.querySelectorAll(baseSelector));
414
- if (extraSelectors.length > 0) {
415
- removeNodes(document.querySelectorAll(extraSelectors.join(',')));
416
- }
417
- // Candidates (conditional removal)
418
- const candidates = document.querySelectorAll(context.candidateSelector);
419
- for (let i = candidates.length - 1; i >= 0; i--) {
420
- if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
421
- throw new Error('Noise removal aborted');
422
- }
423
- const node = candidates[i];
424
- if (!node)
425
- continue;
426
- if (!node.parentNode)
427
- continue;
428
- if (shouldPreserve(node, node.tagName.toLowerCase()))
429
- continue;
430
- if (isNoiseElement(node, context)) {
431
- node.remove();
432
- }
433
- }
434
- }
435
- function processUrlElement(el, attr, base, isSrcset) {
436
- if (!el.parentNode)
437
- return;
438
- if (isSrcset) {
439
- const val = el.getAttribute(attr);
440
- if (val) {
441
- const newVal = val
442
- .split(',')
443
- .map((entry) => {
444
- const parts = entry.trim().split(/\s+/);
445
- if (!parts[0])
446
- return entry;
447
- try {
448
- parts[0] = new URL(parts[0], base).href;
449
- }
450
- catch {
451
- /* ignore */
452
- }
453
- return parts.join(' ');
454
- })
455
- .join(', ');
456
- el.setAttribute(attr, newVal);
457
- }
458
- return;
459
- }
460
- const val = el.getAttribute(attr);
461
- if (val &&
462
- !SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
463
- try {
464
- el.setAttribute(attr, new URL(val, base).href);
465
- }
466
- catch {
467
- /* ignore */
468
- }
469
- }
470
- }
471
- function resolveUrls(document, baseUrlStr) {
472
- let base;
473
- try {
474
- base = new URL(baseUrlStr);
475
- }
476
- catch {
477
- return;
478
- }
479
- const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
480
- for (const el of elements) {
481
- const tag = el.tagName.toLowerCase();
482
- if (tag === 'a')
483
- processUrlElement(el, 'href', base, false);
484
- else if (tag === 'img')
485
- processUrlElement(el, 'src', base, false);
486
- else if (tag === 'source')
487
- processUrlElement(el, 'srcset', base, true);
488
- }
489
- }
490
- export function serializeDocumentForMarkdown(document, fallback) {
491
- const bodyHtml = document.body.innerHTML;
492
- if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
493
- return bodyHtml;
494
- const outerHtml = document.documentElement.outerHTML;
495
- if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
496
- return outerHtml;
497
- return fallback;
498
- }
499
- function isFullDocumentHtml(html) {
500
- return HTML_DOCUMENT_MARKERS.test(html);
501
- }
502
- function mayContainNoise(html) {
503
- const sample = html.length <= NOISE_SCAN_LIMIT
504
- ? html
505
- : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
506
- return NOISE_PATTERNS.some((re) => re.test(sample));
507
- }
508
- function surfaceHiddenTabPanels(document) {
509
- const panels = document.querySelectorAll('[data-slot="tabContent"], [role="tabpanel"]');
510
- for (const panel of panels) {
511
- const style = panel.getAttribute('style') ?? '';
512
- if (/display\s*:\s*none/i.test(style)) {
513
- panel.setAttribute('style', style.replace(/display\s*:\s*none\s*;?/gi, '').trim());
514
- }
515
- panel.removeAttribute('hidden');
516
- }
517
- }
518
- function stripTabTriggers(document) {
519
- surfaceHiddenTabPanels(document);
520
- const tabs = document.querySelectorAll('button[role="tab"]');
521
- for (let i = tabs.length - 1; i >= 0; i--) {
522
- tabs[i]?.remove();
523
- }
524
- }
525
- function escapeTableCellPipes(document) {
526
- const cells = document.querySelectorAll('td, th');
527
- for (const cell of cells) {
528
- for (const node of cell.childNodes) {
529
- const text = node.textContent;
530
- if (node.nodeType === 3 && text?.includes('|')) {
531
- node.textContent = text.replace(/\|/g, '\\|');
532
- }
533
- }
534
- }
535
- }
536
- function separateAdjacentInlineElements(document) {
537
- const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
538
- for (const badge of badges) {
539
- const next = badge.nextSibling;
540
- if (next?.nodeType === 1) {
541
- badge.after(document.createTextNode(' '));
542
- }
543
- }
544
- }
545
- export function prepareDocumentForMarkdown(document, baseUrl, signal) {
546
- const context = getContext();
547
- if (config.noiseRemoval.debug) {
548
- logDebug('Noise removal audit enabled', {
549
- categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
550
- });
551
- }
552
- stripNoise(document, context, signal);
553
- stripTabTriggers(document);
554
- separateAdjacentInlineElements(document);
555
- flattenTableCellBreaks(document);
556
- escapeTableCellPipes(document);
557
- normalizeTableStructure(document);
558
- if (baseUrl)
559
- resolveUrls(document, baseUrl);
560
- }
561
- // Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
562
- function normalizeTableStructure(document) {
563
- for (const table of document.querySelectorAll('table')) {
564
- for (const cell of table.querySelectorAll('th, td')) {
565
- for (const tag of ['tbody', 'thead', 'tfoot']) {
566
- let nested = cell.querySelector(tag);
567
- while (nested) {
568
- table.appendChild(nested);
569
- nested = cell.querySelector(tag);
570
- }
571
- }
572
- }
573
- }
574
- }
575
- function flattenTableCellBreaks(document) {
576
- const cells = document.querySelectorAll('td, th');
577
- for (const cell of cells) {
578
- const brs = cell.querySelectorAll('br');
579
- for (const br of brs) {
580
- br.replaceWith(' ');
581
- }
582
- }
583
- }
584
- export function removeNoiseFromHtml(html, document, baseUrl, signal) {
585
- const shouldParse = isFullDocumentHtml(html) ||
586
- mayContainNoise(html) ||
587
- HTML_FRAGMENT_MARKERS.test(html);
588
- if (!shouldParse)
589
- return html;
590
- try {
591
- const doc = document ?? parseHTML(html).document;
592
- prepareDocumentForMarkdown(doc, baseUrl, signal);
593
- return serializeDocumentForMarkdown(doc, html);
594
- }
595
- catch {
596
- return html;
597
- }
598
- }
599
- // endregion
600
- // region Language Detection
601
- class DetectionContext {
602
- code;
603
- _lower;
604
- _lines;
605
- _trimmedStart;
606
- constructor(code) {
607
- this.code = code;
608
- }
609
- get lower() {
610
- this._lower ??= this.code.toLowerCase();
611
- return this._lower;
612
- }
613
- get lines() {
614
- this._lines ??= this.code.split(/\r?\n/);
615
- return this._lines;
616
- }
617
- get trimmedStart() {
618
- this._trimmedStart ??= this.code.trimStart();
619
- return this._trimmedStart;
620
- }
621
- }
622
- const BASH_COMMANDS = new Set([
623
- 'sudo',
624
- 'chmod',
625
- 'mkdir',
626
- 'cd',
627
- 'ls',
628
- 'cat',
629
- 'echo',
630
- ]);
631
- const BASH_PACKAGE_MANAGERS = [
632
- 'npm',
633
- 'yarn',
634
- 'pnpm',
635
- 'npx',
636
- 'brew',
637
- 'apt',
638
- 'pip',
639
- 'cargo',
640
- 'go',
641
- ];
642
- const BASH_VERBS = new Set(['install', 'add', 'run', 'build', 'start']);
643
- const TYPESCRIPT_HINTS = [
644
- ': string',
645
- ':string',
646
- ': number',
647
- ':number',
648
- ': boolean',
649
- ':boolean',
650
- ': void',
651
- ':void',
652
- ': any',
653
- ':any',
654
- ': unknown',
655
- ':unknown',
656
- ': never',
657
- ':never',
658
- ];
659
- const HTML_TAGS = [
660
- '<!doctype',
661
- '<html',
662
- '<head',
663
- '<body',
664
- '<div',
665
- '<span',
666
- '<p',
667
- '<a',
668
- '<script',
669
- '<style',
670
- ];
671
- const RUST_REGEX = /\b(?:fn|impl|struct|enum)\b/;
672
- const JS_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
673
- const PYTHON_UNIQUE_REGEX = /\b(?:def |elif |except |finally:|yield |lambda |raise |pass$)/m;
674
- const JS_SIGNAL_REGEX = /\b(?:const |let |var |function |require\(|=>|===|!==|console\.)/;
675
- const CSS_REGEX = /@media|@import|@keyframes/;
676
- const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
677
- function containsJsxTag(code) {
678
- const len = code.length;
679
- for (let i = 0; i < len - 1; i++) {
680
- if (code.charCodeAt(i) === ASCII_LT) {
681
- const next = code.charCodeAt(i + 1);
682
- if (next >= ASCII_UPPER_A && next <= ASCII_UPPER_Z)
683
- return true;
684
- }
685
- }
686
- return false;
687
- }
688
- function isBashLine(line) {
689
- const trimmed = line.trimStart();
690
- if (trimmed.length === 0)
691
- return false;
692
- // Shell Prefix
693
- if (trimmed.startsWith('#!') ||
694
- trimmed.startsWith('$ ') ||
695
- trimmed.startsWith('# ')) {
696
- return true;
697
- }
698
- const spaceIdx = trimmed.indexOf(' ');
699
- const firstWord = spaceIdx === -1 ? trimmed : trimmed.slice(0, spaceIdx);
700
- if (BASH_COMMANDS.has(firstWord))
701
- return true;
702
- // Package Managers
703
- const isPkgMgr = BASH_PACKAGE_MANAGERS.includes(firstWord);
704
- if (isPkgMgr && spaceIdx !== -1) {
705
- const rest = trimmed.slice(spaceIdx + 1);
706
- const secondSpaceIdx = rest.indexOf(' ');
707
- const secondWord = secondSpaceIdx === -1 ? rest : rest.slice(0, secondSpaceIdx);
708
- if (BASH_VERBS.has(secondWord))
709
- return true;
710
- }
711
- return false;
712
- }
713
- function detectBashIndicators(lines) {
714
- return lines.some((line) => isBashLine(line));
715
- }
716
- function detectCssStructure(lines) {
717
- for (const line of lines) {
718
- const trimmed = line.trimStart();
719
- if (trimmed.length === 0)
720
- continue;
721
- const hasSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
722
- trimmed.includes('{');
723
- if (hasSelector)
724
- return true;
725
- if (trimmed.includes(';') &&
726
- CSS_PROPERTY_REGEX.test(trimmed) &&
727
- !trimmed.includes('(')) {
728
- return true;
729
- }
730
- }
731
- return false;
732
- }
733
- function detectYamlStructure(lines) {
734
- for (const line of lines) {
735
- const trimmed = line.trim();
736
- if (trimmed.length === 0)
737
- continue;
738
- const colonIdx = trimmed.indexOf(':');
739
- if (colonIdx <= 0)
740
- continue;
741
- const after = trimmed.charCodeAt(colonIdx + 1);
742
- if (after === ASCII_SPACE || after === ASCII_TAB)
743
- return true;
744
- }
745
- return false;
746
- }
747
- function matchRust(ctx) {
748
- if (ctx.lower.includes('let mut'))
749
- return true;
750
- if (RUST_REGEX.test(ctx.lower))
751
- return true;
752
- return ctx.lower.includes('use ') && ctx.lower.includes('::');
753
- }
754
- function matchGo(ctx) {
755
- if (ctx.lower.includes('import "'))
756
- return true;
757
- return /\b(?:package|func)\b/.test(ctx.lower);
758
- }
759
- function matchJsx(ctx) {
760
- const l = ctx.lower;
761
- if (l.includes('classname=') ||
762
- l.includes('jsx:') ||
763
- l.includes("from 'react'") ||
764
- l.includes('from "react"')) {
765
- return true;
766
- }
767
- return containsJsxTag(ctx.code);
768
- }
769
- function matchTypeScript(ctx) {
770
- if (/\b(?:interface|type)\b/.test(ctx.lower))
771
- return true;
772
- const l = ctx.lower;
773
- for (const hint of TYPESCRIPT_HINTS) {
774
- if (l.includes(hint))
775
- return true;
776
- }
777
- return false;
778
- }
779
- function matchSql(ctx) {
780
- return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(ctx.lower);
781
- }
782
- function hasJsSignals(lowerCode) {
783
- return (JS_SIGNAL_REGEX.test(lowerCode) ||
784
- lowerCode.includes('{') ||
785
- lowerCode.includes("from '"));
786
- }
787
- function matchPython(ctx) {
788
- const l = ctx.lower;
789
- if (l.includes('print(') || l.includes('__name__'))
790
- return true;
791
- if (l.includes('self.') || l.includes('elif '))
792
- return true;
793
- // Check for Python's None/True/False using original case (they are capitalized in Python)
794
- if (ctx.code.includes('None') ||
795
- ctx.code.includes('True') ||
796
- ctx.code.includes('False')) {
797
- return true;
798
- }
799
- if (PYTHON_UNIQUE_REGEX.test(l))
800
- return true;
801
- // Shared keywords (import, from, class) — only match if no JS signals present
802
- if (/\b(?:import|from|class)\b/.test(l) && !hasJsSignals(l)) {
803
- return true;
804
- }
805
- return false;
806
- }
807
- function matchHtml(ctx) {
808
- const l = ctx.lower;
809
- for (const tag of HTML_TAGS) {
810
- if (l.includes(tag))
811
- return true;
812
- }
813
- return false;
814
- }
815
- // Pre-sorted by weight descending — first match wins in detectLanguageFromCode
816
- const LANGUAGES = [
817
- { lang: 'rust', weight: 25, match: matchRust },
818
- { lang: 'go', weight: 22, match: matchGo },
819
- { lang: 'jsx', weight: 22, match: matchJsx },
820
- { lang: 'typescript', weight: 20, match: matchTypeScript },
821
- { lang: 'sql', weight: 20, match: matchSql },
822
- { lang: 'python', weight: 18, match: matchPython },
823
- {
824
- lang: 'css',
825
- weight: 18,
826
- match: (ctx) => CSS_REGEX.test(ctx.lower) || detectCssStructure(ctx.lines),
827
- },
828
- { lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
829
- { lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
830
- { lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
831
- { lang: 'html', weight: 12, match: matchHtml },
832
- {
833
- lang: 'json',
834
- weight: 10,
835
- match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
836
- },
837
- ];
838
- export function extractLanguageFromClassName(className) {
839
- if (!className)
840
- return undefined;
841
- // Split by whitespace and check for language indicators
842
- const tokens = className.match(/\S+/g);
843
- if (!tokens)
844
- return undefined;
845
- // Fast path: check for prefixes
846
- for (const token of tokens) {
847
- const lower = token.toLowerCase();
848
- if (lower.startsWith('language-'))
849
- return token.slice(9);
850
- if (lower.startsWith('lang-'))
851
- return token.slice(5);
852
- if (lower.startsWith('highlight-'))
853
- return token.slice(10);
854
- }
855
- // Fallback: check for hljs context
856
- if (!tokens.includes('hljs'))
857
- return undefined;
858
- const langClass = tokens.find((t) => {
859
- const l = t.toLowerCase();
860
- return l !== 'hljs' && !l.startsWith('hljs-');
861
- });
862
- return langClass;
863
- }
864
- function resolveLanguageFromDataAttribute(dataLang) {
865
- const trimmed = dataLang.trim();
866
- if (!trimmed)
867
- return undefined;
868
- // Check if \w+
869
- for (let i = 0; i < trimmed.length; i++) {
870
- const c = trimmed.charCodeAt(i);
871
- const isUpper = c >= ASCII_UPPER_A && c <= ASCII_UPPER_Z;
872
- const isLower = c >= ASCII_LOWER_A && c <= ASCII_LOWER_Z;
873
- const isDigit = c >= ASCII_DIGIT_0 && c <= ASCII_DIGIT_9;
874
- const isUnder = c === ASCII_UNDERSCORE;
875
- if (!isUpper && !isLower && !isDigit && !isUnder) {
876
- return undefined;
877
- }
878
- }
879
- return trimmed;
880
- }
881
- export function resolveLanguageFromAttributes(className, dataLang) {
882
- return (extractLanguageFromClassName(className) ??
883
- resolveLanguageFromDataAttribute(dataLang));
884
- }
885
- export function detectLanguageFromCode(code) {
886
- if (!code)
887
- return undefined;
888
- // Fast path for empty/whitespace only
889
- let empty = true;
890
- for (let i = 0; i < code.length; i++) {
891
- if (code.charCodeAt(i) > ASCII_SPACE) {
892
- empty = false;
893
- break;
894
- }
895
- }
896
- if (empty)
897
- return undefined;
898
- const ctx = new DetectionContext(code);
899
- // LANGUAGES is pre-sorted by weight descending — first match is highest confidence
900
- for (const def of LANGUAGES) {
901
- if (def.match(ctx))
902
- return def.lang;
903
- }
904
- return undefined;
905
- }
906
- // endregion
907
- // region Markdown Cleanup
908
- const MAX_LINE_LENGTH = 80;
909
- const REGEX = {
910
- HEADING_MARKER: /^#{1,6}\s/m,
911
- HEADING_STRICT: /^#{1,6}\s+/m,
912
- EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
913
- ANCHOR_ONLY_HEADING: /^#{1,6}\s+\[[^\]]+\]\(#[^)]+\)\s*$/,
914
- FENCE_START: /^\s*(`{3,}|~{3,})/,
915
- LIST_MARKER: /^(?:[-*+])\s/m,
916
- TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
917
- TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents|on this page)\s*$/i,
918
- HTML_DOC_START: /^(<!doctype|<html)/i,
919
- COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??|\[Back to top\]\(#[^)]*\)|\[\s*\]\(https?:\/\/[^)]*\))\s*$/gim,
920
- ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
921
- CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
922
- DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
923
- SOURCE_KEY: /^source:\s/im,
924
- HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
925
- HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
926
- SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
927
- SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
928
- SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
929
- SPACING_ESCAPES: /\\([[\].])/g,
930
- SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
931
- PUNCT_ONLY_LIST_ARTIFACT: /^(?:[-*+]|\d+\.)\s*(?:\\[-*+|/]|[-*+|/])(?:\s+(?:\\[-*+|/]|[-*+|/]))*\s*$/gm,
932
- NESTED_LIST_INDENT: /^( +)((?:[-*+])|\d+\.)\s/gm,
933
- TYPEDOC_COMMENT: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
934
- };
935
- const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
936
- const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
937
- const TOC_SCAN_LIMIT = 20;
938
- const TOC_MAX_NON_EMPTY = 12;
939
- const TOC_LINK_RATIO_THRESHOLD = 0.8;
940
- const TYPEDOC_PREFIXES = [
941
- 'Defined in:',
942
- 'Returns:',
943
- 'Since:',
944
- 'See also:',
945
- ];
946
- function createAbortChecker(options) {
947
- const signal = options?.signal;
948
- const url = options?.url ?? '';
949
- return (stage) => {
950
- throwIfAborted(signal, url, stage);
951
- };
952
- }
953
- function getLineEnding(content) {
954
- return content.includes('\r\n') ? '\r\n' : '\n';
955
- }
956
- function isBlank(line) {
957
- return line === undefined || line.trim().length === 0;
958
- }
959
- function hasFollowingContent(lines, startIndex) {
960
- // Optimization: Bound lookahead to avoid checking too many lines in huge files
961
- for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
962
- if (!isBlank(lines[i]))
963
- return true;
964
- }
965
- return false;
966
- }
967
- function stripAnchorOnlyHeading(line) {
968
- return line.replace(/^(#{1,6})\s+\[([^\]]+)\]\(#[^)]+\)\s*$/, '$1 $2');
969
- }
970
- function isTitleCaseOrKeyword(trimmed) {
971
- // Quick check for length to avoid regex on long strings
972
- if (trimmed.length > MAX_LINE_LENGTH)
973
- return false;
974
- // Single word optimization
975
- if (!trimmed.includes(' ')) {
976
- if (!/^[A-Z]/.test(trimmed))
977
- return false;
978
- return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
979
- }
980
- // Split limited number of words
981
- const words = trimmed.split(/\s+/);
982
- const len = words.length;
983
- if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
984
- return false;
985
- let capitalizedCount = 0;
986
- for (let i = 0; i < len; i++) {
987
- const w = words[i];
988
- if (!w)
989
- continue;
990
- const isCap = /^[A-Z][a-z]*$/.test(w);
991
- if (isCap)
992
- capitalizedCount++;
993
- else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
994
- return false;
995
- }
996
- return capitalizedCount >= TITLE_MIN_CAPITALIZED;
997
- }
998
- function getHeadingPrefix(trimmed) {
999
- if (trimmed.length > MAX_LINE_LENGTH)
1000
- return null;
1001
- // Fast path: Check common markdown markers first
1002
- const firstChar = trimmed.charCodeAt(0);
1003
- if (firstChar === ASCII_HASH ||
1004
- firstChar === ASCII_DASH ||
1005
- firstChar === ASCII_ASTERISK ||
1006
- firstChar === ASCII_PLUS ||
1007
- firstChar === ASCII_BRACKET_OPEN ||
1008
- (firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
1009
- if (REGEX.HEADING_MARKER.test(trimmed) ||
1010
- REGEX.LIST_MARKER.test(trimmed) ||
1011
- /^\d+\.\s/.test(trimmed) ||
1012
- /^\[.*\]\(.*\)$/.test(trimmed)) {
1013
- return null;
1014
- }
1015
- }
1016
- if (SPECIAL_PREFIXES.test(trimmed)) {
1017
- return /^example:\s/i.test(trimmed) ? '### ' : '## ';
1018
- }
1019
- const lastChar = trimmed.charCodeAt(trimmed.length - 1);
1020
- if (lastChar === ASCII_PERIOD ||
1021
- lastChar === ASCII_EXCLAMATION ||
1022
- lastChar === ASCII_QUESTION)
1023
- return null;
1024
- return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
1025
- }
1026
- function getTocBlockStats(lines, headingIndex) {
1027
- let total = 0;
1028
- let linkCount = 0;
1029
- let nonLinkCount = 0;
1030
- const lookaheadMax = Math.min(lines.length, headingIndex + TOC_SCAN_LIMIT);
1031
- for (let i = headingIndex + 1; i < lookaheadMax; i++) {
1032
- const line = lines[i];
1033
- if (!line)
1034
- continue;
1035
- const trimmed = line.trim();
1036
- if (!trimmed)
1037
- continue;
1038
- if (REGEX.HEADING_MARKER.test(trimmed))
1039
- break;
1040
- total += 1;
1041
- if (REGEX.TOC_LINK.test(trimmed))
1042
- linkCount += 1;
1043
- else
1044
- nonLinkCount += 1;
1045
- if (total >= TOC_MAX_NON_EMPTY)
1046
- break;
1047
- }
1048
- return { total, linkCount, nonLinkCount };
1049
- }
1050
- function skipTocLines(lines, startIndex) {
1051
- for (let i = startIndex; i < lines.length; i++) {
1052
- const line = lines[i];
1053
- if (line === undefined)
1054
- continue;
1055
- const trimmed = line.trim();
1056
- if (!trimmed)
1057
- continue;
1058
- if (!REGEX.TOC_LINK.test(trimmed))
1059
- return i;
1060
- }
1061
- return lines.length;
1062
- }
1063
- function isTypeDocArtifactLine(line) {
1064
- const trimmed = line.trim();
1065
- for (const prefix of TYPEDOC_PREFIXES) {
1066
- if (!trimmed.startsWith(prefix))
1067
- continue;
1068
- const rest = trimmed.slice(prefix.length).trimStart();
1069
- if (!rest.startsWith('**`'))
1070
- return false;
1071
- return rest.includes('`**');
1072
- }
1073
- return false;
1074
- }
1075
- function tryPromoteOrphan(lines, i, trimmed) {
1076
- const prevLine = lines[i - 1];
1077
- const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
1078
- if (!isOrphan)
1079
- return null;
1080
- const prefix = getHeadingPrefix(trimmed);
1081
- if (!prefix)
1082
- return null;
1083
- const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed);
1084
- if (!isSpecialPrefix && !hasFollowingContent(lines, i))
1085
- return null;
1086
- return `${prefix}${trimmed}`;
1087
- }
1088
- function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
1089
- if (!removeToc || !REGEX.TOC_HEADING.test(trimmed))
1090
- return null;
1091
- const { total, linkCount, nonLinkCount } = getTocBlockStats(lines, i);
1092
- if (total === 0 || nonLinkCount > 0)
1093
- return null;
1094
- const ratio = linkCount / total;
1095
- if (ratio <= TOC_LINK_RATIO_THRESHOLD)
1096
- return null;
1097
- throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc');
1098
- return skipTocLines(lines, i + 1);
1099
- }
1100
- function preprocessLines(lines, options) {
1101
- const processedLines = [];
1102
- const len = lines.length;
1103
- const promote = config.markdownCleanup.promoteOrphanHeadings;
1104
- const removeToc = config.markdownCleanup.removeTocBlocks;
1105
- const checkAbort = createAbortChecker(options);
1106
- let skipUntil = -1;
1107
- for (let i = 0; i < len; i++) {
1108
- if (i < skipUntil)
1109
- continue;
1110
- let line = lines[i];
1111
- if (line === undefined)
1112
- continue;
1113
- const trimmed = line.trim();
1114
- if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
1115
- continue;
1116
- if (REGEX.ANCHOR_ONLY_HEADING.test(trimmed)) {
1117
- if (!hasFollowingContent(lines, i))
1118
- continue;
1119
- line = stripAnchorOnlyHeading(trimmed);
1120
- }
1121
- const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc, options);
1122
- if (tocSkip !== null) {
1123
- skipUntil = tocSkip;
1124
- continue;
1125
- }
1126
- if (promote && trimmed.length > 0) {
1127
- checkAbort('markdown:cleanup:promote');
1128
- const promoted = tryPromoteOrphan(lines, i, trimmed);
1129
- if (promoted)
1130
- line = promoted;
1131
- }
1132
- processedLines.push(line);
1133
- }
1134
- return processedLines.join('\n');
1135
- }
1136
- function processTextBuffer(lines, options) {
1137
- if (lines.length === 0)
1138
- return '';
1139
- const text = preprocessLines(lines, options);
1140
- return applyGlobalRegexes(text, options);
1141
- }
1142
- function removeTypeDocArtifacts(text) {
1143
- const filtered = text
1144
- .split('\n')
1145
- .filter((line) => !isTypeDocArtifactLine(line))
1146
- .join('\n');
1147
- return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
1148
- }
1149
- function removeSkipLinks(text) {
1150
- return text
1151
- .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1152
- .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1153
- }
1154
- function normalizeInlineCodeTokens(text) {
1155
- return text.replace(/`([^`\n]+)`/g, (match, inner) => {
1156
- const trimmed = inner.trim();
1157
- if (trimmed === inner || /\s/.test(trimmed))
1158
- return match;
1159
- if (!/^[*A-Za-z0-9_./:-]+$/.test(trimmed))
1160
- return match;
1161
- return `\`${trimmed}\``;
1162
- });
1163
- }
1164
- function normalizeMarkdownSpacing(text) {
1165
- let result = text
1166
- .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
1167
- .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
1168
- .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
1169
- .replace(REGEX.SPACING_ESCAPES, '$1')
1170
- .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
1171
- .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
1172
- .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
1173
- // Trim whitespace around token-like inline code spans.
1174
- result = normalizeInlineCodeTokens(result);
1175
- // Unescape backticks inside markdown link text
1176
- result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
1177
- result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
1178
- return normalizeNestedListIndentation(result);
1179
- }
1180
- function fixConcatenatedProperties(text) {
1181
- let result = text;
1182
- for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
1183
- const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
1184
- if (next === result)
1185
- break;
1186
- result = next;
1187
- }
1188
- return result;
1189
- }
1190
- function applyGlobalRegexes(text, options) {
1191
- const checkAbort = createAbortChecker(options);
1192
- let result = text.replace(/\u00A0/g, ' ');
1193
- checkAbort('markdown:cleanup:headings');
1194
- result = result
1195
- .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
1196
- .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
1197
- if (config.markdownCleanup.removeTypeDocComments) {
1198
- checkAbort('markdown:cleanup:typedoc');
1199
- result = removeTypeDocArtifacts(result);
1200
- }
1201
- if (config.markdownCleanup.removeSkipLinks) {
1202
- checkAbort('markdown:cleanup:skip-links');
1203
- result = removeSkipLinks(result);
1204
- }
1205
- checkAbort('markdown:cleanup:spacing');
1206
- result = normalizeMarkdownSpacing(result);
1207
- checkAbort('markdown:cleanup:properties');
1208
- return fixConcatenatedProperties(result);
1209
- }
1210
- function normalizeNestedListIndentation(text) {
1211
- return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
1212
- const count = spaces.length;
1213
- if (count < 2 || count % 2 !== 0)
1214
- return match;
1215
- const normalized = ' '.repeat((count / 2) * 4);
1216
- return `${normalized}${marker} `;
1217
- });
1218
- }
1219
- export function cleanupMarkdownArtifacts(content, options) {
1220
- if (!content)
1221
- return '';
1222
- const checkAbort = createAbortChecker(options);
1223
- checkAbort('markdown:cleanup:begin');
1224
- const lines = content.split(/\r?\n/);
1225
- let fenceMarker = null;
1226
- const segments = [];
1227
- let buffer = [];
1228
- const flushBuffer = () => {
1229
- if (buffer.length > 0) {
1230
- segments.push(processTextBuffer(buffer, options));
1231
- buffer = [];
1232
- }
1233
- };
1234
- for (const line of lines) {
1235
- const trimmed = line.trimStart();
1236
- if (fenceMarker) {
1237
- segments.push(line);
1238
- if (trimmed.startsWith(fenceMarker) &&
1239
- trimmed.slice(fenceMarker.length).trim() === '') {
1240
- fenceMarker = null;
1241
- }
1242
- }
1243
- else {
1244
- const match = REGEX.FENCE_START.exec(line);
1245
- const newMarker = match ? (match[1] ?? '```') : null;
1246
- if (!newMarker) {
1247
- buffer.push(line);
1248
- }
1249
- else {
1250
- flushBuffer();
1251
- segments.push(line);
1252
- fenceMarker = newMarker;
1253
- }
1254
- }
1255
- }
1256
- flushBuffer();
1257
- return segments.join('\n').trim();
1258
- }
1259
- function parseFrontmatter(content) {
1260
- const len = content.length;
1261
- if (len < 4)
1262
- return null;
1263
- let lineEnding = null;
1264
- let fenceLen = 0;
1265
- if (content.startsWith('---\n')) {
1266
- lineEnding = '\n';
1267
- fenceLen = 4;
1268
- }
1269
- else if (content.startsWith('---\r\n')) {
1270
- lineEnding = '\r\n';
1271
- fenceLen = 5;
1272
- }
1273
- if (!lineEnding)
1274
- return null;
1275
- const fence = `---${lineEnding}`;
1276
- const closeIndex = content.indexOf(fence, fenceLen);
1277
- if (closeIndex === -1)
1278
- return null;
1279
- const range = {
1280
- start: 0,
1281
- end: closeIndex + fenceLen,
1282
- linesStart: fenceLen,
1283
- linesEnd: closeIndex,
1284
- lineEnding,
1285
- };
1286
- // Parse key-value entries in one pass
1287
- const entries = new Map();
1288
- const fmBody = content.slice(range.linesStart, range.linesEnd);
1289
- let lastIdx = 0;
1290
- while (lastIdx < fmBody.length) {
1291
- let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
1292
- if (nextIdx === -1)
1293
- nextIdx = fmBody.length;
1294
- const line = fmBody.slice(lastIdx, nextIdx).trim();
1295
- const colonIdx = line.indexOf(':');
1296
- if (line && colonIdx > 0) {
1297
- const key = line.slice(0, colonIdx).trim().toLowerCase();
1298
- let value = line.slice(colonIdx + 1).trim();
1299
- // Strip surrounding quotes
1300
- const first = value.charAt(0);
1301
- const last = value.charAt(value.length - 1);
1302
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1303
- value = value.slice(1, -1).trim();
1304
- }
1305
- if (value)
1306
- entries.set(key, value);
1307
- }
1308
- lastIdx = nextIdx + lineEnding.length;
1309
- }
1310
- return { range, entries };
1311
- }
1312
- function scanBodyForTitle(content) {
1313
- const len = content.length;
1314
- let scanIndex = 0;
1315
- const maxScan = Math.min(len, BODY_SCAN_LIMIT);
1316
- while (scanIndex < maxScan) {
1317
- let nextIndex = content.indexOf('\n', scanIndex);
1318
- if (nextIndex === -1)
1319
- nextIndex = len;
1320
- let line = content.slice(scanIndex, nextIndex);
1321
- if (line.endsWith('\r'))
1322
- line = line.slice(0, -1);
1323
- const trimmed = line.trim();
1324
- if (trimmed) {
1325
- if (REGEX.HEADING_STRICT.test(trimmed)) {
1326
- return trimmed.replace(REGEX.HEADING_MARKER, '').trim() || undefined;
1327
- }
1328
- return undefined;
1329
- }
1330
- scanIndex = nextIndex + 1;
1331
- }
1332
- return undefined;
1333
- }
1334
- export function extractTitleFromRawMarkdown(content) {
1335
- const fm = parseFrontmatter(content);
1336
- if (fm) {
1337
- const title = fm.entries.get('title') ?? fm.entries.get('name');
1338
- if (title)
1339
- return title;
1340
- }
1341
- return scanBodyForTitle(content);
1342
- }
1343
- export function addSourceToMarkdown(content, url) {
1344
- const fm = parseFrontmatter(content);
1345
- const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
1346
- if (useMarkdownFormat && !fm) {
1347
- if (REGEX.SOURCE_KEY.test(content))
1348
- return content;
1349
- const lineEnding = getLineEnding(content);
1350
- const firstH1Match = REGEX.HEADING_MARKER.exec(content);
1351
- if (firstH1Match) {
1352
- const h1Index = firstH1Match.index;
1353
- const lineEndIndex = content.indexOf(lineEnding, h1Index);
1354
- const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
1355
- const injection = `${lineEnding}Source: ${url}${lineEnding}`;
1356
- return content.slice(0, insertPos) + injection + content.slice(insertPos);
1357
- }
1358
- return `Source: ${url}${lineEnding}${lineEnding}${content}`;
1359
- }
1360
- if (!fm) {
1361
- const lineEnding = getLineEnding(content);
1362
- const escapedUrl = url.replace(/"/g, '\\"');
1363
- return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
1364
- }
1365
- const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
1366
- if (REGEX.SOURCE_KEY.test(fmBody))
1367
- return content;
1368
- const escapedUrl = url.replace(/"/g, '\\"');
1369
- const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
1370
- return (content.slice(0, fm.range.linesEnd) +
1371
- injection +
1372
- content.slice(fm.range.linesEnd));
1373
- }
1374
- // endregion
1375
- // region Content Detection & Metadata Footer
1376
- function countCommonTags(content, limit) {
1377
- if (limit <= 0)
1378
- return 0;
1379
- const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
1380
- let count = 0;
1381
- while (regex.exec(content)) {
1382
- count += 1;
1383
- if (count > limit)
1384
- break;
1385
- }
1386
- return count;
1387
- }
1388
- export function isRawTextContent(content) {
1389
- const trimmed = content.trim();
1390
- if (REGEX.HTML_DOC_START.test(trimmed))
1391
- return false;
1392
- if (parseFrontmatter(trimmed) !== null)
1393
- return true;
1394
- const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
1395
- if (tagCount > HTML_TAG_DENSITY_LIMIT)
1396
- return false;
1397
- return (REGEX.HEADING_MARKER.test(content) ||
1398
- REGEX.LIST_MARKER.test(content) ||
1399
- content.includes('```'));
1400
- }
1401
- function formatFetchedAt(value) {
1402
- const date = new Date(value);
1403
- if (Number.isNaN(date.getTime()))
1404
- return value;
1405
- const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
1406
- day: '2-digit',
1407
- month: '2-digit',
1408
- year: 'numeric',
1409
- });
1410
- return formatter.format(date);
1411
- }
1412
- export function buildMetadataFooter(metadata, fallbackUrl) {
1413
- if (!metadata)
1414
- return '';
1415
- const lines = ['---', ''];
1416
- const url = metadata.url || fallbackUrl;
1417
- const parts = [];
1418
- if (metadata.title)
1419
- parts.push(`_${metadata.title}_`);
1420
- if (metadata.author)
1421
- parts.push(`_${metadata.author}_`);
1422
- if (url)
1423
- parts.push(`[_Original Source_](${url})`);
1424
- if (metadata.fetchedAt) {
1425
- parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
1426
- }
1427
- if (parts.length > 0)
1428
- lines.push(` ${parts.join(' | ')}`);
1429
- if (metadata.description)
1430
- lines.push(` <sub>${metadata.description}</sub>`);
1431
- return lines.join('\n');
1432
- }
1433
- // endregion