@j0hanz/fetch-url-mcp 1.9.2 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,606 @@
1
+ import { parseHTML } from 'linkedom';
2
+ import { config, logDebug } from './core.js';
3
+ const NOISE_SCAN_LIMIT = 50_000;
4
+ const MIN_BODY_CONTENT_LENGTH = 100;
5
+ const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
6
+ const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
7
+ const ABORT_CHECK_INTERVAL = 500;
8
+ const NODE_FILTER_SHOW_TEXT = 4;
9
+ const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
10
+ const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
11
+ const NOISE_PATTERNS = [
12
+ /<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
13
+ /[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
14
+ /[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
15
+ /[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
16
+ /[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumbs?|pagination)\b/i,
17
+ ];
18
+ const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
19
+ const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
20
+ const SKIP_URL_PREFIXES = [
21
+ '#',
22
+ 'javascript:',
23
+ 'mailto:',
24
+ 'tel:',
25
+ 'data:',
26
+ 'blob:',
27
+ ];
28
+ const BASE_STRUCTURAL_TAGS = new Set([
29
+ 'script',
30
+ 'style',
31
+ 'noscript',
32
+ 'iframe',
33
+ 'template',
34
+ 'form',
35
+ 'button',
36
+ 'input',
37
+ 'select',
38
+ 'textarea',
39
+ ]);
40
+ const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
41
+ const NAVIGATION_ROLES = new Set([
42
+ 'navigation',
43
+ 'banner',
44
+ 'complementary',
45
+ 'contentinfo',
46
+ 'tree',
47
+ 'menubar',
48
+ 'menu',
49
+ 'dialog',
50
+ 'alertdialog',
51
+ 'search',
52
+ ]);
53
+ const INTERACTIVE_CONTENT_ROLES = new Set([
54
+ 'tabpanel',
55
+ 'tab',
56
+ 'tablist',
57
+ 'dialog',
58
+ 'alertdialog',
59
+ 'menu',
60
+ 'menuitem',
61
+ 'option',
62
+ 'listbox',
63
+ 'combobox',
64
+ 'tooltip',
65
+ 'alert',
66
+ ]);
67
+ const PROMO_TOKENS_ALWAYS = [
68
+ 'banner',
69
+ 'promo',
70
+ 'announcement',
71
+ 'cta',
72
+ 'advert',
73
+ 'ads',
74
+ 'sponsor',
75
+ 'recommend',
76
+ 'breadcrumb',
77
+ 'breadcrumbs',
78
+ 'pagination',
79
+ 'pager',
80
+ 'taglist',
81
+ 'twitter-tweet',
82
+ 'fb-post',
83
+ 'instagram-media',
84
+ 'social-embed',
85
+ ];
86
+ const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
87
+ const PROMO_TOKENS_BY_CATEGORY = {
88
+ 'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
89
+ newsletters: ['newsletter', 'subscribe'],
90
+ 'social-share': ['share', 'social'],
91
+ };
92
+ // Noise selector configurations
93
+ const BASE_NOISE_SELECTORS = {
94
+ navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"],[class*="breadcrumb"]',
95
+ cookieBanners: '[role="dialog"]',
96
+ hidden: '[style*="display: none"],[style*="display:none"],[style*="visibility: hidden"],[style*="visibility:hidden"],[hidden],[aria-hidden="true"]',
97
+ };
98
+ const NO_MATCH_REGEX = /a^/i;
99
+ let cachedContext;
100
+ let lastContextKey;
101
+ function escapeRegexLiteral(value) {
102
+ return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
103
+ }
104
+ function buildTokenRegex(tokens) {
105
+ if (tokens.size === 0)
106
+ return NO_MATCH_REGEX;
107
+ return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
108
+ }
109
+ function addTokens(target, tokens) {
110
+ for (const token of tokens)
111
+ target.add(token);
112
+ }
113
+ function getPromoMatchers(currentConfig, flags) {
114
+ const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
115
+ const aggressiveTokens = new Set();
116
+ if (currentConfig.aggressiveMode) {
117
+ addTokens(aggressiveTokens, PROMO_TOKENS_AGGRESSIVE);
118
+ }
119
+ if (flags.cookieBanners) {
120
+ addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['cookie-banners']);
121
+ }
122
+ if (flags.newsletters) {
123
+ addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['newsletters']);
124
+ }
125
+ if (flags.socialShare) {
126
+ addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['social-share']);
127
+ }
128
+ for (const t of currentConfig.extraTokens) {
129
+ const n = t.toLowerCase().trim();
130
+ if (n)
131
+ baseTokens.add(n);
132
+ }
133
+ return {
134
+ base: buildTokenRegex(baseTokens),
135
+ aggressive: buildTokenRegex(aggressiveTokens),
136
+ };
137
+ }
138
+ function getContext() {
139
+ const currentConfig = config.noiseRemoval;
140
+ const contextKey = JSON.stringify({
141
+ locale: config.i18n.locale,
142
+ enabledCategories: currentConfig.enabledCategories,
143
+ extraTokens: currentConfig.extraTokens,
144
+ extraSelectors: currentConfig.extraSelectors,
145
+ aggressiveMode: currentConfig.aggressiveMode,
146
+ preserveSvgCanvas: currentConfig.preserveSvgCanvas,
147
+ weights: currentConfig.weights,
148
+ });
149
+ if (cachedContext !== undefined && lastContextKey === contextKey)
150
+ return cachedContext;
151
+ const enabled = new Set(currentConfig.enabledCategories
152
+ .map((c) => {
153
+ const s = c.toLowerCase().trim();
154
+ const { locale } = config.i18n;
155
+ return locale ? s.toLocaleLowerCase(locale) : s;
156
+ })
157
+ .filter(Boolean));
158
+ const isEnabled = (cat) => enabled.has(cat);
159
+ const flags = {
160
+ navFooter: isEnabled('nav-footer'),
161
+ cookieBanners: isEnabled('cookie-banners'),
162
+ newsletters: isEnabled('newsletters'),
163
+ socialShare: isEnabled('social-share'),
164
+ };
165
+ const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
166
+ if (!currentConfig.preserveSvgCanvas) {
167
+ structuralTags.add('svg');
168
+ structuralTags.add('canvas');
169
+ }
170
+ const promoMatchers = getPromoMatchers(currentConfig, flags);
171
+ const extraSelectors = currentConfig.extraSelectors
172
+ .map((s) => s.trim())
173
+ .filter((s) => s.length > 0);
174
+ // Pre-build selectors
175
+ const selectors = [BASE_NOISE_SELECTORS.hidden];
176
+ if (flags.navFooter)
177
+ selectors.push(BASE_NOISE_SELECTORS.navFooter);
178
+ if (flags.cookieBanners)
179
+ selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
180
+ const baseSelector = selectors.join(',');
181
+ const candidateSelector = [
182
+ ...structuralTags,
183
+ ...ALWAYS_NOISE_TAGS,
184
+ 'aside',
185
+ 'header',
186
+ '[class]',
187
+ '[id]',
188
+ '[role]',
189
+ '[style]',
190
+ ].join(',');
191
+ cachedContext = {
192
+ flags,
193
+ structuralTags,
194
+ weights: currentConfig.weights,
195
+ promoMatchers,
196
+ promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
197
+ extraSelectors,
198
+ baseSelector,
199
+ candidateSelector,
200
+ };
201
+ lastContextKey = contextKey;
202
+ return cachedContext;
203
+ }
204
+ function isInteractive(element, role) {
205
+ if (role && INTERACTIVE_CONTENT_ROLES.has(role))
206
+ return true;
207
+ const tag = element.tagName.toLowerCase();
208
+ const ds = element.getAttribute('data-state');
209
+ if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
210
+ return true;
211
+ const dataOrientation = element.getAttribute('data-orientation');
212
+ if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
213
+ return true;
214
+ return (element.hasAttribute('data-accordion-item') ||
215
+ element.hasAttribute('data-radix-collection-item'));
216
+ }
217
+ function isWithinPrimaryContent(element) {
218
+ let current = element;
219
+ while (current) {
220
+ const tagName = current.tagName.toLowerCase();
221
+ if (tagName === 'article' || tagName === 'main')
222
+ return true;
223
+ if (current.getAttribute('role') === 'main')
224
+ return true;
225
+ current = current.parentElement;
226
+ }
227
+ return false;
228
+ }
229
+ const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
230
+ const ASIDE_NAV_MIN_LINKS = 10;
231
+ function isNavigationAside(element) {
232
+ if (element.querySelector('nav'))
233
+ return true;
234
+ const links = element.querySelectorAll('a[href]');
235
+ if (links.length < ASIDE_NAV_MIN_LINKS)
236
+ return false;
237
+ const textLen = (element.textContent || '').trim().length;
238
+ if (textLen === 0)
239
+ return true;
240
+ return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
241
+ }
242
+ function shouldPreserve(element, tagName) {
243
+ // Check Dialog
244
+ const role = element.getAttribute('role');
245
+ if (role === 'dialog' || role === 'alertdialog') {
246
+ if (isWithinPrimaryContent(element))
247
+ return true;
248
+ const textLen = (element.textContent || '').length;
249
+ if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
250
+ return true;
251
+ return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
252
+ }
253
+ // Check Nav/Footer
254
+ if (tagName === 'nav' || tagName === 'footer') {
255
+ if (element.querySelector('article,main,section,[role="main"]'))
256
+ return true;
257
+ return ((element.textContent || '').trim().length >=
258
+ NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
259
+ }
260
+ // Check Aside — preserve only if it looks like article content, not navigation
261
+ if (tagName === 'aside') {
262
+ if (!isWithinPrimaryContent(element))
263
+ return false;
264
+ return !isNavigationAside(element);
265
+ }
266
+ return false;
267
+ }
268
+ function removeNodes(nodes) {
269
+ for (let i = nodes.length - 1; i >= 0; i--) {
270
+ const node = nodes[i];
271
+ if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
272
+ node.remove();
273
+ }
274
+ }
275
+ }
276
+ const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
277
+ function calculateNavFooterScore(tagName, className, id, role, weights) {
278
+ let score = 0;
279
+ if (ALWAYS_NOISE_TAGS.has(tagName))
280
+ score += weights.structural;
281
+ if (tagName === 'header') {
282
+ if ((role && NAVIGATION_ROLES.has(role)) ||
283
+ HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
284
+ score += weights.structural;
285
+ }
286
+ }
287
+ if (tagName === 'aside') {
288
+ score += weights.structural;
289
+ }
290
+ if (role && NAVIGATION_ROLES.has(role)) {
291
+ if (tagName !== 'aside' || role !== 'complementary') {
292
+ score += weights.structural;
293
+ }
294
+ }
295
+ return score;
296
+ }
297
+ function calculatePromoScore(element, className, id, context) {
298
+ if (!context.promoEnabled)
299
+ return 0;
300
+ const aggTest = context.promoMatchers.aggressive.test(className) ||
301
+ context.promoMatchers.aggressive.test(id);
302
+ const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
303
+ const isBaseMatch = !aggTest &&
304
+ (context.promoMatchers.base.test(className) ||
305
+ context.promoMatchers.base.test(id));
306
+ return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
307
+ }
308
+ function isNoiseElement(element, context) {
309
+ const tagName = element.tagName.toLowerCase();
310
+ const className = element.getAttribute('class') ?? '';
311
+ const id = element.getAttribute('id') ?? '';
312
+ const role = element.getAttribute('role');
313
+ const style = element.getAttribute('style');
314
+ const elIsInteractive = isInteractive(element, role);
315
+ const elIsHidden = element.hasAttribute('hidden') ||
316
+ element.getAttribute('aria-hidden') === 'true' ||
317
+ (style !== null && HIDDEN_STYLE_REGEX.test(style));
318
+ let score = 0;
319
+ const { weights } = context;
320
+ // Structural
321
+ if (context.structuralTags.has(tagName) && !elIsInteractive) {
322
+ score += weights.structural;
323
+ }
324
+ // Nav/Footer Scoring
325
+ if (context.flags.navFooter) {
326
+ score += calculateNavFooterScore(tagName, className, id, role, weights);
327
+ }
328
+ // Hidden
329
+ if (elIsHidden && !elIsInteractive) {
330
+ score += weights.hidden;
331
+ }
332
+ // Sticky/Fixed
333
+ if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
334
+ score += weights.stickyFixed;
335
+ }
336
+ // Promo
337
+ score += calculatePromoScore(element, className, id, context);
338
+ return score >= weights.threshold;
339
+ }
340
+ function cleanHeadings(document) {
341
+ const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
342
+ for (const h of headings) {
343
+ if (!h.parentNode)
344
+ continue;
345
+ // Remove absolute/positioned wrapper divs
346
+ const divs = h.querySelectorAll('div');
347
+ for (let j = divs.length - 1; j >= 0; j--) {
348
+ const d = divs[j];
349
+ if (!d?.parentNode)
350
+ continue;
351
+ const cls = d.getAttribute('class') ?? '';
352
+ const stl = d.getAttribute('style') ?? '';
353
+ if (cls.includes('absolute') ||
354
+ stl.includes('position') ||
355
+ d.getAttribute('tabindex') === '-1') {
356
+ d.remove();
357
+ }
358
+ }
359
+ // Remove empty hash-link anchors
360
+ const anchors = h.querySelectorAll('a');
361
+ for (let j = anchors.length - 1; j >= 0; j--) {
362
+ const a = anchors[j];
363
+ if (!a?.parentNode)
364
+ continue;
365
+ const href = a.getAttribute('href') ?? '';
366
+ const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
367
+ if (href.startsWith('#') && txt.length === 0) {
368
+ a.remove();
369
+ }
370
+ }
371
+ // Strip zero-width spaces from text nodes
372
+ const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
373
+ let node;
374
+ while ((node = walker.nextNode())) {
375
+ if (node.textContent?.includes('\u200B')) {
376
+ node.textContent = node.textContent.replace(/\u200B/g, '');
377
+ }
378
+ }
379
+ }
380
+ }
381
+ function stripNoise(document, context, signal) {
382
+ cleanHeadings(document);
383
+ // Structural Removal
384
+ const { baseSelector, extraSelectors } = context;
385
+ removeNodes(document.querySelectorAll(baseSelector));
386
+ if (extraSelectors.length > 0) {
387
+ removeNodes(document.querySelectorAll(extraSelectors.join(',')));
388
+ }
389
+ // Candidates (conditional removal)
390
+ const candidates = document.querySelectorAll(context.candidateSelector);
391
+ for (let i = candidates.length - 1; i >= 0; i--) {
392
+ if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
393
+ throw new Error('Noise removal aborted');
394
+ }
395
+ const node = candidates[i];
396
+ if (!node)
397
+ continue;
398
+ if (!node.parentNode)
399
+ continue;
400
+ if (shouldPreserve(node, node.tagName.toLowerCase()))
401
+ continue;
402
+ if (isNoiseElement(node, context)) {
403
+ node.remove();
404
+ }
405
+ }
406
+ }
407
+ function processUrlElement(el, attr, base, isSrcset) {
408
+ if (!el.parentNode)
409
+ return;
410
+ if (isSrcset) {
411
+ const val = el.getAttribute(attr);
412
+ if (val) {
413
+ const newVal = val
414
+ .split(',')
415
+ .map((entry) => {
416
+ const parts = entry.trim().split(/\s+/);
417
+ if (!parts[0])
418
+ return entry;
419
+ try {
420
+ parts[0] = new URL(parts[0], base).href;
421
+ }
422
+ catch {
423
+ /* ignore */
424
+ }
425
+ return parts.join(' ');
426
+ })
427
+ .join(', ');
428
+ el.setAttribute(attr, newVal);
429
+ }
430
+ return;
431
+ }
432
+ const val = el.getAttribute(attr);
433
+ if (val &&
434
+ !SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
435
+ try {
436
+ el.setAttribute(attr, new URL(val, base).href);
437
+ }
438
+ catch {
439
+ /* ignore */
440
+ }
441
+ }
442
+ }
443
+ function resolveUrls(document, baseUrlStr) {
444
+ let base;
445
+ try {
446
+ base = new URL(baseUrlStr);
447
+ }
448
+ catch {
449
+ return;
450
+ }
451
+ const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
452
+ for (const el of elements) {
453
+ const tag = el.tagName.toLowerCase();
454
+ if (tag === 'a')
455
+ processUrlElement(el, 'href', base, false);
456
+ else if (tag === 'img')
457
+ processUrlElement(el, 'src', base, false);
458
+ else if (tag === 'source')
459
+ processUrlElement(el, 'srcset', base, true);
460
+ }
461
+ }
462
+ export function serializeDocumentForMarkdown(document, fallback) {
463
+ const bodyHtml = document.body.innerHTML;
464
+ if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
465
+ return bodyHtml;
466
+ const outerHtml = document.documentElement.outerHTML;
467
+ if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
468
+ return outerHtml;
469
+ return fallback;
470
+ }
471
+ function isFullDocumentHtml(html) {
472
+ return HTML_DOCUMENT_MARKERS.test(html);
473
+ }
474
+ function mayContainNoise(html) {
475
+ const sample = html.length <= NOISE_SCAN_LIMIT
476
+ ? html
477
+ : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
478
+ return NOISE_PATTERNS.some((re) => re.test(sample));
479
+ }
480
+ function surfaceHiddenTabPanels(document) {
481
+ const panels = document.querySelectorAll('[data-slot="tabContent"], [role="tabpanel"]');
482
+ for (const panel of panels) {
483
+ const style = panel.getAttribute('style') ?? '';
484
+ if (/display\s*:\s*none/i.test(style)) {
485
+ panel.setAttribute('style', style.replace(/display\s*:\s*none\s*;?/gi, '').trim());
486
+ }
487
+ panel.removeAttribute('hidden');
488
+ }
489
+ }
490
+ function stripTabTriggers(document) {
491
+ surfaceHiddenTabPanels(document);
492
+ const tabs = document.querySelectorAll('button[role="tab"]');
493
+ for (let i = tabs.length - 1; i >= 0; i--) {
494
+ tabs[i]?.remove();
495
+ }
496
+ }
497
+ function escapeTableCellPipes(document) {
498
+ const cells = document.querySelectorAll('td, th');
499
+ for (const cell of cells) {
500
+ const walker = document.createTreeWalker(cell, NODE_FILTER_SHOW_TEXT);
501
+ let node;
502
+ while ((node = walker.nextNode())) {
503
+ if (node.textContent?.includes('|')) {
504
+ node.textContent = node.textContent.replace(/\|/g, '\\|');
505
+ }
506
+ }
507
+ }
508
+ }
509
+ function separateAdjacentInlineElements(document) {
510
+ const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
511
+ for (const badge of badges) {
512
+ const next = badge.nextSibling;
513
+ if (next?.nodeType === 1) {
514
+ badge.after(document.createTextNode(' '));
515
+ }
516
+ }
517
+ }
518
+ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
519
+ const context = getContext();
520
+ if (config.noiseRemoval.debug) {
521
+ logDebug('Noise removal audit enabled', {
522
+ categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
523
+ });
524
+ }
525
+ stripNoise(document, context, signal);
526
+ stripTabTriggers(document);
527
+ separateAdjacentInlineElements(document);
528
+ flattenTableCellBreaks(document);
529
+ escapeTableCellPipes(document);
530
+ normalizeTableStructure(document);
531
+ if (baseUrl)
532
+ resolveUrls(document, baseUrl);
533
+ }
534
+ // Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
535
+ function normalizeTableStructure(document) {
536
+ for (const table of document.querySelectorAll('table')) {
537
+ const theadCells = table.querySelectorAll('thead td');
538
+ for (const td of theadCells) {
539
+ const th = document.createElement('th');
540
+ th.innerHTML = td.innerHTML;
541
+ for (const attr of Array.from(td.attributes)) {
542
+ th.setAttribute(attr.name, attr.value);
543
+ }
544
+ td.replaceWith(th);
545
+ }
546
+ for (const cell of table.querySelectorAll('th, td')) {
547
+ for (const tag of ['tbody', 'thead', 'tfoot']) {
548
+ let nested = cell.querySelector(tag);
549
+ while (nested) {
550
+ table.appendChild(nested);
551
+ nested = cell.querySelector(tag);
552
+ }
553
+ }
554
+ }
555
+ }
556
+ }
557
+ function flattenTableCellBreaks(document) {
558
+ const cells = document.querySelectorAll('td, th');
559
+ for (const cell of cells) {
560
+ const brs = cell.querySelectorAll('br');
561
+ for (const br of brs) {
562
+ br.replaceWith(' ');
563
+ }
564
+ const blocks = Array.from(cell.querySelectorAll('div, p, ul, li, h1, h2, h3, h4, h5, h6'));
565
+ for (const block of blocks) {
566
+ if (!block.parentNode)
567
+ continue;
568
+ const span = document.createElement('span');
569
+ span.appendChild(document.createTextNode(' '));
570
+ while (block.firstChild) {
571
+ span.appendChild(block.firstChild);
572
+ }
573
+ span.appendChild(document.createTextNode(' '));
574
+ for (const attr of Array.from(block.attributes)) {
575
+ span.setAttribute(attr.name, attr.value);
576
+ }
577
+ block.replaceWith(span);
578
+ }
579
+ const filterNewlines = (node) => {
580
+ if (node.nodeType === 3 && node.nodeValue) {
581
+ node.nodeValue = node.nodeValue.replace(/\r?\n/g, ' ');
582
+ }
583
+ else {
584
+ for (const child of Array.from(node.childNodes)) {
585
+ filterNewlines(child);
586
+ }
587
+ }
588
+ };
589
+ filterNewlines(cell);
590
+ }
591
+ }
592
+ export function removeNoiseFromHtml(html, document, baseUrl, signal) {
593
+ const shouldParse = isFullDocumentHtml(html) ||
594
+ mayContainNoise(html) ||
595
+ HTML_FRAGMENT_MARKERS.test(html);
596
+ if (!shouldParse)
597
+ return html;
598
+ try {
599
+ const doc = document ?? parseHTML(html).document;
600
+ prepareDocumentForMarkdown(doc, baseUrl, signal);
601
+ return serializeDocumentForMarkdown(doc, html);
602
+ }
603
+ catch {
604
+ return html;
605
+ }
606
+ }
@@ -0,0 +1,13 @@
1
+ interface CleanupOptions {
2
+ signal?: AbortSignal;
3
+ url?: string;
4
+ }
5
+ /**
6
+ * Iterate over markdown content, splitting it into fenced (code) and
7
+ * non-fenced segments. Fenced lines pass through unchanged; non-fenced
8
+ * segments are joined and handed to `processTextSegment` for transformation.
9
+ */
10
+ export declare function processFencedContent(content: string, processTextSegment: (text: string) => string): string;
11
+ export declare function cleanupMarkdownArtifacts(content: string, options?: CleanupOptions): string;
12
+ export {};
13
+ //# sourceMappingURL=md-cleanup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"md-cleanup.d.ts","sourceRoot":"","sources":["../../src/lib/md-cleanup.ts"],"names":[],"mappings":"AAmEA,UAAU,cAAc;IACtB,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AA6VD;;;;GAIG;AACH,wBAAgB,oBAAoB,CAClC,OAAO,EAAE,MAAM,EACf,kBAAkB,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,GAC3C,MAAM,CAuCR;AAWD,wBAAgB,wBAAwB,CACtC,OAAO,EAAE,MAAM,EACf,OAAO,CAAC,EAAE,cAAc,GACvB,MAAM,CAUR"}