@j0hanz/fetch-url-mcp 1.9.0 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/dist/http/auth.d.ts +0 -1
  2. package/dist/http/auth.d.ts.map +1 -1
  3. package/dist/http/auth.js +1 -13
  4. package/dist/http/native.d.ts.map +1 -1
  5. package/dist/http/native.js +2 -5
  6. package/dist/lib/content.d.ts.map +1 -1
  7. package/dist/lib/content.js +378 -346
  8. package/dist/lib/core.d.ts +78 -71
  9. package/dist/lib/core.d.ts.map +1 -1
  10. package/dist/lib/core.js +308 -372
  11. package/dist/lib/fetch-pipeline.d.ts +2 -6
  12. package/dist/lib/fetch-pipeline.d.ts.map +1 -1
  13. package/dist/lib/fetch-pipeline.js +51 -137
  14. package/dist/lib/http.d.ts.map +1 -1
  15. package/dist/lib/http.js +188 -130
  16. package/dist/lib/mcp-tools.d.ts +3 -5
  17. package/dist/lib/mcp-tools.d.ts.map +1 -1
  18. package/dist/lib/mcp-tools.js +22 -58
  19. package/dist/lib/task-handlers.js +4 -4
  20. package/dist/lib/utils.d.ts +6 -0
  21. package/dist/lib/utils.d.ts.map +1 -1
  22. package/dist/lib/utils.js +23 -0
  23. package/dist/resources/index.js +1 -1
  24. package/dist/schemas.d.ts +0 -1
  25. package/dist/schemas.d.ts.map +1 -1
  26. package/dist/schemas.js +4 -6
  27. package/dist/server.js +1 -1
  28. package/dist/tasks/owner.d.ts +1 -1
  29. package/dist/tasks/owner.d.ts.map +1 -1
  30. package/dist/tasks/tool-registry.d.ts +1 -1
  31. package/dist/tasks/tool-registry.d.ts.map +1 -1
  32. package/dist/tools/fetch-url.d.ts +2 -3
  33. package/dist/tools/fetch-url.d.ts.map +1 -1
  34. package/dist/tools/fetch-url.js +89 -152
  35. package/dist/transform/html-translators.d.ts.map +1 -1
  36. package/dist/transform/html-translators.js +1 -23
  37. package/dist/transform/metadata.d.ts +1 -0
  38. package/dist/transform/metadata.d.ts.map +1 -1
  39. package/dist/transform/metadata.js +25 -0
  40. package/dist/transform/transform.d.ts +8 -0
  41. package/dist/transform/transform.d.ts.map +1 -1
  42. package/dist/transform/transform.js +190 -109
  43. package/dist/transform/worker-pool.d.ts +3 -6
  44. package/dist/transform/worker-pool.d.ts.map +1 -1
  45. package/dist/transform/worker-pool.js +148 -118
  46. package/package.json +3 -2
@@ -2,11 +2,38 @@ import { parseHTML } from 'linkedom';
2
2
  import {} from '../transform/types.js';
3
3
  import { config, logDebug } from './core.js';
4
4
  import { throwIfAborted } from './utils.js';
5
+ // ASCII char codes used in hot-path charCodeAt comparisons
6
+ const ASCII_SPACE = 32;
7
+ const ASCII_TAB = 9;
8
+ const ASCII_EXCLAMATION = 33;
9
+ const ASCII_HASH = 35;
10
+ const ASCII_ASTERISK = 42;
11
+ const ASCII_PLUS = 43;
12
+ const ASCII_DASH = 45;
13
+ const ASCII_PERIOD = 46;
14
+ const ASCII_DIGIT_0 = 48;
15
+ const ASCII_DIGIT_9 = 57;
16
+ const ASCII_LT = 60;
17
+ const ASCII_QUESTION = 63;
18
+ const ASCII_UPPER_A = 65;
19
+ const ASCII_UPPER_Z = 90;
20
+ const ASCII_BRACKET_OPEN = 91;
21
+ const ASCII_LOWER_A = 97;
22
+ const ASCII_LOWER_Z = 122;
23
+ const ASCII_UNDERSCORE = 95;
24
+ const HTML_TAG_DENSITY_LIMIT = 5;
25
+ const TITLE_MIN_WORDS = 2;
26
+ const TITLE_MAX_WORDS = 6;
27
+ const TITLE_MIN_CAPITALIZED = 2;
28
+ const PROPERTY_FIX_MAX_PASSES = 3;
29
+ const BODY_SCAN_LIMIT = 5000;
30
+ const HAS_FOLLOWING_LOOKAHEAD = 50;
5
31
  const NOISE_SCAN_LIMIT = 50_000;
6
32
  const MIN_BODY_CONTENT_LENGTH = 100;
7
33
  const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
8
34
  const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
9
35
  const ABORT_CHECK_INTERVAL = 500;
36
+ const NODE_FILTER_SHOW_TEXT = 4;
10
37
  const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
11
38
  const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
12
39
  const NOISE_PATTERNS = [
@@ -89,6 +116,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
89
116
  newsletters: ['newsletter', 'subscribe'],
90
117
  'social-share': ['share', 'social'],
91
118
  };
119
+ // Noise selector configurations
92
120
  const BASE_NOISE_SELECTORS = {
93
121
  navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
94
122
  cookieBanners: '[role="dialog"]',
@@ -96,7 +124,7 @@ const BASE_NOISE_SELECTORS = {
96
124
  };
97
125
  const NO_MATCH_REGEX = /a^/i;
98
126
  let cachedContext;
99
- let lastConfigRef;
127
+ let lastContextKey;
100
128
  function escapeRegexLiteral(value) {
101
129
  return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
102
130
  }
@@ -136,7 +164,16 @@ function getPromoMatchers(currentConfig, flags) {
136
164
  }
137
165
  function getContext() {
138
166
  const currentConfig = config.noiseRemoval;
139
- if (cachedContext !== undefined && lastConfigRef === currentConfig)
167
+ const contextKey = JSON.stringify({
168
+ locale: config.i18n.locale,
169
+ enabledCategories: currentConfig.enabledCategories,
170
+ extraTokens: currentConfig.extraTokens,
171
+ extraSelectors: currentConfig.extraSelectors,
172
+ aggressiveMode: currentConfig.aggressiveMode,
173
+ preserveSvgCanvas: currentConfig.preserveSvgCanvas,
174
+ weights: currentConfig.weights,
175
+ });
176
+ if (cachedContext !== undefined && lastContextKey === contextKey)
140
177
  return cachedContext;
141
178
  const enabled = new Set(currentConfig.enabledCategories
142
179
  .map((c) => {
@@ -188,14 +225,15 @@ function getContext() {
188
225
  baseSelector,
189
226
  candidateSelector,
190
227
  };
191
- lastConfigRef = currentConfig;
228
+ lastContextKey = contextKey;
192
229
  return cachedContext;
193
230
  }
194
231
  function isInteractive(element, role) {
195
232
  if (role && INTERACTIVE_CONTENT_ROLES.has(role))
196
233
  return true;
234
+ const tag = element.tagName.toLowerCase();
197
235
  const ds = element.getAttribute('data-state');
198
- if (ds === 'inactive' || ds === 'closed')
236
+ if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
199
237
  return true;
200
238
  const dataOrientation = element.getAttribute('data-orientation');
201
239
  if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
@@ -215,6 +253,19 @@ function isWithinPrimaryContent(element) {
215
253
  }
216
254
  return false;
217
255
  }
256
+ const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
257
+ const ASIDE_NAV_MIN_LINKS = 10;
258
+ function isNavigationAside(element) {
259
+ if (element.querySelector('nav'))
260
+ return true;
261
+ const links = element.querySelectorAll('a[href]');
262
+ if (links.length < ASIDE_NAV_MIN_LINKS)
263
+ return false;
264
+ const textLen = (element.textContent || '').trim().length;
265
+ if (textLen === 0)
266
+ return true;
267
+ return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
268
+ }
218
269
  function shouldPreserve(element, tagName) {
219
270
  // Check Dialog
220
271
  const role = element.getAttribute('role');
@@ -233,6 +284,12 @@ function shouldPreserve(element, tagName) {
233
284
  return ((element.textContent || '').trim().length >=
234
285
  NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
235
286
  }
287
+ // Check Aside — preserve only if it looks like article content, not navigation
288
+ if (tagName === 'aside') {
289
+ if (!isWithinPrimaryContent(element))
290
+ return false;
291
+ return !isNavigationAside(element);
292
+ }
236
293
  return false;
237
294
  }
238
295
  function removeNodes(nodes) {
@@ -243,142 +300,120 @@ function removeNodes(nodes) {
243
300
  }
244
301
  }
245
302
  }
246
- function scoreNavFooter(meta, weights) {
303
+ const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
304
+ function calculateNavFooterScore(tagName, className, id, role, weights) {
247
305
  let score = 0;
248
- if (ALWAYS_NOISE_TAGS.has(meta.tagName))
306
+ if (ALWAYS_NOISE_TAGS.has(tagName))
249
307
  score += weights.structural;
250
- // Header Boilerplate
251
- if (meta.tagName === 'header') {
252
- if ((meta.role && NAVIGATION_ROLES.has(meta.role)) ||
253
- HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
308
+ if (tagName === 'header') {
309
+ if ((role && NAVIGATION_ROLES.has(role)) ||
310
+ HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
254
311
  score += weights.structural;
255
312
  }
256
313
  }
257
- // Role Noise
258
- if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
259
- if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
314
+ if (tagName === 'aside') {
315
+ score += weights.structural;
316
+ }
317
+ if (role && NAVIGATION_ROLES.has(role)) {
318
+ if (tagName !== 'aside' || role !== 'complementary') {
260
319
  score += weights.structural;
261
320
  }
262
321
  }
263
322
  return score;
264
323
  }
265
- function extractElementMetadata(element) {
324
+ function calculatePromoScore(element, className, id, context) {
325
+ if (!context.promoEnabled)
326
+ return 0;
327
+ const aggTest = context.promoMatchers.aggressive.test(className) ||
328
+ context.promoMatchers.aggressive.test(id);
329
+ const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
330
+ const isBaseMatch = !aggTest &&
331
+ (context.promoMatchers.base.test(className) ||
332
+ context.promoMatchers.base.test(id));
333
+ return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
334
+ }
335
+ function isNoiseElement(element, context) {
266
336
  const tagName = element.tagName.toLowerCase();
267
337
  const className = element.getAttribute('class') ?? '';
268
338
  const id = element.getAttribute('id') ?? '';
269
339
  const role = element.getAttribute('role');
270
340
  const style = element.getAttribute('style');
271
- const _isInteractive = isInteractive(element, role);
272
- const isHidden = element.hasAttribute('hidden') ||
341
+ const elIsInteractive = isInteractive(element, role);
342
+ const elIsHidden = element.hasAttribute('hidden') ||
273
343
  element.getAttribute('aria-hidden') === 'true' ||
274
- (style !== null &&
275
- /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
276
- return {
277
- tagName,
278
- className,
279
- id,
280
- role,
281
- style,
282
- isInteractive: _isInteractive,
283
- isHidden,
284
- };
285
- }
286
- function isNoiseElement(element, context) {
287
- const meta = extractElementMetadata(element);
344
+ (style !== null && HIDDEN_STYLE_REGEX.test(style));
288
345
  let score = 0;
289
346
  const { weights } = context;
290
347
  // Structural
291
- if (context.structuralTags.has(meta.tagName) && !meta.isInteractive) {
348
+ if (context.structuralTags.has(tagName) && !elIsInteractive) {
292
349
  score += weights.structural;
293
350
  }
294
351
  // Nav/Footer Scoring
295
352
  if (context.flags.navFooter) {
296
- score += scoreNavFooter(meta, weights);
353
+ score += calculateNavFooterScore(tagName, className, id, role, weights);
297
354
  }
298
355
  // Hidden
299
- if (meta.isHidden && !meta.isInteractive) {
356
+ if (elIsHidden && !elIsInteractive) {
300
357
  score += weights.hidden;
301
358
  }
302
359
  // Sticky/Fixed
303
- if (FIXED_OR_HIGH_Z_PATTERN.test(meta.className)) {
360
+ if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
304
361
  score += weights.stickyFixed;
305
362
  }
306
363
  // Promo
307
- if (context.promoEnabled) {
308
- const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
309
- context.promoMatchers.aggressive.test(meta.id);
310
- const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
311
- const isBaseMatch = !aggTest &&
312
- (context.promoMatchers.base.test(meta.className) ||
313
- context.promoMatchers.base.test(meta.id));
314
- if (isAggressiveMatch || isBaseMatch) {
315
- score += weights.promo;
316
- }
317
- }
364
+ score += calculatePromoScore(element, className, id, context);
318
365
  return score >= weights.threshold;
319
366
  }
320
- function cleanHeadingWrapperDivs(h) {
321
- const divs = h.querySelectorAll('div');
322
- for (let j = divs.length - 1; j >= 0; j--) {
323
- const d = divs[j];
324
- if (!d?.parentNode)
325
- continue;
326
- const cls = d.getAttribute('class') ?? '';
327
- const stl = d.getAttribute('style') ?? '';
328
- if (cls.includes('absolute') ||
329
- stl.includes('position') ||
330
- d.getAttribute('tabindex') === '-1') {
331
- d.remove();
332
- }
333
- }
334
- }
335
- function cleanHeadingAnchors(h) {
336
- const anchors = h.querySelectorAll('a');
337
- for (let j = anchors.length - 1; j >= 0; j--) {
338
- const a = anchors[j];
339
- if (!a?.parentNode)
340
- continue;
341
- const href = a.getAttribute('href') ?? '';
342
- const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
343
- if (href.startsWith('#') && txt.length === 0) {
344
- a.remove();
345
- }
346
- }
347
- }
348
- function cleanHeadingZeroWidth(h, document) {
349
- const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
350
- let node;
351
- while ((node = walker.nextNode())) {
352
- if (node.textContent?.includes('\u200B')) {
353
- node.textContent = node.textContent.replace(/\u200B/g, '');
354
- }
355
- }
356
- }
357
367
  function cleanHeadings(document) {
358
- // Clean Heading Anchors
359
368
  const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
360
369
  for (const h of headings) {
361
370
  if (!h.parentNode)
362
371
  continue;
363
- cleanHeadingWrapperDivs(h);
364
- cleanHeadingAnchors(h);
365
- cleanHeadingZeroWidth(h, document);
372
+ // Remove absolute/positioned wrapper divs
373
+ const divs = h.querySelectorAll('div');
374
+ for (let j = divs.length - 1; j >= 0; j--) {
375
+ const d = divs[j];
376
+ if (!d?.parentNode)
377
+ continue;
378
+ const cls = d.getAttribute('class') ?? '';
379
+ const stl = d.getAttribute('style') ?? '';
380
+ if (cls.includes('absolute') ||
381
+ stl.includes('position') ||
382
+ d.getAttribute('tabindex') === '-1') {
383
+ d.remove();
384
+ }
385
+ }
386
+ // Remove empty hash-link anchors
387
+ const anchors = h.querySelectorAll('a');
388
+ for (let j = anchors.length - 1; j >= 0; j--) {
389
+ const a = anchors[j];
390
+ if (!a?.parentNode)
391
+ continue;
392
+ const href = a.getAttribute('href') ?? '';
393
+ const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
394
+ if (href.startsWith('#') && txt.length === 0) {
395
+ a.remove();
396
+ }
397
+ }
398
+ // Strip zero-width spaces from text nodes
399
+ const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
400
+ let node;
401
+ while ((node = walker.nextNode())) {
402
+ if (node.textContent?.includes('\u200B')) {
403
+ node.textContent = node.textContent.replace(/\u200B/g, '');
404
+ }
405
+ }
366
406
  }
367
407
  }
368
408
  function stripNoise(document, context, signal) {
369
409
  cleanHeadings(document);
370
- // Remove Base & Extra
410
+ // Structural Removal
371
411
  const { baseSelector, extraSelectors } = context;
372
- // Base
373
- const baseNodes = document.querySelectorAll(baseSelector);
374
- removeNodes(baseNodes);
375
- // Extra
412
+ removeNodes(document.querySelectorAll(baseSelector));
376
413
  if (extraSelectors.length > 0) {
377
- const combinedExtra = extraSelectors.join(',');
378
- const extraNodes = document.querySelectorAll(combinedExtra);
379
- removeNodes(extraNodes);
414
+ removeNodes(document.querySelectorAll(extraSelectors.join(',')));
380
415
  }
381
- // Candidates
416
+ // Candidates (conditional removal)
382
417
  const candidates = document.querySelectorAll(context.candidateSelector);
383
418
  for (let i = candidates.length - 1; i >= 0; i--) {
384
419
  if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
@@ -469,6 +504,29 @@ function mayContainNoise(html) {
469
504
  : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
470
505
  return NOISE_PATTERNS.some((re) => re.test(sample));
471
506
  }
507
+ function stripTabTriggers(document) {
508
+ const tabs = document.querySelectorAll('button[role="tab"]');
509
+ for (let i = tabs.length - 1; i >= 0; i--) {
510
+ tabs[i]?.remove();
511
+ }
512
+ }
513
+ function escapeTableCellPipes(document) {
514
+ const codes = document.querySelectorAll('td code, th code');
515
+ for (const code of codes) {
516
+ if (code.textContent.includes('|')) {
517
+ code.textContent = code.textContent.replace(/\|/g, '\\|');
518
+ }
519
+ }
520
+ }
521
+ function separateAdjacentInlineElements(document) {
522
+ const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
523
+ for (const badge of badges) {
524
+ const next = badge.nextSibling;
525
+ if (next?.nodeType === 1) {
526
+ badge.after(document.createTextNode(' '));
527
+ }
528
+ }
529
+ }
472
530
  export function prepareDocumentForMarkdown(document, baseUrl, signal) {
473
531
  const context = getContext();
474
532
  if (config.noiseRemoval.debug) {
@@ -477,9 +535,37 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
477
535
  });
478
536
  }
479
537
  stripNoise(document, context, signal);
538
+ stripTabTriggers(document);
539
+ separateAdjacentInlineElements(document);
540
+ flattenTableCellBreaks(document);
541
+ escapeTableCellPipes(document);
542
+ normalizeTableStructure(document);
480
543
  if (baseUrl)
481
544
  resolveUrls(document, baseUrl);
482
545
  }
546
+ // Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
547
+ function normalizeTableStructure(document) {
548
+ for (const table of document.querySelectorAll('table')) {
549
+ for (const cell of table.querySelectorAll('th, td')) {
550
+ for (const tag of ['tbody', 'thead', 'tfoot']) {
551
+ let nested = cell.querySelector(tag);
552
+ while (nested) {
553
+ table.appendChild(nested);
554
+ nested = cell.querySelector(tag);
555
+ }
556
+ }
557
+ }
558
+ }
559
+ }
560
+ function flattenTableCellBreaks(document) {
561
+ const cells = document.querySelectorAll('td, th');
562
+ for (const cell of cells) {
563
+ const brs = cell.querySelectorAll('br');
564
+ for (const br of brs) {
565
+ br.replaceWith(' ');
566
+ }
567
+ }
568
+ }
483
569
  export function removeNoiseFromHtml(html, document, baseUrl, signal) {
484
570
  const shouldParse = isFullDocumentHtml(html) ||
485
571
  mayContainNoise(html) ||
@@ -495,6 +581,8 @@ export function removeNoiseFromHtml(html, document, baseUrl, signal) {
495
581
  return html;
496
582
  }
497
583
  }
584
+ // endregion
585
+ // region Language Detection
498
586
  class DetectionContext {
499
587
  code;
500
588
  _lower;
@@ -574,10 +662,10 @@ const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
574
662
  function containsJsxTag(code) {
575
663
  const len = code.length;
576
664
  for (let i = 0; i < len - 1; i++) {
577
- if (code.charCodeAt(i) === 60 /* < */) {
665
+ if (code.charCodeAt(i) === ASCII_LT) {
578
666
  const next = code.charCodeAt(i + 1);
579
- if (next >= 65 && next <= 90)
580
- return true; // A-Z
667
+ if (next >= ASCII_UPPER_A && next <= ASCII_UPPER_Z)
668
+ return true;
581
669
  }
582
670
  }
583
671
  return false;
@@ -636,140 +724,100 @@ function detectYamlStructure(lines) {
636
724
  if (colonIdx <= 0)
637
725
  continue;
638
726
  const after = trimmed.charCodeAt(colonIdx + 1);
639
- // space (32) or tab (9)
640
- if (after === 32 || after === 9)
727
+ if (after === ASCII_SPACE || after === ASCII_TAB)
728
+ return true;
729
+ }
730
+ return false;
731
+ }
732
+ function matchRust(ctx) {
733
+ if (ctx.lower.includes('let mut'))
734
+ return true;
735
+ if (RUST_REGEX.test(ctx.lower))
736
+ return true;
737
+ return ctx.lower.includes('use ') && ctx.lower.includes('::');
738
+ }
739
+ function matchGo(ctx) {
740
+ if (ctx.lower.includes('import "'))
741
+ return true;
742
+ return /\b(?:package|func)\b/.test(ctx.lower);
743
+ }
744
+ function matchJsx(ctx) {
745
+ const l = ctx.lower;
746
+ if (l.includes('classname=') ||
747
+ l.includes('jsx:') ||
748
+ l.includes("from 'react'") ||
749
+ l.includes('from "react"')) {
750
+ return true;
751
+ }
752
+ return containsJsxTag(ctx.code);
753
+ }
754
+ function matchTypeScript(ctx) {
755
+ if (/\b(?:interface|type)\b/.test(ctx.lower))
756
+ return true;
757
+ const l = ctx.lower;
758
+ for (const hint of TYPESCRIPT_HINTS) {
759
+ if (l.includes(hint))
760
+ return true;
761
+ }
762
+ return false;
763
+ }
764
+ function matchSql(ctx) {
765
+ return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(ctx.lower);
766
+ }
767
+ function hasJsSignals(lowerCode) {
768
+ return (JS_SIGNAL_REGEX.test(lowerCode) ||
769
+ lowerCode.includes('{') ||
770
+ lowerCode.includes("from '"));
771
+ }
772
+ function matchPython(ctx) {
773
+ const l = ctx.lower;
774
+ if (l.includes('print(') || l.includes('__name__'))
775
+ return true;
776
+ if (l.includes('self.') || l.includes('elif '))
777
+ return true;
778
+ // Check for Python's None/True/False using original case (they are capitalized in Python)
779
+ if (ctx.code.includes('None') ||
780
+ ctx.code.includes('True') ||
781
+ ctx.code.includes('False')) {
782
+ return true;
783
+ }
784
+ if (PYTHON_UNIQUE_REGEX.test(l))
785
+ return true;
786
+ // Shared keywords (import, from, class) — only match if no JS signals present
787
+ if (/\b(?:import|from|class)\b/.test(l) && !hasJsSignals(l)) {
788
+ return true;
789
+ }
790
+ return false;
791
+ }
792
+ function matchHtml(ctx) {
793
+ const l = ctx.lower;
794
+ for (const tag of HTML_TAGS) {
795
+ if (l.includes(tag))
641
796
  return true;
642
797
  }
643
798
  return false;
644
799
  }
800
+ // Pre-sorted by weight descending — first match wins in detectLanguageFromCode
645
801
  const LANGUAGES = [
646
- {
647
- lang: 'rust',
648
- weight: 25,
649
- match: (ctx) => {
650
- if (ctx.lower.includes('let mut'))
651
- return true;
652
- if (RUST_REGEX.test(ctx.lower))
653
- return true;
654
- return ctx.lower.includes('use ') && ctx.lower.includes('::');
655
- },
656
- },
657
- {
658
- lang: 'go',
659
- weight: 22,
660
- match: (ctx) => {
661
- if (ctx.lower.includes('import "'))
662
- return true;
663
- return /\b(?:package|func)\b/.test(ctx.lower);
664
- },
665
- },
666
- {
667
- lang: 'jsx',
668
- weight: 22,
669
- match: (ctx) => {
670
- const l = ctx.lower;
671
- if (l.includes('classname=') ||
672
- l.includes('jsx:') ||
673
- l.includes("from 'react'") ||
674
- l.includes('from "react"')) {
675
- return true;
676
- }
677
- return containsJsxTag(ctx.code);
678
- },
679
- },
680
- {
681
- lang: 'typescript',
682
- weight: 20,
683
- match: (ctx) => {
684
- if (/\b(?:interface|type)\b/.test(ctx.lower))
685
- return true;
686
- const l = ctx.lower;
687
- for (const hint of TYPESCRIPT_HINTS) {
688
- if (l.includes(hint))
689
- return true;
690
- }
691
- return false;
692
- },
693
- },
694
- {
695
- lang: 'sql',
696
- weight: 20,
697
- match: (ctx) => {
698
- const l = ctx.lower;
699
- return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(l);
700
- },
701
- },
702
- {
703
- lang: 'python',
704
- weight: 18,
705
- match: (ctx) => {
706
- const l = ctx.lower;
707
- if (l.includes('print(') || l.includes('__name__'))
708
- return true;
709
- if (l.includes('self.') || l.includes('elif '))
710
- return true;
711
- // Check for Python's None/True/False using original case (they are capitalized in Python)
712
- if (ctx.code.includes('None') ||
713
- ctx.code.includes('True') ||
714
- ctx.code.includes('False')) {
715
- return true;
716
- }
717
- // Python-unique keywords that JS doesn't have
718
- if (PYTHON_UNIQUE_REGEX.test(l))
719
- return true;
720
- // Shared keywords (import, from, class) — only match if no JS signals present
721
- if (/\b(?:import|from|class)\b/.test(l) &&
722
- !JS_SIGNAL_REGEX.test(l) &&
723
- !l.includes('{') &&
724
- !l.includes("from '")) {
725
- return true;
726
- }
727
- return false;
728
- },
729
- },
802
+ { lang: 'rust', weight: 25, match: matchRust },
803
+ { lang: 'go', weight: 22, match: matchGo },
804
+ { lang: 'jsx', weight: 22, match: matchJsx },
805
+ { lang: 'typescript', weight: 20, match: matchTypeScript },
806
+ { lang: 'sql', weight: 20, match: matchSql },
807
+ { lang: 'python', weight: 18, match: matchPython },
730
808
  {
731
809
  lang: 'css',
732
810
  weight: 18,
733
- match: (ctx) => {
734
- if (CSS_REGEX.test(ctx.lower))
735
- return true;
736
- return detectCssStructure(ctx.lines);
737
- },
738
- },
739
- {
740
- lang: 'bash',
741
- weight: 15,
742
- match: (ctx) => detectBashIndicators(ctx.lines),
743
- },
744
- {
745
- lang: 'yaml',
746
- weight: 15,
747
- match: (ctx) => detectYamlStructure(ctx.lines),
748
- },
749
- {
750
- lang: 'javascript',
751
- weight: 15,
752
- match: (ctx) => JS_REGEX.test(ctx.lower),
753
- },
754
- {
755
- lang: 'html',
756
- weight: 12,
757
- match: (ctx) => {
758
- const l = ctx.lower;
759
- for (const tag of HTML_TAGS) {
760
- if (l.includes(tag))
761
- return true;
762
- }
763
- return false;
764
- },
811
+ match: (ctx) => CSS_REGEX.test(ctx.lower) || detectCssStructure(ctx.lines),
765
812
  },
813
+ { lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
814
+ { lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
815
+ { lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
816
+ { lang: 'html', weight: 12, match: matchHtml },
766
817
  {
767
818
  lang: 'json',
768
819
  weight: 10,
769
- match: (ctx) => {
770
- const s = ctx.trimmedStart;
771
- return s.startsWith('{') || s.startsWith('[');
772
- },
820
+ match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
773
821
  },
774
822
  ];
775
823
  function extractLanguageFromClassName(className) {
@@ -805,11 +853,10 @@ function resolveLanguageFromDataAttribute(dataLang) {
805
853
  // Check if \w+
806
854
  for (let i = 0; i < trimmed.length; i++) {
807
855
  const c = trimmed.charCodeAt(i);
808
- // valid: A-Z, a-z, 0-9, _
809
- const isUpper = c >= 65 && c <= 90;
810
- const isLower = c >= 97 && c <= 122;
811
- const isDigit = c >= 48 && c <= 57;
812
- const isUnder = c === 95;
856
+ const isUpper = c >= ASCII_UPPER_A && c <= ASCII_UPPER_Z;
857
+ const isLower = c >= ASCII_LOWER_A && c <= ASCII_LOWER_Z;
858
+ const isDigit = c >= ASCII_DIGIT_0 && c <= ASCII_DIGIT_9;
859
+ const isUnder = c === ASCII_UNDERSCORE;
813
860
  if (!isUpper && !isLower && !isDigit && !isUnder) {
814
861
  return undefined;
815
862
  }
@@ -826,7 +873,7 @@ export function detectLanguageFromCode(code) {
826
873
  // Fast path for empty/whitespace only
827
874
  let empty = true;
828
875
  for (let i = 0; i < code.length; i++) {
829
- if (code.charCodeAt(i) > 32) {
876
+ if (code.charCodeAt(i) > ASCII_SPACE) {
830
877
  empty = false;
831
878
  break;
832
879
  }
@@ -834,20 +881,15 @@ export function detectLanguageFromCode(code) {
834
881
  if (empty)
835
882
  return undefined;
836
883
  const ctx = new DetectionContext(code);
837
- let bestLang;
838
- let bestScore = -1;
884
+ // LANGUAGES is pre-sorted by weight descending — first match is highest confidence
839
885
  for (const def of LANGUAGES) {
840
- if (def.match(ctx)) {
841
- if (def.weight > bestScore) {
842
- bestScore = def.weight;
843
- bestLang = def.lang;
844
- if (bestScore >= 25)
845
- break;
846
- }
847
- }
886
+ if (def.match(ctx))
887
+ return def.lang;
848
888
  }
849
- return bestLang;
889
+ return undefined;
850
890
  }
891
+ // endregion
892
+ // region Markdown Cleanup
851
893
  const MAX_LINE_LENGTH = 80;
852
894
  const REGEX = {
853
895
  HEADING_MARKER: /^#{1,6}\s/m,
@@ -900,7 +942,7 @@ function isBlank(line) {
900
942
  }
901
943
  function hasFollowingContent(lines, startIndex) {
902
944
  // Optimization: Bound lookahead to avoid checking too many lines in huge files
903
- for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + 50); i++) {
945
+ for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
904
946
  if (!isBlank(lines[i]))
905
947
  return true;
906
948
  }
@@ -919,7 +961,7 @@ function isTitleCaseOrKeyword(trimmed) {
919
961
  // Split limited number of words
920
962
  const words = trimmed.split(/\s+/);
921
963
  const len = words.length;
922
- if (len < 2 || len > 6)
964
+ if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
923
965
  return false;
924
966
  let capitalizedCount = 0;
925
967
  for (let i = 0; i < len; i++) {
@@ -932,20 +974,19 @@ function isTitleCaseOrKeyword(trimmed) {
932
974
  else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
933
975
  return false;
934
976
  }
935
- return capitalizedCount >= 2;
977
+ return capitalizedCount >= TITLE_MIN_CAPITALIZED;
936
978
  }
937
979
  function getHeadingPrefix(trimmed) {
938
980
  if (trimmed.length > MAX_LINE_LENGTH)
939
981
  return null;
940
982
  // Fast path: Check common markdown markers first
941
983
  const firstChar = trimmed.charCodeAt(0);
942
- // # (35), - (45), * (42), + (43), digit (48-57), [ (91)
943
- if (firstChar === 35 ||
944
- firstChar === 45 ||
945
- firstChar === 42 ||
946
- firstChar === 43 ||
947
- firstChar === 91 ||
948
- (firstChar >= 48 && firstChar <= 57)) {
984
+ if (firstChar === ASCII_HASH ||
985
+ firstChar === ASCII_DASH ||
986
+ firstChar === ASCII_ASTERISK ||
987
+ firstChar === ASCII_PLUS ||
988
+ firstChar === ASCII_BRACKET_OPEN ||
989
+ (firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
949
990
  if (REGEX.HEADING_MARKER.test(trimmed) ||
950
991
  REGEX.LIST_MARKER.test(trimmed) ||
951
992
  /^\d+\.\s/.test(trimmed) ||
@@ -957,8 +998,9 @@ function getHeadingPrefix(trimmed) {
957
998
  return /^example:\s/i.test(trimmed) ? '### ' : '## ';
958
999
  }
959
1000
  const lastChar = trimmed.charCodeAt(trimmed.length - 1);
960
- // . (46), ! (33), ? (63)
961
- if (lastChar === 46 || lastChar === 33 || lastChar === 63)
1001
+ if (lastChar === ASCII_PERIOD ||
1002
+ lastChar === ASCII_EXCLAMATION ||
1003
+ lastChar === ASCII_QUESTION)
962
1004
  return null;
963
1005
  return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
964
1006
  }
@@ -1073,48 +1115,63 @@ function processTextBuffer(lines, options) {
1073
1115
  const text = preprocessLines(lines, options);
1074
1116
  return applyGlobalRegexes(text, options);
1075
1117
  }
1076
- function applyGlobalRegexes(text, options) {
1118
+ function removeTypeDocArtifacts(text) {
1119
+ const filtered = text
1120
+ .split('\n')
1121
+ .filter((line) => !isTypeDocArtifactLine(line))
1122
+ .join('\n');
1123
+ return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
1124
+ }
1125
+ function removeSkipLinks(text) {
1126
+ return text
1127
+ .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1128
+ .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1129
+ }
1130
+ function normalizeMarkdownSpacing(text) {
1131
+ let result = text
1132
+ .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
1133
+ .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
1134
+ .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
1135
+ .replace(REGEX.SPACING_ESCAPES, '$1')
1136
+ .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
1137
+ .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
1138
+ .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
1139
+ // Trim leading whitespace inside inline code spans
1140
+ result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
1141
+ // Unescape backticks inside markdown link text
1142
+ result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
1143
+ result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
1144
+ return normalizeNestedListIndentation(result);
1145
+ }
1146
+ function fixConcatenatedProperties(text) {
1077
1147
  let result = text;
1148
+ for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
1149
+ const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
1150
+ if (next === result)
1151
+ break;
1152
+ result = next;
1153
+ }
1154
+ return result;
1155
+ }
1156
+ function applyGlobalRegexes(text, options) {
1078
1157
  const checkAbort = createAbortChecker(options);
1158
+ let result = text.replace(/\u00A0/g, ' ');
1079
1159
  checkAbort('markdown:cleanup:headings');
1080
- // fixAndSpaceHeadings
1081
1160
  result = result
1082
1161
  .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
1083
1162
  .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
1084
1163
  if (config.markdownCleanup.removeTypeDocComments) {
1085
1164
  checkAbort('markdown:cleanup:typedoc');
1086
- result = result
1087
- .split('\n')
1088
- .filter((line) => !isTypeDocArtifactLine(line))
1089
- .join('\n');
1090
- result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
1165
+ result = removeTypeDocArtifacts(result);
1091
1166
  }
1092
1167
  if (config.markdownCleanup.removeSkipLinks) {
1093
1168
  checkAbort('markdown:cleanup:skip-links');
1094
- result = result
1095
- .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1096
- .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1169
+ result = removeSkipLinks(result);
1097
1170
  }
1098
1171
  checkAbort('markdown:cleanup:spacing');
1099
- // normalizeSpacing
1100
- result = result
1101
- .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
1102
- .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
1103
- .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
1104
- .replace(REGEX.SPACING_ESCAPES, '$1')
1105
- .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
1106
- .replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
1107
- .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
1108
- result = normalizeNestedListIndentation(result);
1172
+ result = normalizeMarkdownSpacing(result);
1109
1173
  checkAbort('markdown:cleanup:properties');
1110
- // fixProperties
1111
- for (let k = 0; k < 3; k++) {
1112
- const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
1113
- if (next === result)
1114
- break;
1115
- result = next;
1116
- }
1117
- return result;
1174
+ return fixConcatenatedProperties(result);
1118
1175
  }
1119
1176
  function normalizeNestedListIndentation(text) {
1120
1177
  return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
@@ -1130,27 +1187,17 @@ export function cleanupMarkdownArtifacts(content, options) {
1130
1187
  return '';
1131
1188
  const checkAbort = createAbortChecker(options);
1132
1189
  checkAbort('markdown:cleanup:begin');
1133
- const len = content.length;
1134
- let lastIndex = 0;
1190
+ const lines = content.split(/\r?\n/);
1135
1191
  let fenceMarker = null;
1136
1192
  const segments = [];
1137
1193
  let buffer = [];
1138
- while (lastIndex < len) {
1139
- let nextIndex = content.indexOf('\n', lastIndex);
1140
- let line;
1141
- if (nextIndex === -1) {
1142
- line = content.slice(lastIndex);
1143
- nextIndex = len;
1144
- }
1145
- else {
1146
- if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
1147
- line = content.slice(lastIndex, nextIndex - 1);
1148
- }
1149
- else {
1150
- line = content.slice(lastIndex, nextIndex);
1151
- }
1152
- nextIndex++; // Skip \n
1194
+ const flushBuffer = () => {
1195
+ if (buffer.length > 0) {
1196
+ segments.push(processTextBuffer(buffer, options));
1197
+ buffer = [];
1153
1198
  }
1199
+ };
1200
+ for (const line of lines) {
1154
1201
  const trimmed = line.trimStart();
1155
1202
  if (fenceMarker) {
1156
1203
  segments.push(line);
@@ -1166,22 +1213,16 @@ export function cleanupMarkdownArtifacts(content, options) {
1166
1213
  buffer.push(line);
1167
1214
  }
1168
1215
  else {
1169
- if (buffer.length > 0) {
1170
- segments.push(processTextBuffer(buffer, options));
1171
- buffer = [];
1172
- }
1216
+ flushBuffer();
1173
1217
  segments.push(line);
1174
1218
  fenceMarker = newMarker;
1175
1219
  }
1176
1220
  }
1177
- lastIndex = nextIndex;
1178
- }
1179
- if (buffer.length > 0) {
1180
- segments.push(processTextBuffer(buffer, options));
1181
1221
  }
1222
+ flushBuffer();
1182
1223
  return segments.join('\n').trim();
1183
1224
  }
1184
- function detectFrontmatter(content) {
1225
+ function parseFrontmatter(content) {
1185
1226
  const len = content.length;
1186
1227
  if (len < 4)
1187
1228
  return null;
@@ -1201,57 +1242,43 @@ function detectFrontmatter(content) {
1201
1242
  const closeIndex = content.indexOf(fence, fenceLen);
1202
1243
  if (closeIndex === -1)
1203
1244
  return null;
1204
- return {
1245
+ const range = {
1205
1246
  start: 0,
1206
1247
  end: closeIndex + fenceLen,
1207
1248
  linesStart: fenceLen,
1208
1249
  linesEnd: closeIndex,
1209
1250
  lineEnding,
1210
1251
  };
1211
- }
1212
- function parseFrontmatterEntry(line) {
1213
- const trimmed = line.trim();
1214
- const idx = trimmed.indexOf(':');
1215
- if (!trimmed || idx <= 0)
1216
- return null;
1217
- return {
1218
- key: trimmed.slice(0, idx).trim().toLowerCase(),
1219
- value: trimmed.slice(idx + 1).trim(),
1220
- };
1221
- }
1222
- function stripFrontmatterQuotes(val) {
1223
- const first = val.charAt(0);
1224
- const last = val.charAt(val.length - 1);
1225
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1226
- return val.slice(1, -1).trim();
1227
- }
1228
- return val;
1229
- }
1230
- function scanFrontmatterForTitle(content, fm) {
1231
- const fmBody = content.slice(fm.linesStart, fm.linesEnd);
1252
+ // Parse key-value entries in one pass
1253
+ const entries = new Map();
1254
+ const fmBody = content.slice(range.linesStart, range.linesEnd);
1232
1255
  let lastIdx = 0;
1233
1256
  while (lastIdx < fmBody.length) {
1234
- let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx);
1257
+ let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
1235
1258
  if (nextIdx === -1)
1236
1259
  nextIdx = fmBody.length;
1237
- const line = fmBody.slice(lastIdx, nextIdx);
1238
- const entry = parseFrontmatterEntry(line);
1239
- if (entry) {
1240
- if (entry.key === 'title' || entry.key === 'name') {
1241
- const cleaned = stripFrontmatterQuotes(entry.value);
1242
- if (cleaned)
1243
- return cleaned;
1260
+ const line = fmBody.slice(lastIdx, nextIdx).trim();
1261
+ const colonIdx = line.indexOf(':');
1262
+ if (line && colonIdx > 0) {
1263
+ const key = line.slice(0, colonIdx).trim().toLowerCase();
1264
+ let value = line.slice(colonIdx + 1).trim();
1265
+ // Strip surrounding quotes
1266
+ const first = value.charAt(0);
1267
+ const last = value.charAt(value.length - 1);
1268
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1269
+ value = value.slice(1, -1).trim();
1244
1270
  }
1271
+ if (value)
1272
+ entries.set(key, value);
1245
1273
  }
1246
- lastIdx = nextIdx + fm.lineEnding.length;
1274
+ lastIdx = nextIdx + lineEnding.length;
1247
1275
  }
1248
- return undefined;
1276
+ return { range, entries };
1249
1277
  }
1250
1278
  function scanBodyForTitle(content) {
1251
1279
  const len = content.length;
1252
1280
  let scanIndex = 0;
1253
- const LIMIT = 5000;
1254
- const maxScan = Math.min(len, LIMIT);
1281
+ const maxScan = Math.min(len, BODY_SCAN_LIMIT);
1255
1282
  while (scanIndex < maxScan) {
1256
1283
  let nextIndex = content.indexOf('\n', scanIndex);
1257
1284
  if (nextIndex === -1)
@@ -1271,16 +1298,16 @@ function scanBodyForTitle(content) {
1271
1298
  return undefined;
1272
1299
  }
1273
1300
  export function extractTitleFromRawMarkdown(content) {
1274
- const fm = detectFrontmatter(content);
1301
+ const fm = parseFrontmatter(content);
1275
1302
  if (fm) {
1276
- const title = scanFrontmatterForTitle(content, fm);
1303
+ const title = fm.entries.get('title') ?? fm.entries.get('name');
1277
1304
  if (title)
1278
1305
  return title;
1279
1306
  }
1280
1307
  return scanBodyForTitle(content);
1281
1308
  }
1282
1309
  export function addSourceToMarkdown(content, url) {
1283
- const fm = detectFrontmatter(content);
1310
+ const fm = parseFrontmatter(content);
1284
1311
  const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
1285
1312
  if (useMarkdownFormat && !fm) {
1286
1313
  if (REGEX.SOURCE_KEY.test(content))
@@ -1301,13 +1328,17 @@ export function addSourceToMarkdown(content, url) {
1301
1328
  const escapedUrl = url.replace(/"/g, '\\"');
1302
1329
  return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
1303
1330
  }
1304
- const fmBody = content.slice(fm.linesStart, fm.linesEnd);
1331
+ const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
1305
1332
  if (REGEX.SOURCE_KEY.test(fmBody))
1306
1333
  return content;
1307
1334
  const escapedUrl = url.replace(/"/g, '\\"');
1308
- const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
1309
- return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd);
1335
+ const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
1336
+ return (content.slice(0, fm.range.linesEnd) +
1337
+ injection +
1338
+ content.slice(fm.range.linesEnd));
1310
1339
  }
1340
+ // endregion
1341
+ // region Content Detection & Metadata Footer
1311
1342
  function countCommonTags(content, limit) {
1312
1343
  if (limit <= 0)
1313
1344
  return 0;
@@ -1324,10 +1355,10 @@ export function isRawTextContent(content) {
1324
1355
  const trimmed = content.trim();
1325
1356
  if (REGEX.HTML_DOC_START.test(trimmed))
1326
1357
  return false;
1327
- if (detectFrontmatter(trimmed) !== null)
1358
+ if (parseFrontmatter(trimmed) !== null)
1328
1359
  return true;
1329
- const tagCount = countCommonTags(content, 5);
1330
- if (tagCount > 5)
1360
+ const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
1361
+ if (tagCount > HTML_TAG_DENSITY_LIMIT)
1331
1362
  return false;
1332
1363
  return (REGEX.HEADING_MARKER.test(content) ||
1333
1364
  REGEX.LIST_MARKER.test(content) ||
@@ -1365,3 +1396,4 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
1365
1396
  lines.push(` <sub>${metadata.description}</sub>`);
1366
1397
  return lines.join('\n');
1367
1398
  }
1399
+ // endregion