@j0hanz/fetch-url-mcp 1.9.1 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/http/auth.d.ts +0 -1
  2. package/dist/http/auth.d.ts.map +1 -1
  3. package/dist/http/auth.js +1 -13
  4. package/dist/http/native.d.ts.map +1 -1
  5. package/dist/http/native.js +2 -5
  6. package/dist/lib/content.d.ts.map +1 -1
  7. package/dist/lib/content.js +301 -350
  8. package/dist/lib/core.d.ts +78 -71
  9. package/dist/lib/core.d.ts.map +1 -1
  10. package/dist/lib/core.js +308 -372
  11. package/dist/lib/fetch-pipeline.d.ts +2 -6
  12. package/dist/lib/fetch-pipeline.d.ts.map +1 -1
  13. package/dist/lib/fetch-pipeline.js +51 -137
  14. package/dist/lib/http.d.ts.map +1 -1
  15. package/dist/lib/http.js +188 -130
  16. package/dist/lib/mcp-tools.d.ts +3 -5
  17. package/dist/lib/mcp-tools.d.ts.map +1 -1
  18. package/dist/lib/mcp-tools.js +22 -58
  19. package/dist/lib/task-handlers.js +4 -4
  20. package/dist/lib/utils.d.ts +6 -0
  21. package/dist/lib/utils.d.ts.map +1 -1
  22. package/dist/lib/utils.js +23 -0
  23. package/dist/resources/index.js +1 -1
  24. package/dist/schemas.d.ts +0 -1
  25. package/dist/schemas.d.ts.map +1 -1
  26. package/dist/schemas.js +4 -6
  27. package/dist/server.js +1 -1
  28. package/dist/tasks/owner.d.ts +1 -1
  29. package/dist/tasks/owner.d.ts.map +1 -1
  30. package/dist/tasks/tool-registry.d.ts +1 -1
  31. package/dist/tasks/tool-registry.d.ts.map +1 -1
  32. package/dist/tools/fetch-url.d.ts +2 -3
  33. package/dist/tools/fetch-url.d.ts.map +1 -1
  34. package/dist/tools/fetch-url.js +89 -152
  35. package/dist/transform/transform.d.ts +8 -0
  36. package/dist/transform/transform.d.ts.map +1 -1
  37. package/dist/transform/transform.js +109 -108
  38. package/dist/transform/worker-pool.d.ts +3 -6
  39. package/dist/transform/worker-pool.d.ts.map +1 -1
  40. package/dist/transform/worker-pool.js +148 -118
  41. package/package.json +2 -1
@@ -2,11 +2,38 @@ import { parseHTML } from 'linkedom';
2
2
  import {} from '../transform/types.js';
3
3
  import { config, logDebug } from './core.js';
4
4
  import { throwIfAborted } from './utils.js';
5
+ // ASCII char codes used in hot-path charCodeAt comparisons
6
+ const ASCII_SPACE = 32;
7
+ const ASCII_TAB = 9;
8
+ const ASCII_EXCLAMATION = 33;
9
+ const ASCII_HASH = 35;
10
+ const ASCII_ASTERISK = 42;
11
+ const ASCII_PLUS = 43;
12
+ const ASCII_DASH = 45;
13
+ const ASCII_PERIOD = 46;
14
+ const ASCII_DIGIT_0 = 48;
15
+ const ASCII_DIGIT_9 = 57;
16
+ const ASCII_LT = 60;
17
+ const ASCII_QUESTION = 63;
18
+ const ASCII_UPPER_A = 65;
19
+ const ASCII_UPPER_Z = 90;
20
+ const ASCII_BRACKET_OPEN = 91;
21
+ const ASCII_LOWER_A = 97;
22
+ const ASCII_LOWER_Z = 122;
23
+ const ASCII_UNDERSCORE = 95;
24
+ const HTML_TAG_DENSITY_LIMIT = 5;
25
+ const TITLE_MIN_WORDS = 2;
26
+ const TITLE_MAX_WORDS = 6;
27
+ const TITLE_MIN_CAPITALIZED = 2;
28
+ const PROPERTY_FIX_MAX_PASSES = 3;
29
+ const BODY_SCAN_LIMIT = 5000;
30
+ const HAS_FOLLOWING_LOOKAHEAD = 50;
5
31
  const NOISE_SCAN_LIMIT = 50_000;
6
32
  const MIN_BODY_CONTENT_LENGTH = 100;
7
33
  const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
8
34
  const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
9
35
  const ABORT_CHECK_INTERVAL = 500;
36
+ const NODE_FILTER_SHOW_TEXT = 4;
10
37
  const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
11
38
  const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
12
39
  const NOISE_PATTERNS = [
@@ -89,6 +116,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
89
116
  newsletters: ['newsletter', 'subscribe'],
90
117
  'social-share': ['share', 'social'],
91
118
  };
119
+ // Noise selector configurations
92
120
  const BASE_NOISE_SELECTORS = {
93
121
  navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
94
122
  cookieBanners: '[role="dialog"]',
@@ -96,7 +124,7 @@ const BASE_NOISE_SELECTORS = {
96
124
  };
97
125
  const NO_MATCH_REGEX = /a^/i;
98
126
  let cachedContext;
99
- let lastConfigRef;
127
+ let lastContextKey;
100
128
  function escapeRegexLiteral(value) {
101
129
  return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
102
130
  }
@@ -136,7 +164,16 @@ function getPromoMatchers(currentConfig, flags) {
136
164
  }
137
165
  function getContext() {
138
166
  const currentConfig = config.noiseRemoval;
139
- if (cachedContext !== undefined && lastConfigRef === currentConfig)
167
+ const contextKey = JSON.stringify({
168
+ locale: config.i18n.locale,
169
+ enabledCategories: currentConfig.enabledCategories,
170
+ extraTokens: currentConfig.extraTokens,
171
+ extraSelectors: currentConfig.extraSelectors,
172
+ aggressiveMode: currentConfig.aggressiveMode,
173
+ preserveSvgCanvas: currentConfig.preserveSvgCanvas,
174
+ weights: currentConfig.weights,
175
+ });
176
+ if (cachedContext !== undefined && lastContextKey === contextKey)
140
177
  return cachedContext;
141
178
  const enabled = new Set(currentConfig.enabledCategories
142
179
  .map((c) => {
@@ -188,7 +225,7 @@ function getContext() {
188
225
  baseSelector,
189
226
  candidateSelector,
190
227
  };
191
- lastConfigRef = currentConfig;
228
+ lastContextKey = contextKey;
192
229
  return cachedContext;
193
230
  }
194
231
  function isInteractive(element, role) {
@@ -263,146 +300,120 @@ function removeNodes(nodes) {
263
300
  }
264
301
  }
265
302
  }
266
- function scoreNavFooter(meta, weights) {
303
+ const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
304
+ function calculateNavFooterScore(tagName, className, id, role, weights) {
267
305
  let score = 0;
268
- if (ALWAYS_NOISE_TAGS.has(meta.tagName))
306
+ if (ALWAYS_NOISE_TAGS.has(tagName))
269
307
  score += weights.structural;
270
- // Header Boilerplate
271
- if (meta.tagName === 'header') {
272
- if ((meta.role && NAVIGATION_ROLES.has(meta.role)) ||
273
- HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
308
+ if (tagName === 'header') {
309
+ if ((role && NAVIGATION_ROLES.has(role)) ||
310
+ HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
274
311
  score += weights.structural;
275
312
  }
276
313
  }
277
- // Aside (sidebar/complementary) noise unless inside primary content
278
- if (meta.tagName === 'aside') {
314
+ if (tagName === 'aside') {
279
315
  score += weights.structural;
280
316
  }
281
- // Role Noise
282
- if (meta.role && NAVIGATION_ROLES.has(meta.role)) {
283
- if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
317
+ if (role && NAVIGATION_ROLES.has(role)) {
318
+ if (tagName !== 'aside' || role !== 'complementary') {
284
319
  score += weights.structural;
285
320
  }
286
321
  }
287
322
  return score;
288
323
  }
289
- function extractElementMetadata(element) {
324
+ function calculatePromoScore(element, className, id, context) {
325
+ if (!context.promoEnabled)
326
+ return 0;
327
+ const aggTest = context.promoMatchers.aggressive.test(className) ||
328
+ context.promoMatchers.aggressive.test(id);
329
+ const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
330
+ const isBaseMatch = !aggTest &&
331
+ (context.promoMatchers.base.test(className) ||
332
+ context.promoMatchers.base.test(id));
333
+ return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
334
+ }
335
+ function isNoiseElement(element, context) {
290
336
  const tagName = element.tagName.toLowerCase();
291
337
  const className = element.getAttribute('class') ?? '';
292
338
  const id = element.getAttribute('id') ?? '';
293
339
  const role = element.getAttribute('role');
294
340
  const style = element.getAttribute('style');
295
- const _isInteractive = isInteractive(element, role);
296
- const isHidden = element.hasAttribute('hidden') ||
341
+ const elIsInteractive = isInteractive(element, role);
342
+ const elIsHidden = element.hasAttribute('hidden') ||
297
343
  element.getAttribute('aria-hidden') === 'true' ||
298
- (style !== null &&
299
- /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
300
- return {
301
- tagName,
302
- className,
303
- id,
304
- role,
305
- style,
306
- isInteractive: _isInteractive,
307
- isHidden,
308
- };
309
- }
310
- function isNoiseElement(element, context) {
311
- const meta = extractElementMetadata(element);
344
+ (style !== null && HIDDEN_STYLE_REGEX.test(style));
312
345
  let score = 0;
313
346
  const { weights } = context;
314
347
  // Structural
315
- if (context.structuralTags.has(meta.tagName) && !meta.isInteractive) {
348
+ if (context.structuralTags.has(tagName) && !elIsInteractive) {
316
349
  score += weights.structural;
317
350
  }
318
351
  // Nav/Footer Scoring
319
352
  if (context.flags.navFooter) {
320
- score += scoreNavFooter(meta, weights);
353
+ score += calculateNavFooterScore(tagName, className, id, role, weights);
321
354
  }
322
355
  // Hidden
323
- if (meta.isHidden && !meta.isInteractive) {
356
+ if (elIsHidden && !elIsInteractive) {
324
357
  score += weights.hidden;
325
358
  }
326
359
  // Sticky/Fixed
327
- if (FIXED_OR_HIGH_Z_PATTERN.test(meta.className)) {
360
+ if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
328
361
  score += weights.stickyFixed;
329
362
  }
330
363
  // Promo
331
- if (context.promoEnabled) {
332
- const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
333
- context.promoMatchers.aggressive.test(meta.id);
334
- const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
335
- const isBaseMatch = !aggTest &&
336
- (context.promoMatchers.base.test(meta.className) ||
337
- context.promoMatchers.base.test(meta.id));
338
- if (isAggressiveMatch || isBaseMatch) {
339
- score += weights.promo;
340
- }
341
- }
364
+ score += calculatePromoScore(element, className, id, context);
342
365
  return score >= weights.threshold;
343
366
  }
344
- function cleanHeadingWrapperDivs(h) {
345
- const divs = h.querySelectorAll('div');
346
- for (let j = divs.length - 1; j >= 0; j--) {
347
- const d = divs[j];
348
- if (!d?.parentNode)
349
- continue;
350
- const cls = d.getAttribute('class') ?? '';
351
- const stl = d.getAttribute('style') ?? '';
352
- if (cls.includes('absolute') ||
353
- stl.includes('position') ||
354
- d.getAttribute('tabindex') === '-1') {
355
- d.remove();
356
- }
357
- }
358
- }
359
- function cleanHeadingAnchors(h) {
360
- const anchors = h.querySelectorAll('a');
361
- for (let j = anchors.length - 1; j >= 0; j--) {
362
- const a = anchors[j];
363
- if (!a?.parentNode)
364
- continue;
365
- const href = a.getAttribute('href') ?? '';
366
- const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
367
- if (href.startsWith('#') && txt.length === 0) {
368
- a.remove();
369
- }
370
- }
371
- }
372
- function cleanHeadingZeroWidth(h, document) {
373
- const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
374
- let node;
375
- while ((node = walker.nextNode())) {
376
- if (node.textContent?.includes('\u200B')) {
377
- node.textContent = node.textContent.replace(/\u200B/g, '');
378
- }
379
- }
380
- }
381
367
  function cleanHeadings(document) {
382
- // Clean Heading Anchors
383
368
  const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
384
369
  for (const h of headings) {
385
370
  if (!h.parentNode)
386
371
  continue;
387
- cleanHeadingWrapperDivs(h);
388
- cleanHeadingAnchors(h);
389
- cleanHeadingZeroWidth(h, document);
372
+ // Remove absolute/positioned wrapper divs
373
+ const divs = h.querySelectorAll('div');
374
+ for (let j = divs.length - 1; j >= 0; j--) {
375
+ const d = divs[j];
376
+ if (!d?.parentNode)
377
+ continue;
378
+ const cls = d.getAttribute('class') ?? '';
379
+ const stl = d.getAttribute('style') ?? '';
380
+ if (cls.includes('absolute') ||
381
+ stl.includes('position') ||
382
+ d.getAttribute('tabindex') === '-1') {
383
+ d.remove();
384
+ }
385
+ }
386
+ // Remove empty hash-link anchors
387
+ const anchors = h.querySelectorAll('a');
388
+ for (let j = anchors.length - 1; j >= 0; j--) {
389
+ const a = anchors[j];
390
+ if (!a?.parentNode)
391
+ continue;
392
+ const href = a.getAttribute('href') ?? '';
393
+ const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
394
+ if (href.startsWith('#') && txt.length === 0) {
395
+ a.remove();
396
+ }
397
+ }
398
+ // Strip zero-width spaces from text nodes
399
+ const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
400
+ let node;
401
+ while ((node = walker.nextNode())) {
402
+ if (node.textContent?.includes('\u200B')) {
403
+ node.textContent = node.textContent.replace(/\u200B/g, '');
404
+ }
405
+ }
390
406
  }
391
407
  }
392
408
  function stripNoise(document, context, signal) {
393
409
  cleanHeadings(document);
394
- // Remove Base & Extra
410
+ // Structural Removal
395
411
  const { baseSelector, extraSelectors } = context;
396
- // Base
397
- const baseNodes = document.querySelectorAll(baseSelector);
398
- removeNodes(baseNodes);
399
- // Extra
412
+ removeNodes(document.querySelectorAll(baseSelector));
400
413
  if (extraSelectors.length > 0) {
401
- const combinedExtra = extraSelectors.join(',');
402
- const extraNodes = document.querySelectorAll(combinedExtra);
403
- removeNodes(extraNodes);
414
+ removeNodes(document.querySelectorAll(extraSelectors.join(',')));
404
415
  }
405
- // Candidates
416
+ // Candidates (conditional removal)
406
417
  const candidates = document.querySelectorAll(context.candidateSelector);
407
418
  for (let i = candidates.length - 1; i >= 0; i--) {
408
419
  if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
@@ -570,6 +581,8 @@ export function removeNoiseFromHtml(html, document, baseUrl, signal) {
570
581
  return html;
571
582
  }
572
583
  }
584
+ // endregion
585
+ // region Language Detection
573
586
  class DetectionContext {
574
587
  code;
575
588
  _lower;
@@ -649,10 +662,10 @@ const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
649
662
  function containsJsxTag(code) {
650
663
  const len = code.length;
651
664
  for (let i = 0; i < len - 1; i++) {
652
- if (code.charCodeAt(i) === 60 /* < */) {
665
+ if (code.charCodeAt(i) === ASCII_LT) {
653
666
  const next = code.charCodeAt(i + 1);
654
- if (next >= 65 && next <= 90)
655
- return true; // A-Z
667
+ if (next >= ASCII_UPPER_A && next <= ASCII_UPPER_Z)
668
+ return true;
656
669
  }
657
670
  }
658
671
  return false;
@@ -711,140 +724,100 @@ function detectYamlStructure(lines) {
711
724
  if (colonIdx <= 0)
712
725
  continue;
713
726
  const after = trimmed.charCodeAt(colonIdx + 1);
714
- // space (32) or tab (9)
715
- if (after === 32 || after === 9)
727
+ if (after === ASCII_SPACE || after === ASCII_TAB)
728
+ return true;
729
+ }
730
+ return false;
731
+ }
732
+ function matchRust(ctx) {
733
+ if (ctx.lower.includes('let mut'))
734
+ return true;
735
+ if (RUST_REGEX.test(ctx.lower))
736
+ return true;
737
+ return ctx.lower.includes('use ') && ctx.lower.includes('::');
738
+ }
739
+ function matchGo(ctx) {
740
+ if (ctx.lower.includes('import "'))
741
+ return true;
742
+ return /\b(?:package|func)\b/.test(ctx.lower);
743
+ }
744
+ function matchJsx(ctx) {
745
+ const l = ctx.lower;
746
+ if (l.includes('classname=') ||
747
+ l.includes('jsx:') ||
748
+ l.includes("from 'react'") ||
749
+ l.includes('from "react"')) {
750
+ return true;
751
+ }
752
+ return containsJsxTag(ctx.code);
753
+ }
754
+ function matchTypeScript(ctx) {
755
+ if (/\b(?:interface|type)\b/.test(ctx.lower))
756
+ return true;
757
+ const l = ctx.lower;
758
+ for (const hint of TYPESCRIPT_HINTS) {
759
+ if (l.includes(hint))
760
+ return true;
761
+ }
762
+ return false;
763
+ }
764
+ function matchSql(ctx) {
765
+ return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(ctx.lower);
766
+ }
767
+ function hasJsSignals(lowerCode) {
768
+ return (JS_SIGNAL_REGEX.test(lowerCode) ||
769
+ lowerCode.includes('{') ||
770
+ lowerCode.includes("from '"));
771
+ }
772
+ function matchPython(ctx) {
773
+ const l = ctx.lower;
774
+ if (l.includes('print(') || l.includes('__name__'))
775
+ return true;
776
+ if (l.includes('self.') || l.includes('elif '))
777
+ return true;
778
+ // Check for Python's None/True/False using original case (they are capitalized in Python)
779
+ if (ctx.code.includes('None') ||
780
+ ctx.code.includes('True') ||
781
+ ctx.code.includes('False')) {
782
+ return true;
783
+ }
784
+ if (PYTHON_UNIQUE_REGEX.test(l))
785
+ return true;
786
+ // Shared keywords (import, from, class) — only match if no JS signals present
787
+ if (/\b(?:import|from|class)\b/.test(l) && !hasJsSignals(l)) {
788
+ return true;
789
+ }
790
+ return false;
791
+ }
792
+ function matchHtml(ctx) {
793
+ const l = ctx.lower;
794
+ for (const tag of HTML_TAGS) {
795
+ if (l.includes(tag))
716
796
  return true;
717
797
  }
718
798
  return false;
719
799
  }
800
+ // Pre-sorted by weight descending — first match wins in detectLanguageFromCode
720
801
  const LANGUAGES = [
721
- {
722
- lang: 'rust',
723
- weight: 25,
724
- match: (ctx) => {
725
- if (ctx.lower.includes('let mut'))
726
- return true;
727
- if (RUST_REGEX.test(ctx.lower))
728
- return true;
729
- return ctx.lower.includes('use ') && ctx.lower.includes('::');
730
- },
731
- },
732
- {
733
- lang: 'go',
734
- weight: 22,
735
- match: (ctx) => {
736
- if (ctx.lower.includes('import "'))
737
- return true;
738
- return /\b(?:package|func)\b/.test(ctx.lower);
739
- },
740
- },
741
- {
742
- lang: 'jsx',
743
- weight: 22,
744
- match: (ctx) => {
745
- const l = ctx.lower;
746
- if (l.includes('classname=') ||
747
- l.includes('jsx:') ||
748
- l.includes("from 'react'") ||
749
- l.includes('from "react"')) {
750
- return true;
751
- }
752
- return containsJsxTag(ctx.code);
753
- },
754
- },
755
- {
756
- lang: 'typescript',
757
- weight: 20,
758
- match: (ctx) => {
759
- if (/\b(?:interface|type)\b/.test(ctx.lower))
760
- return true;
761
- const l = ctx.lower;
762
- for (const hint of TYPESCRIPT_HINTS) {
763
- if (l.includes(hint))
764
- return true;
765
- }
766
- return false;
767
- },
768
- },
769
- {
770
- lang: 'sql',
771
- weight: 20,
772
- match: (ctx) => {
773
- const l = ctx.lower;
774
- return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(l);
775
- },
776
- },
777
- {
778
- lang: 'python',
779
- weight: 18,
780
- match: (ctx) => {
781
- const l = ctx.lower;
782
- if (l.includes('print(') || l.includes('__name__'))
783
- return true;
784
- if (l.includes('self.') || l.includes('elif '))
785
- return true;
786
- // Check for Python's None/True/False using original case (they are capitalized in Python)
787
- if (ctx.code.includes('None') ||
788
- ctx.code.includes('True') ||
789
- ctx.code.includes('False')) {
790
- return true;
791
- }
792
- // Python-unique keywords that JS doesn't have
793
- if (PYTHON_UNIQUE_REGEX.test(l))
794
- return true;
795
- // Shared keywords (import, from, class) — only match if no JS signals present
796
- if (/\b(?:import|from|class)\b/.test(l) &&
797
- !JS_SIGNAL_REGEX.test(l) &&
798
- !l.includes('{') &&
799
- !l.includes("from '")) {
800
- return true;
801
- }
802
- return false;
803
- },
804
- },
802
+ { lang: 'rust', weight: 25, match: matchRust },
803
+ { lang: 'go', weight: 22, match: matchGo },
804
+ { lang: 'jsx', weight: 22, match: matchJsx },
805
+ { lang: 'typescript', weight: 20, match: matchTypeScript },
806
+ { lang: 'sql', weight: 20, match: matchSql },
807
+ { lang: 'python', weight: 18, match: matchPython },
805
808
  {
806
809
  lang: 'css',
807
810
  weight: 18,
808
- match: (ctx) => {
809
- if (CSS_REGEX.test(ctx.lower))
810
- return true;
811
- return detectCssStructure(ctx.lines);
812
- },
813
- },
814
- {
815
- lang: 'bash',
816
- weight: 15,
817
- match: (ctx) => detectBashIndicators(ctx.lines),
818
- },
819
- {
820
- lang: 'yaml',
821
- weight: 15,
822
- match: (ctx) => detectYamlStructure(ctx.lines),
823
- },
824
- {
825
- lang: 'javascript',
826
- weight: 15,
827
- match: (ctx) => JS_REGEX.test(ctx.lower),
828
- },
829
- {
830
- lang: 'html',
831
- weight: 12,
832
- match: (ctx) => {
833
- const l = ctx.lower;
834
- for (const tag of HTML_TAGS) {
835
- if (l.includes(tag))
836
- return true;
837
- }
838
- return false;
839
- },
811
+ match: (ctx) => CSS_REGEX.test(ctx.lower) || detectCssStructure(ctx.lines),
840
812
  },
813
+ { lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
814
+ { lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
815
+ { lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
816
+ { lang: 'html', weight: 12, match: matchHtml },
841
817
  {
842
818
  lang: 'json',
843
819
  weight: 10,
844
- match: (ctx) => {
845
- const s = ctx.trimmedStart;
846
- return s.startsWith('{') || s.startsWith('[');
847
- },
820
+ match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
848
821
  },
849
822
  ];
850
823
  function extractLanguageFromClassName(className) {
@@ -880,11 +853,10 @@ function resolveLanguageFromDataAttribute(dataLang) {
880
853
  // Check if \w+
881
854
  for (let i = 0; i < trimmed.length; i++) {
882
855
  const c = trimmed.charCodeAt(i);
883
- // valid: A-Z, a-z, 0-9, _
884
- const isUpper = c >= 65 && c <= 90;
885
- const isLower = c >= 97 && c <= 122;
886
- const isDigit = c >= 48 && c <= 57;
887
- const isUnder = c === 95;
856
+ const isUpper = c >= ASCII_UPPER_A && c <= ASCII_UPPER_Z;
857
+ const isLower = c >= ASCII_LOWER_A && c <= ASCII_LOWER_Z;
858
+ const isDigit = c >= ASCII_DIGIT_0 && c <= ASCII_DIGIT_9;
859
+ const isUnder = c === ASCII_UNDERSCORE;
888
860
  if (!isUpper && !isLower && !isDigit && !isUnder) {
889
861
  return undefined;
890
862
  }
@@ -901,7 +873,7 @@ export function detectLanguageFromCode(code) {
901
873
  // Fast path for empty/whitespace only
902
874
  let empty = true;
903
875
  for (let i = 0; i < code.length; i++) {
904
- if (code.charCodeAt(i) > 32) {
876
+ if (code.charCodeAt(i) > ASCII_SPACE) {
905
877
  empty = false;
906
878
  break;
907
879
  }
@@ -909,20 +881,15 @@ export function detectLanguageFromCode(code) {
909
881
  if (empty)
910
882
  return undefined;
911
883
  const ctx = new DetectionContext(code);
912
- let bestLang;
913
- let bestScore = -1;
884
+ // LANGUAGES is pre-sorted by weight descending — first match is highest confidence
914
885
  for (const def of LANGUAGES) {
915
- if (def.match(ctx)) {
916
- if (def.weight > bestScore) {
917
- bestScore = def.weight;
918
- bestLang = def.lang;
919
- if (bestScore >= 25)
920
- break;
921
- }
922
- }
886
+ if (def.match(ctx))
887
+ return def.lang;
923
888
  }
924
- return bestLang;
889
+ return undefined;
925
890
  }
891
+ // endregion
892
+ // region Markdown Cleanup
926
893
  const MAX_LINE_LENGTH = 80;
927
894
  const REGEX = {
928
895
  HEADING_MARKER: /^#{1,6}\s/m,
@@ -975,7 +942,7 @@ function isBlank(line) {
975
942
  }
976
943
  function hasFollowingContent(lines, startIndex) {
977
944
  // Optimization: Bound lookahead to avoid checking too many lines in huge files
978
- for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + 50); i++) {
945
+ for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
979
946
  if (!isBlank(lines[i]))
980
947
  return true;
981
948
  }
@@ -994,7 +961,7 @@ function isTitleCaseOrKeyword(trimmed) {
994
961
  // Split limited number of words
995
962
  const words = trimmed.split(/\s+/);
996
963
  const len = words.length;
997
- if (len < 2 || len > 6)
964
+ if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
998
965
  return false;
999
966
  let capitalizedCount = 0;
1000
967
  for (let i = 0; i < len; i++) {
@@ -1007,20 +974,19 @@ function isTitleCaseOrKeyword(trimmed) {
1007
974
  else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
1008
975
  return false;
1009
976
  }
1010
- return capitalizedCount >= 2;
977
+ return capitalizedCount >= TITLE_MIN_CAPITALIZED;
1011
978
  }
1012
979
  function getHeadingPrefix(trimmed) {
1013
980
  if (trimmed.length > MAX_LINE_LENGTH)
1014
981
  return null;
1015
982
  // Fast path: Check common markdown markers first
1016
983
  const firstChar = trimmed.charCodeAt(0);
1017
- // # (35), - (45), * (42), + (43), digit (48-57), [ (91)
1018
- if (firstChar === 35 ||
1019
- firstChar === 45 ||
1020
- firstChar === 42 ||
1021
- firstChar === 43 ||
1022
- firstChar === 91 ||
1023
- (firstChar >= 48 && firstChar <= 57)) {
984
+ if (firstChar === ASCII_HASH ||
985
+ firstChar === ASCII_DASH ||
986
+ firstChar === ASCII_ASTERISK ||
987
+ firstChar === ASCII_PLUS ||
988
+ firstChar === ASCII_BRACKET_OPEN ||
989
+ (firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
1024
990
  if (REGEX.HEADING_MARKER.test(trimmed) ||
1025
991
  REGEX.LIST_MARKER.test(trimmed) ||
1026
992
  /^\d+\.\s/.test(trimmed) ||
@@ -1032,8 +998,9 @@ function getHeadingPrefix(trimmed) {
1032
998
  return /^example:\s/i.test(trimmed) ? '### ' : '## ';
1033
999
  }
1034
1000
  const lastChar = trimmed.charCodeAt(trimmed.length - 1);
1035
- // . (46), ! (33), ? (63)
1036
- if (lastChar === 46 || lastChar === 33 || lastChar === 63)
1001
+ if (lastChar === ASCII_PERIOD ||
1002
+ lastChar === ASCII_EXCLAMATION ||
1003
+ lastChar === ASCII_QUESTION)
1037
1004
  return null;
1038
1005
  return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
1039
1006
  }
@@ -1148,33 +1115,20 @@ function processTextBuffer(lines, options) {
1148
1115
  const text = preprocessLines(lines, options);
1149
1116
  return applyGlobalRegexes(text, options);
1150
1117
  }
1151
- function applyGlobalRegexes(text, options) {
1152
- let result = text;
1153
- const checkAbort = createAbortChecker(options);
1154
- // Normalize non-breaking spaces to regular spaces
1155
- result = result.replace(/\u00A0/g, ' ');
1156
- checkAbort('markdown:cleanup:headings');
1157
- // fixAndSpaceHeadings
1158
- result = result
1159
- .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
1160
- .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
1161
- if (config.markdownCleanup.removeTypeDocComments) {
1162
- checkAbort('markdown:cleanup:typedoc');
1163
- result = result
1164
- .split('\n')
1165
- .filter((line) => !isTypeDocArtifactLine(line))
1166
- .join('\n');
1167
- result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
1168
- }
1169
- if (config.markdownCleanup.removeSkipLinks) {
1170
- checkAbort('markdown:cleanup:skip-links');
1171
- result = result
1172
- .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1173
- .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1174
- }
1175
- checkAbort('markdown:cleanup:spacing');
1176
- // normalizeSpacing
1177
- result = result
1118
+ function removeTypeDocArtifacts(text) {
1119
+ const filtered = text
1120
+ .split('\n')
1121
+ .filter((line) => !isTypeDocArtifactLine(line))
1122
+ .join('\n');
1123
+ return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
1124
+ }
1125
+ function removeSkipLinks(text) {
1126
+ return text
1127
+ .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
1128
+ .replace(REGEX.COMBINED_LINE_REMOVALS, '');
1129
+ }
1130
+ function normalizeMarkdownSpacing(text) {
1131
+ let result = text
1178
1132
  .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
1179
1133
  .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
1180
1134
  .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
@@ -1186,10 +1140,12 @@ function applyGlobalRegexes(text, options) {
1186
1140
  result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
1187
1141
  // Unescape backticks inside markdown link text
1188
1142
  result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
1189
- result = normalizeNestedListIndentation(result);
1190
- checkAbort('markdown:cleanup:properties');
1191
- // fixProperties
1192
- for (let k = 0; k < 3; k++) {
1143
+ result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
1144
+ return normalizeNestedListIndentation(result);
1145
+ }
1146
+ function fixConcatenatedProperties(text) {
1147
+ let result = text;
1148
+ for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
1193
1149
  const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
1194
1150
  if (next === result)
1195
1151
  break;
@@ -1197,6 +1153,26 @@ function applyGlobalRegexes(text, options) {
1197
1153
  }
1198
1154
  return result;
1199
1155
  }
1156
+ function applyGlobalRegexes(text, options) {
1157
+ const checkAbort = createAbortChecker(options);
1158
+ let result = text.replace(/\u00A0/g, ' ');
1159
+ checkAbort('markdown:cleanup:headings');
1160
+ result = result
1161
+ .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
1162
+ .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
1163
+ if (config.markdownCleanup.removeTypeDocComments) {
1164
+ checkAbort('markdown:cleanup:typedoc');
1165
+ result = removeTypeDocArtifacts(result);
1166
+ }
1167
+ if (config.markdownCleanup.removeSkipLinks) {
1168
+ checkAbort('markdown:cleanup:skip-links');
1169
+ result = removeSkipLinks(result);
1170
+ }
1171
+ checkAbort('markdown:cleanup:spacing');
1172
+ result = normalizeMarkdownSpacing(result);
1173
+ checkAbort('markdown:cleanup:properties');
1174
+ return fixConcatenatedProperties(result);
1175
+ }
1200
1176
  function normalizeNestedListIndentation(text) {
1201
1177
  return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
1202
1178
  const count = spaces.length;
@@ -1211,27 +1187,17 @@ export function cleanupMarkdownArtifacts(content, options) {
1211
1187
  return '';
1212
1188
  const checkAbort = createAbortChecker(options);
1213
1189
  checkAbort('markdown:cleanup:begin');
1214
- const len = content.length;
1215
- let lastIndex = 0;
1190
+ const lines = content.split(/\r?\n/);
1216
1191
  let fenceMarker = null;
1217
1192
  const segments = [];
1218
1193
  let buffer = [];
1219
- while (lastIndex < len) {
1220
- let nextIndex = content.indexOf('\n', lastIndex);
1221
- let line;
1222
- if (nextIndex === -1) {
1223
- line = content.slice(lastIndex);
1224
- nextIndex = len;
1225
- }
1226
- else {
1227
- if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
1228
- line = content.slice(lastIndex, nextIndex - 1);
1229
- }
1230
- else {
1231
- line = content.slice(lastIndex, nextIndex);
1232
- }
1233
- nextIndex++; // Skip \n
1194
+ const flushBuffer = () => {
1195
+ if (buffer.length > 0) {
1196
+ segments.push(processTextBuffer(buffer, options));
1197
+ buffer = [];
1234
1198
  }
1199
+ };
1200
+ for (const line of lines) {
1235
1201
  const trimmed = line.trimStart();
1236
1202
  if (fenceMarker) {
1237
1203
  segments.push(line);
@@ -1247,22 +1213,16 @@ export function cleanupMarkdownArtifacts(content, options) {
1247
1213
  buffer.push(line);
1248
1214
  }
1249
1215
  else {
1250
- if (buffer.length > 0) {
1251
- segments.push(processTextBuffer(buffer, options));
1252
- buffer = [];
1253
- }
1216
+ flushBuffer();
1254
1217
  segments.push(line);
1255
1218
  fenceMarker = newMarker;
1256
1219
  }
1257
1220
  }
1258
- lastIndex = nextIndex;
1259
- }
1260
- if (buffer.length > 0) {
1261
- segments.push(processTextBuffer(buffer, options));
1262
1221
  }
1222
+ flushBuffer();
1263
1223
  return segments.join('\n').trim();
1264
1224
  }
1265
- function detectFrontmatter(content) {
1225
+ function parseFrontmatter(content) {
1266
1226
  const len = content.length;
1267
1227
  if (len < 4)
1268
1228
  return null;
@@ -1282,57 +1242,43 @@ function detectFrontmatter(content) {
1282
1242
  const closeIndex = content.indexOf(fence, fenceLen);
1283
1243
  if (closeIndex === -1)
1284
1244
  return null;
1285
- return {
1245
+ const range = {
1286
1246
  start: 0,
1287
1247
  end: closeIndex + fenceLen,
1288
1248
  linesStart: fenceLen,
1289
1249
  linesEnd: closeIndex,
1290
1250
  lineEnding,
1291
1251
  };
1292
- }
1293
- function parseFrontmatterEntry(line) {
1294
- const trimmed = line.trim();
1295
- const idx = trimmed.indexOf(':');
1296
- if (!trimmed || idx <= 0)
1297
- return null;
1298
- return {
1299
- key: trimmed.slice(0, idx).trim().toLowerCase(),
1300
- value: trimmed.slice(idx + 1).trim(),
1301
- };
1302
- }
1303
- function stripFrontmatterQuotes(val) {
1304
- const first = val.charAt(0);
1305
- const last = val.charAt(val.length - 1);
1306
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1307
- return val.slice(1, -1).trim();
1308
- }
1309
- return val;
1310
- }
1311
- function scanFrontmatterForTitle(content, fm) {
1312
- const fmBody = content.slice(fm.linesStart, fm.linesEnd);
1252
+ // Parse key-value entries in one pass
1253
+ const entries = new Map();
1254
+ const fmBody = content.slice(range.linesStart, range.linesEnd);
1313
1255
  let lastIdx = 0;
1314
1256
  while (lastIdx < fmBody.length) {
1315
- let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx);
1257
+ let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
1316
1258
  if (nextIdx === -1)
1317
1259
  nextIdx = fmBody.length;
1318
- const line = fmBody.slice(lastIdx, nextIdx);
1319
- const entry = parseFrontmatterEntry(line);
1320
- if (entry) {
1321
- if (entry.key === 'title' || entry.key === 'name') {
1322
- const cleaned = stripFrontmatterQuotes(entry.value);
1323
- if (cleaned)
1324
- return cleaned;
1260
+ const line = fmBody.slice(lastIdx, nextIdx).trim();
1261
+ const colonIdx = line.indexOf(':');
1262
+ if (line && colonIdx > 0) {
1263
+ const key = line.slice(0, colonIdx).trim().toLowerCase();
1264
+ let value = line.slice(colonIdx + 1).trim();
1265
+ // Strip surrounding quotes
1266
+ const first = value.charAt(0);
1267
+ const last = value.charAt(value.length - 1);
1268
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1269
+ value = value.slice(1, -1).trim();
1325
1270
  }
1271
+ if (value)
1272
+ entries.set(key, value);
1326
1273
  }
1327
- lastIdx = nextIdx + fm.lineEnding.length;
1274
+ lastIdx = nextIdx + lineEnding.length;
1328
1275
  }
1329
- return undefined;
1276
+ return { range, entries };
1330
1277
  }
1331
1278
  function scanBodyForTitle(content) {
1332
1279
  const len = content.length;
1333
1280
  let scanIndex = 0;
1334
- const LIMIT = 5000;
1335
- const maxScan = Math.min(len, LIMIT);
1281
+ const maxScan = Math.min(len, BODY_SCAN_LIMIT);
1336
1282
  while (scanIndex < maxScan) {
1337
1283
  let nextIndex = content.indexOf('\n', scanIndex);
1338
1284
  if (nextIndex === -1)
@@ -1352,16 +1298,16 @@ function scanBodyForTitle(content) {
1352
1298
  return undefined;
1353
1299
  }
1354
1300
  export function extractTitleFromRawMarkdown(content) {
1355
- const fm = detectFrontmatter(content);
1301
+ const fm = parseFrontmatter(content);
1356
1302
  if (fm) {
1357
- const title = scanFrontmatterForTitle(content, fm);
1303
+ const title = fm.entries.get('title') ?? fm.entries.get('name');
1358
1304
  if (title)
1359
1305
  return title;
1360
1306
  }
1361
1307
  return scanBodyForTitle(content);
1362
1308
  }
1363
1309
  export function addSourceToMarkdown(content, url) {
1364
- const fm = detectFrontmatter(content);
1310
+ const fm = parseFrontmatter(content);
1365
1311
  const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
1366
1312
  if (useMarkdownFormat && !fm) {
1367
1313
  if (REGEX.SOURCE_KEY.test(content))
@@ -1382,13 +1328,17 @@ export function addSourceToMarkdown(content, url) {
1382
1328
  const escapedUrl = url.replace(/"/g, '\\"');
1383
1329
  return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
1384
1330
  }
1385
- const fmBody = content.slice(fm.linesStart, fm.linesEnd);
1331
+ const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
1386
1332
  if (REGEX.SOURCE_KEY.test(fmBody))
1387
1333
  return content;
1388
1334
  const escapedUrl = url.replace(/"/g, '\\"');
1389
- const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
1390
- return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd);
1335
+ const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
1336
+ return (content.slice(0, fm.range.linesEnd) +
1337
+ injection +
1338
+ content.slice(fm.range.linesEnd));
1391
1339
  }
1340
+ // endregion
1341
+ // region Content Detection & Metadata Footer
1392
1342
  function countCommonTags(content, limit) {
1393
1343
  if (limit <= 0)
1394
1344
  return 0;
@@ -1405,10 +1355,10 @@ export function isRawTextContent(content) {
1405
1355
  const trimmed = content.trim();
1406
1356
  if (REGEX.HTML_DOC_START.test(trimmed))
1407
1357
  return false;
1408
- if (detectFrontmatter(trimmed) !== null)
1358
+ if (parseFrontmatter(trimmed) !== null)
1409
1359
  return true;
1410
- const tagCount = countCommonTags(content, 5);
1411
- if (tagCount > 5)
1360
+ const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
1361
+ if (tagCount > HTML_TAG_DENSITY_LIMIT)
1412
1362
  return false;
1413
1363
  return (REGEX.HEADING_MARKER.test(content) ||
1414
1364
  REGEX.LIST_MARKER.test(content) ||
@@ -1446,3 +1396,4 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
1446
1396
  lines.push(` <sub>${metadata.description}</sub>`);
1447
1397
  return lines.join('\n');
1448
1398
  }
1399
+ // endregion