@j0hanz/fetch-url-mcp 1.9.1 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +0 -1
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +1 -13
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +2 -5
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +301 -350
- package/dist/lib/core.d.ts +78 -71
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +308 -372
- package/dist/lib/fetch-pipeline.d.ts +2 -6
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +51 -137
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js +188 -130
- package/dist/lib/mcp-tools.d.ts +3 -5
- package/dist/lib/mcp-tools.d.ts.map +1 -1
- package/dist/lib/mcp-tools.js +22 -58
- package/dist/lib/task-handlers.js +4 -4
- package/dist/lib/utils.d.ts +6 -0
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +23 -0
- package/dist/resources/index.js +1 -1
- package/dist/schemas.d.ts +0 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +4 -6
- package/dist/server.js +1 -1
- package/dist/tasks/owner.d.ts +1 -1
- package/dist/tasks/owner.d.ts.map +1 -1
- package/dist/tasks/tool-registry.d.ts +1 -1
- package/dist/tasks/tool-registry.d.ts.map +1 -1
- package/dist/tools/fetch-url.d.ts +2 -3
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +89 -152
- package/dist/transform/transform.d.ts +8 -0
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +109 -108
- package/dist/transform/worker-pool.d.ts +3 -6
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +148 -118
- package/package.json +2 -1
package/dist/lib/content.js
CHANGED
|
@@ -2,11 +2,38 @@ import { parseHTML } from 'linkedom';
|
|
|
2
2
|
import {} from '../transform/types.js';
|
|
3
3
|
import { config, logDebug } from './core.js';
|
|
4
4
|
import { throwIfAborted } from './utils.js';
|
|
5
|
+
// ASCII char codes used in hot-path charCodeAt comparisons
|
|
6
|
+
const ASCII_SPACE = 32;
|
|
7
|
+
const ASCII_TAB = 9;
|
|
8
|
+
const ASCII_EXCLAMATION = 33;
|
|
9
|
+
const ASCII_HASH = 35;
|
|
10
|
+
const ASCII_ASTERISK = 42;
|
|
11
|
+
const ASCII_PLUS = 43;
|
|
12
|
+
const ASCII_DASH = 45;
|
|
13
|
+
const ASCII_PERIOD = 46;
|
|
14
|
+
const ASCII_DIGIT_0 = 48;
|
|
15
|
+
const ASCII_DIGIT_9 = 57;
|
|
16
|
+
const ASCII_LT = 60;
|
|
17
|
+
const ASCII_QUESTION = 63;
|
|
18
|
+
const ASCII_UPPER_A = 65;
|
|
19
|
+
const ASCII_UPPER_Z = 90;
|
|
20
|
+
const ASCII_BRACKET_OPEN = 91;
|
|
21
|
+
const ASCII_LOWER_A = 97;
|
|
22
|
+
const ASCII_LOWER_Z = 122;
|
|
23
|
+
const ASCII_UNDERSCORE = 95;
|
|
24
|
+
const HTML_TAG_DENSITY_LIMIT = 5;
|
|
25
|
+
const TITLE_MIN_WORDS = 2;
|
|
26
|
+
const TITLE_MAX_WORDS = 6;
|
|
27
|
+
const TITLE_MIN_CAPITALIZED = 2;
|
|
28
|
+
const PROPERTY_FIX_MAX_PASSES = 3;
|
|
29
|
+
const BODY_SCAN_LIMIT = 5000;
|
|
30
|
+
const HAS_FOLLOWING_LOOKAHEAD = 50;
|
|
5
31
|
const NOISE_SCAN_LIMIT = 50_000;
|
|
6
32
|
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
33
|
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
34
|
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
35
|
const ABORT_CHECK_INTERVAL = 500;
|
|
36
|
+
const NODE_FILTER_SHOW_TEXT = 4;
|
|
10
37
|
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
11
38
|
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
12
39
|
const NOISE_PATTERNS = [
|
|
@@ -89,6 +116,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
|
|
|
89
116
|
newsletters: ['newsletter', 'subscribe'],
|
|
90
117
|
'social-share': ['share', 'social'],
|
|
91
118
|
};
|
|
119
|
+
// Noise selector configurations
|
|
92
120
|
const BASE_NOISE_SELECTORS = {
|
|
93
121
|
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
94
122
|
cookieBanners: '[role="dialog"]',
|
|
@@ -96,7 +124,7 @@ const BASE_NOISE_SELECTORS = {
|
|
|
96
124
|
};
|
|
97
125
|
const NO_MATCH_REGEX = /a^/i;
|
|
98
126
|
let cachedContext;
|
|
99
|
-
let
|
|
127
|
+
let lastContextKey;
|
|
100
128
|
function escapeRegexLiteral(value) {
|
|
101
129
|
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
102
130
|
}
|
|
@@ -136,7 +164,16 @@ function getPromoMatchers(currentConfig, flags) {
|
|
|
136
164
|
}
|
|
137
165
|
function getContext() {
|
|
138
166
|
const currentConfig = config.noiseRemoval;
|
|
139
|
-
|
|
167
|
+
const contextKey = JSON.stringify({
|
|
168
|
+
locale: config.i18n.locale,
|
|
169
|
+
enabledCategories: currentConfig.enabledCategories,
|
|
170
|
+
extraTokens: currentConfig.extraTokens,
|
|
171
|
+
extraSelectors: currentConfig.extraSelectors,
|
|
172
|
+
aggressiveMode: currentConfig.aggressiveMode,
|
|
173
|
+
preserveSvgCanvas: currentConfig.preserveSvgCanvas,
|
|
174
|
+
weights: currentConfig.weights,
|
|
175
|
+
});
|
|
176
|
+
if (cachedContext !== undefined && lastContextKey === contextKey)
|
|
140
177
|
return cachedContext;
|
|
141
178
|
const enabled = new Set(currentConfig.enabledCategories
|
|
142
179
|
.map((c) => {
|
|
@@ -188,7 +225,7 @@ function getContext() {
|
|
|
188
225
|
baseSelector,
|
|
189
226
|
candidateSelector,
|
|
190
227
|
};
|
|
191
|
-
|
|
228
|
+
lastContextKey = contextKey;
|
|
192
229
|
return cachedContext;
|
|
193
230
|
}
|
|
194
231
|
function isInteractive(element, role) {
|
|
@@ -263,146 +300,120 @@ function removeNodes(nodes) {
|
|
|
263
300
|
}
|
|
264
301
|
}
|
|
265
302
|
}
|
|
266
|
-
|
|
303
|
+
const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
|
|
304
|
+
function calculateNavFooterScore(tagName, className, id, role, weights) {
|
|
267
305
|
let score = 0;
|
|
268
|
-
if (ALWAYS_NOISE_TAGS.has(
|
|
306
|
+
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
269
307
|
score += weights.structural;
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
|
|
308
|
+
if (tagName === 'header') {
|
|
309
|
+
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
310
|
+
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
274
311
|
score += weights.structural;
|
|
275
312
|
}
|
|
276
313
|
}
|
|
277
|
-
|
|
278
|
-
if (meta.tagName === 'aside') {
|
|
314
|
+
if (tagName === 'aside') {
|
|
279
315
|
score += weights.structural;
|
|
280
316
|
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
if (meta.tagName !== 'aside' || meta.role !== 'complementary') {
|
|
317
|
+
if (role && NAVIGATION_ROLES.has(role)) {
|
|
318
|
+
if (tagName !== 'aside' || role !== 'complementary') {
|
|
284
319
|
score += weights.structural;
|
|
285
320
|
}
|
|
286
321
|
}
|
|
287
322
|
return score;
|
|
288
323
|
}
|
|
289
|
-
function
|
|
324
|
+
function calculatePromoScore(element, className, id, context) {
|
|
325
|
+
if (!context.promoEnabled)
|
|
326
|
+
return 0;
|
|
327
|
+
const aggTest = context.promoMatchers.aggressive.test(className) ||
|
|
328
|
+
context.promoMatchers.aggressive.test(id);
|
|
329
|
+
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
330
|
+
const isBaseMatch = !aggTest &&
|
|
331
|
+
(context.promoMatchers.base.test(className) ||
|
|
332
|
+
context.promoMatchers.base.test(id));
|
|
333
|
+
return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
|
|
334
|
+
}
|
|
335
|
+
function isNoiseElement(element, context) {
|
|
290
336
|
const tagName = element.tagName.toLowerCase();
|
|
291
337
|
const className = element.getAttribute('class') ?? '';
|
|
292
338
|
const id = element.getAttribute('id') ?? '';
|
|
293
339
|
const role = element.getAttribute('role');
|
|
294
340
|
const style = element.getAttribute('style');
|
|
295
|
-
const
|
|
296
|
-
const
|
|
341
|
+
const elIsInteractive = isInteractive(element, role);
|
|
342
|
+
const elIsHidden = element.hasAttribute('hidden') ||
|
|
297
343
|
element.getAttribute('aria-hidden') === 'true' ||
|
|
298
|
-
(style !== null &&
|
|
299
|
-
/\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
|
|
300
|
-
return {
|
|
301
|
-
tagName,
|
|
302
|
-
className,
|
|
303
|
-
id,
|
|
304
|
-
role,
|
|
305
|
-
style,
|
|
306
|
-
isInteractive: _isInteractive,
|
|
307
|
-
isHidden,
|
|
308
|
-
};
|
|
309
|
-
}
|
|
310
|
-
function isNoiseElement(element, context) {
|
|
311
|
-
const meta = extractElementMetadata(element);
|
|
344
|
+
(style !== null && HIDDEN_STYLE_REGEX.test(style));
|
|
312
345
|
let score = 0;
|
|
313
346
|
const { weights } = context;
|
|
314
347
|
// Structural
|
|
315
|
-
if (context.structuralTags.has(
|
|
348
|
+
if (context.structuralTags.has(tagName) && !elIsInteractive) {
|
|
316
349
|
score += weights.structural;
|
|
317
350
|
}
|
|
318
351
|
// Nav/Footer Scoring
|
|
319
352
|
if (context.flags.navFooter) {
|
|
320
|
-
score +=
|
|
353
|
+
score += calculateNavFooterScore(tagName, className, id, role, weights);
|
|
321
354
|
}
|
|
322
355
|
// Hidden
|
|
323
|
-
if (
|
|
356
|
+
if (elIsHidden && !elIsInteractive) {
|
|
324
357
|
score += weights.hidden;
|
|
325
358
|
}
|
|
326
359
|
// Sticky/Fixed
|
|
327
|
-
if (FIXED_OR_HIGH_Z_PATTERN.test(
|
|
360
|
+
if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
|
|
328
361
|
score += weights.stickyFixed;
|
|
329
362
|
}
|
|
330
363
|
// Promo
|
|
331
|
-
|
|
332
|
-
const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
|
|
333
|
-
context.promoMatchers.aggressive.test(meta.id);
|
|
334
|
-
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
335
|
-
const isBaseMatch = !aggTest &&
|
|
336
|
-
(context.promoMatchers.base.test(meta.className) ||
|
|
337
|
-
context.promoMatchers.base.test(meta.id));
|
|
338
|
-
if (isAggressiveMatch || isBaseMatch) {
|
|
339
|
-
score += weights.promo;
|
|
340
|
-
}
|
|
341
|
-
}
|
|
364
|
+
score += calculatePromoScore(element, className, id, context);
|
|
342
365
|
return score >= weights.threshold;
|
|
343
366
|
}
|
|
344
|
-
function cleanHeadingWrapperDivs(h) {
|
|
345
|
-
const divs = h.querySelectorAll('div');
|
|
346
|
-
for (let j = divs.length - 1; j >= 0; j--) {
|
|
347
|
-
const d = divs[j];
|
|
348
|
-
if (!d?.parentNode)
|
|
349
|
-
continue;
|
|
350
|
-
const cls = d.getAttribute('class') ?? '';
|
|
351
|
-
const stl = d.getAttribute('style') ?? '';
|
|
352
|
-
if (cls.includes('absolute') ||
|
|
353
|
-
stl.includes('position') ||
|
|
354
|
-
d.getAttribute('tabindex') === '-1') {
|
|
355
|
-
d.remove();
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
function cleanHeadingAnchors(h) {
|
|
360
|
-
const anchors = h.querySelectorAll('a');
|
|
361
|
-
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
362
|
-
const a = anchors[j];
|
|
363
|
-
if (!a?.parentNode)
|
|
364
|
-
continue;
|
|
365
|
-
const href = a.getAttribute('href') ?? '';
|
|
366
|
-
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
367
|
-
if (href.startsWith('#') && txt.length === 0) {
|
|
368
|
-
a.remove();
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
function cleanHeadingZeroWidth(h, document) {
|
|
373
|
-
const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
|
|
374
|
-
let node;
|
|
375
|
-
while ((node = walker.nextNode())) {
|
|
376
|
-
if (node.textContent?.includes('\u200B')) {
|
|
377
|
-
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
378
|
-
}
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
367
|
function cleanHeadings(document) {
|
|
382
|
-
// Clean Heading Anchors
|
|
383
368
|
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
384
369
|
for (const h of headings) {
|
|
385
370
|
if (!h.parentNode)
|
|
386
371
|
continue;
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
372
|
+
// Remove absolute/positioned wrapper divs
|
|
373
|
+
const divs = h.querySelectorAll('div');
|
|
374
|
+
for (let j = divs.length - 1; j >= 0; j--) {
|
|
375
|
+
const d = divs[j];
|
|
376
|
+
if (!d?.parentNode)
|
|
377
|
+
continue;
|
|
378
|
+
const cls = d.getAttribute('class') ?? '';
|
|
379
|
+
const stl = d.getAttribute('style') ?? '';
|
|
380
|
+
if (cls.includes('absolute') ||
|
|
381
|
+
stl.includes('position') ||
|
|
382
|
+
d.getAttribute('tabindex') === '-1') {
|
|
383
|
+
d.remove();
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
// Remove empty hash-link anchors
|
|
387
|
+
const anchors = h.querySelectorAll('a');
|
|
388
|
+
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
389
|
+
const a = anchors[j];
|
|
390
|
+
if (!a?.parentNode)
|
|
391
|
+
continue;
|
|
392
|
+
const href = a.getAttribute('href') ?? '';
|
|
393
|
+
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
394
|
+
if (href.startsWith('#') && txt.length === 0) {
|
|
395
|
+
a.remove();
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
// Strip zero-width spaces from text nodes
|
|
399
|
+
const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
|
|
400
|
+
let node;
|
|
401
|
+
while ((node = walker.nextNode())) {
|
|
402
|
+
if (node.textContent?.includes('\u200B')) {
|
|
403
|
+
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
404
|
+
}
|
|
405
|
+
}
|
|
390
406
|
}
|
|
391
407
|
}
|
|
392
408
|
function stripNoise(document, context, signal) {
|
|
393
409
|
cleanHeadings(document);
|
|
394
|
-
//
|
|
410
|
+
// Structural Removal
|
|
395
411
|
const { baseSelector, extraSelectors } = context;
|
|
396
|
-
|
|
397
|
-
const baseNodes = document.querySelectorAll(baseSelector);
|
|
398
|
-
removeNodes(baseNodes);
|
|
399
|
-
// Extra
|
|
412
|
+
removeNodes(document.querySelectorAll(baseSelector));
|
|
400
413
|
if (extraSelectors.length > 0) {
|
|
401
|
-
|
|
402
|
-
const extraNodes = document.querySelectorAll(combinedExtra);
|
|
403
|
-
removeNodes(extraNodes);
|
|
414
|
+
removeNodes(document.querySelectorAll(extraSelectors.join(',')));
|
|
404
415
|
}
|
|
405
|
-
// Candidates
|
|
416
|
+
// Candidates (conditional removal)
|
|
406
417
|
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
407
418
|
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
408
419
|
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
@@ -570,6 +581,8 @@ export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
|
570
581
|
return html;
|
|
571
582
|
}
|
|
572
583
|
}
|
|
584
|
+
// endregion
|
|
585
|
+
// region Language Detection
|
|
573
586
|
class DetectionContext {
|
|
574
587
|
code;
|
|
575
588
|
_lower;
|
|
@@ -649,10 +662,10 @@ const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
|
|
|
649
662
|
function containsJsxTag(code) {
|
|
650
663
|
const len = code.length;
|
|
651
664
|
for (let i = 0; i < len - 1; i++) {
|
|
652
|
-
if (code.charCodeAt(i) ===
|
|
665
|
+
if (code.charCodeAt(i) === ASCII_LT) {
|
|
653
666
|
const next = code.charCodeAt(i + 1);
|
|
654
|
-
if (next >=
|
|
655
|
-
return true;
|
|
667
|
+
if (next >= ASCII_UPPER_A && next <= ASCII_UPPER_Z)
|
|
668
|
+
return true;
|
|
656
669
|
}
|
|
657
670
|
}
|
|
658
671
|
return false;
|
|
@@ -711,140 +724,100 @@ function detectYamlStructure(lines) {
|
|
|
711
724
|
if (colonIdx <= 0)
|
|
712
725
|
continue;
|
|
713
726
|
const after = trimmed.charCodeAt(colonIdx + 1);
|
|
714
|
-
|
|
715
|
-
|
|
727
|
+
if (after === ASCII_SPACE || after === ASCII_TAB)
|
|
728
|
+
return true;
|
|
729
|
+
}
|
|
730
|
+
return false;
|
|
731
|
+
}
|
|
732
|
+
function matchRust(ctx) {
|
|
733
|
+
if (ctx.lower.includes('let mut'))
|
|
734
|
+
return true;
|
|
735
|
+
if (RUST_REGEX.test(ctx.lower))
|
|
736
|
+
return true;
|
|
737
|
+
return ctx.lower.includes('use ') && ctx.lower.includes('::');
|
|
738
|
+
}
|
|
739
|
+
function matchGo(ctx) {
|
|
740
|
+
if (ctx.lower.includes('import "'))
|
|
741
|
+
return true;
|
|
742
|
+
return /\b(?:package|func)\b/.test(ctx.lower);
|
|
743
|
+
}
|
|
744
|
+
function matchJsx(ctx) {
|
|
745
|
+
const l = ctx.lower;
|
|
746
|
+
if (l.includes('classname=') ||
|
|
747
|
+
l.includes('jsx:') ||
|
|
748
|
+
l.includes("from 'react'") ||
|
|
749
|
+
l.includes('from "react"')) {
|
|
750
|
+
return true;
|
|
751
|
+
}
|
|
752
|
+
return containsJsxTag(ctx.code);
|
|
753
|
+
}
|
|
754
|
+
function matchTypeScript(ctx) {
|
|
755
|
+
if (/\b(?:interface|type)\b/.test(ctx.lower))
|
|
756
|
+
return true;
|
|
757
|
+
const l = ctx.lower;
|
|
758
|
+
for (const hint of TYPESCRIPT_HINTS) {
|
|
759
|
+
if (l.includes(hint))
|
|
760
|
+
return true;
|
|
761
|
+
}
|
|
762
|
+
return false;
|
|
763
|
+
}
|
|
764
|
+
function matchSql(ctx) {
|
|
765
|
+
return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(ctx.lower);
|
|
766
|
+
}
|
|
767
|
+
function hasJsSignals(lowerCode) {
|
|
768
|
+
return (JS_SIGNAL_REGEX.test(lowerCode) ||
|
|
769
|
+
lowerCode.includes('{') ||
|
|
770
|
+
lowerCode.includes("from '"));
|
|
771
|
+
}
|
|
772
|
+
function matchPython(ctx) {
|
|
773
|
+
const l = ctx.lower;
|
|
774
|
+
if (l.includes('print(') || l.includes('__name__'))
|
|
775
|
+
return true;
|
|
776
|
+
if (l.includes('self.') || l.includes('elif '))
|
|
777
|
+
return true;
|
|
778
|
+
// Check for Python's None/True/False using original case (they are capitalized in Python)
|
|
779
|
+
if (ctx.code.includes('None') ||
|
|
780
|
+
ctx.code.includes('True') ||
|
|
781
|
+
ctx.code.includes('False')) {
|
|
782
|
+
return true;
|
|
783
|
+
}
|
|
784
|
+
if (PYTHON_UNIQUE_REGEX.test(l))
|
|
785
|
+
return true;
|
|
786
|
+
// Shared keywords (import, from, class) — only match if no JS signals present
|
|
787
|
+
if (/\b(?:import|from|class)\b/.test(l) && !hasJsSignals(l)) {
|
|
788
|
+
return true;
|
|
789
|
+
}
|
|
790
|
+
return false;
|
|
791
|
+
}
|
|
792
|
+
function matchHtml(ctx) {
|
|
793
|
+
const l = ctx.lower;
|
|
794
|
+
for (const tag of HTML_TAGS) {
|
|
795
|
+
if (l.includes(tag))
|
|
716
796
|
return true;
|
|
717
797
|
}
|
|
718
798
|
return false;
|
|
719
799
|
}
|
|
800
|
+
// Pre-sorted by weight descending — first match wins in detectLanguageFromCode
|
|
720
801
|
const LANGUAGES = [
|
|
721
|
-
{
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
if (RUST_REGEX.test(ctx.lower))
|
|
728
|
-
return true;
|
|
729
|
-
return ctx.lower.includes('use ') && ctx.lower.includes('::');
|
|
730
|
-
},
|
|
731
|
-
},
|
|
732
|
-
{
|
|
733
|
-
lang: 'go',
|
|
734
|
-
weight: 22,
|
|
735
|
-
match: (ctx) => {
|
|
736
|
-
if (ctx.lower.includes('import "'))
|
|
737
|
-
return true;
|
|
738
|
-
return /\b(?:package|func)\b/.test(ctx.lower);
|
|
739
|
-
},
|
|
740
|
-
},
|
|
741
|
-
{
|
|
742
|
-
lang: 'jsx',
|
|
743
|
-
weight: 22,
|
|
744
|
-
match: (ctx) => {
|
|
745
|
-
const l = ctx.lower;
|
|
746
|
-
if (l.includes('classname=') ||
|
|
747
|
-
l.includes('jsx:') ||
|
|
748
|
-
l.includes("from 'react'") ||
|
|
749
|
-
l.includes('from "react"')) {
|
|
750
|
-
return true;
|
|
751
|
-
}
|
|
752
|
-
return containsJsxTag(ctx.code);
|
|
753
|
-
},
|
|
754
|
-
},
|
|
755
|
-
{
|
|
756
|
-
lang: 'typescript',
|
|
757
|
-
weight: 20,
|
|
758
|
-
match: (ctx) => {
|
|
759
|
-
if (/\b(?:interface|type)\b/.test(ctx.lower))
|
|
760
|
-
return true;
|
|
761
|
-
const l = ctx.lower;
|
|
762
|
-
for (const hint of TYPESCRIPT_HINTS) {
|
|
763
|
-
if (l.includes(hint))
|
|
764
|
-
return true;
|
|
765
|
-
}
|
|
766
|
-
return false;
|
|
767
|
-
},
|
|
768
|
-
},
|
|
769
|
-
{
|
|
770
|
-
lang: 'sql',
|
|
771
|
-
weight: 20,
|
|
772
|
-
match: (ctx) => {
|
|
773
|
-
const l = ctx.lower;
|
|
774
|
-
return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(l);
|
|
775
|
-
},
|
|
776
|
-
},
|
|
777
|
-
{
|
|
778
|
-
lang: 'python',
|
|
779
|
-
weight: 18,
|
|
780
|
-
match: (ctx) => {
|
|
781
|
-
const l = ctx.lower;
|
|
782
|
-
if (l.includes('print(') || l.includes('__name__'))
|
|
783
|
-
return true;
|
|
784
|
-
if (l.includes('self.') || l.includes('elif '))
|
|
785
|
-
return true;
|
|
786
|
-
// Check for Python's None/True/False using original case (they are capitalized in Python)
|
|
787
|
-
if (ctx.code.includes('None') ||
|
|
788
|
-
ctx.code.includes('True') ||
|
|
789
|
-
ctx.code.includes('False')) {
|
|
790
|
-
return true;
|
|
791
|
-
}
|
|
792
|
-
// Python-unique keywords that JS doesn't have
|
|
793
|
-
if (PYTHON_UNIQUE_REGEX.test(l))
|
|
794
|
-
return true;
|
|
795
|
-
// Shared keywords (import, from, class) — only match if no JS signals present
|
|
796
|
-
if (/\b(?:import|from|class)\b/.test(l) &&
|
|
797
|
-
!JS_SIGNAL_REGEX.test(l) &&
|
|
798
|
-
!l.includes('{') &&
|
|
799
|
-
!l.includes("from '")) {
|
|
800
|
-
return true;
|
|
801
|
-
}
|
|
802
|
-
return false;
|
|
803
|
-
},
|
|
804
|
-
},
|
|
802
|
+
{ lang: 'rust', weight: 25, match: matchRust },
|
|
803
|
+
{ lang: 'go', weight: 22, match: matchGo },
|
|
804
|
+
{ lang: 'jsx', weight: 22, match: matchJsx },
|
|
805
|
+
{ lang: 'typescript', weight: 20, match: matchTypeScript },
|
|
806
|
+
{ lang: 'sql', weight: 20, match: matchSql },
|
|
807
|
+
{ lang: 'python', weight: 18, match: matchPython },
|
|
805
808
|
{
|
|
806
809
|
lang: 'css',
|
|
807
810
|
weight: 18,
|
|
808
|
-
match: (ctx) =>
|
|
809
|
-
if (CSS_REGEX.test(ctx.lower))
|
|
810
|
-
return true;
|
|
811
|
-
return detectCssStructure(ctx.lines);
|
|
812
|
-
},
|
|
813
|
-
},
|
|
814
|
-
{
|
|
815
|
-
lang: 'bash',
|
|
816
|
-
weight: 15,
|
|
817
|
-
match: (ctx) => detectBashIndicators(ctx.lines),
|
|
818
|
-
},
|
|
819
|
-
{
|
|
820
|
-
lang: 'yaml',
|
|
821
|
-
weight: 15,
|
|
822
|
-
match: (ctx) => detectYamlStructure(ctx.lines),
|
|
823
|
-
},
|
|
824
|
-
{
|
|
825
|
-
lang: 'javascript',
|
|
826
|
-
weight: 15,
|
|
827
|
-
match: (ctx) => JS_REGEX.test(ctx.lower),
|
|
828
|
-
},
|
|
829
|
-
{
|
|
830
|
-
lang: 'html',
|
|
831
|
-
weight: 12,
|
|
832
|
-
match: (ctx) => {
|
|
833
|
-
const l = ctx.lower;
|
|
834
|
-
for (const tag of HTML_TAGS) {
|
|
835
|
-
if (l.includes(tag))
|
|
836
|
-
return true;
|
|
837
|
-
}
|
|
838
|
-
return false;
|
|
839
|
-
},
|
|
811
|
+
match: (ctx) => CSS_REGEX.test(ctx.lower) || detectCssStructure(ctx.lines),
|
|
840
812
|
},
|
|
813
|
+
{ lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
|
|
814
|
+
{ lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
|
|
815
|
+
{ lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
|
|
816
|
+
{ lang: 'html', weight: 12, match: matchHtml },
|
|
841
817
|
{
|
|
842
818
|
lang: 'json',
|
|
843
819
|
weight: 10,
|
|
844
|
-
match: (ctx) => {
|
|
845
|
-
const s = ctx.trimmedStart;
|
|
846
|
-
return s.startsWith('{') || s.startsWith('[');
|
|
847
|
-
},
|
|
820
|
+
match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
|
|
848
821
|
},
|
|
849
822
|
];
|
|
850
823
|
function extractLanguageFromClassName(className) {
|
|
@@ -880,11 +853,10 @@ function resolveLanguageFromDataAttribute(dataLang) {
|
|
|
880
853
|
// Check if \w+
|
|
881
854
|
for (let i = 0; i < trimmed.length; i++) {
|
|
882
855
|
const c = trimmed.charCodeAt(i);
|
|
883
|
-
|
|
884
|
-
const
|
|
885
|
-
const
|
|
886
|
-
const
|
|
887
|
-
const isUnder = c === 95;
|
|
856
|
+
const isUpper = c >= ASCII_UPPER_A && c <= ASCII_UPPER_Z;
|
|
857
|
+
const isLower = c >= ASCII_LOWER_A && c <= ASCII_LOWER_Z;
|
|
858
|
+
const isDigit = c >= ASCII_DIGIT_0 && c <= ASCII_DIGIT_9;
|
|
859
|
+
const isUnder = c === ASCII_UNDERSCORE;
|
|
888
860
|
if (!isUpper && !isLower && !isDigit && !isUnder) {
|
|
889
861
|
return undefined;
|
|
890
862
|
}
|
|
@@ -901,7 +873,7 @@ export function detectLanguageFromCode(code) {
|
|
|
901
873
|
// Fast path for empty/whitespace only
|
|
902
874
|
let empty = true;
|
|
903
875
|
for (let i = 0; i < code.length; i++) {
|
|
904
|
-
if (code.charCodeAt(i) >
|
|
876
|
+
if (code.charCodeAt(i) > ASCII_SPACE) {
|
|
905
877
|
empty = false;
|
|
906
878
|
break;
|
|
907
879
|
}
|
|
@@ -909,20 +881,15 @@ export function detectLanguageFromCode(code) {
|
|
|
909
881
|
if (empty)
|
|
910
882
|
return undefined;
|
|
911
883
|
const ctx = new DetectionContext(code);
|
|
912
|
-
|
|
913
|
-
let bestScore = -1;
|
|
884
|
+
// LANGUAGES is pre-sorted by weight descending — first match is highest confidence
|
|
914
885
|
for (const def of LANGUAGES) {
|
|
915
|
-
if (def.match(ctx))
|
|
916
|
-
|
|
917
|
-
bestScore = def.weight;
|
|
918
|
-
bestLang = def.lang;
|
|
919
|
-
if (bestScore >= 25)
|
|
920
|
-
break;
|
|
921
|
-
}
|
|
922
|
-
}
|
|
886
|
+
if (def.match(ctx))
|
|
887
|
+
return def.lang;
|
|
923
888
|
}
|
|
924
|
-
return
|
|
889
|
+
return undefined;
|
|
925
890
|
}
|
|
891
|
+
// endregion
|
|
892
|
+
// region Markdown Cleanup
|
|
926
893
|
const MAX_LINE_LENGTH = 80;
|
|
927
894
|
const REGEX = {
|
|
928
895
|
HEADING_MARKER: /^#{1,6}\s/m,
|
|
@@ -975,7 +942,7 @@ function isBlank(line) {
|
|
|
975
942
|
}
|
|
976
943
|
function hasFollowingContent(lines, startIndex) {
|
|
977
944
|
// Optimization: Bound lookahead to avoid checking too many lines in huge files
|
|
978
|
-
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex +
|
|
945
|
+
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
|
|
979
946
|
if (!isBlank(lines[i]))
|
|
980
947
|
return true;
|
|
981
948
|
}
|
|
@@ -994,7 +961,7 @@ function isTitleCaseOrKeyword(trimmed) {
|
|
|
994
961
|
// Split limited number of words
|
|
995
962
|
const words = trimmed.split(/\s+/);
|
|
996
963
|
const len = words.length;
|
|
997
|
-
if (len <
|
|
964
|
+
if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
|
|
998
965
|
return false;
|
|
999
966
|
let capitalizedCount = 0;
|
|
1000
967
|
for (let i = 0; i < len; i++) {
|
|
@@ -1007,20 +974,19 @@ function isTitleCaseOrKeyword(trimmed) {
|
|
|
1007
974
|
else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
|
|
1008
975
|
return false;
|
|
1009
976
|
}
|
|
1010
|
-
return capitalizedCount >=
|
|
977
|
+
return capitalizedCount >= TITLE_MIN_CAPITALIZED;
|
|
1011
978
|
}
|
|
1012
979
|
function getHeadingPrefix(trimmed) {
|
|
1013
980
|
if (trimmed.length > MAX_LINE_LENGTH)
|
|
1014
981
|
return null;
|
|
1015
982
|
// Fast path: Check common markdown markers first
|
|
1016
983
|
const firstChar = trimmed.charCodeAt(0);
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
firstChar ===
|
|
1020
|
-
firstChar ===
|
|
1021
|
-
firstChar ===
|
|
1022
|
-
firstChar
|
|
1023
|
-
(firstChar >= 48 && firstChar <= 57)) {
|
|
984
|
+
if (firstChar === ASCII_HASH ||
|
|
985
|
+
firstChar === ASCII_DASH ||
|
|
986
|
+
firstChar === ASCII_ASTERISK ||
|
|
987
|
+
firstChar === ASCII_PLUS ||
|
|
988
|
+
firstChar === ASCII_BRACKET_OPEN ||
|
|
989
|
+
(firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
|
|
1024
990
|
if (REGEX.HEADING_MARKER.test(trimmed) ||
|
|
1025
991
|
REGEX.LIST_MARKER.test(trimmed) ||
|
|
1026
992
|
/^\d+\.\s/.test(trimmed) ||
|
|
@@ -1032,8 +998,9 @@ function getHeadingPrefix(trimmed) {
|
|
|
1032
998
|
return /^example:\s/i.test(trimmed) ? '### ' : '## ';
|
|
1033
999
|
}
|
|
1034
1000
|
const lastChar = trimmed.charCodeAt(trimmed.length - 1);
|
|
1035
|
-
|
|
1036
|
-
|
|
1001
|
+
if (lastChar === ASCII_PERIOD ||
|
|
1002
|
+
lastChar === ASCII_EXCLAMATION ||
|
|
1003
|
+
lastChar === ASCII_QUESTION)
|
|
1037
1004
|
return null;
|
|
1038
1005
|
return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
|
|
1039
1006
|
}
|
|
@@ -1148,33 +1115,20 @@ function processTextBuffer(lines, options) {
|
|
|
1148
1115
|
const text = preprocessLines(lines, options);
|
|
1149
1116
|
return applyGlobalRegexes(text, options);
|
|
1150
1117
|
}
|
|
1151
|
-
function
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
.replace(REGEX.
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
.filter((line) => !isTypeDocArtifactLine(line))
|
|
1166
|
-
.join('\n');
|
|
1167
|
-
result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
1168
|
-
}
|
|
1169
|
-
if (config.markdownCleanup.removeSkipLinks) {
|
|
1170
|
-
checkAbort('markdown:cleanup:skip-links');
|
|
1171
|
-
result = result
|
|
1172
|
-
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1173
|
-
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1174
|
-
}
|
|
1175
|
-
checkAbort('markdown:cleanup:spacing');
|
|
1176
|
-
// normalizeSpacing
|
|
1177
|
-
result = result
|
|
1118
|
+
function removeTypeDocArtifacts(text) {
|
|
1119
|
+
const filtered = text
|
|
1120
|
+
.split('\n')
|
|
1121
|
+
.filter((line) => !isTypeDocArtifactLine(line))
|
|
1122
|
+
.join('\n');
|
|
1123
|
+
return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
1124
|
+
}
|
|
1125
|
+
function removeSkipLinks(text) {
|
|
1126
|
+
return text
|
|
1127
|
+
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1128
|
+
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1129
|
+
}
|
|
1130
|
+
function normalizeMarkdownSpacing(text) {
|
|
1131
|
+
let result = text
|
|
1178
1132
|
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
1179
1133
|
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
1180
1134
|
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
@@ -1186,10 +1140,12 @@ function applyGlobalRegexes(text, options) {
|
|
|
1186
1140
|
result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
|
|
1187
1141
|
// Unescape backticks inside markdown link text
|
|
1188
1142
|
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
1189
|
-
result =
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1143
|
+
result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
|
|
1144
|
+
return normalizeNestedListIndentation(result);
|
|
1145
|
+
}
|
|
1146
|
+
function fixConcatenatedProperties(text) {
|
|
1147
|
+
let result = text;
|
|
1148
|
+
for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
|
|
1193
1149
|
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
1194
1150
|
if (next === result)
|
|
1195
1151
|
break;
|
|
@@ -1197,6 +1153,26 @@ function applyGlobalRegexes(text, options) {
|
|
|
1197
1153
|
}
|
|
1198
1154
|
return result;
|
|
1199
1155
|
}
|
|
1156
|
+
function applyGlobalRegexes(text, options) {
|
|
1157
|
+
const checkAbort = createAbortChecker(options);
|
|
1158
|
+
let result = text.replace(/\u00A0/g, ' ');
|
|
1159
|
+
checkAbort('markdown:cleanup:headings');
|
|
1160
|
+
result = result
|
|
1161
|
+
.replace(REGEX.HEADING_SPACING, '$1\n\n$2')
|
|
1162
|
+
.replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
|
|
1163
|
+
if (config.markdownCleanup.removeTypeDocComments) {
|
|
1164
|
+
checkAbort('markdown:cleanup:typedoc');
|
|
1165
|
+
result = removeTypeDocArtifacts(result);
|
|
1166
|
+
}
|
|
1167
|
+
if (config.markdownCleanup.removeSkipLinks) {
|
|
1168
|
+
checkAbort('markdown:cleanup:skip-links');
|
|
1169
|
+
result = removeSkipLinks(result);
|
|
1170
|
+
}
|
|
1171
|
+
checkAbort('markdown:cleanup:spacing');
|
|
1172
|
+
result = normalizeMarkdownSpacing(result);
|
|
1173
|
+
checkAbort('markdown:cleanup:properties');
|
|
1174
|
+
return fixConcatenatedProperties(result);
|
|
1175
|
+
}
|
|
1200
1176
|
function normalizeNestedListIndentation(text) {
|
|
1201
1177
|
return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
|
|
1202
1178
|
const count = spaces.length;
|
|
@@ -1211,27 +1187,17 @@ export function cleanupMarkdownArtifacts(content, options) {
|
|
|
1211
1187
|
return '';
|
|
1212
1188
|
const checkAbort = createAbortChecker(options);
|
|
1213
1189
|
checkAbort('markdown:cleanup:begin');
|
|
1214
|
-
const
|
|
1215
|
-
let lastIndex = 0;
|
|
1190
|
+
const lines = content.split(/\r?\n/);
|
|
1216
1191
|
let fenceMarker = null;
|
|
1217
1192
|
const segments = [];
|
|
1218
1193
|
let buffer = [];
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
line = content.slice(lastIndex);
|
|
1224
|
-
nextIndex = len;
|
|
1225
|
-
}
|
|
1226
|
-
else {
|
|
1227
|
-
if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
|
|
1228
|
-
line = content.slice(lastIndex, nextIndex - 1);
|
|
1229
|
-
}
|
|
1230
|
-
else {
|
|
1231
|
-
line = content.slice(lastIndex, nextIndex);
|
|
1232
|
-
}
|
|
1233
|
-
nextIndex++; // Skip \n
|
|
1194
|
+
const flushBuffer = () => {
|
|
1195
|
+
if (buffer.length > 0) {
|
|
1196
|
+
segments.push(processTextBuffer(buffer, options));
|
|
1197
|
+
buffer = [];
|
|
1234
1198
|
}
|
|
1199
|
+
};
|
|
1200
|
+
for (const line of lines) {
|
|
1235
1201
|
const trimmed = line.trimStart();
|
|
1236
1202
|
if (fenceMarker) {
|
|
1237
1203
|
segments.push(line);
|
|
@@ -1247,22 +1213,16 @@ export function cleanupMarkdownArtifacts(content, options) {
|
|
|
1247
1213
|
buffer.push(line);
|
|
1248
1214
|
}
|
|
1249
1215
|
else {
|
|
1250
|
-
|
|
1251
|
-
segments.push(processTextBuffer(buffer, options));
|
|
1252
|
-
buffer = [];
|
|
1253
|
-
}
|
|
1216
|
+
flushBuffer();
|
|
1254
1217
|
segments.push(line);
|
|
1255
1218
|
fenceMarker = newMarker;
|
|
1256
1219
|
}
|
|
1257
1220
|
}
|
|
1258
|
-
lastIndex = nextIndex;
|
|
1259
|
-
}
|
|
1260
|
-
if (buffer.length > 0) {
|
|
1261
|
-
segments.push(processTextBuffer(buffer, options));
|
|
1262
1221
|
}
|
|
1222
|
+
flushBuffer();
|
|
1263
1223
|
return segments.join('\n').trim();
|
|
1264
1224
|
}
|
|
1265
|
-
function
|
|
1225
|
+
function parseFrontmatter(content) {
|
|
1266
1226
|
const len = content.length;
|
|
1267
1227
|
if (len < 4)
|
|
1268
1228
|
return null;
|
|
@@ -1282,57 +1242,43 @@ function detectFrontmatter(content) {
|
|
|
1282
1242
|
const closeIndex = content.indexOf(fence, fenceLen);
|
|
1283
1243
|
if (closeIndex === -1)
|
|
1284
1244
|
return null;
|
|
1285
|
-
|
|
1245
|
+
const range = {
|
|
1286
1246
|
start: 0,
|
|
1287
1247
|
end: closeIndex + fenceLen,
|
|
1288
1248
|
linesStart: fenceLen,
|
|
1289
1249
|
linesEnd: closeIndex,
|
|
1290
1250
|
lineEnding,
|
|
1291
1251
|
};
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
const
|
|
1295
|
-
const idx = trimmed.indexOf(':');
|
|
1296
|
-
if (!trimmed || idx <= 0)
|
|
1297
|
-
return null;
|
|
1298
|
-
return {
|
|
1299
|
-
key: trimmed.slice(0, idx).trim().toLowerCase(),
|
|
1300
|
-
value: trimmed.slice(idx + 1).trim(),
|
|
1301
|
-
};
|
|
1302
|
-
}
|
|
1303
|
-
function stripFrontmatterQuotes(val) {
|
|
1304
|
-
const first = val.charAt(0);
|
|
1305
|
-
const last = val.charAt(val.length - 1);
|
|
1306
|
-
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1307
|
-
return val.slice(1, -1).trim();
|
|
1308
|
-
}
|
|
1309
|
-
return val;
|
|
1310
|
-
}
|
|
1311
|
-
function scanFrontmatterForTitle(content, fm) {
|
|
1312
|
-
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1252
|
+
// Parse key-value entries in one pass
|
|
1253
|
+
const entries = new Map();
|
|
1254
|
+
const fmBody = content.slice(range.linesStart, range.linesEnd);
|
|
1313
1255
|
let lastIdx = 0;
|
|
1314
1256
|
while (lastIdx < fmBody.length) {
|
|
1315
|
-
let nextIdx = fmBody.indexOf(
|
|
1257
|
+
let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
|
|
1316
1258
|
if (nextIdx === -1)
|
|
1317
1259
|
nextIdx = fmBody.length;
|
|
1318
|
-
const line = fmBody.slice(lastIdx, nextIdx);
|
|
1319
|
-
const
|
|
1320
|
-
if (
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1260
|
+
const line = fmBody.slice(lastIdx, nextIdx).trim();
|
|
1261
|
+
const colonIdx = line.indexOf(':');
|
|
1262
|
+
if (line && colonIdx > 0) {
|
|
1263
|
+
const key = line.slice(0, colonIdx).trim().toLowerCase();
|
|
1264
|
+
let value = line.slice(colonIdx + 1).trim();
|
|
1265
|
+
// Strip surrounding quotes
|
|
1266
|
+
const first = value.charAt(0);
|
|
1267
|
+
const last = value.charAt(value.length - 1);
|
|
1268
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1269
|
+
value = value.slice(1, -1).trim();
|
|
1325
1270
|
}
|
|
1271
|
+
if (value)
|
|
1272
|
+
entries.set(key, value);
|
|
1326
1273
|
}
|
|
1327
|
-
lastIdx = nextIdx +
|
|
1274
|
+
lastIdx = nextIdx + lineEnding.length;
|
|
1328
1275
|
}
|
|
1329
|
-
return
|
|
1276
|
+
return { range, entries };
|
|
1330
1277
|
}
|
|
1331
1278
|
function scanBodyForTitle(content) {
|
|
1332
1279
|
const len = content.length;
|
|
1333
1280
|
let scanIndex = 0;
|
|
1334
|
-
const
|
|
1335
|
-
const maxScan = Math.min(len, LIMIT);
|
|
1281
|
+
const maxScan = Math.min(len, BODY_SCAN_LIMIT);
|
|
1336
1282
|
while (scanIndex < maxScan) {
|
|
1337
1283
|
let nextIndex = content.indexOf('\n', scanIndex);
|
|
1338
1284
|
if (nextIndex === -1)
|
|
@@ -1352,16 +1298,16 @@ function scanBodyForTitle(content) {
|
|
|
1352
1298
|
return undefined;
|
|
1353
1299
|
}
|
|
1354
1300
|
export function extractTitleFromRawMarkdown(content) {
|
|
1355
|
-
const fm =
|
|
1301
|
+
const fm = parseFrontmatter(content);
|
|
1356
1302
|
if (fm) {
|
|
1357
|
-
const title =
|
|
1303
|
+
const title = fm.entries.get('title') ?? fm.entries.get('name');
|
|
1358
1304
|
if (title)
|
|
1359
1305
|
return title;
|
|
1360
1306
|
}
|
|
1361
1307
|
return scanBodyForTitle(content);
|
|
1362
1308
|
}
|
|
1363
1309
|
export function addSourceToMarkdown(content, url) {
|
|
1364
|
-
const fm =
|
|
1310
|
+
const fm = parseFrontmatter(content);
|
|
1365
1311
|
const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
|
|
1366
1312
|
if (useMarkdownFormat && !fm) {
|
|
1367
1313
|
if (REGEX.SOURCE_KEY.test(content))
|
|
@@ -1382,13 +1328,17 @@ export function addSourceToMarkdown(content, url) {
|
|
|
1382
1328
|
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1383
1329
|
return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
|
|
1384
1330
|
}
|
|
1385
|
-
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1331
|
+
const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
|
|
1386
1332
|
if (REGEX.SOURCE_KEY.test(fmBody))
|
|
1387
1333
|
return content;
|
|
1388
1334
|
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1389
|
-
const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
|
|
1390
|
-
return content.slice(0, fm.linesEnd) +
|
|
1335
|
+
const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
|
|
1336
|
+
return (content.slice(0, fm.range.linesEnd) +
|
|
1337
|
+
injection +
|
|
1338
|
+
content.slice(fm.range.linesEnd));
|
|
1391
1339
|
}
|
|
1340
|
+
// endregion
|
|
1341
|
+
// region Content Detection & Metadata Footer
|
|
1392
1342
|
function countCommonTags(content, limit) {
|
|
1393
1343
|
if (limit <= 0)
|
|
1394
1344
|
return 0;
|
|
@@ -1405,10 +1355,10 @@ export function isRawTextContent(content) {
|
|
|
1405
1355
|
const trimmed = content.trim();
|
|
1406
1356
|
if (REGEX.HTML_DOC_START.test(trimmed))
|
|
1407
1357
|
return false;
|
|
1408
|
-
if (
|
|
1358
|
+
if (parseFrontmatter(trimmed) !== null)
|
|
1409
1359
|
return true;
|
|
1410
|
-
const tagCount = countCommonTags(content,
|
|
1411
|
-
if (tagCount >
|
|
1360
|
+
const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
|
|
1361
|
+
if (tagCount > HTML_TAG_DENSITY_LIMIT)
|
|
1412
1362
|
return false;
|
|
1413
1363
|
return (REGEX.HEADING_MARKER.test(content) ||
|
|
1414
1364
|
REGEX.LIST_MARKER.test(content) ||
|
|
@@ -1446,3 +1396,4 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
|
1446
1396
|
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1447
1397
|
return lines.join('\n');
|
|
1448
1398
|
}
|
|
1399
|
+
// endregion
|