@j0hanz/fetch-url-mcp 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/http/auth.d.ts +0 -1
- package/dist/http/auth.d.ts.map +1 -1
- package/dist/http/auth.js +1 -13
- package/dist/http/native.d.ts.map +1 -1
- package/dist/http/native.js +2 -5
- package/dist/lib/content.d.ts.map +1 -1
- package/dist/lib/content.js +378 -346
- package/dist/lib/core.d.ts +78 -71
- package/dist/lib/core.d.ts.map +1 -1
- package/dist/lib/core.js +308 -372
- package/dist/lib/fetch-pipeline.d.ts +2 -6
- package/dist/lib/fetch-pipeline.d.ts.map +1 -1
- package/dist/lib/fetch-pipeline.js +51 -137
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js +188 -130
- package/dist/lib/mcp-tools.d.ts +3 -5
- package/dist/lib/mcp-tools.d.ts.map +1 -1
- package/dist/lib/mcp-tools.js +22 -58
- package/dist/lib/task-handlers.js +4 -4
- package/dist/lib/utils.d.ts +6 -0
- package/dist/lib/utils.d.ts.map +1 -1
- package/dist/lib/utils.js +23 -0
- package/dist/resources/index.js +1 -1
- package/dist/schemas.d.ts +0 -1
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +4 -6
- package/dist/server.js +1 -1
- package/dist/tasks/owner.d.ts +1 -1
- package/dist/tasks/owner.d.ts.map +1 -1
- package/dist/tasks/tool-registry.d.ts +1 -1
- package/dist/tasks/tool-registry.d.ts.map +1 -1
- package/dist/tools/fetch-url.d.ts +2 -3
- package/dist/tools/fetch-url.d.ts.map +1 -1
- package/dist/tools/fetch-url.js +89 -152
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +1 -23
- package/dist/transform/metadata.d.ts +1 -0
- package/dist/transform/metadata.d.ts.map +1 -1
- package/dist/transform/metadata.js +25 -0
- package/dist/transform/transform.d.ts +8 -0
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +190 -109
- package/dist/transform/worker-pool.d.ts +3 -6
- package/dist/transform/worker-pool.d.ts.map +1 -1
- package/dist/transform/worker-pool.js +148 -118
- package/package.json +3 -2
package/dist/lib/content.js
CHANGED
|
@@ -2,11 +2,38 @@ import { parseHTML } from 'linkedom';
|
|
|
2
2
|
import {} from '../transform/types.js';
|
|
3
3
|
import { config, logDebug } from './core.js';
|
|
4
4
|
import { throwIfAborted } from './utils.js';
|
|
5
|
+
// ASCII char codes used in hot-path charCodeAt comparisons
|
|
6
|
+
const ASCII_SPACE = 32;
|
|
7
|
+
const ASCII_TAB = 9;
|
|
8
|
+
const ASCII_EXCLAMATION = 33;
|
|
9
|
+
const ASCII_HASH = 35;
|
|
10
|
+
const ASCII_ASTERISK = 42;
|
|
11
|
+
const ASCII_PLUS = 43;
|
|
12
|
+
const ASCII_DASH = 45;
|
|
13
|
+
const ASCII_PERIOD = 46;
|
|
14
|
+
const ASCII_DIGIT_0 = 48;
|
|
15
|
+
const ASCII_DIGIT_9 = 57;
|
|
16
|
+
const ASCII_LT = 60;
|
|
17
|
+
const ASCII_QUESTION = 63;
|
|
18
|
+
const ASCII_UPPER_A = 65;
|
|
19
|
+
const ASCII_UPPER_Z = 90;
|
|
20
|
+
const ASCII_BRACKET_OPEN = 91;
|
|
21
|
+
const ASCII_LOWER_A = 97;
|
|
22
|
+
const ASCII_LOWER_Z = 122;
|
|
23
|
+
const ASCII_UNDERSCORE = 95;
|
|
24
|
+
const HTML_TAG_DENSITY_LIMIT = 5;
|
|
25
|
+
const TITLE_MIN_WORDS = 2;
|
|
26
|
+
const TITLE_MAX_WORDS = 6;
|
|
27
|
+
const TITLE_MIN_CAPITALIZED = 2;
|
|
28
|
+
const PROPERTY_FIX_MAX_PASSES = 3;
|
|
29
|
+
const BODY_SCAN_LIMIT = 5000;
|
|
30
|
+
const HAS_FOLLOWING_LOOKAHEAD = 50;
|
|
5
31
|
const NOISE_SCAN_LIMIT = 50_000;
|
|
6
32
|
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
33
|
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
34
|
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
35
|
const ABORT_CHECK_INTERVAL = 500;
|
|
36
|
+
const NODE_FILTER_SHOW_TEXT = 4;
|
|
10
37
|
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
11
38
|
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
12
39
|
const NOISE_PATTERNS = [
|
|
@@ -89,6 +116,7 @@ const PROMO_TOKENS_BY_CATEGORY = {
|
|
|
89
116
|
newsletters: ['newsletter', 'subscribe'],
|
|
90
117
|
'social-share': ['share', 'social'],
|
|
91
118
|
};
|
|
119
|
+
// Noise selector configurations
|
|
92
120
|
const BASE_NOISE_SELECTORS = {
|
|
93
121
|
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
94
122
|
cookieBanners: '[role="dialog"]',
|
|
@@ -96,7 +124,7 @@ const BASE_NOISE_SELECTORS = {
|
|
|
96
124
|
};
|
|
97
125
|
const NO_MATCH_REGEX = /a^/i;
|
|
98
126
|
let cachedContext;
|
|
99
|
-
let
|
|
127
|
+
let lastContextKey;
|
|
100
128
|
function escapeRegexLiteral(value) {
|
|
101
129
|
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
102
130
|
}
|
|
@@ -136,7 +164,16 @@ function getPromoMatchers(currentConfig, flags) {
|
|
|
136
164
|
}
|
|
137
165
|
function getContext() {
|
|
138
166
|
const currentConfig = config.noiseRemoval;
|
|
139
|
-
|
|
167
|
+
const contextKey = JSON.stringify({
|
|
168
|
+
locale: config.i18n.locale,
|
|
169
|
+
enabledCategories: currentConfig.enabledCategories,
|
|
170
|
+
extraTokens: currentConfig.extraTokens,
|
|
171
|
+
extraSelectors: currentConfig.extraSelectors,
|
|
172
|
+
aggressiveMode: currentConfig.aggressiveMode,
|
|
173
|
+
preserveSvgCanvas: currentConfig.preserveSvgCanvas,
|
|
174
|
+
weights: currentConfig.weights,
|
|
175
|
+
});
|
|
176
|
+
if (cachedContext !== undefined && lastContextKey === contextKey)
|
|
140
177
|
return cachedContext;
|
|
141
178
|
const enabled = new Set(currentConfig.enabledCategories
|
|
142
179
|
.map((c) => {
|
|
@@ -188,14 +225,15 @@ function getContext() {
|
|
|
188
225
|
baseSelector,
|
|
189
226
|
candidateSelector,
|
|
190
227
|
};
|
|
191
|
-
|
|
228
|
+
lastContextKey = contextKey;
|
|
192
229
|
return cachedContext;
|
|
193
230
|
}
|
|
194
231
|
function isInteractive(element, role) {
|
|
195
232
|
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
196
233
|
return true;
|
|
234
|
+
const tag = element.tagName.toLowerCase();
|
|
197
235
|
const ds = element.getAttribute('data-state');
|
|
198
|
-
if (ds === 'inactive' || ds === 'closed')
|
|
236
|
+
if ((ds === 'inactive' || ds === 'closed') && !BASE_STRUCTURAL_TAGS.has(tag))
|
|
199
237
|
return true;
|
|
200
238
|
const dataOrientation = element.getAttribute('data-orientation');
|
|
201
239
|
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
@@ -215,6 +253,19 @@ function isWithinPrimaryContent(element) {
|
|
|
215
253
|
}
|
|
216
254
|
return false;
|
|
217
255
|
}
|
|
256
|
+
const ASIDE_NAV_LINK_DENSITY_THRESHOLD = 0.5;
|
|
257
|
+
const ASIDE_NAV_MIN_LINKS = 10;
|
|
258
|
+
function isNavigationAside(element) {
|
|
259
|
+
if (element.querySelector('nav'))
|
|
260
|
+
return true;
|
|
261
|
+
const links = element.querySelectorAll('a[href]');
|
|
262
|
+
if (links.length < ASIDE_NAV_MIN_LINKS)
|
|
263
|
+
return false;
|
|
264
|
+
const textLen = (element.textContent || '').trim().length;
|
|
265
|
+
if (textLen === 0)
|
|
266
|
+
return true;
|
|
267
|
+
return links.length / (textLen / 100) >= ASIDE_NAV_LINK_DENSITY_THRESHOLD;
|
|
268
|
+
}
|
|
218
269
|
function shouldPreserve(element, tagName) {
|
|
219
270
|
// Check Dialog
|
|
220
271
|
const role = element.getAttribute('role');
|
|
@@ -233,6 +284,12 @@ function shouldPreserve(element, tagName) {
|
|
|
233
284
|
return ((element.textContent || '').trim().length >=
|
|
234
285
|
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
235
286
|
}
|
|
287
|
+
// Check Aside — preserve only if it looks like article content, not navigation
|
|
288
|
+
if (tagName === 'aside') {
|
|
289
|
+
if (!isWithinPrimaryContent(element))
|
|
290
|
+
return false;
|
|
291
|
+
return !isNavigationAside(element);
|
|
292
|
+
}
|
|
236
293
|
return false;
|
|
237
294
|
}
|
|
238
295
|
function removeNodes(nodes) {
|
|
@@ -243,142 +300,120 @@ function removeNodes(nodes) {
|
|
|
243
300
|
}
|
|
244
301
|
}
|
|
245
302
|
}
|
|
246
|
-
|
|
303
|
+
const HIDDEN_STYLE_REGEX = /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i;
|
|
304
|
+
function calculateNavFooterScore(tagName, className, id, role, weights) {
|
|
247
305
|
let score = 0;
|
|
248
|
-
if (ALWAYS_NOISE_TAGS.has(
|
|
306
|
+
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
249
307
|
score += weights.structural;
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
HEADER_NOISE_PATTERN.test(`${meta.className} ${meta.id}`)) {
|
|
308
|
+
if (tagName === 'header') {
|
|
309
|
+
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
310
|
+
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
254
311
|
score += weights.structural;
|
|
255
312
|
}
|
|
256
313
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
314
|
+
if (tagName === 'aside') {
|
|
315
|
+
score += weights.structural;
|
|
316
|
+
}
|
|
317
|
+
if (role && NAVIGATION_ROLES.has(role)) {
|
|
318
|
+
if (tagName !== 'aside' || role !== 'complementary') {
|
|
260
319
|
score += weights.structural;
|
|
261
320
|
}
|
|
262
321
|
}
|
|
263
322
|
return score;
|
|
264
323
|
}
|
|
265
|
-
function
|
|
324
|
+
function calculatePromoScore(element, className, id, context) {
|
|
325
|
+
if (!context.promoEnabled)
|
|
326
|
+
return 0;
|
|
327
|
+
const aggTest = context.promoMatchers.aggressive.test(className) ||
|
|
328
|
+
context.promoMatchers.aggressive.test(id);
|
|
329
|
+
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
330
|
+
const isBaseMatch = !aggTest &&
|
|
331
|
+
(context.promoMatchers.base.test(className) ||
|
|
332
|
+
context.promoMatchers.base.test(id));
|
|
333
|
+
return isAggressiveMatch || isBaseMatch ? context.weights.promo : 0;
|
|
334
|
+
}
|
|
335
|
+
function isNoiseElement(element, context) {
|
|
266
336
|
const tagName = element.tagName.toLowerCase();
|
|
267
337
|
const className = element.getAttribute('class') ?? '';
|
|
268
338
|
const id = element.getAttribute('id') ?? '';
|
|
269
339
|
const role = element.getAttribute('role');
|
|
270
340
|
const style = element.getAttribute('style');
|
|
271
|
-
const
|
|
272
|
-
const
|
|
341
|
+
const elIsInteractive = isInteractive(element, role);
|
|
342
|
+
const elIsHidden = element.hasAttribute('hidden') ||
|
|
273
343
|
element.getAttribute('aria-hidden') === 'true' ||
|
|
274
|
-
(style !== null &&
|
|
275
|
-
/\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
|
|
276
|
-
return {
|
|
277
|
-
tagName,
|
|
278
|
-
className,
|
|
279
|
-
id,
|
|
280
|
-
role,
|
|
281
|
-
style,
|
|
282
|
-
isInteractive: _isInteractive,
|
|
283
|
-
isHidden,
|
|
284
|
-
};
|
|
285
|
-
}
|
|
286
|
-
function isNoiseElement(element, context) {
|
|
287
|
-
const meta = extractElementMetadata(element);
|
|
344
|
+
(style !== null && HIDDEN_STYLE_REGEX.test(style));
|
|
288
345
|
let score = 0;
|
|
289
346
|
const { weights } = context;
|
|
290
347
|
// Structural
|
|
291
|
-
if (context.structuralTags.has(
|
|
348
|
+
if (context.structuralTags.has(tagName) && !elIsInteractive) {
|
|
292
349
|
score += weights.structural;
|
|
293
350
|
}
|
|
294
351
|
// Nav/Footer Scoring
|
|
295
352
|
if (context.flags.navFooter) {
|
|
296
|
-
score +=
|
|
353
|
+
score += calculateNavFooterScore(tagName, className, id, role, weights);
|
|
297
354
|
}
|
|
298
355
|
// Hidden
|
|
299
|
-
if (
|
|
356
|
+
if (elIsHidden && !elIsInteractive) {
|
|
300
357
|
score += weights.hidden;
|
|
301
358
|
}
|
|
302
359
|
// Sticky/Fixed
|
|
303
|
-
if (FIXED_OR_HIGH_Z_PATTERN.test(
|
|
360
|
+
if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
|
|
304
361
|
score += weights.stickyFixed;
|
|
305
362
|
}
|
|
306
363
|
// Promo
|
|
307
|
-
|
|
308
|
-
const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
|
|
309
|
-
context.promoMatchers.aggressive.test(meta.id);
|
|
310
|
-
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
311
|
-
const isBaseMatch = !aggTest &&
|
|
312
|
-
(context.promoMatchers.base.test(meta.className) ||
|
|
313
|
-
context.promoMatchers.base.test(meta.id));
|
|
314
|
-
if (isAggressiveMatch || isBaseMatch) {
|
|
315
|
-
score += weights.promo;
|
|
316
|
-
}
|
|
317
|
-
}
|
|
364
|
+
score += calculatePromoScore(element, className, id, context);
|
|
318
365
|
return score >= weights.threshold;
|
|
319
366
|
}
|
|
320
|
-
function cleanHeadingWrapperDivs(h) {
|
|
321
|
-
const divs = h.querySelectorAll('div');
|
|
322
|
-
for (let j = divs.length - 1; j >= 0; j--) {
|
|
323
|
-
const d = divs[j];
|
|
324
|
-
if (!d?.parentNode)
|
|
325
|
-
continue;
|
|
326
|
-
const cls = d.getAttribute('class') ?? '';
|
|
327
|
-
const stl = d.getAttribute('style') ?? '';
|
|
328
|
-
if (cls.includes('absolute') ||
|
|
329
|
-
stl.includes('position') ||
|
|
330
|
-
d.getAttribute('tabindex') === '-1') {
|
|
331
|
-
d.remove();
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
function cleanHeadingAnchors(h) {
|
|
336
|
-
const anchors = h.querySelectorAll('a');
|
|
337
|
-
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
338
|
-
const a = anchors[j];
|
|
339
|
-
if (!a?.parentNode)
|
|
340
|
-
continue;
|
|
341
|
-
const href = a.getAttribute('href') ?? '';
|
|
342
|
-
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
343
|
-
if (href.startsWith('#') && txt.length === 0) {
|
|
344
|
-
a.remove();
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
function cleanHeadingZeroWidth(h, document) {
|
|
349
|
-
const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
|
|
350
|
-
let node;
|
|
351
|
-
while ((node = walker.nextNode())) {
|
|
352
|
-
if (node.textContent?.includes('\u200B')) {
|
|
353
|
-
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
354
|
-
}
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
367
|
function cleanHeadings(document) {
|
|
358
|
-
// Clean Heading Anchors
|
|
359
368
|
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
360
369
|
for (const h of headings) {
|
|
361
370
|
if (!h.parentNode)
|
|
362
371
|
continue;
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
372
|
+
// Remove absolute/positioned wrapper divs
|
|
373
|
+
const divs = h.querySelectorAll('div');
|
|
374
|
+
for (let j = divs.length - 1; j >= 0; j--) {
|
|
375
|
+
const d = divs[j];
|
|
376
|
+
if (!d?.parentNode)
|
|
377
|
+
continue;
|
|
378
|
+
const cls = d.getAttribute('class') ?? '';
|
|
379
|
+
const stl = d.getAttribute('style') ?? '';
|
|
380
|
+
if (cls.includes('absolute') ||
|
|
381
|
+
stl.includes('position') ||
|
|
382
|
+
d.getAttribute('tabindex') === '-1') {
|
|
383
|
+
d.remove();
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
// Remove empty hash-link anchors
|
|
387
|
+
const anchors = h.querySelectorAll('a');
|
|
388
|
+
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
389
|
+
const a = anchors[j];
|
|
390
|
+
if (!a?.parentNode)
|
|
391
|
+
continue;
|
|
392
|
+
const href = a.getAttribute('href') ?? '';
|
|
393
|
+
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
394
|
+
if (href.startsWith('#') && txt.length === 0) {
|
|
395
|
+
a.remove();
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
// Strip zero-width spaces from text nodes
|
|
399
|
+
const walker = document.createTreeWalker(h, NODE_FILTER_SHOW_TEXT);
|
|
400
|
+
let node;
|
|
401
|
+
while ((node = walker.nextNode())) {
|
|
402
|
+
if (node.textContent?.includes('\u200B')) {
|
|
403
|
+
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
404
|
+
}
|
|
405
|
+
}
|
|
366
406
|
}
|
|
367
407
|
}
|
|
368
408
|
function stripNoise(document, context, signal) {
|
|
369
409
|
cleanHeadings(document);
|
|
370
|
-
//
|
|
410
|
+
// Structural Removal
|
|
371
411
|
const { baseSelector, extraSelectors } = context;
|
|
372
|
-
|
|
373
|
-
const baseNodes = document.querySelectorAll(baseSelector);
|
|
374
|
-
removeNodes(baseNodes);
|
|
375
|
-
// Extra
|
|
412
|
+
removeNodes(document.querySelectorAll(baseSelector));
|
|
376
413
|
if (extraSelectors.length > 0) {
|
|
377
|
-
|
|
378
|
-
const extraNodes = document.querySelectorAll(combinedExtra);
|
|
379
|
-
removeNodes(extraNodes);
|
|
414
|
+
removeNodes(document.querySelectorAll(extraSelectors.join(',')));
|
|
380
415
|
}
|
|
381
|
-
// Candidates
|
|
416
|
+
// Candidates (conditional removal)
|
|
382
417
|
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
383
418
|
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
384
419
|
if (i % ABORT_CHECK_INTERVAL === 0 && signal?.aborted) {
|
|
@@ -469,6 +504,29 @@ function mayContainNoise(html) {
|
|
|
469
504
|
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
470
505
|
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
471
506
|
}
|
|
507
|
+
function stripTabTriggers(document) {
|
|
508
|
+
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
509
|
+
for (let i = tabs.length - 1; i >= 0; i--) {
|
|
510
|
+
tabs[i]?.remove();
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
function escapeTableCellPipes(document) {
|
|
514
|
+
const codes = document.querySelectorAll('td code, th code');
|
|
515
|
+
for (const code of codes) {
|
|
516
|
+
if (code.textContent.includes('|')) {
|
|
517
|
+
code.textContent = code.textContent.replace(/\|/g, '\\|');
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
function separateAdjacentInlineElements(document) {
|
|
522
|
+
const badges = document.querySelectorAll('span.chakra-badge, [data-scope="badge"], [class*="badge"]');
|
|
523
|
+
for (const badge of badges) {
|
|
524
|
+
const next = badge.nextSibling;
|
|
525
|
+
if (next?.nodeType === 1) {
|
|
526
|
+
badge.after(document.createTextNode(' '));
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
}
|
|
472
530
|
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
473
531
|
const context = getContext();
|
|
474
532
|
if (config.noiseRemoval.debug) {
|
|
@@ -477,9 +535,37 @@ export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
|
477
535
|
});
|
|
478
536
|
}
|
|
479
537
|
stripNoise(document, context, signal);
|
|
538
|
+
stripTabTriggers(document);
|
|
539
|
+
separateAdjacentInlineElements(document);
|
|
540
|
+
flattenTableCellBreaks(document);
|
|
541
|
+
escapeTableCellPipes(document);
|
|
542
|
+
normalizeTableStructure(document);
|
|
480
543
|
if (baseUrl)
|
|
481
544
|
resolveUrls(document, baseUrl);
|
|
482
545
|
}
|
|
546
|
+
// Some sites put tbody/thead/tfoot inside td/th, which breaks markdown tables.
|
|
547
|
+
function normalizeTableStructure(document) {
|
|
548
|
+
for (const table of document.querySelectorAll('table')) {
|
|
549
|
+
for (const cell of table.querySelectorAll('th, td')) {
|
|
550
|
+
for (const tag of ['tbody', 'thead', 'tfoot']) {
|
|
551
|
+
let nested = cell.querySelector(tag);
|
|
552
|
+
while (nested) {
|
|
553
|
+
table.appendChild(nested);
|
|
554
|
+
nested = cell.querySelector(tag);
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
function flattenTableCellBreaks(document) {
|
|
561
|
+
const cells = document.querySelectorAll('td, th');
|
|
562
|
+
for (const cell of cells) {
|
|
563
|
+
const brs = cell.querySelectorAll('br');
|
|
564
|
+
for (const br of brs) {
|
|
565
|
+
br.replaceWith(' ');
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|
|
483
569
|
export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
484
570
|
const shouldParse = isFullDocumentHtml(html) ||
|
|
485
571
|
mayContainNoise(html) ||
|
|
@@ -495,6 +581,8 @@ export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
|
495
581
|
return html;
|
|
496
582
|
}
|
|
497
583
|
}
|
|
584
|
+
// endregion
|
|
585
|
+
// region Language Detection
|
|
498
586
|
class DetectionContext {
|
|
499
587
|
code;
|
|
500
588
|
_lower;
|
|
@@ -574,10 +662,10 @@ const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
|
|
|
574
662
|
function containsJsxTag(code) {
|
|
575
663
|
const len = code.length;
|
|
576
664
|
for (let i = 0; i < len - 1; i++) {
|
|
577
|
-
if (code.charCodeAt(i) ===
|
|
665
|
+
if (code.charCodeAt(i) === ASCII_LT) {
|
|
578
666
|
const next = code.charCodeAt(i + 1);
|
|
579
|
-
if (next >=
|
|
580
|
-
return true;
|
|
667
|
+
if (next >= ASCII_UPPER_A && next <= ASCII_UPPER_Z)
|
|
668
|
+
return true;
|
|
581
669
|
}
|
|
582
670
|
}
|
|
583
671
|
return false;
|
|
@@ -636,140 +724,100 @@ function detectYamlStructure(lines) {
|
|
|
636
724
|
if (colonIdx <= 0)
|
|
637
725
|
continue;
|
|
638
726
|
const after = trimmed.charCodeAt(colonIdx + 1);
|
|
639
|
-
|
|
640
|
-
|
|
727
|
+
if (after === ASCII_SPACE || after === ASCII_TAB)
|
|
728
|
+
return true;
|
|
729
|
+
}
|
|
730
|
+
return false;
|
|
731
|
+
}
|
|
732
|
+
function matchRust(ctx) {
|
|
733
|
+
if (ctx.lower.includes('let mut'))
|
|
734
|
+
return true;
|
|
735
|
+
if (RUST_REGEX.test(ctx.lower))
|
|
736
|
+
return true;
|
|
737
|
+
return ctx.lower.includes('use ') && ctx.lower.includes('::');
|
|
738
|
+
}
|
|
739
|
+
function matchGo(ctx) {
|
|
740
|
+
if (ctx.lower.includes('import "'))
|
|
741
|
+
return true;
|
|
742
|
+
return /\b(?:package|func)\b/.test(ctx.lower);
|
|
743
|
+
}
|
|
744
|
+
function matchJsx(ctx) {
|
|
745
|
+
const l = ctx.lower;
|
|
746
|
+
if (l.includes('classname=') ||
|
|
747
|
+
l.includes('jsx:') ||
|
|
748
|
+
l.includes("from 'react'") ||
|
|
749
|
+
l.includes('from "react"')) {
|
|
750
|
+
return true;
|
|
751
|
+
}
|
|
752
|
+
return containsJsxTag(ctx.code);
|
|
753
|
+
}
|
|
754
|
+
function matchTypeScript(ctx) {
|
|
755
|
+
if (/\b(?:interface|type)\b/.test(ctx.lower))
|
|
756
|
+
return true;
|
|
757
|
+
const l = ctx.lower;
|
|
758
|
+
for (const hint of TYPESCRIPT_HINTS) {
|
|
759
|
+
if (l.includes(hint))
|
|
760
|
+
return true;
|
|
761
|
+
}
|
|
762
|
+
return false;
|
|
763
|
+
}
|
|
764
|
+
function matchSql(ctx) {
|
|
765
|
+
return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(ctx.lower);
|
|
766
|
+
}
|
|
767
|
+
function hasJsSignals(lowerCode) {
|
|
768
|
+
return (JS_SIGNAL_REGEX.test(lowerCode) ||
|
|
769
|
+
lowerCode.includes('{') ||
|
|
770
|
+
lowerCode.includes("from '"));
|
|
771
|
+
}
|
|
772
|
+
function matchPython(ctx) {
|
|
773
|
+
const l = ctx.lower;
|
|
774
|
+
if (l.includes('print(') || l.includes('__name__'))
|
|
775
|
+
return true;
|
|
776
|
+
if (l.includes('self.') || l.includes('elif '))
|
|
777
|
+
return true;
|
|
778
|
+
// Check for Python's None/True/False using original case (they are capitalized in Python)
|
|
779
|
+
if (ctx.code.includes('None') ||
|
|
780
|
+
ctx.code.includes('True') ||
|
|
781
|
+
ctx.code.includes('False')) {
|
|
782
|
+
return true;
|
|
783
|
+
}
|
|
784
|
+
if (PYTHON_UNIQUE_REGEX.test(l))
|
|
785
|
+
return true;
|
|
786
|
+
// Shared keywords (import, from, class) — only match if no JS signals present
|
|
787
|
+
if (/\b(?:import|from|class)\b/.test(l) && !hasJsSignals(l)) {
|
|
788
|
+
return true;
|
|
789
|
+
}
|
|
790
|
+
return false;
|
|
791
|
+
}
|
|
792
|
+
function matchHtml(ctx) {
|
|
793
|
+
const l = ctx.lower;
|
|
794
|
+
for (const tag of HTML_TAGS) {
|
|
795
|
+
if (l.includes(tag))
|
|
641
796
|
return true;
|
|
642
797
|
}
|
|
643
798
|
return false;
|
|
644
799
|
}
|
|
800
|
+
// Pre-sorted by weight descending — first match wins in detectLanguageFromCode
|
|
645
801
|
const LANGUAGES = [
|
|
646
|
-
{
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
if (RUST_REGEX.test(ctx.lower))
|
|
653
|
-
return true;
|
|
654
|
-
return ctx.lower.includes('use ') && ctx.lower.includes('::');
|
|
655
|
-
},
|
|
656
|
-
},
|
|
657
|
-
{
|
|
658
|
-
lang: 'go',
|
|
659
|
-
weight: 22,
|
|
660
|
-
match: (ctx) => {
|
|
661
|
-
if (ctx.lower.includes('import "'))
|
|
662
|
-
return true;
|
|
663
|
-
return /\b(?:package|func)\b/.test(ctx.lower);
|
|
664
|
-
},
|
|
665
|
-
},
|
|
666
|
-
{
|
|
667
|
-
lang: 'jsx',
|
|
668
|
-
weight: 22,
|
|
669
|
-
match: (ctx) => {
|
|
670
|
-
const l = ctx.lower;
|
|
671
|
-
if (l.includes('classname=') ||
|
|
672
|
-
l.includes('jsx:') ||
|
|
673
|
-
l.includes("from 'react'") ||
|
|
674
|
-
l.includes('from "react"')) {
|
|
675
|
-
return true;
|
|
676
|
-
}
|
|
677
|
-
return containsJsxTag(ctx.code);
|
|
678
|
-
},
|
|
679
|
-
},
|
|
680
|
-
{
|
|
681
|
-
lang: 'typescript',
|
|
682
|
-
weight: 20,
|
|
683
|
-
match: (ctx) => {
|
|
684
|
-
if (/\b(?:interface|type)\b/.test(ctx.lower))
|
|
685
|
-
return true;
|
|
686
|
-
const l = ctx.lower;
|
|
687
|
-
for (const hint of TYPESCRIPT_HINTS) {
|
|
688
|
-
if (l.includes(hint))
|
|
689
|
-
return true;
|
|
690
|
-
}
|
|
691
|
-
return false;
|
|
692
|
-
},
|
|
693
|
-
},
|
|
694
|
-
{
|
|
695
|
-
lang: 'sql',
|
|
696
|
-
weight: 20,
|
|
697
|
-
match: (ctx) => {
|
|
698
|
-
const l = ctx.lower;
|
|
699
|
-
return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(l);
|
|
700
|
-
},
|
|
701
|
-
},
|
|
702
|
-
{
|
|
703
|
-
lang: 'python',
|
|
704
|
-
weight: 18,
|
|
705
|
-
match: (ctx) => {
|
|
706
|
-
const l = ctx.lower;
|
|
707
|
-
if (l.includes('print(') || l.includes('__name__'))
|
|
708
|
-
return true;
|
|
709
|
-
if (l.includes('self.') || l.includes('elif '))
|
|
710
|
-
return true;
|
|
711
|
-
// Check for Python's None/True/False using original case (they are capitalized in Python)
|
|
712
|
-
if (ctx.code.includes('None') ||
|
|
713
|
-
ctx.code.includes('True') ||
|
|
714
|
-
ctx.code.includes('False')) {
|
|
715
|
-
return true;
|
|
716
|
-
}
|
|
717
|
-
// Python-unique keywords that JS doesn't have
|
|
718
|
-
if (PYTHON_UNIQUE_REGEX.test(l))
|
|
719
|
-
return true;
|
|
720
|
-
// Shared keywords (import, from, class) — only match if no JS signals present
|
|
721
|
-
if (/\b(?:import|from|class)\b/.test(l) &&
|
|
722
|
-
!JS_SIGNAL_REGEX.test(l) &&
|
|
723
|
-
!l.includes('{') &&
|
|
724
|
-
!l.includes("from '")) {
|
|
725
|
-
return true;
|
|
726
|
-
}
|
|
727
|
-
return false;
|
|
728
|
-
},
|
|
729
|
-
},
|
|
802
|
+
{ lang: 'rust', weight: 25, match: matchRust },
|
|
803
|
+
{ lang: 'go', weight: 22, match: matchGo },
|
|
804
|
+
{ lang: 'jsx', weight: 22, match: matchJsx },
|
|
805
|
+
{ lang: 'typescript', weight: 20, match: matchTypeScript },
|
|
806
|
+
{ lang: 'sql', weight: 20, match: matchSql },
|
|
807
|
+
{ lang: 'python', weight: 18, match: matchPython },
|
|
730
808
|
{
|
|
731
809
|
lang: 'css',
|
|
732
810
|
weight: 18,
|
|
733
|
-
match: (ctx) =>
|
|
734
|
-
if (CSS_REGEX.test(ctx.lower))
|
|
735
|
-
return true;
|
|
736
|
-
return detectCssStructure(ctx.lines);
|
|
737
|
-
},
|
|
738
|
-
},
|
|
739
|
-
{
|
|
740
|
-
lang: 'bash',
|
|
741
|
-
weight: 15,
|
|
742
|
-
match: (ctx) => detectBashIndicators(ctx.lines),
|
|
743
|
-
},
|
|
744
|
-
{
|
|
745
|
-
lang: 'yaml',
|
|
746
|
-
weight: 15,
|
|
747
|
-
match: (ctx) => detectYamlStructure(ctx.lines),
|
|
748
|
-
},
|
|
749
|
-
{
|
|
750
|
-
lang: 'javascript',
|
|
751
|
-
weight: 15,
|
|
752
|
-
match: (ctx) => JS_REGEX.test(ctx.lower),
|
|
753
|
-
},
|
|
754
|
-
{
|
|
755
|
-
lang: 'html',
|
|
756
|
-
weight: 12,
|
|
757
|
-
match: (ctx) => {
|
|
758
|
-
const l = ctx.lower;
|
|
759
|
-
for (const tag of HTML_TAGS) {
|
|
760
|
-
if (l.includes(tag))
|
|
761
|
-
return true;
|
|
762
|
-
}
|
|
763
|
-
return false;
|
|
764
|
-
},
|
|
811
|
+
match: (ctx) => CSS_REGEX.test(ctx.lower) || detectCssStructure(ctx.lines),
|
|
765
812
|
},
|
|
813
|
+
{ lang: 'bash', weight: 15, match: (ctx) => detectBashIndicators(ctx.lines) },
|
|
814
|
+
{ lang: 'yaml', weight: 15, match: (ctx) => detectYamlStructure(ctx.lines) },
|
|
815
|
+
{ lang: 'javascript', weight: 15, match: (ctx) => JS_REGEX.test(ctx.lower) },
|
|
816
|
+
{ lang: 'html', weight: 12, match: matchHtml },
|
|
766
817
|
{
|
|
767
818
|
lang: 'json',
|
|
768
819
|
weight: 10,
|
|
769
|
-
match: (ctx) => {
|
|
770
|
-
const s = ctx.trimmedStart;
|
|
771
|
-
return s.startsWith('{') || s.startsWith('[');
|
|
772
|
-
},
|
|
820
|
+
match: (ctx) => ctx.trimmedStart.startsWith('{') || ctx.trimmedStart.startsWith('['),
|
|
773
821
|
},
|
|
774
822
|
];
|
|
775
823
|
function extractLanguageFromClassName(className) {
|
|
@@ -805,11 +853,10 @@ function resolveLanguageFromDataAttribute(dataLang) {
|
|
|
805
853
|
// Check if \w+
|
|
806
854
|
for (let i = 0; i < trimmed.length; i++) {
|
|
807
855
|
const c = trimmed.charCodeAt(i);
|
|
808
|
-
|
|
809
|
-
const
|
|
810
|
-
const
|
|
811
|
-
const
|
|
812
|
-
const isUnder = c === 95;
|
|
856
|
+
const isUpper = c >= ASCII_UPPER_A && c <= ASCII_UPPER_Z;
|
|
857
|
+
const isLower = c >= ASCII_LOWER_A && c <= ASCII_LOWER_Z;
|
|
858
|
+
const isDigit = c >= ASCII_DIGIT_0 && c <= ASCII_DIGIT_9;
|
|
859
|
+
const isUnder = c === ASCII_UNDERSCORE;
|
|
813
860
|
if (!isUpper && !isLower && !isDigit && !isUnder) {
|
|
814
861
|
return undefined;
|
|
815
862
|
}
|
|
@@ -826,7 +873,7 @@ export function detectLanguageFromCode(code) {
|
|
|
826
873
|
// Fast path for empty/whitespace only
|
|
827
874
|
let empty = true;
|
|
828
875
|
for (let i = 0; i < code.length; i++) {
|
|
829
|
-
if (code.charCodeAt(i) >
|
|
876
|
+
if (code.charCodeAt(i) > ASCII_SPACE) {
|
|
830
877
|
empty = false;
|
|
831
878
|
break;
|
|
832
879
|
}
|
|
@@ -834,20 +881,15 @@ export function detectLanguageFromCode(code) {
|
|
|
834
881
|
if (empty)
|
|
835
882
|
return undefined;
|
|
836
883
|
const ctx = new DetectionContext(code);
|
|
837
|
-
|
|
838
|
-
let bestScore = -1;
|
|
884
|
+
// LANGUAGES is pre-sorted by weight descending — first match is highest confidence
|
|
839
885
|
for (const def of LANGUAGES) {
|
|
840
|
-
if (def.match(ctx))
|
|
841
|
-
|
|
842
|
-
bestScore = def.weight;
|
|
843
|
-
bestLang = def.lang;
|
|
844
|
-
if (bestScore >= 25)
|
|
845
|
-
break;
|
|
846
|
-
}
|
|
847
|
-
}
|
|
886
|
+
if (def.match(ctx))
|
|
887
|
+
return def.lang;
|
|
848
888
|
}
|
|
849
|
-
return
|
|
889
|
+
return undefined;
|
|
850
890
|
}
|
|
891
|
+
// endregion
|
|
892
|
+
// region Markdown Cleanup
|
|
851
893
|
const MAX_LINE_LENGTH = 80;
|
|
852
894
|
const REGEX = {
|
|
853
895
|
HEADING_MARKER: /^#{1,6}\s/m,
|
|
@@ -900,7 +942,7 @@ function isBlank(line) {
|
|
|
900
942
|
}
|
|
901
943
|
function hasFollowingContent(lines, startIndex) {
|
|
902
944
|
// Optimization: Bound lookahead to avoid checking too many lines in huge files
|
|
903
|
-
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex +
|
|
945
|
+
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + HAS_FOLLOWING_LOOKAHEAD); i++) {
|
|
904
946
|
if (!isBlank(lines[i]))
|
|
905
947
|
return true;
|
|
906
948
|
}
|
|
@@ -919,7 +961,7 @@ function isTitleCaseOrKeyword(trimmed) {
|
|
|
919
961
|
// Split limited number of words
|
|
920
962
|
const words = trimmed.split(/\s+/);
|
|
921
963
|
const len = words.length;
|
|
922
|
-
if (len <
|
|
964
|
+
if (len < TITLE_MIN_WORDS || len > TITLE_MAX_WORDS)
|
|
923
965
|
return false;
|
|
924
966
|
let capitalizedCount = 0;
|
|
925
967
|
for (let i = 0; i < len; i++) {
|
|
@@ -932,20 +974,19 @@ function isTitleCaseOrKeyword(trimmed) {
|
|
|
932
974
|
else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
|
|
933
975
|
return false;
|
|
934
976
|
}
|
|
935
|
-
return capitalizedCount >=
|
|
977
|
+
return capitalizedCount >= TITLE_MIN_CAPITALIZED;
|
|
936
978
|
}
|
|
937
979
|
function getHeadingPrefix(trimmed) {
|
|
938
980
|
if (trimmed.length > MAX_LINE_LENGTH)
|
|
939
981
|
return null;
|
|
940
982
|
// Fast path: Check common markdown markers first
|
|
941
983
|
const firstChar = trimmed.charCodeAt(0);
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
firstChar ===
|
|
945
|
-
firstChar ===
|
|
946
|
-
firstChar ===
|
|
947
|
-
firstChar
|
|
948
|
-
(firstChar >= 48 && firstChar <= 57)) {
|
|
984
|
+
if (firstChar === ASCII_HASH ||
|
|
985
|
+
firstChar === ASCII_DASH ||
|
|
986
|
+
firstChar === ASCII_ASTERISK ||
|
|
987
|
+
firstChar === ASCII_PLUS ||
|
|
988
|
+
firstChar === ASCII_BRACKET_OPEN ||
|
|
989
|
+
(firstChar >= ASCII_DIGIT_0 && firstChar <= ASCII_DIGIT_9)) {
|
|
949
990
|
if (REGEX.HEADING_MARKER.test(trimmed) ||
|
|
950
991
|
REGEX.LIST_MARKER.test(trimmed) ||
|
|
951
992
|
/^\d+\.\s/.test(trimmed) ||
|
|
@@ -957,8 +998,9 @@ function getHeadingPrefix(trimmed) {
|
|
|
957
998
|
return /^example:\s/i.test(trimmed) ? '### ' : '## ';
|
|
958
999
|
}
|
|
959
1000
|
const lastChar = trimmed.charCodeAt(trimmed.length - 1);
|
|
960
|
-
|
|
961
|
-
|
|
1001
|
+
if (lastChar === ASCII_PERIOD ||
|
|
1002
|
+
lastChar === ASCII_EXCLAMATION ||
|
|
1003
|
+
lastChar === ASCII_QUESTION)
|
|
962
1004
|
return null;
|
|
963
1005
|
return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
|
|
964
1006
|
}
|
|
@@ -1073,48 +1115,63 @@ function processTextBuffer(lines, options) {
|
|
|
1073
1115
|
const text = preprocessLines(lines, options);
|
|
1074
1116
|
return applyGlobalRegexes(text, options);
|
|
1075
1117
|
}
|
|
1076
|
-
function
|
|
1118
|
+
function removeTypeDocArtifacts(text) {
|
|
1119
|
+
const filtered = text
|
|
1120
|
+
.split('\n')
|
|
1121
|
+
.filter((line) => !isTypeDocArtifactLine(line))
|
|
1122
|
+
.join('\n');
|
|
1123
|
+
return filtered.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
1124
|
+
}
|
|
1125
|
+
function removeSkipLinks(text) {
|
|
1126
|
+
return text
|
|
1127
|
+
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1128
|
+
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1129
|
+
}
|
|
1130
|
+
function normalizeMarkdownSpacing(text) {
|
|
1131
|
+
let result = text
|
|
1132
|
+
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
1133
|
+
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
1134
|
+
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
1135
|
+
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
1136
|
+
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1137
|
+
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
1138
|
+
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1139
|
+
// Trim leading whitespace inside inline code spans
|
|
1140
|
+
result = result.replace(/(?<=\s|^)`\s+([^`]+)`/gm, '`$1`');
|
|
1141
|
+
// Unescape backticks inside markdown link text
|
|
1142
|
+
result = result.replace(/\[([^\]]*\\`[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/\\`/g, '`')}](${url})`);
|
|
1143
|
+
result = result.replace(/\[([^\]]*<[^\]]*)\]\(([^)]+)\)/g, (_match, linkText, url) => `[${linkText.replace(/</g, '\\<').replace(/>/g, '\\>')}](${url})`);
|
|
1144
|
+
return normalizeNestedListIndentation(result);
|
|
1145
|
+
}
|
|
1146
|
+
function fixConcatenatedProperties(text) {
|
|
1077
1147
|
let result = text;
|
|
1148
|
+
for (let k = 0; k < PROPERTY_FIX_MAX_PASSES; k++) {
|
|
1149
|
+
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
1150
|
+
if (next === result)
|
|
1151
|
+
break;
|
|
1152
|
+
result = next;
|
|
1153
|
+
}
|
|
1154
|
+
return result;
|
|
1155
|
+
}
|
|
1156
|
+
function applyGlobalRegexes(text, options) {
|
|
1078
1157
|
const checkAbort = createAbortChecker(options);
|
|
1158
|
+
let result = text.replace(/\u00A0/g, ' ');
|
|
1079
1159
|
checkAbort('markdown:cleanup:headings');
|
|
1080
|
-
// fixAndSpaceHeadings
|
|
1081
1160
|
result = result
|
|
1082
1161
|
.replace(REGEX.HEADING_SPACING, '$1\n\n$2')
|
|
1083
1162
|
.replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
|
|
1084
1163
|
if (config.markdownCleanup.removeTypeDocComments) {
|
|
1085
1164
|
checkAbort('markdown:cleanup:typedoc');
|
|
1086
|
-
result = result
|
|
1087
|
-
.split('\n')
|
|
1088
|
-
.filter((line) => !isTypeDocArtifactLine(line))
|
|
1089
|
-
.join('\n');
|
|
1090
|
-
result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
1165
|
+
result = removeTypeDocArtifacts(result);
|
|
1091
1166
|
}
|
|
1092
1167
|
if (config.markdownCleanup.removeSkipLinks) {
|
|
1093
1168
|
checkAbort('markdown:cleanup:skip-links');
|
|
1094
|
-
result = result
|
|
1095
|
-
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1096
|
-
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1169
|
+
result = removeSkipLinks(result);
|
|
1097
1170
|
}
|
|
1098
1171
|
checkAbort('markdown:cleanup:spacing');
|
|
1099
|
-
|
|
1100
|
-
result = result
|
|
1101
|
-
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
1102
|
-
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
1103
|
-
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
1104
|
-
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
1105
|
-
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1106
|
-
.replace(REGEX.PUNCT_ONLY_LIST_ARTIFACT, '')
|
|
1107
|
-
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1108
|
-
result = normalizeNestedListIndentation(result);
|
|
1172
|
+
result = normalizeMarkdownSpacing(result);
|
|
1109
1173
|
checkAbort('markdown:cleanup:properties');
|
|
1110
|
-
|
|
1111
|
-
for (let k = 0; k < 3; k++) {
|
|
1112
|
-
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
1113
|
-
if (next === result)
|
|
1114
|
-
break;
|
|
1115
|
-
result = next;
|
|
1116
|
-
}
|
|
1117
|
-
return result;
|
|
1174
|
+
return fixConcatenatedProperties(result);
|
|
1118
1175
|
}
|
|
1119
1176
|
function normalizeNestedListIndentation(text) {
|
|
1120
1177
|
return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
|
|
@@ -1130,27 +1187,17 @@ export function cleanupMarkdownArtifacts(content, options) {
|
|
|
1130
1187
|
return '';
|
|
1131
1188
|
const checkAbort = createAbortChecker(options);
|
|
1132
1189
|
checkAbort('markdown:cleanup:begin');
|
|
1133
|
-
const
|
|
1134
|
-
let lastIndex = 0;
|
|
1190
|
+
const lines = content.split(/\r?\n/);
|
|
1135
1191
|
let fenceMarker = null;
|
|
1136
1192
|
const segments = [];
|
|
1137
1193
|
let buffer = [];
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
line = content.slice(lastIndex);
|
|
1143
|
-
nextIndex = len;
|
|
1144
|
-
}
|
|
1145
|
-
else {
|
|
1146
|
-
if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
|
|
1147
|
-
line = content.slice(lastIndex, nextIndex - 1);
|
|
1148
|
-
}
|
|
1149
|
-
else {
|
|
1150
|
-
line = content.slice(lastIndex, nextIndex);
|
|
1151
|
-
}
|
|
1152
|
-
nextIndex++; // Skip \n
|
|
1194
|
+
const flushBuffer = () => {
|
|
1195
|
+
if (buffer.length > 0) {
|
|
1196
|
+
segments.push(processTextBuffer(buffer, options));
|
|
1197
|
+
buffer = [];
|
|
1153
1198
|
}
|
|
1199
|
+
};
|
|
1200
|
+
for (const line of lines) {
|
|
1154
1201
|
const trimmed = line.trimStart();
|
|
1155
1202
|
if (fenceMarker) {
|
|
1156
1203
|
segments.push(line);
|
|
@@ -1166,22 +1213,16 @@ export function cleanupMarkdownArtifacts(content, options) {
|
|
|
1166
1213
|
buffer.push(line);
|
|
1167
1214
|
}
|
|
1168
1215
|
else {
|
|
1169
|
-
|
|
1170
|
-
segments.push(processTextBuffer(buffer, options));
|
|
1171
|
-
buffer = [];
|
|
1172
|
-
}
|
|
1216
|
+
flushBuffer();
|
|
1173
1217
|
segments.push(line);
|
|
1174
1218
|
fenceMarker = newMarker;
|
|
1175
1219
|
}
|
|
1176
1220
|
}
|
|
1177
|
-
lastIndex = nextIndex;
|
|
1178
|
-
}
|
|
1179
|
-
if (buffer.length > 0) {
|
|
1180
|
-
segments.push(processTextBuffer(buffer, options));
|
|
1181
1221
|
}
|
|
1222
|
+
flushBuffer();
|
|
1182
1223
|
return segments.join('\n').trim();
|
|
1183
1224
|
}
|
|
1184
|
-
function
|
|
1225
|
+
function parseFrontmatter(content) {
|
|
1185
1226
|
const len = content.length;
|
|
1186
1227
|
if (len < 4)
|
|
1187
1228
|
return null;
|
|
@@ -1201,57 +1242,43 @@ function detectFrontmatter(content) {
|
|
|
1201
1242
|
const closeIndex = content.indexOf(fence, fenceLen);
|
|
1202
1243
|
if (closeIndex === -1)
|
|
1203
1244
|
return null;
|
|
1204
|
-
|
|
1245
|
+
const range = {
|
|
1205
1246
|
start: 0,
|
|
1206
1247
|
end: closeIndex + fenceLen,
|
|
1207
1248
|
linesStart: fenceLen,
|
|
1208
1249
|
linesEnd: closeIndex,
|
|
1209
1250
|
lineEnding,
|
|
1210
1251
|
};
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
const
|
|
1214
|
-
const idx = trimmed.indexOf(':');
|
|
1215
|
-
if (!trimmed || idx <= 0)
|
|
1216
|
-
return null;
|
|
1217
|
-
return {
|
|
1218
|
-
key: trimmed.slice(0, idx).trim().toLowerCase(),
|
|
1219
|
-
value: trimmed.slice(idx + 1).trim(),
|
|
1220
|
-
};
|
|
1221
|
-
}
|
|
1222
|
-
function stripFrontmatterQuotes(val) {
|
|
1223
|
-
const first = val.charAt(0);
|
|
1224
|
-
const last = val.charAt(val.length - 1);
|
|
1225
|
-
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1226
|
-
return val.slice(1, -1).trim();
|
|
1227
|
-
}
|
|
1228
|
-
return val;
|
|
1229
|
-
}
|
|
1230
|
-
function scanFrontmatterForTitle(content, fm) {
|
|
1231
|
-
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1252
|
+
// Parse key-value entries in one pass
|
|
1253
|
+
const entries = new Map();
|
|
1254
|
+
const fmBody = content.slice(range.linesStart, range.linesEnd);
|
|
1232
1255
|
let lastIdx = 0;
|
|
1233
1256
|
while (lastIdx < fmBody.length) {
|
|
1234
|
-
let nextIdx = fmBody.indexOf(
|
|
1257
|
+
let nextIdx = fmBody.indexOf(lineEnding, lastIdx);
|
|
1235
1258
|
if (nextIdx === -1)
|
|
1236
1259
|
nextIdx = fmBody.length;
|
|
1237
|
-
const line = fmBody.slice(lastIdx, nextIdx);
|
|
1238
|
-
const
|
|
1239
|
-
if (
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1260
|
+
const line = fmBody.slice(lastIdx, nextIdx).trim();
|
|
1261
|
+
const colonIdx = line.indexOf(':');
|
|
1262
|
+
if (line && colonIdx > 0) {
|
|
1263
|
+
const key = line.slice(0, colonIdx).trim().toLowerCase();
|
|
1264
|
+
let value = line.slice(colonIdx + 1).trim();
|
|
1265
|
+
// Strip surrounding quotes
|
|
1266
|
+
const first = value.charAt(0);
|
|
1267
|
+
const last = value.charAt(value.length - 1);
|
|
1268
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1269
|
+
value = value.slice(1, -1).trim();
|
|
1244
1270
|
}
|
|
1271
|
+
if (value)
|
|
1272
|
+
entries.set(key, value);
|
|
1245
1273
|
}
|
|
1246
|
-
lastIdx = nextIdx +
|
|
1274
|
+
lastIdx = nextIdx + lineEnding.length;
|
|
1247
1275
|
}
|
|
1248
|
-
return
|
|
1276
|
+
return { range, entries };
|
|
1249
1277
|
}
|
|
1250
1278
|
function scanBodyForTitle(content) {
|
|
1251
1279
|
const len = content.length;
|
|
1252
1280
|
let scanIndex = 0;
|
|
1253
|
-
const
|
|
1254
|
-
const maxScan = Math.min(len, LIMIT);
|
|
1281
|
+
const maxScan = Math.min(len, BODY_SCAN_LIMIT);
|
|
1255
1282
|
while (scanIndex < maxScan) {
|
|
1256
1283
|
let nextIndex = content.indexOf('\n', scanIndex);
|
|
1257
1284
|
if (nextIndex === -1)
|
|
@@ -1271,16 +1298,16 @@ function scanBodyForTitle(content) {
|
|
|
1271
1298
|
return undefined;
|
|
1272
1299
|
}
|
|
1273
1300
|
export function extractTitleFromRawMarkdown(content) {
|
|
1274
|
-
const fm =
|
|
1301
|
+
const fm = parseFrontmatter(content);
|
|
1275
1302
|
if (fm) {
|
|
1276
|
-
const title =
|
|
1303
|
+
const title = fm.entries.get('title') ?? fm.entries.get('name');
|
|
1277
1304
|
if (title)
|
|
1278
1305
|
return title;
|
|
1279
1306
|
}
|
|
1280
1307
|
return scanBodyForTitle(content);
|
|
1281
1308
|
}
|
|
1282
1309
|
export function addSourceToMarkdown(content, url) {
|
|
1283
|
-
const fm =
|
|
1310
|
+
const fm = parseFrontmatter(content);
|
|
1284
1311
|
const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
|
|
1285
1312
|
if (useMarkdownFormat && !fm) {
|
|
1286
1313
|
if (REGEX.SOURCE_KEY.test(content))
|
|
@@ -1301,13 +1328,17 @@ export function addSourceToMarkdown(content, url) {
|
|
|
1301
1328
|
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1302
1329
|
return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
|
|
1303
1330
|
}
|
|
1304
|
-
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1331
|
+
const fmBody = content.slice(fm.range.linesStart, fm.range.linesEnd);
|
|
1305
1332
|
if (REGEX.SOURCE_KEY.test(fmBody))
|
|
1306
1333
|
return content;
|
|
1307
1334
|
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1308
|
-
const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
|
|
1309
|
-
return content.slice(0, fm.linesEnd) +
|
|
1335
|
+
const injection = `source: "${escapedUrl}"${fm.range.lineEnding}`;
|
|
1336
|
+
return (content.slice(0, fm.range.linesEnd) +
|
|
1337
|
+
injection +
|
|
1338
|
+
content.slice(fm.range.linesEnd));
|
|
1310
1339
|
}
|
|
1340
|
+
// endregion
|
|
1341
|
+
// region Content Detection & Metadata Footer
|
|
1311
1342
|
function countCommonTags(content, limit) {
|
|
1312
1343
|
if (limit <= 0)
|
|
1313
1344
|
return 0;
|
|
@@ -1324,10 +1355,10 @@ export function isRawTextContent(content) {
|
|
|
1324
1355
|
const trimmed = content.trim();
|
|
1325
1356
|
if (REGEX.HTML_DOC_START.test(trimmed))
|
|
1326
1357
|
return false;
|
|
1327
|
-
if (
|
|
1358
|
+
if (parseFrontmatter(trimmed) !== null)
|
|
1328
1359
|
return true;
|
|
1329
|
-
const tagCount = countCommonTags(content,
|
|
1330
|
-
if (tagCount >
|
|
1360
|
+
const tagCount = countCommonTags(content, HTML_TAG_DENSITY_LIMIT);
|
|
1361
|
+
if (tagCount > HTML_TAG_DENSITY_LIMIT)
|
|
1331
1362
|
return false;
|
|
1332
1363
|
return (REGEX.HEADING_MARKER.test(content) ||
|
|
1333
1364
|
REGEX.LIST_MARKER.test(content) ||
|
|
@@ -1365,3 +1396,4 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
|
1365
1396
|
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1366
1397
|
return lines.join('\n');
|
|
1367
1398
|
}
|
|
1399
|
+
// endregion
|