@j0hanz/superfetch 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +356 -223
- package/dist/assets/logo.svg +24837 -24835
- package/dist/cache.d.ts +28 -20
- package/dist/cache.js +292 -514
- package/dist/config.d.ts +41 -7
- package/dist/config.js +298 -148
- package/dist/crypto.js +25 -12
- package/dist/dom-noise-removal.js +379 -421
- package/dist/errors.d.ts +2 -2
- package/dist/errors.js +25 -8
- package/dist/fetch.d.ts +18 -16
- package/dist/fetch.js +1132 -526
- package/dist/host-normalization.js +40 -10
- package/dist/http-native.js +628 -287
- package/dist/index.js +67 -7
- package/dist/instructions.md +44 -30
- package/dist/ip-blocklist.d.ts +8 -0
- package/dist/ip-blocklist.js +65 -0
- package/dist/json.js +14 -9
- package/dist/language-detection.d.ts +2 -11
- package/dist/language-detection.js +289 -280
- package/dist/markdown-cleanup.d.ts +0 -1
- package/dist/markdown-cleanup.js +391 -429
- package/dist/mcp-validator.js +4 -2
- package/dist/mcp.js +184 -135
- package/dist/observability.js +89 -21
- package/dist/resources.js +16 -6
- package/dist/server-tuning.d.ts +2 -0
- package/dist/server-tuning.js +25 -23
- package/dist/session.d.ts +1 -0
- package/dist/session.js +41 -33
- package/dist/tasks.d.ts +2 -0
- package/dist/tasks.js +91 -9
- package/dist/timer-utils.d.ts +5 -0
- package/dist/timer-utils.js +20 -0
- package/dist/tools.d.ts +28 -5
- package/dist/tools.js +317 -183
- package/dist/transform-types.d.ts +5 -1
- package/dist/transform.d.ts +3 -2
- package/dist/transform.js +1138 -421
- package/dist/type-guards.d.ts +1 -0
- package/dist/type-guards.js +7 -0
- package/dist/workers/transform-child.d.ts +1 -0
- package/dist/workers/transform-child.js +118 -0
- package/dist/workers/transform-worker.js +87 -78
- package/package.json +21 -13
package/dist/markdown-cleanup.js
CHANGED
|
@@ -1,483 +1,448 @@
|
|
|
1
1
|
import { config } from './config.js';
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
2
|
+
// --- Constants & Regex ---
|
|
3
|
+
const MAX_LINE_LENGTH = 80;
|
|
4
|
+
const REGEX = {
|
|
5
|
+
HEADING_MARKER: /^#{1,6}\s/m,
|
|
6
|
+
HEADING_STRICT: /^#{1,6}\s+/m,
|
|
7
|
+
EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
|
|
8
|
+
FENCE_START: /^\s*(`{3,}|~{3,})/,
|
|
9
|
+
LIST_MARKER: /^(?:[-*+])\s/m,
|
|
10
|
+
TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
11
|
+
TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i,
|
|
12
|
+
HTML_DOC_START: /^(<!doctype|<html)/i,
|
|
13
|
+
COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??)\s*$/gim,
|
|
14
|
+
ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
|
|
15
|
+
CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
|
|
16
|
+
DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
|
|
17
|
+
SOURCE_KEY: /^source:\s/im,
|
|
18
|
+
HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
|
|
19
|
+
HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
|
|
20
|
+
HEADING_CAMEL_CASE: /(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm,
|
|
21
|
+
SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
|
|
22
|
+
SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
|
|
23
|
+
SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
|
|
24
|
+
SPACING_ESCAPES: /\\([[\].])/g,
|
|
25
|
+
SPACING_URL_ENC: /\]\([^)]*%5[Ff][^)]*\)/g,
|
|
26
|
+
SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
|
|
27
|
+
TYPEDOC: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
|
|
28
|
+
};
|
|
29
|
+
const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
|
|
30
|
+
const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
|
|
31
|
+
// --- Helper Functions ---
|
|
32
|
+
function getLineEnding(content) {
|
|
33
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
31
34
|
}
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
const
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
for (const line of lines) {
|
|
40
|
-
// Transition into fence: flush outside segment first.
|
|
41
|
-
if (!state.inFence && isFenceStart(line)) {
|
|
42
|
-
if (current.length > 0) {
|
|
43
|
-
segments.push({
|
|
44
|
-
content: current.join('\n'),
|
|
45
|
-
inFence: currentIsFence,
|
|
46
|
-
});
|
|
47
|
-
current = [];
|
|
48
|
-
}
|
|
49
|
-
currentIsFence = true;
|
|
50
|
-
current.push(line);
|
|
51
|
-
advanceFenceState(line, state);
|
|
52
|
-
continue;
|
|
53
|
-
}
|
|
54
|
-
current.push(line);
|
|
55
|
-
const wasInFence = state.inFence;
|
|
56
|
-
advanceFenceState(line, state);
|
|
57
|
-
// Transition out of fence: flush fence segment.
|
|
58
|
-
if (wasInFence && !state.inFence) {
|
|
59
|
-
segments.push({ content: current.join('\n'), inFence: true });
|
|
60
|
-
current = [];
|
|
61
|
-
currentIsFence = false;
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
if (current.length > 0) {
|
|
65
|
-
segments.push({ content: current.join('\n'), inFence: currentIsFence });
|
|
66
|
-
}
|
|
67
|
-
return segments;
|
|
35
|
+
function hasFollowingContent(lines, startIndex) {
|
|
36
|
+
// Optimization: Bound lookahead to avoid checking too many lines in huge files
|
|
37
|
+
const max = Math.min(lines.length, startIndex + 50);
|
|
38
|
+
for (let i = startIndex + 1; i < max; i++) {
|
|
39
|
+
const line = lines[i];
|
|
40
|
+
if (line && line.trim().length > 0)
|
|
41
|
+
return true;
|
|
68
42
|
}
|
|
43
|
+
return false;
|
|
69
44
|
}
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
'
|
|
77
|
-
|
|
78
|
-
'conclusion',
|
|
79
|
-
'prerequisites',
|
|
80
|
-
'requirements',
|
|
81
|
-
'installation',
|
|
82
|
-
'configuration',
|
|
83
|
-
'usage',
|
|
84
|
-
'features',
|
|
85
|
-
'limitations',
|
|
86
|
-
'troubleshooting',
|
|
87
|
-
'faq',
|
|
88
|
-
'resources',
|
|
89
|
-
'references',
|
|
90
|
-
'changelog',
|
|
91
|
-
'license',
|
|
92
|
-
'acknowledgments',
|
|
93
|
-
'appendix',
|
|
94
|
-
]);
|
|
95
|
-
class OrphanHeadingPromoter {
|
|
96
|
-
shouldPromote(line, prevLine) {
|
|
97
|
-
const isPrecededByBlank = prevLine.trim() === '';
|
|
98
|
-
if (!isPrecededByBlank)
|
|
45
|
+
// Optimized Heuristics
|
|
46
|
+
function isTitleCaseOrKeyword(trimmed) {
|
|
47
|
+
// Quick check for length to avoid regex on long strings
|
|
48
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
49
|
+
return false;
|
|
50
|
+
// Single word optimization
|
|
51
|
+
if (!trimmed.includes(' ')) {
|
|
52
|
+
if (!/^[A-Z]/.test(trimmed))
|
|
99
53
|
return false;
|
|
100
|
-
return
|
|
54
|
+
return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
|
|
101
55
|
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
return
|
|
56
|
+
// Split limited number of words
|
|
57
|
+
const words = trimmed.split(/\s+/);
|
|
58
|
+
const len = words.length;
|
|
59
|
+
if (len < 2 || len > 6)
|
|
60
|
+
return false;
|
|
61
|
+
let capitalizedCount = 0;
|
|
62
|
+
for (let i = 0; i < len; i++) {
|
|
63
|
+
const w = words[i];
|
|
64
|
+
if (!w)
|
|
65
|
+
continue;
|
|
66
|
+
const isCap = /^[A-Z][a-z]*$/.test(w);
|
|
67
|
+
if (isCap)
|
|
68
|
+
capitalizedCount++;
|
|
69
|
+
else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
|
|
70
|
+
return false;
|
|
107
71
|
}
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
72
|
+
return capitalizedCount >= 2;
|
|
73
|
+
}
|
|
74
|
+
function getHeadingPrefix(trimmed) {
|
|
75
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
76
|
+
return null;
|
|
77
|
+
// Fast path: Check common markdown markers first
|
|
78
|
+
const firstChar = trimmed.charCodeAt(0);
|
|
79
|
+
// # (35), - (45), * (42), + (43), digit (48-57), [ (91)
|
|
80
|
+
if (firstChar === 35 ||
|
|
81
|
+
firstChar === 45 ||
|
|
82
|
+
firstChar === 42 ||
|
|
83
|
+
firstChar === 43 ||
|
|
84
|
+
firstChar === 91 ||
|
|
85
|
+
(firstChar >= 48 && firstChar <= 57)) {
|
|
86
|
+
if (REGEX.HEADING_MARKER.test(trimmed) ||
|
|
87
|
+
REGEX.LIST_MARKER.test(trimmed) ||
|
|
88
|
+
/^\d+\.\s/.test(trimmed) ||
|
|
89
|
+
/^\[.*\]\(.*\)$/.test(trimmed)) {
|
|
90
|
+
return null;
|
|
111
91
|
}
|
|
112
|
-
return line;
|
|
113
92
|
}
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
93
|
+
if (SPECIAL_PREFIXES.test(trimmed)) {
|
|
94
|
+
return /^example:\s/i.test(trimmed) ? '### ' : '## ';
|
|
95
|
+
}
|
|
96
|
+
const lastChar = trimmed.charCodeAt(trimmed.length - 1);
|
|
97
|
+
// . (46), ! (33), ? (63)
|
|
98
|
+
if (lastChar === 46 || lastChar === 33 || lastChar === 63)
|
|
99
|
+
return null;
|
|
100
|
+
return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
|
|
101
|
+
}
|
|
102
|
+
// Optimized TOC detection
|
|
103
|
+
function hasTocBlock(lines, headingIndex) {
|
|
104
|
+
const lookaheadMax = Math.min(lines.length, headingIndex + 8);
|
|
105
|
+
for (let i = headingIndex + 1; i < lookaheadMax; i++) {
|
|
106
|
+
const line = lines[i];
|
|
107
|
+
if (!line || line.trim().length === 0)
|
|
108
|
+
continue;
|
|
109
|
+
if (REGEX.TOC_LINK.test(line))
|
|
127
110
|
return true;
|
|
128
|
-
}
|
|
129
|
-
const words = trimmed.split(/\s+/);
|
|
130
|
-
if (words.length >= 2 && words.length <= 6) {
|
|
131
|
-
const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
|
|
132
|
-
if (isTitleCase)
|
|
133
|
-
return true;
|
|
134
|
-
}
|
|
135
|
-
if (words.length === 1) {
|
|
136
|
-
const lower = trimmed.toLowerCase();
|
|
137
|
-
if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed))
|
|
138
|
-
return true;
|
|
139
|
-
}
|
|
140
|
-
return false;
|
|
141
111
|
}
|
|
112
|
+
return false;
|
|
142
113
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
return _match;
|
|
155
|
-
const trimmedPrefix = prefix.trim();
|
|
156
|
-
if (trimmedPrefix === '') {
|
|
157
|
-
return `${hashes} ${heading}\n\n`;
|
|
158
|
-
}
|
|
159
|
-
return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
|
|
160
|
-
});
|
|
161
|
-
}
|
|
162
|
-
function removeSkipLinksAndEmptyAnchors(text) {
|
|
163
|
-
const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g;
|
|
164
|
-
return text
|
|
165
|
-
.replace(zeroWidthAnchorLink, '')
|
|
166
|
-
.replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '')
|
|
167
|
-
.replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '')
|
|
168
|
-
.replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
|
|
114
|
+
function skipTocLines(lines, startIndex) {
|
|
115
|
+
for (let i = startIndex; i < lines.length; i++) {
|
|
116
|
+
const line = lines[i];
|
|
117
|
+
if (!line)
|
|
118
|
+
continue;
|
|
119
|
+
if (line.trim().length === 0)
|
|
120
|
+
continue;
|
|
121
|
+
if (!REGEX.TOC_LINK.test(line))
|
|
122
|
+
return i;
|
|
123
|
+
}
|
|
124
|
+
return lines.length;
|
|
169
125
|
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
126
|
+
// --- Main Processing Logic ---
|
|
127
|
+
function tryPromoteOrphan(lines, i, trimmed) {
|
|
128
|
+
const prevLine = lines[i - 1];
|
|
129
|
+
const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
|
|
130
|
+
if (!isOrphan)
|
|
131
|
+
return null;
|
|
132
|
+
const prefix = getHeadingPrefix(trimmed);
|
|
133
|
+
if (!prefix)
|
|
134
|
+
return null;
|
|
135
|
+
const isTitleCaseOnly = prefix === '## ' &&
|
|
136
|
+
!SPECIAL_PREFIXES.test(trimmed) &&
|
|
137
|
+
trimmed.includes(' ');
|
|
138
|
+
if (isTitleCaseOnly && !hasFollowingContent(lines, i))
|
|
139
|
+
return null;
|
|
140
|
+
return `${prefix}${trimmed}`;
|
|
177
141
|
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
const
|
|
186
|
-
const
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
if (
|
|
201
|
-
|
|
202
|
-
skipping = false;
|
|
203
|
-
}
|
|
142
|
+
function shouldSkipAsToc(lines, i, trimmed, removeToc) {
|
|
143
|
+
if (removeToc && REGEX.TOC_HEADING.test(trimmed) && hasTocBlock(lines, i)) {
|
|
144
|
+
return skipTocLines(lines, i + 1);
|
|
145
|
+
}
|
|
146
|
+
return null;
|
|
147
|
+
}
|
|
148
|
+
function preprocessLines(lines) {
|
|
149
|
+
const processedLines = [];
|
|
150
|
+
const len = lines.length;
|
|
151
|
+
const promote = config.markdownCleanup.promoteOrphanHeadings;
|
|
152
|
+
const removeToc = config.markdownCleanup.removeTocBlocks;
|
|
153
|
+
let skipUntil = -1;
|
|
154
|
+
for (let i = 0; i < len; i++) {
|
|
155
|
+
if (i < skipUntil)
|
|
156
|
+
continue;
|
|
157
|
+
let line = lines[i];
|
|
158
|
+
if (line === undefined)
|
|
159
|
+
continue;
|
|
160
|
+
const trimmed = line.trim();
|
|
161
|
+
if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
|
|
162
|
+
continue;
|
|
163
|
+
const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc);
|
|
164
|
+
if (tocSkip !== null) {
|
|
165
|
+
skipUntil = tocSkip;
|
|
204
166
|
continue;
|
|
205
167
|
}
|
|
206
|
-
|
|
168
|
+
if (promote && trimmed.length > 0) {
|
|
169
|
+
const promoted = tryPromoteOrphan(lines, i, trimmed);
|
|
170
|
+
if (promoted)
|
|
171
|
+
line = promoted;
|
|
172
|
+
}
|
|
173
|
+
processedLines.push(line);
|
|
207
174
|
}
|
|
208
|
-
return
|
|
209
|
-
}
|
|
210
|
-
function tidyLinksAndEscapes(text) {
|
|
211
|
-
return text
|
|
212
|
-
.replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[')
|
|
213
|
-
.replace(/^Was this page helpful\??\s*$/gim, '')
|
|
214
|
-
.replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ')
|
|
215
|
-
.replace(/\\([[]])/g, '$1');
|
|
175
|
+
return processedLines.join('\n');
|
|
216
176
|
}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
return text
|
|
177
|
+
// Process a block of non-fence lines
|
|
178
|
+
function processTextBuffer(lines) {
|
|
179
|
+
if (lines.length === 0)
|
|
180
|
+
return '';
|
|
181
|
+
const text = preprocessLines(lines);
|
|
182
|
+
return applyGlobalRegexes(text);
|
|
223
183
|
}
|
|
224
|
-
function
|
|
225
|
-
const quotedValuePattern = /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g;
|
|
184
|
+
function applyGlobalRegexes(text) {
|
|
226
185
|
let result = text;
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
186
|
+
// fixAndSpaceHeadings
|
|
187
|
+
result = result
|
|
188
|
+
.replace(REGEX.HEADING_SPACING, '$1\n\n$2')
|
|
189
|
+
.replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```')
|
|
190
|
+
.replace(REGEX.HEADING_CAMEL_CASE, '$1\n\n$2');
|
|
191
|
+
// removeTypeDocComments
|
|
192
|
+
if (config.markdownCleanup.removeTypeDocComments) {
|
|
193
|
+
result = result.replace(REGEX.TYPEDOC, (match) => match.startsWith('`') ? match : '');
|
|
194
|
+
}
|
|
195
|
+
if (config.markdownCleanup.removeSkipLinks) {
|
|
196
|
+
result = result
|
|
197
|
+
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
198
|
+
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
199
|
+
}
|
|
200
|
+
// normalizeSpacing
|
|
201
|
+
result = result
|
|
202
|
+
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
203
|
+
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
204
|
+
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
205
|
+
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
206
|
+
.replace(REGEX.SPACING_URL_ENC, (m) => m.replace(/%5[Ff]/g, '_'))
|
|
207
|
+
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
208
|
+
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
209
|
+
// fixProperties
|
|
210
|
+
for (let k = 0; k < 3; k++) {
|
|
211
|
+
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
212
|
+
if (next === result)
|
|
233
213
|
break;
|
|
234
|
-
|
|
235
|
-
iterations++;
|
|
214
|
+
result = next;
|
|
236
215
|
}
|
|
237
216
|
return result;
|
|
238
217
|
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
tidyLinksAndEscapes,
|
|
246
|
-
normalizeListsAndSpacing,
|
|
247
|
-
fixConcatenatedProperties,
|
|
248
|
-
];
|
|
249
|
-
function getLastLine(text) {
|
|
250
|
-
const index = text.lastIndexOf('\n');
|
|
251
|
-
return index === -1 ? text : text.slice(index + 1);
|
|
252
|
-
}
|
|
253
|
-
class MarkdownCleanupPipeline {
|
|
254
|
-
cleanup(markdown) {
|
|
255
|
-
if (!markdown)
|
|
256
|
-
return '';
|
|
257
|
-
const segments = fencedSegmenter.split(markdown);
|
|
258
|
-
const cleaned = segments
|
|
259
|
-
.map((seg, index) => {
|
|
260
|
-
if (seg.inFence)
|
|
261
|
-
return seg.content;
|
|
262
|
-
const prevSeg = segments[index - 1];
|
|
263
|
-
const prevLineContext = prevSeg ? getLastLine(prevSeg.content) : '';
|
|
264
|
-
const lines = seg.content.split('\n');
|
|
265
|
-
const promotedLines = [];
|
|
266
|
-
for (let i = 0; i < lines.length; i += 1) {
|
|
267
|
-
const line = lines[i] ?? '';
|
|
268
|
-
const prevLine = i > 0 ? (lines[i - 1] ?? '') : prevLineContext;
|
|
269
|
-
promotedLines.push(orphanHeadingPromoter.processLine(line, prevLine));
|
|
270
|
-
}
|
|
271
|
-
const promoted = promotedLines.join('\n');
|
|
272
|
-
return CLEANUP_STEPS.reduce((text, step) => step(text), promoted);
|
|
273
|
-
})
|
|
274
|
-
.join('\n')
|
|
275
|
-
.trim();
|
|
276
|
-
return cleaned;
|
|
218
|
+
function findNextLine(content, lastIndex, len) {
|
|
219
|
+
let nextIndex = content.indexOf('\n', lastIndex);
|
|
220
|
+
let line;
|
|
221
|
+
if (nextIndex === -1) {
|
|
222
|
+
line = content.slice(lastIndex);
|
|
223
|
+
nextIndex = len;
|
|
277
224
|
}
|
|
225
|
+
else {
|
|
226
|
+
if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
|
|
227
|
+
line = content.slice(lastIndex, nextIndex - 1);
|
|
228
|
+
}
|
|
229
|
+
else {
|
|
230
|
+
line = content.slice(lastIndex, nextIndex);
|
|
231
|
+
}
|
|
232
|
+
nextIndex++; // Skip \n
|
|
233
|
+
}
|
|
234
|
+
return { line, nextIndex };
|
|
278
235
|
}
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
return
|
|
282
|
-
}
|
|
283
|
-
/* -------------------------------------------------------------------------------------------------
|
|
284
|
-
* Raw markdown handling + metadata footer
|
|
285
|
-
* ------------------------------------------------------------------------------------------------- */
|
|
286
|
-
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
287
|
-
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
288
|
-
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
289
|
-
function containsMarkdownHeading(content) {
|
|
290
|
-
return HEADING_PATTERN.test(content);
|
|
291
|
-
}
|
|
292
|
-
function containsMarkdownList(content) {
|
|
293
|
-
return LIST_PATTERN.test(content);
|
|
236
|
+
function checkFenceStart(line) {
|
|
237
|
+
const match = REGEX.FENCE_START.exec(line);
|
|
238
|
+
return match ? (match[1] ?? '```') : null;
|
|
294
239
|
}
|
|
295
|
-
function
|
|
296
|
-
|
|
297
|
-
if (first === -1)
|
|
298
|
-
return false;
|
|
299
|
-
return content.includes('```', first + 3);
|
|
240
|
+
function isFenceClosure(trimmed, marker) {
|
|
241
|
+
return (trimmed.startsWith(marker) && trimmed.slice(marker.length).trim() === '');
|
|
300
242
|
}
|
|
301
|
-
function
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
containsFencedCodeBlock(content));
|
|
243
|
+
function handleFencedLine(line, trimmed, fenceMarker, segments) {
|
|
244
|
+
segments.push(line);
|
|
245
|
+
return isFenceClosure(trimmed, fenceMarker) ? null : fenceMarker;
|
|
305
246
|
}
|
|
306
|
-
function
|
|
307
|
-
|
|
247
|
+
function handleUnfencedLine(line, segments, buffer) {
|
|
248
|
+
const newMarker = checkFenceStart(line);
|
|
249
|
+
if (!newMarker) {
|
|
250
|
+
buffer.push(line);
|
|
251
|
+
return { fenceMarker: null, buffer };
|
|
252
|
+
}
|
|
253
|
+
if (buffer.length > 0) {
|
|
254
|
+
segments.push(processTextBuffer(buffer));
|
|
255
|
+
buffer = [];
|
|
256
|
+
}
|
|
257
|
+
segments.push(line);
|
|
258
|
+
return { fenceMarker: newMarker, buffer };
|
|
308
259
|
}
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
260
|
+
export function cleanupMarkdownArtifacts(content) {
|
|
261
|
+
if (!content)
|
|
262
|
+
return '';
|
|
263
|
+
const len = content.length;
|
|
264
|
+
let lastIndex = 0;
|
|
265
|
+
let fenceMarker = null;
|
|
266
|
+
const segments = [];
|
|
267
|
+
let buffer = [];
|
|
268
|
+
while (lastIndex < len) {
|
|
269
|
+
const { line, nextIndex } = findNextLine(content, lastIndex, len);
|
|
270
|
+
const trimmed = line.trimStart();
|
|
271
|
+
if (fenceMarker) {
|
|
272
|
+
fenceMarker = handleFencedLine(line, trimmed, fenceMarker, segments);
|
|
273
|
+
}
|
|
274
|
+
else {
|
|
275
|
+
({ fenceMarker, buffer } = handleUnfencedLine(line, segments, buffer));
|
|
276
|
+
}
|
|
277
|
+
lastIndex = nextIndex;
|
|
320
278
|
}
|
|
321
|
-
|
|
322
|
-
|
|
279
|
+
if (buffer.length > 0) {
|
|
280
|
+
segments.push(processTextBuffer(buffer));
|
|
323
281
|
}
|
|
282
|
+
return segments.join('\n').trim();
|
|
324
283
|
}
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
284
|
+
function detectFrontmatter(content) {
|
|
285
|
+
const len = content.length;
|
|
286
|
+
if (len < 4)
|
|
287
|
+
return null;
|
|
288
|
+
let lineEnding = null;
|
|
289
|
+
let fenceLen = 0;
|
|
290
|
+
if (content.startsWith('---\n')) {
|
|
291
|
+
lineEnding = '\n';
|
|
292
|
+
fenceLen = 4;
|
|
334
293
|
}
|
|
335
|
-
|
|
294
|
+
else if (content.startsWith('---\r\n')) {
|
|
295
|
+
lineEnding = '\r\n';
|
|
296
|
+
fenceLen = 5;
|
|
297
|
+
}
|
|
298
|
+
if (!lineEnding)
|
|
299
|
+
return null;
|
|
300
|
+
const fence = `---${lineEnding}`;
|
|
301
|
+
const closeIndex = content.indexOf(fence, fenceLen);
|
|
302
|
+
if (closeIndex === -1)
|
|
303
|
+
return null;
|
|
304
|
+
return {
|
|
305
|
+
start: 0,
|
|
306
|
+
end: closeIndex + fenceLen,
|
|
307
|
+
linesStart: fenceLen,
|
|
308
|
+
linesEnd: closeIndex,
|
|
309
|
+
lineEnding,
|
|
310
|
+
};
|
|
336
311
|
}
|
|
337
312
|
function parseFrontmatterEntry(line) {
|
|
338
313
|
const trimmed = line.trim();
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
const separatorIndex = trimmed.indexOf(':');
|
|
342
|
-
if (separatorIndex <= 0)
|
|
314
|
+
const idx = trimmed.indexOf(':');
|
|
315
|
+
if (!trimmed || idx <= 0)
|
|
343
316
|
return null;
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
317
|
+
return {
|
|
318
|
+
key: trimmed.slice(0, idx).trim().toLowerCase(),
|
|
319
|
+
value: trimmed.slice(idx + 1).trim(),
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
function stripFrontmatterQuotes(val) {
|
|
323
|
+
const first = val.charAt(0);
|
|
324
|
+
const last = val.charAt(val.length - 1);
|
|
325
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
326
|
+
return val.slice(1, -1).trim();
|
|
327
|
+
}
|
|
328
|
+
return val;
|
|
329
|
+
}
|
|
330
|
+
function scanFrontmatterForTitle(content, fm) {
|
|
331
|
+
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
332
|
+
let lastIdx = 0;
|
|
333
|
+
while (lastIdx < fmBody.length) {
|
|
334
|
+
let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx);
|
|
335
|
+
if (nextIdx === -1)
|
|
336
|
+
nextIdx = fmBody.length;
|
|
337
|
+
const line = fmBody.slice(lastIdx, nextIdx);
|
|
338
|
+
const entry = parseFrontmatterEntry(line);
|
|
339
|
+
if (entry) {
|
|
340
|
+
if (entry.key === 'title' || entry.key === 'name') {
|
|
341
|
+
const cleaned = stripFrontmatterQuotes(entry.value);
|
|
342
|
+
if (cleaned)
|
|
343
|
+
return cleaned;
|
|
344
|
+
}
|
|
361
345
|
}
|
|
362
|
-
|
|
363
|
-
return undefined;
|
|
364
|
-
const nextChar = trimmed[index];
|
|
365
|
-
if (nextChar !== ' ' && nextChar !== '\t')
|
|
366
|
-
return undefined;
|
|
367
|
-
const heading = trimmed.slice(index).trim();
|
|
368
|
-
return heading.length > 0 ? heading : undefined;
|
|
346
|
+
lastIdx = nextIdx + fm.lineEnding.length;
|
|
369
347
|
}
|
|
370
348
|
return undefined;
|
|
371
349
|
}
|
|
372
|
-
|
|
373
|
-
const
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
const limit = Math.min(lines.length, 50);
|
|
391
|
-
for (let index = 0; index < limit; index += 1) {
|
|
392
|
-
const line = lines[index];
|
|
393
|
-
if (!line)
|
|
394
|
-
continue;
|
|
395
|
-
if (line.trimStart().toLowerCase().startsWith('source:')) {
|
|
396
|
-
return true;
|
|
350
|
+
function scanBodyForTitle(content) {
|
|
351
|
+
const len = content.length;
|
|
352
|
+
let scanIndex = 0;
|
|
353
|
+
const LIMIT = 5000;
|
|
354
|
+
const maxScan = Math.min(len, LIMIT);
|
|
355
|
+
while (scanIndex < maxScan) {
|
|
356
|
+
let nextIndex = content.indexOf('\n', scanIndex);
|
|
357
|
+
if (nextIndex === -1)
|
|
358
|
+
nextIndex = len;
|
|
359
|
+
let line = content.slice(scanIndex, nextIndex);
|
|
360
|
+
if (line.endsWith('\r'))
|
|
361
|
+
line = line.slice(0, -1);
|
|
362
|
+
const trimmed = line.trim();
|
|
363
|
+
if (trimmed) {
|
|
364
|
+
if (REGEX.HEADING_STRICT.test(trimmed)) {
|
|
365
|
+
return trimmed.replace(REGEX.HEADING_MARKER, '').trim() || undefined;
|
|
366
|
+
}
|
|
367
|
+
return undefined;
|
|
397
368
|
}
|
|
369
|
+
scanIndex = nextIndex + 1;
|
|
398
370
|
}
|
|
399
|
-
return
|
|
371
|
+
return undefined;
|
|
400
372
|
}
|
|
401
|
-
function
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
if (firstNonEmptyIndex !== -1) {
|
|
408
|
-
const firstLine = lines[firstNonEmptyIndex];
|
|
409
|
-
if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
|
|
410
|
-
const insertAt = firstNonEmptyIndex + 1;
|
|
411
|
-
const updated = [
|
|
412
|
-
...lines.slice(0, insertAt),
|
|
413
|
-
'',
|
|
414
|
-
`Source: ${url}`,
|
|
415
|
-
'',
|
|
416
|
-
...lines.slice(insertAt),
|
|
417
|
-
];
|
|
418
|
-
return updated.join(lineEnding);
|
|
419
|
-
}
|
|
373
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
374
|
+
const fm = detectFrontmatter(content);
|
|
375
|
+
if (fm) {
|
|
376
|
+
const title = scanFrontmatterForTitle(content, fm);
|
|
377
|
+
if (title)
|
|
378
|
+
return title;
|
|
420
379
|
}
|
|
421
|
-
return
|
|
380
|
+
return scanBodyForTitle(content);
|
|
422
381
|
}
|
|
423
382
|
export function addSourceToMarkdown(content, url) {
|
|
424
|
-
const fm =
|
|
425
|
-
|
|
426
|
-
|
|
383
|
+
const fm = detectFrontmatter(content);
|
|
384
|
+
const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
|
|
385
|
+
if (useMarkdownFormat && !fm) {
|
|
386
|
+
if (REGEX.SOURCE_KEY.test(content))
|
|
387
|
+
return content;
|
|
388
|
+
const lineEnding = getLineEnding(content);
|
|
389
|
+
const firstH1Match = REGEX.HEADING_MARKER.exec(content);
|
|
390
|
+
if (firstH1Match) {
|
|
391
|
+
const h1Index = firstH1Match.index;
|
|
392
|
+
const lineEndIndex = content.indexOf(lineEnding, h1Index);
|
|
393
|
+
const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
|
|
394
|
+
const injection = `${lineEnding}Source: ${url}${lineEnding}`;
|
|
395
|
+
return content.slice(0, insertPos) + injection + content.slice(insertPos);
|
|
396
|
+
}
|
|
397
|
+
return `Source: ${url}${lineEnding}${lineEnding}${content}`;
|
|
427
398
|
}
|
|
428
399
|
if (!fm) {
|
|
429
|
-
|
|
430
|
-
|
|
400
|
+
const lineEnding = getLineEnding(content);
|
|
401
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
402
|
+
return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
|
|
431
403
|
}
|
|
432
|
-
const
|
|
433
|
-
|
|
434
|
-
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
435
|
-
if (hasSource)
|
|
404
|
+
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
405
|
+
if (REGEX.SOURCE_KEY.test(fmBody))
|
|
436
406
|
return content;
|
|
437
|
-
const
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
return
|
|
407
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
408
|
+
const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
|
|
409
|
+
return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd);
|
|
410
|
+
}
|
|
411
|
+
function countCommonTags(content, limit) {
|
|
412
|
+
if (limit <= 0)
|
|
413
|
+
return 0;
|
|
414
|
+
const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
|
|
415
|
+
let count = 0;
|
|
416
|
+
while (regex.exec(content)) {
|
|
417
|
+
count += 1;
|
|
418
|
+
if (count > limit)
|
|
419
|
+
break;
|
|
420
|
+
}
|
|
421
|
+
return count;
|
|
452
422
|
}
|
|
453
423
|
export function isRawTextContent(content) {
|
|
454
424
|
const trimmed = content.trim();
|
|
455
|
-
|
|
456
|
-
const hasMarkdownFrontmatter = frontmatter.hasFrontmatter(trimmed);
|
|
457
|
-
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
458
|
-
const isMarkdown = looksLikeMarkdown(content);
|
|
459
|
-
return (!isHtmlDocument &&
|
|
460
|
-
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
461
|
-
}
|
|
462
|
-
export function isLikelyHtmlContent(content) {
|
|
463
|
-
const trimmed = content.trim();
|
|
464
|
-
if (!trimmed)
|
|
425
|
+
if (REGEX.HTML_DOC_START.test(trimmed))
|
|
465
426
|
return false;
|
|
466
|
-
if (
|
|
427
|
+
if (detectFrontmatter(trimmed) !== null)
|
|
467
428
|
return true;
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
429
|
+
const tagCount = countCommonTags(content, 5);
|
|
430
|
+
if (tagCount > 5)
|
|
431
|
+
return false;
|
|
432
|
+
return (REGEX.HEADING_MARKER.test(content) ||
|
|
433
|
+
REGEX.LIST_MARKER.test(content) ||
|
|
434
|
+
content.includes('```'));
|
|
435
|
+
}
|
|
436
|
+
function formatFetchedAt(value) {
|
|
437
|
+
const date = new Date(value);
|
|
438
|
+
if (Number.isNaN(date.getTime()))
|
|
439
|
+
return value;
|
|
440
|
+
const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
|
|
441
|
+
day: '2-digit',
|
|
442
|
+
month: '2-digit',
|
|
443
|
+
year: 'numeric',
|
|
444
|
+
});
|
|
445
|
+
return formatter.format(date);
|
|
481
446
|
}
|
|
482
447
|
export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
483
448
|
if (!metadata)
|
|
@@ -492,14 +457,11 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
|
492
457
|
if (url)
|
|
493
458
|
parts.push(`[_Original Source_](${url})`);
|
|
494
459
|
if (metadata.fetchedAt) {
|
|
495
|
-
|
|
496
|
-
parts.push(`_${formattedDate}_`);
|
|
460
|
+
parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
|
|
497
461
|
}
|
|
498
|
-
if (parts.length > 0)
|
|
462
|
+
if (parts.length > 0)
|
|
499
463
|
lines.push(` ${parts.join(' | ')}`);
|
|
500
|
-
|
|
501
|
-
if (metadata.description) {
|
|
464
|
+
if (metadata.description)
|
|
502
465
|
lines.push(` <sub>${metadata.description}</sub>`);
|
|
503
|
-
}
|
|
504
466
|
return lines.join('\n');
|
|
505
467
|
}
|