@j0hanz/superfetch 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +356 -223
  2. package/dist/assets/logo.svg +24837 -24835
  3. package/dist/cache.d.ts +28 -20
  4. package/dist/cache.js +292 -514
  5. package/dist/config.d.ts +41 -7
  6. package/dist/config.js +298 -148
  7. package/dist/crypto.js +25 -12
  8. package/dist/dom-noise-removal.js +379 -421
  9. package/dist/errors.d.ts +2 -2
  10. package/dist/errors.js +25 -8
  11. package/dist/fetch.d.ts +18 -16
  12. package/dist/fetch.js +1132 -526
  13. package/dist/host-normalization.js +40 -10
  14. package/dist/http-native.js +628 -287
  15. package/dist/index.js +67 -7
  16. package/dist/instructions.md +44 -30
  17. package/dist/ip-blocklist.d.ts +8 -0
  18. package/dist/ip-blocklist.js +65 -0
  19. package/dist/json.js +14 -9
  20. package/dist/language-detection.d.ts +2 -11
  21. package/dist/language-detection.js +289 -280
  22. package/dist/markdown-cleanup.d.ts +0 -1
  23. package/dist/markdown-cleanup.js +391 -429
  24. package/dist/mcp-validator.js +4 -2
  25. package/dist/mcp.js +184 -135
  26. package/dist/observability.js +89 -21
  27. package/dist/resources.js +16 -6
  28. package/dist/server-tuning.d.ts +2 -0
  29. package/dist/server-tuning.js +25 -23
  30. package/dist/session.d.ts +1 -0
  31. package/dist/session.js +41 -33
  32. package/dist/tasks.d.ts +2 -0
  33. package/dist/tasks.js +91 -9
  34. package/dist/timer-utils.d.ts +5 -0
  35. package/dist/timer-utils.js +20 -0
  36. package/dist/tools.d.ts +28 -5
  37. package/dist/tools.js +317 -183
  38. package/dist/transform-types.d.ts +5 -1
  39. package/dist/transform.d.ts +3 -2
  40. package/dist/transform.js +1138 -421
  41. package/dist/type-guards.d.ts +1 -0
  42. package/dist/type-guards.js +7 -0
  43. package/dist/workers/transform-child.d.ts +1 -0
  44. package/dist/workers/transform-child.js +118 -0
  45. package/dist/workers/transform-worker.js +87 -78
  46. package/package.json +21 -13
@@ -1,483 +1,448 @@
1
1
  import { config } from './config.js';
2
- /* -------------------------------------------------------------------------------------------------
3
- * Fences
4
- * ------------------------------------------------------------------------------------------------- */
5
- function isFenceStart(line) {
6
- const trimmed = line.trimStart();
7
- return trimmed.startsWith('```') || trimmed.startsWith('~~~');
8
- }
9
- function extractFenceMarker(line) {
10
- const trimmed = line.trimStart();
11
- const match = /^(`{3,}|~{3,})/.exec(trimmed);
12
- return match?.[1] ?? '```';
13
- }
14
- function isFenceEnd(line, marker) {
15
- const trimmed = line.trimStart();
16
- return (trimmed.startsWith(marker) && trimmed.slice(marker.length).trim() === '');
17
- }
18
- function initialFenceState() {
19
- return { inFence: false, marker: '' };
20
- }
21
- function advanceFenceState(line, state) {
22
- if (!state.inFence && isFenceStart(line)) {
23
- state.inFence = true;
24
- state.marker = extractFenceMarker(line);
25
- return;
26
- }
27
- if (state.inFence && isFenceEnd(line, state.marker)) {
28
- state.inFence = false;
29
- state.marker = '';
30
- }
2
+ // --- Constants & Regex ---
3
+ const MAX_LINE_LENGTH = 80;
4
+ const REGEX = {
5
+ HEADING_MARKER: /^#{1,6}\s/m,
6
+ HEADING_STRICT: /^#{1,6}\s+/m,
7
+ EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
8
+ FENCE_START: /^\s*(`{3,}|~{3,})/,
9
+ LIST_MARKER: /^(?:[-*+])\s/m,
10
+ TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
11
+ TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i,
12
+ HTML_DOC_START: /^(<!doctype|<html)/i,
13
+ COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??)\s*$/gim,
14
+ ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
15
+ CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
16
+ DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
17
+ SOURCE_KEY: /^source:\s/im,
18
+ HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
19
+ HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
20
+ HEADING_CAMEL_CASE: /(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm,
21
+ SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
22
+ SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
23
+ SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
24
+ SPACING_ESCAPES: /\\([[\].])/g,
25
+ SPACING_URL_ENC: /\]\([^)]*%5[Ff][^)]*\)/g,
26
+ SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
27
+ TYPEDOC: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
28
+ };
29
+ const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
30
+ const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
31
+ // --- Helper Functions ---
32
+ function getLineEnding(content) {
33
+ return content.includes('\r\n') ? '\r\n' : '\n';
31
34
  }
32
- class FencedSegmenter {
33
- split(content) {
34
- const lines = content.split('\n');
35
- const segments = [];
36
- const state = initialFenceState();
37
- let current = [];
38
- let currentIsFence = false;
39
- for (const line of lines) {
40
- // Transition into fence: flush outside segment first.
41
- if (!state.inFence && isFenceStart(line)) {
42
- if (current.length > 0) {
43
- segments.push({
44
- content: current.join('\n'),
45
- inFence: currentIsFence,
46
- });
47
- current = [];
48
- }
49
- currentIsFence = true;
50
- current.push(line);
51
- advanceFenceState(line, state);
52
- continue;
53
- }
54
- current.push(line);
55
- const wasInFence = state.inFence;
56
- advanceFenceState(line, state);
57
- // Transition out of fence: flush fence segment.
58
- if (wasInFence && !state.inFence) {
59
- segments.push({ content: current.join('\n'), inFence: true });
60
- current = [];
61
- currentIsFence = false;
62
- }
63
- }
64
- if (current.length > 0) {
65
- segments.push({ content: current.join('\n'), inFence: currentIsFence });
66
- }
67
- return segments;
35
+ function hasFollowingContent(lines, startIndex) {
36
+ // Optimization: Bound lookahead to avoid checking too many lines in huge files
37
+ const max = Math.min(lines.length, startIndex + 50);
38
+ for (let i = startIndex + 1; i < max; i++) {
39
+ const line = lines[i];
40
+ if (line && line.trim().length > 0)
41
+ return true;
68
42
  }
43
+ return false;
69
44
  }
70
- const fencedSegmenter = new FencedSegmenter();
71
- /* -------------------------------------------------------------------------------------------------
72
- * Orphan heading promotion
73
- * ------------------------------------------------------------------------------------------------- */
74
- const HEADING_KEYWORDS = new Set([
75
- 'overview',
76
- 'introduction',
77
- 'summary',
78
- 'conclusion',
79
- 'prerequisites',
80
- 'requirements',
81
- 'installation',
82
- 'configuration',
83
- 'usage',
84
- 'features',
85
- 'limitations',
86
- 'troubleshooting',
87
- 'faq',
88
- 'resources',
89
- 'references',
90
- 'changelog',
91
- 'license',
92
- 'acknowledgments',
93
- 'appendix',
94
- ]);
95
- class OrphanHeadingPromoter {
96
- shouldPromote(line, prevLine) {
97
- const isPrecededByBlank = prevLine.trim() === '';
98
- if (!isPrecededByBlank)
45
+ // Optimized Heuristics
46
+ function isTitleCaseOrKeyword(trimmed) {
47
+ // Quick check for length to avoid regex on long strings
48
+ if (trimmed.length > MAX_LINE_LENGTH)
49
+ return false;
50
+ // Single word optimization
51
+ if (!trimmed.includes(' ')) {
52
+ if (!/^[A-Z]/.test(trimmed))
99
53
  return false;
100
- return this.isLikelyHeadingLine(line);
54
+ return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
101
55
  }
102
- format(line) {
103
- const trimmed = line.trim();
104
- const isExample = /^example:\s/i.test(trimmed);
105
- const prefix = isExample ? '### ' : '## ';
106
- return prefix + trimmed;
56
+ // Split limited number of words
57
+ const words = trimmed.split(/\s+/);
58
+ const len = words.length;
59
+ if (len < 2 || len > 6)
60
+ return false;
61
+ let capitalizedCount = 0;
62
+ for (let i = 0; i < len; i++) {
63
+ const w = words[i];
64
+ if (!w)
65
+ continue;
66
+ const isCap = /^[A-Z][a-z]*$/.test(w);
67
+ if (isCap)
68
+ capitalizedCount++;
69
+ else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
70
+ return false;
107
71
  }
108
- processLine(line, prevLine) {
109
- if (this.shouldPromote(line, prevLine)) {
110
- return this.format(line);
72
+ return capitalizedCount >= 2;
73
+ }
74
+ function getHeadingPrefix(trimmed) {
75
+ if (trimmed.length > MAX_LINE_LENGTH)
76
+ return null;
77
+ // Fast path: Check common markdown markers first
78
+ const firstChar = trimmed.charCodeAt(0);
79
+ // # (35), - (45), * (42), + (43), digit (48-57), [ (91)
80
+ if (firstChar === 35 ||
81
+ firstChar === 45 ||
82
+ firstChar === 42 ||
83
+ firstChar === 43 ||
84
+ firstChar === 91 ||
85
+ (firstChar >= 48 && firstChar <= 57)) {
86
+ if (REGEX.HEADING_MARKER.test(trimmed) ||
87
+ REGEX.LIST_MARKER.test(trimmed) ||
88
+ /^\d+\.\s/.test(trimmed) ||
89
+ /^\[.*\]\(.*\)$/.test(trimmed)) {
90
+ return null;
111
91
  }
112
- return line;
113
92
  }
114
- isLikelyHeadingLine(line) {
115
- const trimmed = line.trim();
116
- if (!trimmed || trimmed.length > 80)
117
- return false;
118
- if (/^#{1,6}\s/.test(trimmed))
119
- return false;
120
- if (/^[-*+•]\s/.test(trimmed) || /^\d+\.\s/.test(trimmed))
121
- return false;
122
- if (/[.!?]$/.test(trimmed))
123
- return false;
124
- if (/^\[.*\]\(.*\)$/.test(trimmed))
125
- return false;
126
- if (/^(?:example|note|tip|warning|important|caution):\s+\S/i.test(trimmed)) {
93
+ if (SPECIAL_PREFIXES.test(trimmed)) {
94
+ return /^example:\s/i.test(trimmed) ? '### ' : '## ';
95
+ }
96
+ const lastChar = trimmed.charCodeAt(trimmed.length - 1);
97
+ // . (46), ! (33), ? (63)
98
+ if (lastChar === 46 || lastChar === 33 || lastChar === 63)
99
+ return null;
100
+ return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
101
+ }
102
+ // Optimized TOC detection
103
+ function hasTocBlock(lines, headingIndex) {
104
+ const lookaheadMax = Math.min(lines.length, headingIndex + 8);
105
+ for (let i = headingIndex + 1; i < lookaheadMax; i++) {
106
+ const line = lines[i];
107
+ if (!line || line.trim().length === 0)
108
+ continue;
109
+ if (REGEX.TOC_LINK.test(line))
127
110
  return true;
128
- }
129
- const words = trimmed.split(/\s+/);
130
- if (words.length >= 2 && words.length <= 6) {
131
- const isTitleCase = words.every((w) => /^[A-Z][a-z]*$/.test(w) || /^(?:and|or|the|of|in|for|to|a)$/i.test(w));
132
- if (isTitleCase)
133
- return true;
134
- }
135
- if (words.length === 1) {
136
- const lower = trimmed.toLowerCase();
137
- if (HEADING_KEYWORDS.has(lower) && /^[A-Z]/.test(trimmed))
138
- return true;
139
- }
140
- return false;
141
111
  }
112
+ return false;
142
113
  }
143
- const orphanHeadingPromoter = new OrphanHeadingPromoter();
144
- /* -------------------------------------------------------------------------------------------------
145
- * Cleanup rules (OUTSIDE fences only)
146
- * ------------------------------------------------------------------------------------------------- */
147
- function removeEmptyHeadings(text) {
148
- return text.replace(/^#{1,6}[ \t\u00A0]*$\r?\n?/gm, '');
149
- }
150
- function fixOrphanHeadings(text) {
151
- // Pattern: hashes on their own line, blank line, then a "heading-like" line.
152
- return text.replace(/^(.*?)(#{1,6})\s*(?:\r?\n){2}([A-Z][^\r\n]+?)(?:\r?\n)/gm, (_match, prefix, hashes, heading) => {
153
- if (heading.length > 150)
154
- return _match;
155
- const trimmedPrefix = prefix.trim();
156
- if (trimmedPrefix === '') {
157
- return `${hashes} ${heading}\n\n`;
158
- }
159
- return `${trimmedPrefix}\n\n${hashes} ${heading}\n\n`;
160
- });
161
- }
162
- function removeSkipLinksAndEmptyAnchors(text) {
163
- const zeroWidthAnchorLink = /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g;
164
- return text
165
- .replace(zeroWidthAnchorLink, '')
166
- .replace(/^\[Skip to (?:main )?content\]\(#[^)]*\)\s*$/gim, '')
167
- .replace(/^\[Skip to (?:main )?navigation\]\(#[^)]*\)\s*$/gim, '')
168
- .replace(/^\[Skip link\]\(#[^)]*\)\s*$/gim, '');
114
+ function skipTocLines(lines, startIndex) {
115
+ for (let i = startIndex; i < lines.length; i++) {
116
+ const line = lines[i];
117
+ if (!line)
118
+ continue;
119
+ if (line.trim().length === 0)
120
+ continue;
121
+ if (!REGEX.TOC_LINK.test(line))
122
+ return i;
123
+ }
124
+ return lines.length;
169
125
  }
170
- function ensureBlankLineAfterHeadings(text) {
171
- // Heading followed immediately by a fence marker
172
- text = text.replace(/(^#{1,6}\s+\w+)```/gm, '$1\n\n```');
173
- // Heuristic: Some converters jam words together after a heading
174
- text = text.replace(/(^#{1,6}\s+\w*[A-Z])([A-Z][a-z])/gm, '$1\n\n$2');
175
- // Any heading line should be followed by a blank line before body
176
- return text.replace(/(^#{1,6}\s[^\n]*)\n([^\n])/gm, '$1\n\n$2');
126
+ // --- Main Processing Logic ---
127
+ function tryPromoteOrphan(lines, i, trimmed) {
128
+ const prevLine = lines[i - 1];
129
+ const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
130
+ if (!isOrphan)
131
+ return null;
132
+ const prefix = getHeadingPrefix(trimmed);
133
+ if (!prefix)
134
+ return null;
135
+ const isTitleCaseOnly = prefix === '## ' &&
136
+ !SPECIAL_PREFIXES.test(trimmed) &&
137
+ trimmed.includes(' ');
138
+ if (isTitleCaseOnly && !hasFollowingContent(lines, i))
139
+ return null;
140
+ return `${prefix}${trimmed}`;
177
141
  }
178
- /**
179
- * Remove markdown TOC blocks of the form:
180
- * - [Title](#anchor)
181
- * outside fenced code blocks.
182
- */
183
- function removeTocBlocks(text) {
184
- const tocLine = /^- \[[^\]]+\]\(#[^)]+\)\s*$/;
185
- const lines = text.split('\n');
186
- const out = [];
187
- let skipping = false;
188
- for (let i = 0; i < lines.length; i += 1) {
189
- const line = lines[i] ?? '';
190
- const prev = i > 0 ? (lines[i - 1] ?? '') : '';
191
- const next = i < lines.length - 1 ? (lines[i + 1] ?? '') : '';
192
- if (tocLine.test(line)) {
193
- const prevIsToc = tocLine.test(prev) || prev.trim() === '';
194
- const nextIsToc = tocLine.test(next) || next.trim() === '';
195
- if (prevIsToc || nextIsToc) {
196
- skipping = true;
197
- continue;
198
- }
199
- }
200
- if (skipping) {
201
- if (line.trim() === '') {
202
- skipping = false;
203
- }
142
+ function shouldSkipAsToc(lines, i, trimmed, removeToc) {
143
+ if (removeToc && REGEX.TOC_HEADING.test(trimmed) && hasTocBlock(lines, i)) {
144
+ return skipTocLines(lines, i + 1);
145
+ }
146
+ return null;
147
+ }
148
+ function preprocessLines(lines) {
149
+ const processedLines = [];
150
+ const len = lines.length;
151
+ const promote = config.markdownCleanup.promoteOrphanHeadings;
152
+ const removeToc = config.markdownCleanup.removeTocBlocks;
153
+ let skipUntil = -1;
154
+ for (let i = 0; i < len; i++) {
155
+ if (i < skipUntil)
156
+ continue;
157
+ let line = lines[i];
158
+ if (line === undefined)
159
+ continue;
160
+ const trimmed = line.trim();
161
+ if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
162
+ continue;
163
+ const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc);
164
+ if (tocSkip !== null) {
165
+ skipUntil = tocSkip;
204
166
  continue;
205
167
  }
206
- out.push(line);
168
+ if (promote && trimmed.length > 0) {
169
+ const promoted = tryPromoteOrphan(lines, i, trimmed);
170
+ if (promoted)
171
+ line = promoted;
172
+ }
173
+ processedLines.push(line);
207
174
  }
208
- return out.join('\n');
209
- }
210
- function tidyLinksAndEscapes(text) {
211
- return text
212
- .replace(/\]\(([^)]+)\)\[/g, ']($1)\n\n[')
213
- .replace(/^Was this page helpful\??\s*$/gim, '')
214
- .replace(/(`[^`]+`)\s*\\-\s*/g, '$1 - ')
215
- .replace(/\\([[]])/g, '$1');
175
+ return processedLines.join('\n');
216
176
  }
217
- function normalizeListsAndSpacing(text) {
218
- // Ensure blank line before list starts (bullet/ordered)
219
- text = text.replace(/([^\n])\n([-*+] )/g, '$1\n\n$2');
220
- text = text.replace(/(\S)\n(\d+\. )/g, '$1\n\n$2');
221
- // Collapse excessive blank lines
222
- return text.replace(/\n{3,}/g, '\n\n');
177
+ // Process a block of non-fence lines
178
+ function processTextBuffer(lines) {
179
+ if (lines.length === 0)
180
+ return '';
181
+ const text = preprocessLines(lines);
182
+ return applyGlobalRegexes(text);
223
183
  }
224
- function fixConcatenatedProperties(text) {
225
- const quotedValuePattern = /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g;
184
+ function applyGlobalRegexes(text) {
226
185
  let result = text;
227
- let iterations = 0;
228
- const maxIterations = 3;
229
- while (iterations < maxIterations) {
230
- const before = result;
231
- result = result.replace(quotedValuePattern, '$1$2\n\n$3');
232
- if (result === before) {
186
+ // fixAndSpaceHeadings
187
+ result = result
188
+ .replace(REGEX.HEADING_SPACING, '$1\n\n$2')
189
+ .replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```')
190
+ .replace(REGEX.HEADING_CAMEL_CASE, '$1\n\n$2');
191
+ // removeTypeDocComments
192
+ if (config.markdownCleanup.removeTypeDocComments) {
193
+ result = result.replace(REGEX.TYPEDOC, (match) => match.startsWith('`') ? match : '');
194
+ }
195
+ if (config.markdownCleanup.removeSkipLinks) {
196
+ result = result
197
+ .replace(REGEX.ZERO_WIDTH_ANCHOR, '')
198
+ .replace(REGEX.COMBINED_LINE_REMOVALS, '');
199
+ }
200
+ // normalizeSpacing
201
+ result = result
202
+ .replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
203
+ .replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
204
+ .replace(REGEX.SPACING_CODE_DASH, '$1 - ')
205
+ .replace(REGEX.SPACING_ESCAPES, '$1')
206
+ .replace(REGEX.SPACING_URL_ENC, (m) => m.replace(/%5[Ff]/g, '_'))
207
+ .replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
208
+ .replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
209
+ // fixProperties
210
+ for (let k = 0; k < 3; k++) {
211
+ const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
212
+ if (next === result)
233
213
  break;
234
- }
235
- iterations++;
214
+ result = next;
236
215
  }
237
216
  return result;
238
217
  }
239
- const CLEANUP_STEPS = [
240
- fixOrphanHeadings,
241
- removeEmptyHeadings,
242
- removeSkipLinksAndEmptyAnchors,
243
- ensureBlankLineAfterHeadings,
244
- removeTocBlocks,
245
- tidyLinksAndEscapes,
246
- normalizeListsAndSpacing,
247
- fixConcatenatedProperties,
248
- ];
249
- function getLastLine(text) {
250
- const index = text.lastIndexOf('\n');
251
- return index === -1 ? text : text.slice(index + 1);
252
- }
253
- class MarkdownCleanupPipeline {
254
- cleanup(markdown) {
255
- if (!markdown)
256
- return '';
257
- const segments = fencedSegmenter.split(markdown);
258
- const cleaned = segments
259
- .map((seg, index) => {
260
- if (seg.inFence)
261
- return seg.content;
262
- const prevSeg = segments[index - 1];
263
- const prevLineContext = prevSeg ? getLastLine(prevSeg.content) : '';
264
- const lines = seg.content.split('\n');
265
- const promotedLines = [];
266
- for (let i = 0; i < lines.length; i += 1) {
267
- const line = lines[i] ?? '';
268
- const prevLine = i > 0 ? (lines[i - 1] ?? '') : prevLineContext;
269
- promotedLines.push(orphanHeadingPromoter.processLine(line, prevLine));
270
- }
271
- const promoted = promotedLines.join('\n');
272
- return CLEANUP_STEPS.reduce((text, step) => step(text), promoted);
273
- })
274
- .join('\n')
275
- .trim();
276
- return cleaned;
218
+ function findNextLine(content, lastIndex, len) {
219
+ let nextIndex = content.indexOf('\n', lastIndex);
220
+ let line;
221
+ if (nextIndex === -1) {
222
+ line = content.slice(lastIndex);
223
+ nextIndex = len;
277
224
  }
225
+ else {
226
+ if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
227
+ line = content.slice(lastIndex, nextIndex - 1);
228
+ }
229
+ else {
230
+ line = content.slice(lastIndex, nextIndex);
231
+ }
232
+ nextIndex++; // Skip \n
233
+ }
234
+ return { line, nextIndex };
278
235
  }
279
- const markdownCleanupPipeline = new MarkdownCleanupPipeline();
280
- export function cleanupMarkdownArtifacts(content) {
281
- return markdownCleanupPipeline.cleanup(content);
282
- }
283
- /* -------------------------------------------------------------------------------------------------
284
- * Raw markdown handling + metadata footer
285
- * ------------------------------------------------------------------------------------------------- */
286
- const HEADING_PATTERN = /^#{1,6}\s/m;
287
- const LIST_PATTERN = /^(?:[-*+])\s/m;
288
- const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
289
- function containsMarkdownHeading(content) {
290
- return HEADING_PATTERN.test(content);
291
- }
292
- function containsMarkdownList(content) {
293
- return LIST_PATTERN.test(content);
236
+ function checkFenceStart(line) {
237
+ const match = REGEX.FENCE_START.exec(line);
238
+ return match ? (match[1] ?? '```') : null;
294
239
  }
295
- function containsFencedCodeBlock(content) {
296
- const first = content.indexOf('```');
297
- if (first === -1)
298
- return false;
299
- return content.includes('```', first + 3);
240
+ function isFenceClosure(trimmed, marker) {
241
+ return (trimmed.startsWith(marker) && trimmed.slice(marker.length).trim() === '');
300
242
  }
301
- function looksLikeMarkdown(content) {
302
- return (containsMarkdownHeading(content) ||
303
- containsMarkdownList(content) ||
304
- containsFencedCodeBlock(content));
243
+ function handleFencedLine(line, trimmed, fenceMarker, segments) {
244
+ segments.push(line);
245
+ return isFenceClosure(trimmed, fenceMarker) ? null : fenceMarker;
305
246
  }
306
- function detectLineEnding(content) {
307
- return content.includes('\r\n') ? '\r\n' : '\n';
247
+ function handleUnfencedLine(line, segments, buffer) {
248
+ const newMarker = checkFenceStart(line);
249
+ if (!newMarker) {
250
+ buffer.push(line);
251
+ return { fenceMarker: null, buffer };
252
+ }
253
+ if (buffer.length > 0) {
254
+ segments.push(processTextBuffer(buffer));
255
+ buffer = [];
256
+ }
257
+ segments.push(line);
258
+ return { fenceMarker: newMarker, buffer };
308
259
  }
309
- const FRONTMATTER_DELIMITER = '---';
310
- class RawMarkdownFrontmatter {
311
- find(content) {
312
- const lineEnding = detectLineEnding(content);
313
- const lines = content.split(lineEnding);
314
- if (lines[0] !== FRONTMATTER_DELIMITER)
315
- return null;
316
- const endIndex = lines.indexOf(FRONTMATTER_DELIMITER, 1);
317
- if (endIndex === -1)
318
- return null;
319
- return { lineEnding, lines, endIndex };
260
+ export function cleanupMarkdownArtifacts(content) {
261
+ if (!content)
262
+ return '';
263
+ const len = content.length;
264
+ let lastIndex = 0;
265
+ let fenceMarker = null;
266
+ const segments = [];
267
+ let buffer = [];
268
+ while (lastIndex < len) {
269
+ const { line, nextIndex } = findNextLine(content, lastIndex, len);
270
+ const trimmed = line.trimStart();
271
+ if (fenceMarker) {
272
+ fenceMarker = handleFencedLine(line, trimmed, fenceMarker, segments);
273
+ }
274
+ else {
275
+ ({ fenceMarker, buffer } = handleUnfencedLine(line, segments, buffer));
276
+ }
277
+ lastIndex = nextIndex;
320
278
  }
321
- hasFrontmatter(trimmed) {
322
- return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
279
+ if (buffer.length > 0) {
280
+ segments.push(processTextBuffer(buffer));
323
281
  }
282
+ return segments.join('\n').trim();
324
283
  }
325
- const frontmatter = new RawMarkdownFrontmatter();
326
- function stripOptionalQuotes(value) {
327
- const trimmed = value.trim();
328
- if (trimmed.length < 2)
329
- return trimmed;
330
- const first = trimmed[0];
331
- const last = trimmed[trimmed.length - 1];
332
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
333
- return trimmed.slice(1, -1).trim();
284
+ function detectFrontmatter(content) {
285
+ const len = content.length;
286
+ if (len < 4)
287
+ return null;
288
+ let lineEnding = null;
289
+ let fenceLen = 0;
290
+ if (content.startsWith('---\n')) {
291
+ lineEnding = '\n';
292
+ fenceLen = 4;
334
293
  }
335
- return trimmed;
294
+ else if (content.startsWith('---\r\n')) {
295
+ lineEnding = '\r\n';
296
+ fenceLen = 5;
297
+ }
298
+ if (!lineEnding)
299
+ return null;
300
+ const fence = `---${lineEnding}`;
301
+ const closeIndex = content.indexOf(fence, fenceLen);
302
+ if (closeIndex === -1)
303
+ return null;
304
+ return {
305
+ start: 0,
306
+ end: closeIndex + fenceLen,
307
+ linesStart: fenceLen,
308
+ linesEnd: closeIndex,
309
+ lineEnding,
310
+ };
336
311
  }
337
312
  function parseFrontmatterEntry(line) {
338
313
  const trimmed = line.trim();
339
- if (!trimmed)
340
- return null;
341
- const separatorIndex = trimmed.indexOf(':');
342
- if (separatorIndex <= 0)
314
+ const idx = trimmed.indexOf(':');
315
+ if (!trimmed || idx <= 0)
343
316
  return null;
344
- const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
345
- const value = trimmed.slice(separatorIndex + 1);
346
- return { key, value };
347
- }
348
- function isTitleKey(key) {
349
- return key === 'title' || key === 'name';
350
- }
351
- function extractTitleFromHeading(content) {
352
- const lineEnding = detectLineEnding(content);
353
- const lines = content.split(lineEnding);
354
- for (const line of lines) {
355
- const trimmed = line.trim();
356
- if (!trimmed)
357
- continue;
358
- let index = 0;
359
- while (index < trimmed.length && trimmed[index] === '#') {
360
- index += 1;
317
+ return {
318
+ key: trimmed.slice(0, idx).trim().toLowerCase(),
319
+ value: trimmed.slice(idx + 1).trim(),
320
+ };
321
+ }
322
+ function stripFrontmatterQuotes(val) {
323
+ const first = val.charAt(0);
324
+ const last = val.charAt(val.length - 1);
325
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
326
+ return val.slice(1, -1).trim();
327
+ }
328
+ return val;
329
+ }
330
+ function scanFrontmatterForTitle(content, fm) {
331
+ const fmBody = content.slice(fm.linesStart, fm.linesEnd);
332
+ let lastIdx = 0;
333
+ while (lastIdx < fmBody.length) {
334
+ let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx);
335
+ if (nextIdx === -1)
336
+ nextIdx = fmBody.length;
337
+ const line = fmBody.slice(lastIdx, nextIdx);
338
+ const entry = parseFrontmatterEntry(line);
339
+ if (entry) {
340
+ if (entry.key === 'title' || entry.key === 'name') {
341
+ const cleaned = stripFrontmatterQuotes(entry.value);
342
+ if (cleaned)
343
+ return cleaned;
344
+ }
361
345
  }
362
- if (index === 0 || index > 6)
363
- return undefined;
364
- const nextChar = trimmed[index];
365
- if (nextChar !== ' ' && nextChar !== '\t')
366
- return undefined;
367
- const heading = trimmed.slice(index).trim();
368
- return heading.length > 0 ? heading : undefined;
346
+ lastIdx = nextIdx + fm.lineEnding.length;
369
347
  }
370
348
  return undefined;
371
349
  }
372
- export function extractTitleFromRawMarkdown(content) {
373
- const fm = frontmatter.find(content);
374
- if (!fm) {
375
- return extractTitleFromHeading(content);
376
- }
377
- const { lines, endIndex } = fm;
378
- const entry = lines
379
- .slice(1, endIndex)
380
- .map((line) => parseFrontmatterEntry(line))
381
- .find((parsed) => parsed !== null && isTitleKey(parsed.key));
382
- if (!entry)
383
- return undefined;
384
- const value = stripOptionalQuotes(entry.value);
385
- return value || undefined;
386
- }
387
- function hasMarkdownSourceLine(content) {
388
- const lineEnding = detectLineEnding(content);
389
- const lines = content.split(lineEnding);
390
- const limit = Math.min(lines.length, 50);
391
- for (let index = 0; index < limit; index += 1) {
392
- const line = lines[index];
393
- if (!line)
394
- continue;
395
- if (line.trimStart().toLowerCase().startsWith('source:')) {
396
- return true;
350
+ function scanBodyForTitle(content) {
351
+ const len = content.length;
352
+ let scanIndex = 0;
353
+ const LIMIT = 5000;
354
+ const maxScan = Math.min(len, LIMIT);
355
+ while (scanIndex < maxScan) {
356
+ let nextIndex = content.indexOf('\n', scanIndex);
357
+ if (nextIndex === -1)
358
+ nextIndex = len;
359
+ let line = content.slice(scanIndex, nextIndex);
360
+ if (line.endsWith('\r'))
361
+ line = line.slice(0, -1);
362
+ const trimmed = line.trim();
363
+ if (trimmed) {
364
+ if (REGEX.HEADING_STRICT.test(trimmed)) {
365
+ return trimmed.replace(REGEX.HEADING_MARKER, '').trim() || undefined;
366
+ }
367
+ return undefined;
397
368
  }
369
+ scanIndex = nextIndex + 1;
398
370
  }
399
- return false;
371
+ return undefined;
400
372
  }
401
- function addSourceToMarkdownAsMarkdown(content, url) {
402
- if (hasMarkdownSourceLine(content))
403
- return content;
404
- const lineEnding = detectLineEnding(content);
405
- const lines = content.split(lineEnding);
406
- const firstNonEmptyIndex = lines.findIndex((line) => line.trim().length > 0);
407
- if (firstNonEmptyIndex !== -1) {
408
- const firstLine = lines[firstNonEmptyIndex];
409
- if (firstLine && /^#{1,6}\s+/.test(firstLine.trim())) {
410
- const insertAt = firstNonEmptyIndex + 1;
411
- const updated = [
412
- ...lines.slice(0, insertAt),
413
- '',
414
- `Source: ${url}`,
415
- '',
416
- ...lines.slice(insertAt),
417
- ];
418
- return updated.join(lineEnding);
419
- }
373
+ export function extractTitleFromRawMarkdown(content) {
374
+ const fm = detectFrontmatter(content);
375
+ if (fm) {
376
+ const title = scanFrontmatterForTitle(content, fm);
377
+ if (title)
378
+ return title;
420
379
  }
421
- return [`Source: ${url}`, '', content].join(lineEnding);
380
+ return scanBodyForTitle(content);
422
381
  }
423
382
  export function addSourceToMarkdown(content, url) {
424
- const fm = frontmatter.find(content);
425
- if (config.transform.metadataFormat === 'markdown' && !fm) {
426
- return addSourceToMarkdownAsMarkdown(content, url);
383
+ const fm = detectFrontmatter(content);
384
+ const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
385
+ if (useMarkdownFormat && !fm) {
386
+ if (REGEX.SOURCE_KEY.test(content))
387
+ return content;
388
+ const lineEnding = getLineEnding(content);
389
+ const firstH1Match = REGEX.HEADING_MARKER.exec(content);
390
+ if (firstH1Match) {
391
+ const h1Index = firstH1Match.index;
392
+ const lineEndIndex = content.indexOf(lineEnding, h1Index);
393
+ const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
394
+ const injection = `${lineEnding}Source: ${url}${lineEnding}`;
395
+ return content.slice(0, insertPos) + injection + content.slice(insertPos);
396
+ }
397
+ return `Source: ${url}${lineEnding}${lineEnding}${content}`;
427
398
  }
428
399
  if (!fm) {
429
- // Preserve existing behavior: always uses LF even if content uses CRLF.
430
- return `---\nsource: "${url}"\n---\n\n${content}`;
400
+ const lineEnding = getLineEnding(content);
401
+ const escapedUrl = url.replace(/"/g, '\\"');
402
+ return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
431
403
  }
432
- const { lineEnding, lines, endIndex } = fm;
433
- const bodyLines = lines.slice(1, endIndex);
434
- const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
435
- if (hasSource)
404
+ const fmBody = content.slice(fm.linesStart, fm.linesEnd);
405
+ if (REGEX.SOURCE_KEY.test(fmBody))
436
406
  return content;
437
- const updatedLines = [
438
- lines[0],
439
- ...bodyLines,
440
- `source: "${url}"`,
441
- ...lines.slice(endIndex),
442
- ];
443
- return updatedLines.join(lineEnding);
444
- }
445
- function looksLikeHtmlDocument(trimmed) {
446
- return HTML_DOCUMENT_PATTERN.test(trimmed);
447
- }
448
- function countCommonHtmlTags(content) {
449
- const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
450
- [];
451
- return matches.length;
407
+ const escapedUrl = url.replace(/"/g, '\\"');
408
+ const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
409
+ return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd);
410
+ }
411
+ function countCommonTags(content, limit) {
412
+ if (limit <= 0)
413
+ return 0;
414
+ const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
415
+ let count = 0;
416
+ while (regex.exec(content)) {
417
+ count += 1;
418
+ if (count > limit)
419
+ break;
420
+ }
421
+ return count;
452
422
  }
453
423
  export function isRawTextContent(content) {
454
424
  const trimmed = content.trim();
455
- const isHtmlDocument = looksLikeHtmlDocument(trimmed);
456
- const hasMarkdownFrontmatter = frontmatter.hasFrontmatter(trimmed);
457
- const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
458
- const isMarkdown = looksLikeMarkdown(content);
459
- return (!isHtmlDocument &&
460
- (hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
461
- }
462
- export function isLikelyHtmlContent(content) {
463
- const trimmed = content.trim();
464
- if (!trimmed)
425
+ if (REGEX.HTML_DOC_START.test(trimmed))
465
426
  return false;
466
- if (looksLikeHtmlDocument(trimmed))
427
+ if (detectFrontmatter(trimmed) !== null)
467
428
  return true;
468
- return countCommonHtmlTags(content) > 2;
469
- }
470
- function formatFetchedDate(isoString) {
471
- try {
472
- const date = new Date(isoString);
473
- const day = String(date.getDate()).padStart(2, '0');
474
- const month = String(date.getMonth() + 1).padStart(2, '0');
475
- const year = date.getFullYear();
476
- return `${day}-${month}-${year}`;
477
- }
478
- catch {
479
- return isoString;
480
- }
429
+ const tagCount = countCommonTags(content, 5);
430
+ if (tagCount > 5)
431
+ return false;
432
+ return (REGEX.HEADING_MARKER.test(content) ||
433
+ REGEX.LIST_MARKER.test(content) ||
434
+ content.includes('```'));
435
+ }
436
+ function formatFetchedAt(value) {
437
+ const date = new Date(value);
438
+ if (Number.isNaN(date.getTime()))
439
+ return value;
440
+ const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
441
+ day: '2-digit',
442
+ month: '2-digit',
443
+ year: 'numeric',
444
+ });
445
+ return formatter.format(date);
481
446
  }
482
447
  export function buildMetadataFooter(metadata, fallbackUrl) {
483
448
  if (!metadata)
@@ -492,14 +457,11 @@ export function buildMetadataFooter(metadata, fallbackUrl) {
492
457
  if (url)
493
458
  parts.push(`[_Original Source_](${url})`);
494
459
  if (metadata.fetchedAt) {
495
- const formattedDate = formatFetchedDate(metadata.fetchedAt);
496
- parts.push(`_${formattedDate}_`);
460
+ parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
497
461
  }
498
- if (parts.length > 0) {
462
+ if (parts.length > 0)
499
463
  lines.push(` ${parts.join(' | ')}`);
500
- }
501
- if (metadata.description) {
464
+ if (metadata.description)
502
465
  lines.push(` <sub>${metadata.description}</sub>`);
503
- }
504
466
  return lines.join('\n');
505
467
  }