@j0hanz/superfetch 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -17
- package/dist/config/index.js +11 -6
- package/dist/http/auth.js +161 -2
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +508 -11
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +1 -1
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/extractor.js +49 -38
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +21 -0
- package/dist/services/fetcher.js +532 -13
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +48 -6
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +110 -96
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-validator.js +33 -21
- package/package.json +7 -5
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
function detectLineEnding(content) {
|
|
2
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
3
|
+
}
|
|
4
|
+
function findFrontmatterLines(content) {
|
|
5
|
+
const lineEnding = detectLineEnding(content);
|
|
6
|
+
const lines = content.split(lineEnding);
|
|
7
|
+
if (lines[0] !== '---')
|
|
8
|
+
return null;
|
|
9
|
+
const endIndex = lines.indexOf('---', 1);
|
|
10
|
+
if (endIndex === -1)
|
|
11
|
+
return null;
|
|
12
|
+
return { lineEnding, lines, endIndex };
|
|
13
|
+
}
|
|
14
|
+
function stripOptionalQuotes(value) {
|
|
15
|
+
const trimmed = value.trim();
|
|
16
|
+
if (trimmed.length < 2)
|
|
17
|
+
return trimmed;
|
|
18
|
+
const first = trimmed[0];
|
|
19
|
+
const last = trimmed[trimmed.length - 1];
|
|
20
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
21
|
+
return trimmed.slice(1, -1).trim();
|
|
22
|
+
}
|
|
23
|
+
return trimmed;
|
|
24
|
+
}
|
|
25
|
+
function parseFrontmatterEntry(line) {
|
|
26
|
+
const trimmed = line.trim();
|
|
27
|
+
if (!trimmed)
|
|
28
|
+
return null;
|
|
29
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
30
|
+
if (separatorIndex <= 0)
|
|
31
|
+
return null;
|
|
32
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
33
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
34
|
+
return { key, value };
|
|
35
|
+
}
|
|
36
|
+
function isTitleKey(key) {
|
|
37
|
+
return key === 'title' || key === 'name';
|
|
38
|
+
}
|
|
39
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
40
|
+
const frontmatter = findFrontmatterLines(content);
|
|
41
|
+
if (!frontmatter)
|
|
42
|
+
return undefined;
|
|
43
|
+
const { lines, endIndex } = frontmatter;
|
|
44
|
+
const entry = lines
|
|
45
|
+
.slice(1, endIndex)
|
|
46
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
47
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
48
|
+
if (!entry)
|
|
49
|
+
return undefined;
|
|
50
|
+
const value = stripOptionalQuotes(entry.value);
|
|
51
|
+
return value || undefined;
|
|
52
|
+
}
|
|
53
|
+
export function addSourceToMarkdown(content, url) {
|
|
54
|
+
const frontmatter = findFrontmatterLines(content);
|
|
55
|
+
if (!frontmatter) {
|
|
56
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
57
|
+
}
|
|
58
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
59
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
60
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
61
|
+
if (hasSource)
|
|
62
|
+
return content;
|
|
63
|
+
const updatedLines = [
|
|
64
|
+
lines[0],
|
|
65
|
+
...bodyLines,
|
|
66
|
+
`source: "${url}"`,
|
|
67
|
+
...lines.slice(endIndex),
|
|
68
|
+
];
|
|
69
|
+
return updatedLines.join(lineEnding);
|
|
70
|
+
}
|
|
71
|
+
export function hasFrontmatter(trimmed) {
|
|
72
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
73
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function looksLikeMarkdown(content: string): boolean;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
2
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
3
|
+
export function looksLikeMarkdown(content) {
|
|
4
|
+
return (containsMarkdownHeading(content) ||
|
|
5
|
+
containsMarkdownList(content) ||
|
|
6
|
+
containsFencedCodeBlock(content));
|
|
7
|
+
}
|
|
8
|
+
function containsMarkdownHeading(content) {
|
|
9
|
+
return HEADING_PATTERN.test(content);
|
|
10
|
+
}
|
|
11
|
+
function containsMarkdownList(content) {
|
|
12
|
+
return LIST_PATTERN.test(content);
|
|
13
|
+
}
|
|
14
|
+
function containsFencedCodeBlock(content) {
|
|
15
|
+
const first = content.indexOf('```');
|
|
16
|
+
if (first === -1)
|
|
17
|
+
return false;
|
|
18
|
+
return content.includes('```', first + 3);
|
|
19
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function looksLikeMarkdown(content: string): boolean;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
2
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
3
|
+
export function looksLikeMarkdown(content) {
|
|
4
|
+
return (containsMarkdownHeading(content) ||
|
|
5
|
+
containsMarkdownList(content) ||
|
|
6
|
+
containsFencedCodeBlock(content));
|
|
7
|
+
}
|
|
8
|
+
function containsMarkdownHeading(content) {
|
|
9
|
+
return HEADING_PATTERN.test(content);
|
|
10
|
+
}
|
|
11
|
+
function containsMarkdownList(content) {
|
|
12
|
+
return LIST_PATTERN.test(content);
|
|
13
|
+
}
|
|
14
|
+
function containsFencedCodeBlock(content) {
|
|
15
|
+
const first = content.indexOf('```');
|
|
16
|
+
if (first === -1)
|
|
17
|
+
return false;
|
|
18
|
+
return content.includes('```', first + 3);
|
|
19
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
function detectLineEnding(content) {
|
|
2
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
3
|
+
}
|
|
4
|
+
function findFrontmatterLines(content) {
|
|
5
|
+
const lineEnding = detectLineEnding(content);
|
|
6
|
+
const lines = content.split(lineEnding);
|
|
7
|
+
if (lines[0] !== '---')
|
|
8
|
+
return null;
|
|
9
|
+
const endIndex = lines.indexOf('---', 1);
|
|
10
|
+
if (endIndex === -1)
|
|
11
|
+
return null;
|
|
12
|
+
return { lineEnding, lines, endIndex };
|
|
13
|
+
}
|
|
14
|
+
function stripOptionalQuotes(value) {
|
|
15
|
+
const trimmed = value.trim();
|
|
16
|
+
if (trimmed.length < 2)
|
|
17
|
+
return trimmed;
|
|
18
|
+
const first = trimmed[0];
|
|
19
|
+
const last = trimmed[trimmed.length - 1];
|
|
20
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
21
|
+
return trimmed.slice(1, -1).trim();
|
|
22
|
+
}
|
|
23
|
+
return trimmed;
|
|
24
|
+
}
|
|
25
|
+
function parseFrontmatterEntry(line) {
|
|
26
|
+
const trimmed = line.trim();
|
|
27
|
+
if (!trimmed)
|
|
28
|
+
return null;
|
|
29
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
30
|
+
if (separatorIndex <= 0)
|
|
31
|
+
return null;
|
|
32
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
33
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
34
|
+
return { key, value };
|
|
35
|
+
}
|
|
36
|
+
function isTitleKey(key) {
|
|
37
|
+
return key === 'title' || key === 'name';
|
|
38
|
+
}
|
|
39
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
40
|
+
const frontmatter = findFrontmatterLines(content);
|
|
41
|
+
if (!frontmatter)
|
|
42
|
+
return undefined;
|
|
43
|
+
const { lines, endIndex } = frontmatter;
|
|
44
|
+
const entry = lines
|
|
45
|
+
.slice(1, endIndex)
|
|
46
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
47
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
48
|
+
if (!entry)
|
|
49
|
+
return undefined;
|
|
50
|
+
const value = stripOptionalQuotes(entry.value);
|
|
51
|
+
return value || undefined;
|
|
52
|
+
}
|
|
53
|
+
export function addSourceToMarkdown(content, url) {
|
|
54
|
+
const frontmatter = findFrontmatterLines(content);
|
|
55
|
+
if (!frontmatter) {
|
|
56
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
57
|
+
}
|
|
58
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
59
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
60
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
61
|
+
if (hasSource)
|
|
62
|
+
return content;
|
|
63
|
+
const updatedLines = [
|
|
64
|
+
lines[0],
|
|
65
|
+
...bodyLines,
|
|
66
|
+
`source: "${url}"`,
|
|
67
|
+
...lines.slice(endIndex),
|
|
68
|
+
];
|
|
69
|
+
return updatedLines.join(lineEnding);
|
|
70
|
+
}
|
|
71
|
+
export function hasFrontmatter(trimmed) {
|
|
72
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
73
|
+
}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import { logDebug } from '../../services/logger.js';
|
|
2
|
+
import { isRawTextContentUrl } from '../../utils/url-transformer.js';
|
|
3
|
+
const HEADING_PATTERN = /^#{1,6}\s/m;
|
|
4
|
+
const LIST_PATTERN = /^(?:[-*+])\s/m;
|
|
5
|
+
const HTML_DOCUMENT_PATTERN = /^(<!doctype|<html)/i;
|
|
6
|
+
function containsMarkdownHeading(content) {
|
|
7
|
+
return HEADING_PATTERN.test(content);
|
|
8
|
+
}
|
|
9
|
+
function containsMarkdownList(content) {
|
|
10
|
+
return LIST_PATTERN.test(content);
|
|
11
|
+
}
|
|
12
|
+
function containsFencedCodeBlock(content) {
|
|
13
|
+
const first = content.indexOf('```');
|
|
14
|
+
if (first === -1)
|
|
15
|
+
return false;
|
|
16
|
+
return content.includes('```', first + 3);
|
|
17
|
+
}
|
|
18
|
+
function looksLikeMarkdown(content) {
|
|
19
|
+
return (containsMarkdownHeading(content) ||
|
|
20
|
+
containsMarkdownList(content) ||
|
|
21
|
+
containsFencedCodeBlock(content));
|
|
22
|
+
}
|
|
23
|
+
function detectLineEnding(content) {
|
|
24
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
25
|
+
}
|
|
26
|
+
function findFrontmatterLines(content) {
|
|
27
|
+
const lineEnding = detectLineEnding(content);
|
|
28
|
+
const lines = content.split(lineEnding);
|
|
29
|
+
if (lines[0] !== '---')
|
|
30
|
+
return null;
|
|
31
|
+
const endIndex = lines.indexOf('---', 1);
|
|
32
|
+
if (endIndex === -1)
|
|
33
|
+
return null;
|
|
34
|
+
return { lineEnding, lines, endIndex };
|
|
35
|
+
}
|
|
36
|
+
function stripOptionalQuotes(value) {
|
|
37
|
+
const trimmed = value.trim();
|
|
38
|
+
if (trimmed.length < 2)
|
|
39
|
+
return trimmed;
|
|
40
|
+
const first = trimmed[0];
|
|
41
|
+
const last = trimmed[trimmed.length - 1];
|
|
42
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
43
|
+
return trimmed.slice(1, -1).trim();
|
|
44
|
+
}
|
|
45
|
+
return trimmed;
|
|
46
|
+
}
|
|
47
|
+
function parseFrontmatterEntry(line) {
|
|
48
|
+
const trimmed = line.trim();
|
|
49
|
+
if (!trimmed)
|
|
50
|
+
return null;
|
|
51
|
+
const separatorIndex = trimmed.indexOf(':');
|
|
52
|
+
if (separatorIndex <= 0)
|
|
53
|
+
return null;
|
|
54
|
+
const key = trimmed.slice(0, separatorIndex).trim().toLowerCase();
|
|
55
|
+
const value = trimmed.slice(separatorIndex + 1);
|
|
56
|
+
return { key, value };
|
|
57
|
+
}
|
|
58
|
+
function isTitleKey(key) {
|
|
59
|
+
return key === 'title' || key === 'name';
|
|
60
|
+
}
|
|
61
|
+
function extractTitleFromRawMarkdown(content) {
|
|
62
|
+
const frontmatter = findFrontmatterLines(content);
|
|
63
|
+
if (!frontmatter)
|
|
64
|
+
return undefined;
|
|
65
|
+
const { lines, endIndex } = frontmatter;
|
|
66
|
+
const entry = lines
|
|
67
|
+
.slice(1, endIndex)
|
|
68
|
+
.map((line) => parseFrontmatterEntry(line))
|
|
69
|
+
.find((parsed) => parsed !== null && isTitleKey(parsed.key));
|
|
70
|
+
if (!entry)
|
|
71
|
+
return undefined;
|
|
72
|
+
const value = stripOptionalQuotes(entry.value);
|
|
73
|
+
return value || undefined;
|
|
74
|
+
}
|
|
75
|
+
function addSourceToMarkdown(content, url) {
|
|
76
|
+
const frontmatter = findFrontmatterLines(content);
|
|
77
|
+
if (!frontmatter) {
|
|
78
|
+
return `---\nsource: "${url}"\n---\n\n${content}`;
|
|
79
|
+
}
|
|
80
|
+
const { lineEnding, lines, endIndex } = frontmatter;
|
|
81
|
+
const bodyLines = lines.slice(1, endIndex);
|
|
82
|
+
const hasSource = bodyLines.some((line) => line.trimStart().toLowerCase().startsWith('source:'));
|
|
83
|
+
if (hasSource)
|
|
84
|
+
return content;
|
|
85
|
+
const updatedLines = [
|
|
86
|
+
lines[0],
|
|
87
|
+
...bodyLines,
|
|
88
|
+
`source: "${url}"`,
|
|
89
|
+
...lines.slice(endIndex),
|
|
90
|
+
];
|
|
91
|
+
return updatedLines.join(lineEnding);
|
|
92
|
+
}
|
|
93
|
+
function hasFrontmatter(trimmed) {
|
|
94
|
+
return trimmed.startsWith('---\n') || trimmed.startsWith('---\r\n');
|
|
95
|
+
}
|
|
96
|
+
function looksLikeHtmlDocument(trimmed) {
|
|
97
|
+
return HTML_DOCUMENT_PATTERN.test(trimmed);
|
|
98
|
+
}
|
|
99
|
+
function countCommonHtmlTags(content) {
|
|
100
|
+
const matches = content.match(/<(html|head|body|div|span|script|style|meta|link)\b/gi) ??
|
|
101
|
+
[];
|
|
102
|
+
return matches.length;
|
|
103
|
+
}
|
|
104
|
+
function isRawTextContent(content) {
|
|
105
|
+
const trimmed = content.trim();
|
|
106
|
+
const isHtmlDocument = looksLikeHtmlDocument(trimmed);
|
|
107
|
+
const hasMarkdownFrontmatter = hasFrontmatter(trimmed);
|
|
108
|
+
const hasTooManyHtmlTags = countCommonHtmlTags(content) > 2;
|
|
109
|
+
const isMarkdown = looksLikeMarkdown(content);
|
|
110
|
+
return (!isHtmlDocument &&
|
|
111
|
+
(hasMarkdownFrontmatter || (!hasTooManyHtmlTags && isMarkdown)));
|
|
112
|
+
}
|
|
113
|
+
function buildRawMarkdownPayload({ rawContent, url, includeMetadata, }) {
|
|
114
|
+
const title = extractTitleFromRawMarkdown(rawContent);
|
|
115
|
+
const content = includeMetadata
|
|
116
|
+
? addSourceToMarkdown(rawContent, url)
|
|
117
|
+
: rawContent;
|
|
118
|
+
return { content, title };
|
|
119
|
+
}
|
|
120
|
+
export function tryTransformRawContent({ html, url, includeMetadata, }) {
|
|
121
|
+
if (!isRawTextContentUrl(url) && !isRawTextContent(html)) {
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
logDebug('Preserving raw markdown content', { url: url.substring(0, 80) });
|
|
125
|
+
const { content, title } = buildRawMarkdownPayload({
|
|
126
|
+
rawContent: html,
|
|
127
|
+
url,
|
|
128
|
+
includeMetadata,
|
|
129
|
+
});
|
|
130
|
+
return {
|
|
131
|
+
markdown: content,
|
|
132
|
+
title,
|
|
133
|
+
truncated: false,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { CODE_BLOCK } from '../../config/formatting.js';
|
|
2
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../../utils/code-language.js';
|
|
3
|
+
import { isRecord } from '../../utils/guards.js';
|
|
4
|
+
function isElement(node) {
|
|
5
|
+
return (isRecord(node) &&
|
|
6
|
+
'getAttribute' in node &&
|
|
7
|
+
typeof node.getAttribute === 'function');
|
|
8
|
+
}
|
|
9
|
+
function isFencedCodeBlock(node, options) {
|
|
10
|
+
return (options.codeBlockStyle === 'fenced' &&
|
|
11
|
+
node.nodeName === 'PRE' &&
|
|
12
|
+
node.firstChild?.nodeName === 'CODE');
|
|
13
|
+
}
|
|
14
|
+
function formatFencedCodeBlock(node) {
|
|
15
|
+
const codeNode = node.firstChild;
|
|
16
|
+
if (!isElement(codeNode))
|
|
17
|
+
return '';
|
|
18
|
+
const code = codeNode.textContent || '';
|
|
19
|
+
const language = resolveCodeLanguage(codeNode, code);
|
|
20
|
+
return CODE_BLOCK.format(code, language);
|
|
21
|
+
}
|
|
22
|
+
function resolveCodeLanguage(codeNode, code) {
|
|
23
|
+
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
24
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
25
|
+
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
26
|
+
}
|
|
27
|
+
function readCodeAttributes(codeNode) {
|
|
28
|
+
return {
|
|
29
|
+
className: codeNode.getAttribute('class') ?? '',
|
|
30
|
+
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
export function addFencedCodeRule(instance) {
|
|
34
|
+
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
35
|
+
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
36
|
+
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
37
|
+
});
|
|
38
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { FRONTMATTER_DELIMITER, joinLines } from '../../config/formatting.js';
|
|
2
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
3
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
4
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
5
|
+
const ESCAPE_PATTERNS = {
|
|
6
|
+
backslash: /\\/g,
|
|
7
|
+
quote: /"/g,
|
|
8
|
+
newline: /\n/g,
|
|
9
|
+
tab: /\t/g,
|
|
10
|
+
};
|
|
11
|
+
const YAML_QUOTE_CHECKS = [
|
|
12
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
13
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
14
|
+
(input) => input === '',
|
|
15
|
+
(input) => YAML_NUMERIC.test(input),
|
|
16
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
17
|
+
];
|
|
18
|
+
function needsYamlQuotes(value) {
|
|
19
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
20
|
+
}
|
|
21
|
+
function escapeYamlValue(value) {
|
|
22
|
+
if (!needsYamlQuotes(value)) {
|
|
23
|
+
return value;
|
|
24
|
+
}
|
|
25
|
+
const escaped = value
|
|
26
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
27
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
28
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
29
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
30
|
+
return `"${escaped}"`;
|
|
31
|
+
}
|
|
32
|
+
function appendFrontmatterField(lines, key, value) {
|
|
33
|
+
if (!value)
|
|
34
|
+
return;
|
|
35
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
36
|
+
}
|
|
37
|
+
export function buildFrontmatter(metadata) {
|
|
38
|
+
if (!metadata)
|
|
39
|
+
return '';
|
|
40
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
41
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
42
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
43
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
44
|
+
return joinLines(lines);
|
|
45
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
|
+
const STRUCTURAL_TAGS = new Set([
|
|
3
|
+
'script',
|
|
4
|
+
'style',
|
|
5
|
+
'noscript',
|
|
6
|
+
'iframe',
|
|
7
|
+
'nav',
|
|
8
|
+
'footer',
|
|
9
|
+
'aside',
|
|
10
|
+
'header',
|
|
11
|
+
'form',
|
|
12
|
+
'button',
|
|
13
|
+
'input',
|
|
14
|
+
'select',
|
|
15
|
+
'textarea',
|
|
16
|
+
]);
|
|
17
|
+
const NAVIGATION_ROLES = new Set([
|
|
18
|
+
'navigation',
|
|
19
|
+
'banner',
|
|
20
|
+
'complementary',
|
|
21
|
+
'contentinfo',
|
|
22
|
+
'tree',
|
|
23
|
+
'menubar',
|
|
24
|
+
'menu',
|
|
25
|
+
]);
|
|
26
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
27
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
28
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
29
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
30
|
+
function isElement(node) {
|
|
31
|
+
return (isRecord(node) &&
|
|
32
|
+
'getAttribute' in node &&
|
|
33
|
+
typeof node.getAttribute === 'function');
|
|
34
|
+
}
|
|
35
|
+
function isStructuralNoiseTag(tagName) {
|
|
36
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
37
|
+
}
|
|
38
|
+
function isElementHidden(element) {
|
|
39
|
+
return (element.getAttribute('hidden') !== null ||
|
|
40
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
41
|
+
}
|
|
42
|
+
function hasNoiseRole(role) {
|
|
43
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
44
|
+
}
|
|
45
|
+
function matchesPromoIdOrClass(className, id) {
|
|
46
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
47
|
+
return PROMO_PATTERN.test(combined);
|
|
48
|
+
}
|
|
49
|
+
function matchesHighZIsolate(className) {
|
|
50
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
51
|
+
}
|
|
52
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
53
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
54
|
+
}
|
|
55
|
+
function readElementMetadata(element) {
|
|
56
|
+
return {
|
|
57
|
+
tagName: element.tagName.toLowerCase(),
|
|
58
|
+
className: element.getAttribute('class') ?? '',
|
|
59
|
+
id: element.getAttribute('id') ?? '',
|
|
60
|
+
role: element.getAttribute('role'),
|
|
61
|
+
isHidden: isElementHidden(element),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function isNoiseElement(node) {
|
|
65
|
+
const metadata = readElementMetadata(node);
|
|
66
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
67
|
+
metadata.isHidden ||
|
|
68
|
+
hasNoiseRole(metadata.role) ||
|
|
69
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
70
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
71
|
+
}
|
|
72
|
+
function isNoiseNode(node) {
|
|
73
|
+
return isElement(node) && isNoiseElement(node);
|
|
74
|
+
}
|
|
75
|
+
export function addNoiseRule(instance) {
|
|
76
|
+
instance.addRule('removeNoise', {
|
|
77
|
+
filter: (node) => isNoiseNode(node),
|
|
78
|
+
replacement: () => '',
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { addFencedCodeRule } from './fenced-code-rule.js';
|
|
3
|
+
import { addNoiseRule } from './noise-rule.js';
|
|
4
|
+
let turndownInstance = null;
|
|
5
|
+
function createTurndownInstance() {
|
|
6
|
+
const instance = new TurndownService({
|
|
7
|
+
headingStyle: 'atx',
|
|
8
|
+
codeBlockStyle: 'fenced',
|
|
9
|
+
emDelimiter: '_',
|
|
10
|
+
bulletListMarker: '-',
|
|
11
|
+
});
|
|
12
|
+
addNoiseRule(instance);
|
|
13
|
+
addFencedCodeRule(instance);
|
|
14
|
+
return instance;
|
|
15
|
+
}
|
|
16
|
+
export function getTurndown() {
|
|
17
|
+
turndownInstance ??= createTurndownInstance();
|
|
18
|
+
return turndownInstance;
|
|
19
|
+
}
|