@j0hanz/superfetch 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -17
- package/dist/config/index.js +11 -6
- package/dist/http/auth.js +161 -2
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +508 -11
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +1 -1
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/extractor.js +49 -38
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +21 -0
- package/dist/services/fetcher.js +532 -13
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +48 -6
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +110 -96
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-validator.js +33 -21
- package/package.json +7 -5
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
|
|
3
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
4
|
+
import { isRecord } from '../utils/guards.js';
|
|
5
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
6
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
7
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
8
|
+
const ESCAPE_PATTERNS = {
|
|
9
|
+
backslash: /\\/g,
|
|
10
|
+
quote: /"/g,
|
|
11
|
+
newline: /\n/g,
|
|
12
|
+
tab: /\t/g,
|
|
13
|
+
};
|
|
14
|
+
const YAML_QUOTE_CHECKS = [
|
|
15
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
16
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
17
|
+
(input) => input === '',
|
|
18
|
+
(input) => YAML_NUMERIC.test(input),
|
|
19
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
20
|
+
];
|
|
21
|
+
function needsYamlQuotes(value) {
|
|
22
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
23
|
+
}
|
|
24
|
+
function escapeYamlValue(value) {
|
|
25
|
+
if (!needsYamlQuotes(value)) {
|
|
26
|
+
return value;
|
|
27
|
+
}
|
|
28
|
+
const escaped = value
|
|
29
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
30
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
31
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
32
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
33
|
+
return `"${escaped}"`;
|
|
34
|
+
}
|
|
35
|
+
function appendFrontmatterField(lines, key, value) {
|
|
36
|
+
if (!value)
|
|
37
|
+
return;
|
|
38
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
39
|
+
}
|
|
40
|
+
function buildFrontmatter(metadata) {
|
|
41
|
+
if (!metadata)
|
|
42
|
+
return '';
|
|
43
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
44
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
45
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
46
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
47
|
+
return joinLines(lines);
|
|
48
|
+
}
|
|
49
|
+
function isElement(node) {
|
|
50
|
+
return (isRecord(node) &&
|
|
51
|
+
'getAttribute' in node &&
|
|
52
|
+
typeof node.getAttribute === 'function');
|
|
53
|
+
}
|
|
54
|
+
function isFencedCodeBlock(node, options) {
|
|
55
|
+
return (options.codeBlockStyle === 'fenced' &&
|
|
56
|
+
node.nodeName === 'PRE' &&
|
|
57
|
+
node.firstChild?.nodeName === 'CODE');
|
|
58
|
+
}
|
|
59
|
+
function formatFencedCodeBlock(node) {
|
|
60
|
+
const codeNode = node.firstChild;
|
|
61
|
+
if (!isElement(codeNode))
|
|
62
|
+
return '';
|
|
63
|
+
const code = codeNode.textContent || '';
|
|
64
|
+
const language = resolveCodeLanguage(codeNode, code);
|
|
65
|
+
return CODE_BLOCK.format(code, language);
|
|
66
|
+
}
|
|
67
|
+
function resolveCodeLanguage(codeNode, code) {
|
|
68
|
+
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
69
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
70
|
+
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
71
|
+
}
|
|
72
|
+
function readCodeAttributes(codeNode) {
|
|
73
|
+
return {
|
|
74
|
+
className: codeNode.getAttribute('class') ?? '',
|
|
75
|
+
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
function addFencedCodeRule(instance) {
|
|
79
|
+
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
80
|
+
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
81
|
+
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
const STRUCTURAL_TAGS = new Set([
|
|
85
|
+
'script',
|
|
86
|
+
'style',
|
|
87
|
+
'noscript',
|
|
88
|
+
'iframe',
|
|
89
|
+
'nav',
|
|
90
|
+
'footer',
|
|
91
|
+
'aside',
|
|
92
|
+
'header',
|
|
93
|
+
'form',
|
|
94
|
+
'button',
|
|
95
|
+
'input',
|
|
96
|
+
'select',
|
|
97
|
+
'textarea',
|
|
98
|
+
]);
|
|
99
|
+
const NAVIGATION_ROLES = new Set([
|
|
100
|
+
'navigation',
|
|
101
|
+
'banner',
|
|
102
|
+
'complementary',
|
|
103
|
+
'contentinfo',
|
|
104
|
+
'tree',
|
|
105
|
+
'menubar',
|
|
106
|
+
'menu',
|
|
107
|
+
]);
|
|
108
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
109
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
110
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
111
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
112
|
+
function isStructuralNoiseTag(tagName) {
|
|
113
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
114
|
+
}
|
|
115
|
+
function isElementHidden(element) {
|
|
116
|
+
return (element.getAttribute('hidden') !== null ||
|
|
117
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
118
|
+
}
|
|
119
|
+
function hasNoiseRole(role) {
|
|
120
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
121
|
+
}
|
|
122
|
+
function matchesPromoIdOrClass(className, id) {
|
|
123
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
124
|
+
return PROMO_PATTERN.test(combined);
|
|
125
|
+
}
|
|
126
|
+
function matchesHighZIsolate(className) {
|
|
127
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
128
|
+
}
|
|
129
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
130
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
131
|
+
}
|
|
132
|
+
function readElementMetadata(element) {
|
|
133
|
+
return {
|
|
134
|
+
tagName: element.tagName.toLowerCase(),
|
|
135
|
+
className: element.getAttribute('class') ?? '',
|
|
136
|
+
id: element.getAttribute('id') ?? '',
|
|
137
|
+
role: element.getAttribute('role'),
|
|
138
|
+
isHidden: isElementHidden(element),
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
function isNoiseElement(node) {
|
|
142
|
+
const metadata = readElementMetadata(node);
|
|
143
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
144
|
+
metadata.isHidden ||
|
|
145
|
+
hasNoiseRole(metadata.role) ||
|
|
146
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
147
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
148
|
+
}
|
|
149
|
+
function isNoiseNode(node) {
|
|
150
|
+
return isElement(node) && isNoiseElement(node);
|
|
151
|
+
}
|
|
152
|
+
function addNoiseRule(instance) {
|
|
153
|
+
instance.addRule('removeNoise', {
|
|
154
|
+
filter: (node) => isNoiseNode(node),
|
|
155
|
+
replacement: () => '',
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
let turndownInstance = null;
|
|
159
|
+
function createTurndownInstance() {
|
|
160
|
+
const instance = new TurndownService({
|
|
161
|
+
headingStyle: 'atx',
|
|
162
|
+
codeBlockStyle: 'fenced',
|
|
163
|
+
emDelimiter: '_',
|
|
164
|
+
bulletListMarker: '-',
|
|
165
|
+
});
|
|
166
|
+
addNoiseRule(instance);
|
|
167
|
+
addFencedCodeRule(instance);
|
|
168
|
+
return instance;
|
|
169
|
+
}
|
|
170
|
+
function getTurndown() {
|
|
171
|
+
turndownInstance ??= createTurndownInstance();
|
|
172
|
+
return turndownInstance;
|
|
173
|
+
}
|
|
174
|
+
export function htmlToMarkdown(html, metadata) {
|
|
175
|
+
const frontmatter = buildFrontmatter(metadata);
|
|
176
|
+
if (!html)
|
|
177
|
+
return frontmatter;
|
|
178
|
+
try {
|
|
179
|
+
const content = getTurndown().turndown(html).trim();
|
|
180
|
+
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
return frontmatter;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
@@ -1,187 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
4
|
-
import { isRecord } from '../utils/guards.js';
|
|
5
|
-
let turndownInstance = null;
|
|
6
|
-
function createTurndownInstance() {
|
|
7
|
-
const instance = new TurndownService({
|
|
8
|
-
headingStyle: 'atx',
|
|
9
|
-
codeBlockStyle: 'fenced',
|
|
10
|
-
emDelimiter: '_',
|
|
11
|
-
bulletListMarker: '-',
|
|
12
|
-
});
|
|
13
|
-
addNoiseRule(instance);
|
|
14
|
-
addFencedCodeRule(instance);
|
|
15
|
-
return instance;
|
|
16
|
-
}
|
|
17
|
-
function getTurndown() {
|
|
18
|
-
turndownInstance ??= createTurndownInstance();
|
|
19
|
-
return turndownInstance;
|
|
20
|
-
}
|
|
21
|
-
function isElement(node) {
|
|
22
|
-
if (!isRecord(node))
|
|
23
|
-
return false;
|
|
24
|
-
return 'getAttribute' in node && typeof node.getAttribute === 'function';
|
|
25
|
-
}
|
|
26
|
-
const STRUCTURAL_TAGS = new Set([
|
|
27
|
-
'script',
|
|
28
|
-
'style',
|
|
29
|
-
'noscript',
|
|
30
|
-
'iframe',
|
|
31
|
-
'nav',
|
|
32
|
-
'footer',
|
|
33
|
-
'aside',
|
|
34
|
-
'header',
|
|
35
|
-
'form',
|
|
36
|
-
'button',
|
|
37
|
-
'input',
|
|
38
|
-
'select',
|
|
39
|
-
'textarea',
|
|
40
|
-
]);
|
|
41
|
-
const NAVIGATION_ROLES = new Set([
|
|
42
|
-
'navigation',
|
|
43
|
-
'banner',
|
|
44
|
-
'complementary',
|
|
45
|
-
'contentinfo',
|
|
46
|
-
'tree',
|
|
47
|
-
'menubar',
|
|
48
|
-
'menu',
|
|
49
|
-
]);
|
|
50
|
-
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
51
|
-
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
52
|
-
const HIGH_Z_PATTERN = /\bz-(?:4[0-9]|50)\b/;
|
|
53
|
-
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
54
|
-
function isStructuralNoiseTag(tagName) {
|
|
55
|
-
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
56
|
-
}
|
|
57
|
-
function isElementHidden(element) {
|
|
58
|
-
return (element.getAttribute('hidden') !== null ||
|
|
59
|
-
element.getAttribute('aria-hidden') === 'true');
|
|
60
|
-
}
|
|
61
|
-
function hasNoiseRole(role) {
|
|
62
|
-
return role ? NAVIGATION_ROLES.has(role) : false;
|
|
63
|
-
}
|
|
64
|
-
function matchesPromoIdOrClass(className, id) {
|
|
65
|
-
const combined = `${className} ${id}`.toLowerCase();
|
|
66
|
-
return PROMO_PATTERN.test(combined);
|
|
67
|
-
}
|
|
68
|
-
function matchesHighZIsolate(className) {
|
|
69
|
-
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
70
|
-
}
|
|
71
|
-
function matchesFixedOrHighZIsolate(className) {
|
|
72
|
-
if (FIXED_PATTERN.test(className))
|
|
73
|
-
return true;
|
|
74
|
-
return matchesHighZIsolate(className);
|
|
75
|
-
}
|
|
76
|
-
function addNoiseRule(instance) {
|
|
77
|
-
instance.addRule('removeNoise', {
|
|
78
|
-
filter: (node) => isNoiseNode(node),
|
|
79
|
-
replacement: () => '',
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
function isNoiseNode(node) {
|
|
83
|
-
if (!isElement(node))
|
|
84
|
-
return false;
|
|
85
|
-
return isNoiseElement(node);
|
|
86
|
-
}
|
|
87
|
-
function readElementMetadata(element) {
|
|
88
|
-
return {
|
|
89
|
-
tagName: element.tagName.toLowerCase(),
|
|
90
|
-
className: element.getAttribute('class') ?? '',
|
|
91
|
-
id: element.getAttribute('id') ?? '',
|
|
92
|
-
role: element.getAttribute('role'),
|
|
93
|
-
isHidden: isElementHidden(element),
|
|
94
|
-
};
|
|
95
|
-
}
|
|
96
|
-
function isNoiseElement(node) {
|
|
97
|
-
const metadata = readElementMetadata(node);
|
|
98
|
-
if (isStructuralNoiseTag(metadata.tagName))
|
|
99
|
-
return true;
|
|
100
|
-
if (metadata.isHidden)
|
|
101
|
-
return true;
|
|
102
|
-
if (hasNoiseRole(metadata.role))
|
|
103
|
-
return true;
|
|
104
|
-
if (matchesFixedOrHighZIsolate(metadata.className))
|
|
105
|
-
return true;
|
|
106
|
-
return matchesPromoIdOrClass(metadata.className, metadata.id);
|
|
107
|
-
}
|
|
108
|
-
function addFencedCodeRule(instance) {
|
|
109
|
-
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
110
|
-
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
111
|
-
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
112
|
-
});
|
|
113
|
-
}
|
|
114
|
-
function isFencedCodeBlock(node, options) {
|
|
115
|
-
if (options.codeBlockStyle !== 'fenced')
|
|
116
|
-
return false;
|
|
117
|
-
if (node.nodeName !== 'PRE')
|
|
118
|
-
return false;
|
|
119
|
-
const { firstChild } = node;
|
|
120
|
-
if (!firstChild)
|
|
121
|
-
return false;
|
|
122
|
-
return firstChild.nodeName === 'CODE';
|
|
123
|
-
}
|
|
124
|
-
function formatFencedCodeBlock(node) {
|
|
125
|
-
const codeNode = node.firstChild;
|
|
126
|
-
if (!isElement(codeNode))
|
|
127
|
-
return '';
|
|
128
|
-
const code = codeNode.textContent || '';
|
|
129
|
-
const language = resolveCodeLanguage(codeNode, code);
|
|
130
|
-
return CODE_BLOCK.format(code, language);
|
|
131
|
-
}
|
|
132
|
-
function resolveCodeLanguage(codeNode, code) {
|
|
133
|
-
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
134
|
-
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
135
|
-
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
136
|
-
}
|
|
137
|
-
function readCodeAttributes(codeNode) {
|
|
138
|
-
return {
|
|
139
|
-
className: codeNode.getAttribute('class') ?? '',
|
|
140
|
-
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
141
|
-
};
|
|
142
|
-
}
|
|
143
|
-
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
144
|
-
const YAML_NUMERIC = /^[\d.]+$/;
|
|
145
|
-
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
146
|
-
const ESCAPE_PATTERNS = {
|
|
147
|
-
backslash: /\\/g,
|
|
148
|
-
quote: /"/g,
|
|
149
|
-
newline: /\n/g,
|
|
150
|
-
tab: /\t/g,
|
|
151
|
-
};
|
|
152
|
-
function needsYamlQuotes(value) {
|
|
153
|
-
const checks = [
|
|
154
|
-
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
155
|
-
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
156
|
-
(input) => input === '',
|
|
157
|
-
(input) => YAML_NUMERIC.test(input),
|
|
158
|
-
(input) => YAML_RESERVED_WORDS.test(input),
|
|
159
|
-
];
|
|
160
|
-
return checks.some((check) => check(value));
|
|
161
|
-
}
|
|
162
|
-
function escapeYamlValue(value) {
|
|
163
|
-
if (!needsYamlQuotes(value)) {
|
|
164
|
-
return value;
|
|
165
|
-
}
|
|
166
|
-
const escaped = value
|
|
167
|
-
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
168
|
-
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
169
|
-
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
170
|
-
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
171
|
-
return `"${escaped}"`;
|
|
172
|
-
}
|
|
173
|
-
function appendFrontmatterField(lines, key, value) {
|
|
174
|
-
if (!value)
|
|
175
|
-
return;
|
|
176
|
-
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
177
|
-
}
|
|
178
|
-
function createFrontmatter(metadata) {
|
|
179
|
-
const lines = [FRONTMATTER_DELIMITER];
|
|
180
|
-
appendFrontmatterField(lines, 'title', metadata.title);
|
|
181
|
-
appendFrontmatterField(lines, 'source', metadata.url);
|
|
182
|
-
lines.push(FRONTMATTER_DELIMITER);
|
|
183
|
-
return joinLines(lines);
|
|
184
|
-
}
|
|
1
|
+
import { buildFrontmatter } from './markdown/frontmatter.js';
|
|
2
|
+
import { getTurndown } from './markdown/turndown-instance.js';
|
|
185
3
|
export function htmlToMarkdown(html, metadata) {
|
|
186
4
|
const frontmatter = buildFrontmatter(metadata);
|
|
187
5
|
if (!html)
|
|
@@ -194,8 +12,3 @@ export function htmlToMarkdown(html, metadata) {
|
|
|
194
12
|
return frontmatter;
|
|
195
13
|
}
|
|
196
14
|
}
|
|
197
|
-
function buildFrontmatter(metadata) {
|
|
198
|
-
if (!metadata)
|
|
199
|
-
return '';
|
|
200
|
-
return createFrontmatter(metadata);
|
|
201
|
-
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function detectBash(code: string): boolean;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { splitLines } from './code-language-parsing.js';
|
|
2
|
+
const BASH_PACKAGE_MANAGERS = [
|
|
3
|
+
'npm',
|
|
4
|
+
'yarn',
|
|
5
|
+
'pnpm',
|
|
6
|
+
'npx',
|
|
7
|
+
'brew',
|
|
8
|
+
'apt',
|
|
9
|
+
'pip',
|
|
10
|
+
'cargo',
|
|
11
|
+
'go',
|
|
12
|
+
];
|
|
13
|
+
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
14
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
15
|
+
export function detectBash(code) {
|
|
16
|
+
const lines = splitLines(code);
|
|
17
|
+
for (const line of lines) {
|
|
18
|
+
const trimmed = line.trimStart();
|
|
19
|
+
if (!trimmed)
|
|
20
|
+
continue;
|
|
21
|
+
if (isBashIndicator(trimmed))
|
|
22
|
+
return true;
|
|
23
|
+
}
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
function startsWithCommand(line, commands) {
|
|
27
|
+
return commands.some((command) => line === command || line.startsWith(`${command} `));
|
|
28
|
+
}
|
|
29
|
+
function isBashIndicator(line) {
|
|
30
|
+
return (isShebang(line) ||
|
|
31
|
+
isPromptLine(line) ||
|
|
32
|
+
startsWithCommand(line, BASH_COMMANDS) ||
|
|
33
|
+
startsWithPackageManagerCommand(line));
|
|
34
|
+
}
|
|
35
|
+
function isShebang(line) {
|
|
36
|
+
return line.startsWith('#!');
|
|
37
|
+
}
|
|
38
|
+
function isPromptLine(line) {
|
|
39
|
+
return line.startsWith('$ ') || line.startsWith('# ');
|
|
40
|
+
}
|
|
41
|
+
function startsWithPackageManagerCommand(line) {
|
|
42
|
+
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
43
|
+
if (!line.startsWith(`${manager} `))
|
|
44
|
+
return false;
|
|
45
|
+
const rest = line.slice(manager.length + 1);
|
|
46
|
+
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
47
|
+
});
|
|
48
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { CODE_DETECTORS } from './code-language-detectors.js';
|
|
2
|
+
import { extractLanguageFromClassName, resolveLanguageFromDataAttribute, } from './code-language-parsing.js';
|
|
3
|
+
export function detectLanguageFromCode(code) {
|
|
4
|
+
for (const { language, detect } of CODE_DETECTORS) {
|
|
5
|
+
if (detect(code))
|
|
6
|
+
return language;
|
|
7
|
+
}
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
11
|
+
const classMatch = extractLanguageFromClassName(className);
|
|
12
|
+
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
13
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { detectBash } from './code-language-bash.js';
|
|
2
|
+
import { containsJsxTag, containsWord, splitLines, } from './code-language-parsing.js';
|
|
3
|
+
const TYPE_HINTS = [
|
|
4
|
+
'string',
|
|
5
|
+
'number',
|
|
6
|
+
'boolean',
|
|
7
|
+
'void',
|
|
8
|
+
'any',
|
|
9
|
+
'unknown',
|
|
10
|
+
'never',
|
|
11
|
+
];
|
|
12
|
+
const HTML_TAGS = [
|
|
13
|
+
'<!doctype',
|
|
14
|
+
'<html',
|
|
15
|
+
'<head',
|
|
16
|
+
'<body',
|
|
17
|
+
'<div',
|
|
18
|
+
'<span',
|
|
19
|
+
'<p',
|
|
20
|
+
'<a',
|
|
21
|
+
'<script',
|
|
22
|
+
'<style',
|
|
23
|
+
];
|
|
24
|
+
const SQL_KEYWORDS = [
|
|
25
|
+
'select',
|
|
26
|
+
'insert',
|
|
27
|
+
'update',
|
|
28
|
+
'delete',
|
|
29
|
+
'create',
|
|
30
|
+
'alter',
|
|
31
|
+
'drop',
|
|
32
|
+
];
|
|
33
|
+
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
34
|
+
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
35
|
+
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
36
|
+
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
37
|
+
export const CODE_DETECTORS = [
|
|
38
|
+
{ language: 'jsx', detect: detectJsx },
|
|
39
|
+
{ language: 'typescript', detect: detectTypescript },
|
|
40
|
+
{ language: 'rust', detect: detectRust },
|
|
41
|
+
{ language: 'javascript', detect: detectJavascript },
|
|
42
|
+
{ language: 'python', detect: detectPython },
|
|
43
|
+
{ language: 'bash', detect: detectBash },
|
|
44
|
+
{ language: 'css', detect: detectCss },
|
|
45
|
+
{ language: 'html', detect: detectHtml },
|
|
46
|
+
{ language: 'json', detect: detectJson },
|
|
47
|
+
{ language: 'yaml', detect: detectYaml },
|
|
48
|
+
{ language: 'sql', detect: detectSql },
|
|
49
|
+
{ language: 'go', detect: detectGo },
|
|
50
|
+
];
|
|
51
|
+
function detectJsx(code) {
|
|
52
|
+
const lower = code.toLowerCase();
|
|
53
|
+
if (lower.includes('classname='))
|
|
54
|
+
return true;
|
|
55
|
+
if (lower.includes('jsx:'))
|
|
56
|
+
return true;
|
|
57
|
+
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
return containsJsxTag(code);
|
|
61
|
+
}
|
|
62
|
+
function detectTypescript(code) {
|
|
63
|
+
const lower = code.toLowerCase();
|
|
64
|
+
if (containsWord(lower, 'interface'))
|
|
65
|
+
return true;
|
|
66
|
+
if (containsWord(lower, 'type'))
|
|
67
|
+
return true;
|
|
68
|
+
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
69
|
+
}
|
|
70
|
+
function detectRust(code) {
|
|
71
|
+
const lower = code.toLowerCase();
|
|
72
|
+
return (RUST_WORD_REGEX.test(lower) ||
|
|
73
|
+
lower.includes('let mut') ||
|
|
74
|
+
(lower.includes('use ') && lower.includes('::')));
|
|
75
|
+
}
|
|
76
|
+
function detectJavascript(code) {
|
|
77
|
+
const lower = code.toLowerCase();
|
|
78
|
+
return JS_WORD_REGEX.test(lower);
|
|
79
|
+
}
|
|
80
|
+
function detectPython(code) {
|
|
81
|
+
const lower = code.toLowerCase();
|
|
82
|
+
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
83
|
+
lower.includes('print(') ||
|
|
84
|
+
lower.includes('__name__'));
|
|
85
|
+
}
|
|
86
|
+
function detectCss(code) {
|
|
87
|
+
const lower = code.toLowerCase();
|
|
88
|
+
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
89
|
+
return true;
|
|
90
|
+
const lines = splitLines(code);
|
|
91
|
+
for (const line of lines) {
|
|
92
|
+
const trimmed = line.trimStart();
|
|
93
|
+
if (!trimmed)
|
|
94
|
+
continue;
|
|
95
|
+
if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
function detectHtml(code) {
|
|
101
|
+
const lower = code.toLowerCase();
|
|
102
|
+
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
103
|
+
}
|
|
104
|
+
function detectJson(code) {
|
|
105
|
+
const trimmed = code.trimStart();
|
|
106
|
+
if (!trimmed)
|
|
107
|
+
return false;
|
|
108
|
+
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
109
|
+
}
|
|
110
|
+
function detectYaml(code) {
|
|
111
|
+
const lines = splitLines(code);
|
|
112
|
+
for (const line of lines) {
|
|
113
|
+
const trimmed = line.trim();
|
|
114
|
+
if (!trimmed)
|
|
115
|
+
continue;
|
|
116
|
+
const colonIndex = trimmed.indexOf(':');
|
|
117
|
+
if (colonIndex <= 0)
|
|
118
|
+
continue;
|
|
119
|
+
const after = trimmed[colonIndex + 1];
|
|
120
|
+
if (after === ' ' || after === '\t')
|
|
121
|
+
return true;
|
|
122
|
+
}
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
function detectSql(code) {
|
|
126
|
+
const lower = code.toLowerCase();
|
|
127
|
+
return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
|
|
128
|
+
}
|
|
129
|
+
function detectGo(code) {
|
|
130
|
+
const lower = code.toLowerCase();
|
|
131
|
+
return (containsWord(lower, 'package') ||
|
|
132
|
+
containsWord(lower, 'func') ||
|
|
133
|
+
lower.includes('import "'));
|
|
134
|
+
}
|
|
135
|
+
function isCssSelectorLine(line) {
|
|
136
|
+
if (!line.startsWith('.') && !line.startsWith('#'))
|
|
137
|
+
return false;
|
|
138
|
+
return line.includes('{');
|
|
139
|
+
}
|
|
140
|
+
function isCssPropertyLine(line) {
|
|
141
|
+
return line.includes(':') && line.includes(';');
|
|
142
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare function containsJsxTag(code: string): boolean;
|
|
2
|
+
export declare function containsWord(source: string, word: string): boolean;
|
|
3
|
+
export declare function splitLines(content: string): string[];
|
|
4
|
+
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
5
|
+
export declare function resolveLanguageFromDataAttribute(dataLang: string): string | undefined;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export function containsJsxTag(code) {
|
|
2
|
+
for (let index = 0; index < code.length - 1; index += 1) {
|
|
3
|
+
if (code[index] !== '<')
|
|
4
|
+
continue;
|
|
5
|
+
const next = code[index + 1];
|
|
6
|
+
if (!next)
|
|
7
|
+
continue;
|
|
8
|
+
if (next >= 'A' && next <= 'Z')
|
|
9
|
+
return true;
|
|
10
|
+
}
|
|
11
|
+
return false;
|
|
12
|
+
}
|
|
13
|
+
export function containsWord(source, word) {
|
|
14
|
+
let startIndex = source.indexOf(word);
|
|
15
|
+
while (startIndex !== -1) {
|
|
16
|
+
const before = startIndex === 0 ? '' : source[startIndex - 1];
|
|
17
|
+
const afterIndex = startIndex + word.length;
|
|
18
|
+
const after = afterIndex >= source.length ? '' : source[afterIndex];
|
|
19
|
+
if (!isWordChar(before) && !isWordChar(after))
|
|
20
|
+
return true;
|
|
21
|
+
startIndex = source.indexOf(word, startIndex + word.length);
|
|
22
|
+
}
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
export function splitLines(content) {
|
|
26
|
+
return content.split('\n');
|
|
27
|
+
}
|
|
28
|
+
export function extractLanguageFromClassName(className) {
|
|
29
|
+
const tokens = className.match(/\S+/g);
|
|
30
|
+
if (!tokens)
|
|
31
|
+
return undefined;
|
|
32
|
+
for (const token of tokens) {
|
|
33
|
+
const lower = token.toLowerCase();
|
|
34
|
+
if (lower.startsWith('language-'))
|
|
35
|
+
return token.slice('language-'.length);
|
|
36
|
+
if (lower.startsWith('lang-'))
|
|
37
|
+
return token.slice('lang-'.length);
|
|
38
|
+
if (lower.startsWith('highlight-')) {
|
|
39
|
+
return token.slice('highlight-'.length);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return undefined;
|
|
43
|
+
}
|
|
44
|
+
export function resolveLanguageFromDataAttribute(dataLang) {
|
|
45
|
+
const trimmed = dataLang.trim();
|
|
46
|
+
if (!trimmed)
|
|
47
|
+
return undefined;
|
|
48
|
+
for (const char of trimmed) {
|
|
49
|
+
if (!isWordChar(char))
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
return trimmed;
|
|
53
|
+
}
|
|
54
|
+
function isWordChar(char) {
|
|
55
|
+
if (!char)
|
|
56
|
+
return false;
|
|
57
|
+
const code = char.charCodeAt(0);
|
|
58
|
+
return ((code >= 48 && code <= 57) ||
|
|
59
|
+
(code >= 65 && code <= 90) ||
|
|
60
|
+
(code >= 97 && code <= 122) ||
|
|
61
|
+
char === '_');
|
|
62
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare function containsJsxTag(code: string): boolean;
|
|
2
|
+
export declare function containsWord(source: string, word: string): boolean;
|
|
3
|
+
export declare function splitLines(content: string): string[];
|
|
4
|
+
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
5
|
+
export declare function resolveLanguageFromDataAttribute(dataLang: string): string | undefined;
|