@j0hanz/superfetch 1.2.5 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +131 -156
- package/dist/config/auth-config.d.ts +16 -0
- package/dist/config/auth-config.js +53 -0
- package/dist/config/constants.d.ts +11 -13
- package/dist/config/constants.js +1 -3
- package/dist/config/env-parsers.d.ts +7 -0
- package/dist/config/env-parsers.js +84 -0
- package/dist/config/formatting.d.ts +2 -2
- package/dist/config/index.d.ts +47 -53
- package/dist/config/index.js +35 -64
- package/dist/config/types/content.d.ts +1 -49
- package/dist/config/types/runtime.d.ts +8 -16
- package/dist/config/types/tools.d.ts +2 -28
- package/dist/http/accept-policy.d.ts +3 -0
- package/dist/http/accept-policy.js +45 -0
- package/dist/http/async-handler.d.ts +2 -0
- package/dist/http/async-handler.js +5 -0
- package/dist/http/auth-introspection.d.ts +2 -0
- package/dist/http/auth-introspection.js +141 -0
- package/dist/http/auth-static.d.ts +2 -0
- package/dist/http/auth-static.js +23 -0
- package/dist/http/auth.d.ts +3 -2
- package/dist/http/auth.js +254 -23
- package/dist/http/cors.d.ts +6 -6
- package/dist/http/cors.js +7 -42
- package/dist/http/download-routes.d.ts +0 -12
- package/dist/http/download-routes.js +21 -58
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/jsonrpc-http.d.ts +2 -0
- package/dist/http/jsonrpc-http.js +10 -0
- package/dist/http/mcp-routes.d.ts +8 -3
- package/dist/http/mcp-routes.js +137 -31
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-helpers.d.ts +0 -1
- package/dist/http/mcp-session-helpers.js +1 -1
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-transport.d.ts +7 -0
- package/dist/http/mcp-session-transport.js +57 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +15 -137
- package/dist/http/mcp-sessions.d.ts +43 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/mcp-validation.d.ts +1 -0
- package/dist/http/mcp-validation.js +11 -10
- package/dist/http/protocol-policy.d.ts +2 -0
- package/dist/http/protocol-policy.js +31 -0
- package/dist/http/rate-limit.js +7 -4
- package/dist/http/server-config.d.ts +1 -0
- package/dist/http/server-config.js +40 -0
- package/dist/http/server-middleware.d.ts +7 -9
- package/dist/http/server-middleware.js +9 -70
- package/dist/http/server-shutdown.d.ts +4 -0
- package/dist/http/server-shutdown.js +43 -0
- package/dist/http/server.d.ts +10 -0
- package/dist/http/server.js +546 -61
- package/dist/http/session-cleanup.js +8 -5
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +32 -33
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +67 -125
- package/dist/resources/index.js +0 -82
- package/dist/server.js +50 -29
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache-keys.d.ts +7 -0
- package/dist/services/cache-keys.js +57 -0
- package/dist/services/cache.d.ts +4 -9
- package/dist/services/cache.js +77 -139
- package/dist/services/context.d.ts +0 -1
- package/dist/services/context.js +0 -7
- package/dist/services/extractor.js +55 -116
- package/dist/services/fetcher/agents.d.ts +2 -2
- package/dist/services/fetcher/agents.js +35 -96
- package/dist/services/fetcher/dns-selection.d.ts +2 -0
- package/dist/services/fetcher/dns-selection.js +72 -0
- package/dist/services/fetcher/interceptors.d.ts +0 -22
- package/dist/services/fetcher/interceptors.js +18 -32
- package/dist/services/fetcher/redirects.js +16 -7
- package/dist/services/fetcher/response.js +79 -34
- package/dist/services/fetcher.d.ts +22 -3
- package/dist/services/fetcher.js +544 -44
- package/dist/services/fifo-queue.d.ts +8 -0
- package/dist/services/fifo-queue.js +25 -0
- package/dist/services/logger.js +2 -2
- package/dist/services/metadata-collector.d.ts +1 -9
- package/dist/services/metadata-collector.js +71 -2
- package/dist/services/transform-worker-pool.d.ts +4 -14
- package/dist/services/transform-worker-pool.js +177 -129
- package/dist/services/transform-worker-types.d.ts +32 -0
- package/dist/services/transform-worker-types.js +14 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +3 -4
- package/dist/tools/handlers/fetch-markdown.tool.js +20 -72
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -22
- package/dist/tools/handlers/fetch-single.shared.js +175 -89
- package/dist/tools/handlers/fetch-url.tool.d.ts +7 -1
- package/dist/tools/handlers/fetch-url.tool.js +84 -119
- package/dist/tools/index.js +21 -40
- package/dist/tools/schemas.d.ts +1 -51
- package/dist/tools/schemas.js +1 -107
- package/dist/tools/utils/cached-markdown.d.ts +5 -0
- package/dist/tools/utils/cached-markdown.js +46 -0
- package/dist/tools/utils/content-shaping.d.ts +4 -0
- package/dist/tools/utils/content-shaping.js +67 -0
- package/dist/tools/utils/content-transform.d.ts +5 -17
- package/dist/tools/utils/content-transform.js +134 -114
- package/dist/tools/utils/fetch-pipeline.d.ts +0 -8
- package/dist/tools/utils/fetch-pipeline.js +57 -63
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/inline-content.d.ts +1 -2
- package/dist/tools/utils/inline-content.js +4 -7
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +135 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +2 -0
- package/dist/transformers/markdown.js +185 -0
- package/dist/transformers/markdown.transformer.js +5 -117
- package/dist/utils/cached-payload.d.ts +7 -0
- package/dist/utils/cached-payload.js +36 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.d.ts +9 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/error-utils.js +1 -1
- package/dist/utils/filename-generator.js +34 -12
- package/dist/utils/guards.d.ts +1 -0
- package/dist/utils/guards.js +3 -0
- package/dist/utils/header-normalizer.d.ts +0 -3
- package/dist/utils/header-normalizer.js +3 -3
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +14 -46
- package/dist/utils/url-transformer.d.ts +7 -0
- package/dist/utils/url-transformer.js +147 -0
- package/dist/utils/url-validator.d.ts +1 -2
- package/dist/utils/url-validator.js +53 -114
- package/dist/workers/content-transform.worker.d.ts +1 -0
- package/dist/workers/content-transform.worker.js +40 -0
- package/package.json +17 -18
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
|
+
const STRUCTURAL_TAGS = new Set([
|
|
3
|
+
'script',
|
|
4
|
+
'style',
|
|
5
|
+
'noscript',
|
|
6
|
+
'iframe',
|
|
7
|
+
'nav',
|
|
8
|
+
'footer',
|
|
9
|
+
'aside',
|
|
10
|
+
'header',
|
|
11
|
+
'form',
|
|
12
|
+
'button',
|
|
13
|
+
'input',
|
|
14
|
+
'select',
|
|
15
|
+
'textarea',
|
|
16
|
+
]);
|
|
17
|
+
const NAVIGATION_ROLES = new Set([
|
|
18
|
+
'navigation',
|
|
19
|
+
'banner',
|
|
20
|
+
'complementary',
|
|
21
|
+
'contentinfo',
|
|
22
|
+
'tree',
|
|
23
|
+
'menubar',
|
|
24
|
+
'menu',
|
|
25
|
+
]);
|
|
26
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
27
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
28
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
29
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
30
|
+
function isElement(node) {
|
|
31
|
+
return (isRecord(node) &&
|
|
32
|
+
'getAttribute' in node &&
|
|
33
|
+
typeof node.getAttribute === 'function');
|
|
34
|
+
}
|
|
35
|
+
function isStructuralNoiseTag(tagName) {
|
|
36
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
37
|
+
}
|
|
38
|
+
function isElementHidden(element) {
|
|
39
|
+
return (element.getAttribute('hidden') !== null ||
|
|
40
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
41
|
+
}
|
|
42
|
+
function hasNoiseRole(role) {
|
|
43
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
44
|
+
}
|
|
45
|
+
function matchesPromoIdOrClass(className, id) {
|
|
46
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
47
|
+
return PROMO_PATTERN.test(combined);
|
|
48
|
+
}
|
|
49
|
+
function matchesHighZIsolate(className) {
|
|
50
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
51
|
+
}
|
|
52
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
53
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
54
|
+
}
|
|
55
|
+
function readElementMetadata(element) {
|
|
56
|
+
return {
|
|
57
|
+
tagName: element.tagName.toLowerCase(),
|
|
58
|
+
className: element.getAttribute('class') ?? '',
|
|
59
|
+
id: element.getAttribute('id') ?? '',
|
|
60
|
+
role: element.getAttribute('role'),
|
|
61
|
+
isHidden: isElementHidden(element),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function isNoiseElement(node) {
|
|
65
|
+
const metadata = readElementMetadata(node);
|
|
66
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
67
|
+
metadata.isHidden ||
|
|
68
|
+
hasNoiseRole(metadata.role) ||
|
|
69
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
70
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
71
|
+
}
|
|
72
|
+
function isNoiseNode(node) {
|
|
73
|
+
return isElement(node) && isNoiseElement(node);
|
|
74
|
+
}
|
|
75
|
+
export function addNoiseRule(instance) {
|
|
76
|
+
instance.addRule('removeNoise', {
|
|
77
|
+
filter: (node) => isNoiseNode(node),
|
|
78
|
+
replacement: () => '',
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { addFencedCodeRule } from './fenced-code-rule.js';
|
|
3
|
+
import { addNoiseRule } from './noise-rule.js';
|
|
4
|
+
let turndownInstance = null;
|
|
5
|
+
function createTurndownInstance() {
|
|
6
|
+
const instance = new TurndownService({
|
|
7
|
+
headingStyle: 'atx',
|
|
8
|
+
codeBlockStyle: 'fenced',
|
|
9
|
+
emDelimiter: '_',
|
|
10
|
+
bulletListMarker: '-',
|
|
11
|
+
});
|
|
12
|
+
addNoiseRule(instance);
|
|
13
|
+
addFencedCodeRule(instance);
|
|
14
|
+
return instance;
|
|
15
|
+
}
|
|
16
|
+
export function getTurndown() {
|
|
17
|
+
turndownInstance ??= createTurndownInstance();
|
|
18
|
+
return turndownInstance;
|
|
19
|
+
}
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
|
|
3
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
4
|
+
import { isRecord } from '../utils/guards.js';
|
|
5
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
6
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
7
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
8
|
+
const ESCAPE_PATTERNS = {
|
|
9
|
+
backslash: /\\/g,
|
|
10
|
+
quote: /"/g,
|
|
11
|
+
newline: /\n/g,
|
|
12
|
+
tab: /\t/g,
|
|
13
|
+
};
|
|
14
|
+
const YAML_QUOTE_CHECKS = [
|
|
15
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
16
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
17
|
+
(input) => input === '',
|
|
18
|
+
(input) => YAML_NUMERIC.test(input),
|
|
19
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
20
|
+
];
|
|
21
|
+
function needsYamlQuotes(value) {
|
|
22
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
23
|
+
}
|
|
24
|
+
function escapeYamlValue(value) {
|
|
25
|
+
if (!needsYamlQuotes(value)) {
|
|
26
|
+
return value;
|
|
27
|
+
}
|
|
28
|
+
const escaped = value
|
|
29
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
30
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
31
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
32
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
33
|
+
return `"${escaped}"`;
|
|
34
|
+
}
|
|
35
|
+
function appendFrontmatterField(lines, key, value) {
|
|
36
|
+
if (!value)
|
|
37
|
+
return;
|
|
38
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
39
|
+
}
|
|
40
|
+
function buildFrontmatter(metadata) {
|
|
41
|
+
if (!metadata)
|
|
42
|
+
return '';
|
|
43
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
44
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
45
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
46
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
47
|
+
return joinLines(lines);
|
|
48
|
+
}
|
|
49
|
+
function isElement(node) {
|
|
50
|
+
return (isRecord(node) &&
|
|
51
|
+
'getAttribute' in node &&
|
|
52
|
+
typeof node.getAttribute === 'function');
|
|
53
|
+
}
|
|
54
|
+
function isFencedCodeBlock(node, options) {
|
|
55
|
+
return (options.codeBlockStyle === 'fenced' &&
|
|
56
|
+
node.nodeName === 'PRE' &&
|
|
57
|
+
node.firstChild?.nodeName === 'CODE');
|
|
58
|
+
}
|
|
59
|
+
function formatFencedCodeBlock(node) {
|
|
60
|
+
const codeNode = node.firstChild;
|
|
61
|
+
if (!isElement(codeNode))
|
|
62
|
+
return '';
|
|
63
|
+
const code = codeNode.textContent || '';
|
|
64
|
+
const language = resolveCodeLanguage(codeNode, code);
|
|
65
|
+
return CODE_BLOCK.format(code, language);
|
|
66
|
+
}
|
|
67
|
+
function resolveCodeLanguage(codeNode, code) {
|
|
68
|
+
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
69
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
70
|
+
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
71
|
+
}
|
|
72
|
+
function readCodeAttributes(codeNode) {
|
|
73
|
+
return {
|
|
74
|
+
className: codeNode.getAttribute('class') ?? '',
|
|
75
|
+
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
function addFencedCodeRule(instance) {
|
|
79
|
+
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
80
|
+
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
81
|
+
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
const STRUCTURAL_TAGS = new Set([
|
|
85
|
+
'script',
|
|
86
|
+
'style',
|
|
87
|
+
'noscript',
|
|
88
|
+
'iframe',
|
|
89
|
+
'nav',
|
|
90
|
+
'footer',
|
|
91
|
+
'aside',
|
|
92
|
+
'header',
|
|
93
|
+
'form',
|
|
94
|
+
'button',
|
|
95
|
+
'input',
|
|
96
|
+
'select',
|
|
97
|
+
'textarea',
|
|
98
|
+
]);
|
|
99
|
+
const NAVIGATION_ROLES = new Set([
|
|
100
|
+
'navigation',
|
|
101
|
+
'banner',
|
|
102
|
+
'complementary',
|
|
103
|
+
'contentinfo',
|
|
104
|
+
'tree',
|
|
105
|
+
'menubar',
|
|
106
|
+
'menu',
|
|
107
|
+
]);
|
|
108
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
109
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
110
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
111
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
112
|
+
function isStructuralNoiseTag(tagName) {
|
|
113
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
114
|
+
}
|
|
115
|
+
function isElementHidden(element) {
|
|
116
|
+
return (element.getAttribute('hidden') !== null ||
|
|
117
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
118
|
+
}
|
|
119
|
+
function hasNoiseRole(role) {
|
|
120
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
121
|
+
}
|
|
122
|
+
function matchesPromoIdOrClass(className, id) {
|
|
123
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
124
|
+
return PROMO_PATTERN.test(combined);
|
|
125
|
+
}
|
|
126
|
+
function matchesHighZIsolate(className) {
|
|
127
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
128
|
+
}
|
|
129
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
130
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
131
|
+
}
|
|
132
|
+
function readElementMetadata(element) {
|
|
133
|
+
return {
|
|
134
|
+
tagName: element.tagName.toLowerCase(),
|
|
135
|
+
className: element.getAttribute('class') ?? '',
|
|
136
|
+
id: element.getAttribute('id') ?? '',
|
|
137
|
+
role: element.getAttribute('role'),
|
|
138
|
+
isHidden: isElementHidden(element),
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
function isNoiseElement(node) {
|
|
142
|
+
const metadata = readElementMetadata(node);
|
|
143
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
144
|
+
metadata.isHidden ||
|
|
145
|
+
hasNoiseRole(metadata.role) ||
|
|
146
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
147
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
148
|
+
}
|
|
149
|
+
function isNoiseNode(node) {
|
|
150
|
+
return isElement(node) && isNoiseElement(node);
|
|
151
|
+
}
|
|
152
|
+
function addNoiseRule(instance) {
|
|
153
|
+
instance.addRule('removeNoise', {
|
|
154
|
+
filter: (node) => isNoiseNode(node),
|
|
155
|
+
replacement: () => '',
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
let turndownInstance = null;
|
|
159
|
+
function createTurndownInstance() {
|
|
160
|
+
const instance = new TurndownService({
|
|
161
|
+
headingStyle: 'atx',
|
|
162
|
+
codeBlockStyle: 'fenced',
|
|
163
|
+
emDelimiter: '_',
|
|
164
|
+
bulletListMarker: '-',
|
|
165
|
+
});
|
|
166
|
+
addNoiseRule(instance);
|
|
167
|
+
addFencedCodeRule(instance);
|
|
168
|
+
return instance;
|
|
169
|
+
}
|
|
170
|
+
function getTurndown() {
|
|
171
|
+
turndownInstance ??= createTurndownInstance();
|
|
172
|
+
return turndownInstance;
|
|
173
|
+
}
|
|
174
|
+
export function htmlToMarkdown(html, metadata) {
|
|
175
|
+
const frontmatter = buildFrontmatter(metadata);
|
|
176
|
+
if (!html)
|
|
177
|
+
return frontmatter;
|
|
178
|
+
try {
|
|
179
|
+
const content = getTurndown().turndown(html).trim();
|
|
180
|
+
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
181
|
+
}
|
|
182
|
+
catch {
|
|
183
|
+
return frontmatter;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
@@ -1,126 +1,14 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
4
|
-
let turndownInstance = null;
|
|
5
|
-
function getTurndown() {
|
|
6
|
-
if (turndownInstance)
|
|
7
|
-
return turndownInstance;
|
|
8
|
-
turndownInstance = createTurndownInstance();
|
|
9
|
-
return turndownInstance;
|
|
10
|
-
}
|
|
11
|
-
function createTurndownInstance() {
|
|
12
|
-
const instance = new TurndownService({
|
|
13
|
-
headingStyle: 'atx',
|
|
14
|
-
codeBlockStyle: 'fenced',
|
|
15
|
-
emDelimiter: '_',
|
|
16
|
-
bulletListMarker: '-',
|
|
17
|
-
});
|
|
18
|
-
addNoiseRule(instance);
|
|
19
|
-
addFencedCodeRule(instance);
|
|
20
|
-
return instance;
|
|
21
|
-
}
|
|
22
|
-
function addNoiseRule(instance) {
|
|
23
|
-
instance.addRule('removeNoise', {
|
|
24
|
-
filter: ['script', 'style', 'noscript', 'nav', 'footer', 'aside', 'iframe'],
|
|
25
|
-
replacement: () => '',
|
|
26
|
-
});
|
|
27
|
-
}
|
|
28
|
-
function addFencedCodeRule(instance) {
|
|
29
|
-
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
30
|
-
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
31
|
-
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
32
|
-
});
|
|
33
|
-
}
|
|
34
|
-
function isFencedCodeBlock(node, options) {
|
|
35
|
-
if (options.codeBlockStyle !== 'fenced')
|
|
36
|
-
return false;
|
|
37
|
-
if (node.nodeName !== 'PRE')
|
|
38
|
-
return false;
|
|
39
|
-
const { firstChild } = node;
|
|
40
|
-
if (!firstChild)
|
|
41
|
-
return false;
|
|
42
|
-
return firstChild.nodeName === 'CODE';
|
|
43
|
-
}
|
|
44
|
-
function isElement(node) {
|
|
45
|
-
return (node !== null &&
|
|
46
|
-
typeof node === 'object' &&
|
|
47
|
-
'getAttribute' in node &&
|
|
48
|
-
typeof node.getAttribute === 'function');
|
|
49
|
-
}
|
|
50
|
-
function formatFencedCodeBlock(node) {
|
|
51
|
-
const codeNode = node.firstChild;
|
|
52
|
-
if (!isElement(codeNode))
|
|
53
|
-
return '';
|
|
54
|
-
const code = codeNode.textContent || '';
|
|
55
|
-
const language = resolveCodeLanguage(codeNode, code);
|
|
56
|
-
return CODE_BLOCK.format(code, language);
|
|
57
|
-
}
|
|
58
|
-
function resolveCodeLanguage(codeNode, code) {
|
|
59
|
-
const className = codeNode.getAttribute('class') ?? '';
|
|
60
|
-
const dataLang = codeNode.getAttribute('data-language') ?? '';
|
|
61
|
-
const attributeLanguage = resolveLanguageFromAttributes(className, dataLang);
|
|
62
|
-
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
63
|
-
}
|
|
64
|
-
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
65
|
-
const YAML_NUMERIC = /^[\d.]+$/;
|
|
66
|
-
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
67
|
-
const ESCAPE_PATTERNS = {
|
|
68
|
-
backslash: /\\/g,
|
|
69
|
-
quote: /"/g,
|
|
70
|
-
newline: /\n/g,
|
|
71
|
-
tab: /\t/g,
|
|
72
|
-
};
|
|
73
|
-
function needsYamlQuotes(value) {
|
|
74
|
-
const checks = [
|
|
75
|
-
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
76
|
-
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
77
|
-
(input) => input === '',
|
|
78
|
-
(input) => YAML_NUMERIC.test(input),
|
|
79
|
-
(input) => YAML_RESERVED_WORDS.test(input),
|
|
80
|
-
];
|
|
81
|
-
return checks.some((check) => check(value));
|
|
82
|
-
}
|
|
83
|
-
function escapeYamlValue(value) {
|
|
84
|
-
if (!needsYamlQuotes(value)) {
|
|
85
|
-
return value;
|
|
86
|
-
}
|
|
87
|
-
const escaped = value
|
|
88
|
-
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
89
|
-
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
90
|
-
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
91
|
-
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
92
|
-
return `"${escaped}"`;
|
|
93
|
-
}
|
|
94
|
-
function createFrontmatter(metadata) {
|
|
95
|
-
const lines = [FRONTMATTER_DELIMITER];
|
|
96
|
-
if (metadata.title) {
|
|
97
|
-
lines.push(`title: ${escapeYamlValue(metadata.title)}`);
|
|
98
|
-
}
|
|
99
|
-
if (metadata.url) {
|
|
100
|
-
lines.push(`source: ${escapeYamlValue(metadata.url)}`);
|
|
101
|
-
}
|
|
102
|
-
lines.push(FRONTMATTER_DELIMITER);
|
|
103
|
-
return joinLines(lines);
|
|
104
|
-
}
|
|
105
|
-
function convertHtmlToMarkdown(html) {
|
|
106
|
-
return getTurndown().turndown(html).trim();
|
|
107
|
-
}
|
|
108
|
-
function buildFrontmatterBlock(metadata) {
|
|
109
|
-
return metadata ? createFrontmatter(metadata) : '';
|
|
110
|
-
}
|
|
1
|
+
import { buildFrontmatter } from './markdown/frontmatter.js';
|
|
2
|
+
import { getTurndown } from './markdown/turndown-instance.js';
|
|
111
3
|
export function htmlToMarkdown(html, metadata) {
|
|
112
|
-
const frontmatter =
|
|
113
|
-
if (!
|
|
4
|
+
const frontmatter = buildFrontmatter(metadata);
|
|
5
|
+
if (!html)
|
|
114
6
|
return frontmatter;
|
|
115
|
-
}
|
|
116
7
|
try {
|
|
117
|
-
const content =
|
|
8
|
+
const content = getTurndown().turndown(html).trim();
|
|
118
9
|
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
119
10
|
}
|
|
120
11
|
catch {
|
|
121
12
|
return frontmatter;
|
|
122
13
|
}
|
|
123
14
|
}
|
|
124
|
-
function isValidHtmlInput(html) {
|
|
125
|
-
return Boolean(html && typeof html === 'string');
|
|
126
|
-
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export interface CachedPayload {
|
|
2
|
+
content?: string;
|
|
3
|
+
markdown?: string;
|
|
4
|
+
title?: string;
|
|
5
|
+
}
|
|
6
|
+
export declare function parseCachedPayload(raw: string): CachedPayload | null;
|
|
7
|
+
export declare function resolveCachedPayloadContent(payload: CachedPayload): string | null;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { isRecord } from './guards.js';
|
|
2
|
+
export function parseCachedPayload(raw) {
|
|
3
|
+
try {
|
|
4
|
+
const parsed = JSON.parse(raw);
|
|
5
|
+
return isCachedPayload(parsed) ? parsed : null;
|
|
6
|
+
}
|
|
7
|
+
catch {
|
|
8
|
+
return null;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
export function resolveCachedPayloadContent(payload) {
|
|
12
|
+
if (typeof payload.markdown === 'string') {
|
|
13
|
+
return payload.markdown;
|
|
14
|
+
}
|
|
15
|
+
if (typeof payload.content === 'string') {
|
|
16
|
+
return payload.content;
|
|
17
|
+
}
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
function hasOptionalStringProperty(value, key) {
|
|
21
|
+
const prop = value[key];
|
|
22
|
+
if (prop === undefined)
|
|
23
|
+
return true;
|
|
24
|
+
return typeof prop === 'string';
|
|
25
|
+
}
|
|
26
|
+
function isCachedPayload(value) {
|
|
27
|
+
if (!isRecord(value))
|
|
28
|
+
return false;
|
|
29
|
+
if (!hasOptionalStringProperty(value, 'content'))
|
|
30
|
+
return false;
|
|
31
|
+
if (!hasOptionalStringProperty(value, 'markdown'))
|
|
32
|
+
return false;
|
|
33
|
+
if (!hasOptionalStringProperty(value, 'title'))
|
|
34
|
+
return false;
|
|
35
|
+
return true;
|
|
36
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function detectBash(code: string): boolean;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { splitLines } from './code-language-parsing.js';
|
|
2
|
+
const BASH_PACKAGE_MANAGERS = [
|
|
3
|
+
'npm',
|
|
4
|
+
'yarn',
|
|
5
|
+
'pnpm',
|
|
6
|
+
'npx',
|
|
7
|
+
'brew',
|
|
8
|
+
'apt',
|
|
9
|
+
'pip',
|
|
10
|
+
'cargo',
|
|
11
|
+
'go',
|
|
12
|
+
];
|
|
13
|
+
const BASH_VERBS = ['install', 'add', 'run', 'build', 'start'];
|
|
14
|
+
const BASH_COMMANDS = ['sudo', 'chmod', 'mkdir', 'cd', 'ls', 'cat', 'echo'];
|
|
15
|
+
export function detectBash(code) {
|
|
16
|
+
const lines = splitLines(code);
|
|
17
|
+
for (const line of lines) {
|
|
18
|
+
const trimmed = line.trimStart();
|
|
19
|
+
if (!trimmed)
|
|
20
|
+
continue;
|
|
21
|
+
if (isBashIndicator(trimmed))
|
|
22
|
+
return true;
|
|
23
|
+
}
|
|
24
|
+
return false;
|
|
25
|
+
}
|
|
26
|
+
function startsWithCommand(line, commands) {
|
|
27
|
+
return commands.some((command) => line === command || line.startsWith(`${command} `));
|
|
28
|
+
}
|
|
29
|
+
function isBashIndicator(line) {
|
|
30
|
+
return (isShebang(line) ||
|
|
31
|
+
isPromptLine(line) ||
|
|
32
|
+
startsWithCommand(line, BASH_COMMANDS) ||
|
|
33
|
+
startsWithPackageManagerCommand(line));
|
|
34
|
+
}
|
|
35
|
+
function isShebang(line) {
|
|
36
|
+
return line.startsWith('#!');
|
|
37
|
+
}
|
|
38
|
+
function isPromptLine(line) {
|
|
39
|
+
return line.startsWith('$ ') || line.startsWith('# ');
|
|
40
|
+
}
|
|
41
|
+
function startsWithPackageManagerCommand(line) {
|
|
42
|
+
return BASH_PACKAGE_MANAGERS.some((manager) => {
|
|
43
|
+
if (!line.startsWith(`${manager} `))
|
|
44
|
+
return false;
|
|
45
|
+
const rest = line.slice(manager.length + 1);
|
|
46
|
+
return BASH_VERBS.some((verb) => rest === verb || rest.startsWith(`${verb} `));
|
|
47
|
+
});
|
|
48
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { CODE_DETECTORS } from './code-language-detectors.js';
|
|
2
|
+
import { extractLanguageFromClassName, resolveLanguageFromDataAttribute, } from './code-language-parsing.js';
|
|
3
|
+
export function detectLanguageFromCode(code) {
|
|
4
|
+
for (const { language, detect } of CODE_DETECTORS) {
|
|
5
|
+
if (detect(code))
|
|
6
|
+
return language;
|
|
7
|
+
}
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
11
|
+
const classMatch = extractLanguageFromClassName(className);
|
|
12
|
+
return classMatch ?? resolveLanguageFromDataAttribute(dataLang);
|
|
13
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import { detectBash } from './code-language-bash.js';
|
|
2
|
+
import { containsJsxTag, containsWord, splitLines, } from './code-language-parsing.js';
|
|
3
|
+
const TYPE_HINTS = [
|
|
4
|
+
'string',
|
|
5
|
+
'number',
|
|
6
|
+
'boolean',
|
|
7
|
+
'void',
|
|
8
|
+
'any',
|
|
9
|
+
'unknown',
|
|
10
|
+
'never',
|
|
11
|
+
];
|
|
12
|
+
const HTML_TAGS = [
|
|
13
|
+
'<!doctype',
|
|
14
|
+
'<html',
|
|
15
|
+
'<head',
|
|
16
|
+
'<body',
|
|
17
|
+
'<div',
|
|
18
|
+
'<span',
|
|
19
|
+
'<p',
|
|
20
|
+
'<a',
|
|
21
|
+
'<script',
|
|
22
|
+
'<style',
|
|
23
|
+
];
|
|
24
|
+
const SQL_KEYWORDS = [
|
|
25
|
+
'select',
|
|
26
|
+
'insert',
|
|
27
|
+
'update',
|
|
28
|
+
'delete',
|
|
29
|
+
'create',
|
|
30
|
+
'alter',
|
|
31
|
+
'drop',
|
|
32
|
+
];
|
|
33
|
+
const JS_WORD_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
34
|
+
const PYTHON_WORD_REGEX = /\b(?:def|class|import|from)\b/;
|
|
35
|
+
const RUST_WORD_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
36
|
+
const CSS_DIRECTIVE_REGEX = /@media|@import|@keyframes/;
|
|
37
|
+
export const CODE_DETECTORS = [
|
|
38
|
+
{ language: 'jsx', detect: detectJsx },
|
|
39
|
+
{ language: 'typescript', detect: detectTypescript },
|
|
40
|
+
{ language: 'rust', detect: detectRust },
|
|
41
|
+
{ language: 'javascript', detect: detectJavascript },
|
|
42
|
+
{ language: 'python', detect: detectPython },
|
|
43
|
+
{ language: 'bash', detect: detectBash },
|
|
44
|
+
{ language: 'css', detect: detectCss },
|
|
45
|
+
{ language: 'html', detect: detectHtml },
|
|
46
|
+
{ language: 'json', detect: detectJson },
|
|
47
|
+
{ language: 'yaml', detect: detectYaml },
|
|
48
|
+
{ language: 'sql', detect: detectSql },
|
|
49
|
+
{ language: 'go', detect: detectGo },
|
|
50
|
+
];
|
|
51
|
+
function detectJsx(code) {
|
|
52
|
+
const lower = code.toLowerCase();
|
|
53
|
+
if (lower.includes('classname='))
|
|
54
|
+
return true;
|
|
55
|
+
if (lower.includes('jsx:'))
|
|
56
|
+
return true;
|
|
57
|
+
if (lower.includes("from 'react'") || lower.includes('from "react"')) {
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
return containsJsxTag(code);
|
|
61
|
+
}
|
|
62
|
+
function detectTypescript(code) {
|
|
63
|
+
const lower = code.toLowerCase();
|
|
64
|
+
if (containsWord(lower, 'interface'))
|
|
65
|
+
return true;
|
|
66
|
+
if (containsWord(lower, 'type'))
|
|
67
|
+
return true;
|
|
68
|
+
return TYPE_HINTS.some((hint) => lower.includes(`: ${hint}`) || lower.includes(`:${hint}`));
|
|
69
|
+
}
|
|
70
|
+
function detectRust(code) {
|
|
71
|
+
const lower = code.toLowerCase();
|
|
72
|
+
return (RUST_WORD_REGEX.test(lower) ||
|
|
73
|
+
lower.includes('let mut') ||
|
|
74
|
+
(lower.includes('use ') && lower.includes('::')));
|
|
75
|
+
}
|
|
76
|
+
function detectJavascript(code) {
|
|
77
|
+
const lower = code.toLowerCase();
|
|
78
|
+
return JS_WORD_REGEX.test(lower);
|
|
79
|
+
}
|
|
80
|
+
function detectPython(code) {
|
|
81
|
+
const lower = code.toLowerCase();
|
|
82
|
+
return (PYTHON_WORD_REGEX.test(lower) ||
|
|
83
|
+
lower.includes('print(') ||
|
|
84
|
+
lower.includes('__name__'));
|
|
85
|
+
}
|
|
86
|
+
function detectCss(code) {
|
|
87
|
+
const lower = code.toLowerCase();
|
|
88
|
+
if (CSS_DIRECTIVE_REGEX.test(lower))
|
|
89
|
+
return true;
|
|
90
|
+
const lines = splitLines(code);
|
|
91
|
+
for (const line of lines) {
|
|
92
|
+
const trimmed = line.trimStart();
|
|
93
|
+
if (!trimmed)
|
|
94
|
+
continue;
|
|
95
|
+
if (isCssSelectorLine(trimmed) || isCssPropertyLine(trimmed))
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
function detectHtml(code) {
|
|
101
|
+
const lower = code.toLowerCase();
|
|
102
|
+
return HTML_TAGS.some((tag) => lower.includes(tag));
|
|
103
|
+
}
|
|
104
|
+
function detectJson(code) {
|
|
105
|
+
const trimmed = code.trimStart();
|
|
106
|
+
if (!trimmed)
|
|
107
|
+
return false;
|
|
108
|
+
return trimmed.startsWith('{') || trimmed.startsWith('[');
|
|
109
|
+
}
|
|
110
|
+
function detectYaml(code) {
|
|
111
|
+
const lines = splitLines(code);
|
|
112
|
+
for (const line of lines) {
|
|
113
|
+
const trimmed = line.trim();
|
|
114
|
+
if (!trimmed)
|
|
115
|
+
continue;
|
|
116
|
+
const colonIndex = trimmed.indexOf(':');
|
|
117
|
+
if (colonIndex <= 0)
|
|
118
|
+
continue;
|
|
119
|
+
const after = trimmed[colonIndex + 1];
|
|
120
|
+
if (after === ' ' || after === '\t')
|
|
121
|
+
return true;
|
|
122
|
+
}
|
|
123
|
+
return false;
|
|
124
|
+
}
|
|
125
|
+
function detectSql(code) {
|
|
126
|
+
const lower = code.toLowerCase();
|
|
127
|
+
return SQL_KEYWORDS.some((keyword) => containsWord(lower, keyword));
|
|
128
|
+
}
|
|
129
|
+
function detectGo(code) {
|
|
130
|
+
const lower = code.toLowerCase();
|
|
131
|
+
return (containsWord(lower, 'package') ||
|
|
132
|
+
containsWord(lower, 'func') ||
|
|
133
|
+
lower.includes('import "'));
|
|
134
|
+
}
|
|
135
|
+
function isCssSelectorLine(line) {
|
|
136
|
+
if (!line.startsWith('.') && !line.startsWith('#'))
|
|
137
|
+
return false;
|
|
138
|
+
return line.includes('{');
|
|
139
|
+
}
|
|
140
|
+
function isCssPropertyLine(line) {
|
|
141
|
+
return line.includes(':') && line.includes(';');
|
|
142
|
+
}
|