@j0hanz/superfetch 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -46
- package/dist/cache.d.ts +42 -0
- package/dist/cache.js +565 -0
- package/dist/config/env-parsers.d.ts +1 -0
- package/dist/config/env-parsers.js +12 -0
- package/dist/config/index.d.ts +7 -0
- package/dist/config/index.js +20 -8
- package/dist/config/types/content.d.ts +1 -0
- package/dist/config.d.ts +77 -0
- package/dist/config.js +261 -0
- package/dist/crypto.d.ts +2 -0
- package/dist/crypto.js +32 -0
- package/dist/errors.d.ts +10 -0
- package/dist/errors.js +28 -0
- package/dist/fetch.d.ts +40 -0
- package/dist/fetch.js +910 -0
- package/dist/http/auth.js +161 -2
- package/dist/http/base-middleware.d.ts +7 -0
- package/dist/http/base-middleware.js +143 -0
- package/dist/http/cors.d.ts +0 -5
- package/dist/http/cors.js +0 -6
- package/dist/http/download-routes.js +6 -2
- package/dist/http/error-handler.d.ts +2 -0
- package/dist/http/error-handler.js +55 -0
- package/dist/http/host-allowlist.d.ts +3 -0
- package/dist/http/host-allowlist.js +117 -0
- package/dist/http/mcp-routes.d.ts +8 -2
- package/dist/http/mcp-routes.js +101 -8
- package/dist/http/mcp-session-eviction.d.ts +3 -0
- package/dist/http/mcp-session-eviction.js +24 -0
- package/dist/http/mcp-session-init.d.ts +7 -0
- package/dist/http/mcp-session-init.js +94 -0
- package/dist/http/mcp-session-slots.d.ts +17 -0
- package/dist/http/mcp-session-slots.js +55 -0
- package/dist/http/mcp-session-transport-init.d.ts +7 -0
- package/dist/http/mcp-session-transport-init.js +41 -0
- package/dist/http/mcp-session-types.d.ts +5 -0
- package/dist/http/mcp-session-types.js +1 -0
- package/dist/http/mcp-session.d.ts +9 -9
- package/dist/http/mcp-session.js +5 -114
- package/dist/http/mcp-sessions.d.ts +41 -0
- package/dist/http/mcp-sessions.js +392 -0
- package/dist/http/rate-limit.js +2 -2
- package/dist/http/server-middleware.d.ts +6 -1
- package/dist/http/server-middleware.js +3 -117
- package/dist/http/server-shutdown.js +1 -1
- package/dist/http/server-tuning.d.ts +9 -0
- package/dist/http/server-tuning.js +45 -0
- package/dist/http/server.js +206 -9
- package/dist/http/session-cleanup.js +8 -5
- package/dist/http.d.ts +78 -0
- package/dist/http.js +1437 -0
- package/dist/index.js +3 -3
- package/dist/mcp.d.ts +3 -0
- package/dist/mcp.js +94 -0
- package/dist/middleware/error-handler.d.ts +1 -1
- package/dist/middleware/error-handler.js +31 -30
- package/dist/observability.d.ts +16 -0
- package/dist/observability.js +78 -0
- package/dist/resources/cached-content-params.d.ts +5 -0
- package/dist/resources/cached-content-params.js +36 -0
- package/dist/resources/cached-content.js +33 -33
- package/dist/server.js +21 -6
- package/dist/services/cache-events.d.ts +8 -0
- package/dist/services/cache-events.js +19 -0
- package/dist/services/cache.d.ts +5 -4
- package/dist/services/cache.js +49 -45
- package/dist/services/context.d.ts +2 -0
- package/dist/services/context.js +3 -0
- package/dist/services/extractor.d.ts +1 -0
- package/dist/services/extractor.js +77 -40
- package/dist/services/fetcher/agents.js +1 -1
- package/dist/services/fetcher/dns-selection.js +1 -1
- package/dist/services/fetcher/interceptors.js +29 -60
- package/dist/services/fetcher/redirects.js +12 -4
- package/dist/services/fetcher/response.js +18 -8
- package/dist/services/fetcher.d.ts +23 -0
- package/dist/services/fetcher.js +553 -13
- package/dist/services/logger.js +4 -1
- package/dist/services/telemetry.d.ts +19 -0
- package/dist/services/telemetry.js +43 -0
- package/dist/services/transform-worker-pool.d.ts +10 -3
- package/dist/services/transform-worker-pool.js +213 -184
- package/dist/tools/handlers/fetch-single.shared.d.ts +11 -3
- package/dist/tools/handlers/fetch-single.shared.js +131 -2
- package/dist/tools/handlers/fetch-url.tool.d.ts +6 -0
- package/dist/tools/handlers/fetch-url.tool.js +56 -12
- package/dist/tools/index.d.ts +1 -0
- package/dist/tools/index.js +13 -1
- package/dist/tools/schemas.d.ts +2 -0
- package/dist/tools/schemas.js +8 -0
- package/dist/tools/utils/content-shaping.js +19 -4
- package/dist/tools/utils/content-transform-core.d.ts +5 -0
- package/dist/tools/utils/content-transform-core.js +180 -0
- package/dist/tools/utils/content-transform-workers.d.ts +1 -0
- package/dist/tools/utils/content-transform-workers.js +1 -0
- package/dist/tools/utils/content-transform.d.ts +2 -1
- package/dist/tools/utils/content-transform.js +37 -136
- package/dist/tools/utils/fetch-pipeline.js +47 -56
- package/dist/tools/utils/frontmatter.d.ts +3 -0
- package/dist/tools/utils/frontmatter.js +73 -0
- package/dist/tools/utils/markdown-heuristics.d.ts +1 -0
- package/dist/tools/utils/markdown-heuristics.js +19 -0
- package/dist/tools/utils/markdown-signals.d.ts +1 -0
- package/dist/tools/utils/markdown-signals.js +19 -0
- package/dist/tools/utils/raw-markdown-frontmatter.d.ts +3 -0
- package/dist/tools/utils/raw-markdown-frontmatter.js +73 -0
- package/dist/tools/utils/raw-markdown.d.ts +6 -0
- package/dist/tools/utils/raw-markdown.js +149 -0
- package/dist/tools.d.ts +104 -0
- package/dist/tools.js +421 -0
- package/dist/transform.d.ts +69 -0
- package/dist/transform.js +1509 -0
- package/dist/transformers/markdown/fenced-code-rule.d.ts +2 -0
- package/dist/transformers/markdown/fenced-code-rule.js +38 -0
- package/dist/transformers/markdown/frontmatter.d.ts +2 -0
- package/dist/transformers/markdown/frontmatter.js +45 -0
- package/dist/transformers/markdown/noise-rule.d.ts +2 -0
- package/dist/transformers/markdown/noise-rule.js +80 -0
- package/dist/transformers/markdown/turndown-instance.d.ts +2 -0
- package/dist/transformers/markdown/turndown-instance.js +19 -0
- package/dist/transformers/markdown.d.ts +5 -0
- package/dist/transformers/markdown.js +314 -0
- package/dist/transformers/markdown.transformer.js +2 -189
- package/dist/utils/cancellation.d.ts +1 -0
- package/dist/utils/cancellation.js +18 -0
- package/dist/utils/code-language-bash.d.ts +1 -0
- package/dist/utils/code-language-bash.js +48 -0
- package/dist/utils/code-language-core.d.ts +2 -0
- package/dist/utils/code-language-core.js +13 -0
- package/dist/utils/code-language-detectors.d.ts +5 -0
- package/dist/utils/code-language-detectors.js +142 -0
- package/dist/utils/code-language-helpers.d.ts +5 -0
- package/dist/utils/code-language-helpers.js +62 -0
- package/dist/utils/code-language-parsing.d.ts +5 -0
- package/dist/utils/code-language-parsing.js +62 -0
- package/dist/utils/code-language.js +250 -46
- package/dist/utils/error-details.d.ts +3 -0
- package/dist/utils/error-details.js +12 -0
- package/dist/utils/filename-generator.js +14 -3
- package/dist/utils/host-normalizer.d.ts +1 -0
- package/dist/utils/host-normalizer.js +37 -0
- package/dist/utils/ip-address.d.ts +4 -0
- package/dist/utils/ip-address.js +6 -0
- package/dist/utils/tool-error-handler.js +12 -17
- package/dist/utils/url-redactor.d.ts +1 -0
- package/dist/utils/url-redactor.js +13 -0
- package/dist/utils/url-validator.js +35 -20
- package/dist/workers/transform-worker.js +82 -38
- package/package.json +13 -10
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { CODE_BLOCK } from '../../config/formatting.js';
|
|
2
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../../utils/code-language.js';
|
|
3
|
+
import { isRecord } from '../../utils/guards.js';
|
|
4
|
+
function isElement(node) {
|
|
5
|
+
return (isRecord(node) &&
|
|
6
|
+
'getAttribute' in node &&
|
|
7
|
+
typeof node.getAttribute === 'function');
|
|
8
|
+
}
|
|
9
|
+
function isFencedCodeBlock(node, options) {
|
|
10
|
+
return (options.codeBlockStyle === 'fenced' &&
|
|
11
|
+
node.nodeName === 'PRE' &&
|
|
12
|
+
node.firstChild?.nodeName === 'CODE');
|
|
13
|
+
}
|
|
14
|
+
function formatFencedCodeBlock(node) {
|
|
15
|
+
const codeNode = node.firstChild;
|
|
16
|
+
if (!isElement(codeNode))
|
|
17
|
+
return '';
|
|
18
|
+
const code = codeNode.textContent || '';
|
|
19
|
+
const language = resolveCodeLanguage(codeNode, code);
|
|
20
|
+
return CODE_BLOCK.format(code, language);
|
|
21
|
+
}
|
|
22
|
+
function resolveCodeLanguage(codeNode, code) {
|
|
23
|
+
const { className, dataLanguage } = readCodeAttributes(codeNode);
|
|
24
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
25
|
+
return attributeLanguage ?? detectLanguageFromCode(code) ?? '';
|
|
26
|
+
}
|
|
27
|
+
function readCodeAttributes(codeNode) {
|
|
28
|
+
return {
|
|
29
|
+
className: codeNode.getAttribute('class') ?? '',
|
|
30
|
+
dataLanguage: codeNode.getAttribute('data-language') ?? '',
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
export function addFencedCodeRule(instance) {
|
|
34
|
+
instance.addRule('fencedCodeBlockWithLanguage', {
|
|
35
|
+
filter: (node, options) => isFencedCodeBlock(node, options),
|
|
36
|
+
replacement: (_content, node) => formatFencedCodeBlock(node),
|
|
37
|
+
});
|
|
38
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { FRONTMATTER_DELIMITER, joinLines } from '../../config/formatting.js';
|
|
2
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
3
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
4
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
5
|
+
const ESCAPE_PATTERNS = {
|
|
6
|
+
backslash: /\\/g,
|
|
7
|
+
quote: /"/g,
|
|
8
|
+
newline: /\n/g,
|
|
9
|
+
tab: /\t/g,
|
|
10
|
+
};
|
|
11
|
+
const YAML_QUOTE_CHECKS = [
|
|
12
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
13
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
14
|
+
(input) => input === '',
|
|
15
|
+
(input) => YAML_NUMERIC.test(input),
|
|
16
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
17
|
+
];
|
|
18
|
+
function needsYamlQuotes(value) {
|
|
19
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
20
|
+
}
|
|
21
|
+
function escapeYamlValue(value) {
|
|
22
|
+
if (!needsYamlQuotes(value)) {
|
|
23
|
+
return value;
|
|
24
|
+
}
|
|
25
|
+
const escaped = value
|
|
26
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
27
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
28
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
29
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
30
|
+
return `"${escaped}"`;
|
|
31
|
+
}
|
|
32
|
+
function appendFrontmatterField(lines, key, value) {
|
|
33
|
+
if (!value)
|
|
34
|
+
return;
|
|
35
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
36
|
+
}
|
|
37
|
+
export function buildFrontmatter(metadata) {
|
|
38
|
+
if (!metadata)
|
|
39
|
+
return '';
|
|
40
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
41
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
42
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
43
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
44
|
+
return joinLines(lines);
|
|
45
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { isRecord } from '../../utils/guards.js';
|
|
2
|
+
const STRUCTURAL_TAGS = new Set([
|
|
3
|
+
'script',
|
|
4
|
+
'style',
|
|
5
|
+
'noscript',
|
|
6
|
+
'iframe',
|
|
7
|
+
'nav',
|
|
8
|
+
'footer',
|
|
9
|
+
'aside',
|
|
10
|
+
'header',
|
|
11
|
+
'form',
|
|
12
|
+
'button',
|
|
13
|
+
'input',
|
|
14
|
+
'select',
|
|
15
|
+
'textarea',
|
|
16
|
+
]);
|
|
17
|
+
const NAVIGATION_ROLES = new Set([
|
|
18
|
+
'navigation',
|
|
19
|
+
'banner',
|
|
20
|
+
'complementary',
|
|
21
|
+
'contentinfo',
|
|
22
|
+
'tree',
|
|
23
|
+
'menubar',
|
|
24
|
+
'menu',
|
|
25
|
+
]);
|
|
26
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
27
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
28
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
29
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
30
|
+
function isElement(node) {
|
|
31
|
+
return (isRecord(node) &&
|
|
32
|
+
'getAttribute' in node &&
|
|
33
|
+
typeof node.getAttribute === 'function');
|
|
34
|
+
}
|
|
35
|
+
function isStructuralNoiseTag(tagName) {
|
|
36
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
37
|
+
}
|
|
38
|
+
function isElementHidden(element) {
|
|
39
|
+
return (element.getAttribute('hidden') !== null ||
|
|
40
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
41
|
+
}
|
|
42
|
+
function hasNoiseRole(role) {
|
|
43
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
44
|
+
}
|
|
45
|
+
function matchesPromoIdOrClass(className, id) {
|
|
46
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
47
|
+
return PROMO_PATTERN.test(combined);
|
|
48
|
+
}
|
|
49
|
+
function matchesHighZIsolate(className) {
|
|
50
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
51
|
+
}
|
|
52
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
53
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
54
|
+
}
|
|
55
|
+
function readElementMetadata(element) {
|
|
56
|
+
return {
|
|
57
|
+
tagName: element.tagName.toLowerCase(),
|
|
58
|
+
className: element.getAttribute('class') ?? '',
|
|
59
|
+
id: element.getAttribute('id') ?? '',
|
|
60
|
+
role: element.getAttribute('role'),
|
|
61
|
+
isHidden: isElementHidden(element),
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
function isNoiseElement(node) {
|
|
65
|
+
const metadata = readElementMetadata(node);
|
|
66
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
67
|
+
metadata.isHidden ||
|
|
68
|
+
hasNoiseRole(metadata.role) ||
|
|
69
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
70
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
71
|
+
}
|
|
72
|
+
function isNoiseNode(node) {
|
|
73
|
+
return isElement(node) && isNoiseElement(node);
|
|
74
|
+
}
|
|
75
|
+
export function addNoiseRule(instance) {
|
|
76
|
+
instance.addRule('removeNoise', {
|
|
77
|
+
filter: (node) => isNoiseNode(node),
|
|
78
|
+
replacement: () => '',
|
|
79
|
+
});
|
|
80
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { addFencedCodeRule } from './fenced-code-rule.js';
|
|
3
|
+
import { addNoiseRule } from './noise-rule.js';
|
|
4
|
+
let turndownInstance = null;
|
|
5
|
+
function createTurndownInstance() {
|
|
6
|
+
const instance = new TurndownService({
|
|
7
|
+
headingStyle: 'atx',
|
|
8
|
+
codeBlockStyle: 'fenced',
|
|
9
|
+
emDelimiter: '_',
|
|
10
|
+
bulletListMarker: '-',
|
|
11
|
+
});
|
|
12
|
+
addNoiseRule(instance);
|
|
13
|
+
addFencedCodeRule(instance);
|
|
14
|
+
return instance;
|
|
15
|
+
}
|
|
16
|
+
export function getTurndown() {
|
|
17
|
+
turndownInstance ??= createTurndownInstance();
|
|
18
|
+
return turndownInstance;
|
|
19
|
+
}
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import { NodeHtmlMarkdown, } from 'node-html-markdown';
|
|
3
|
+
import { CODE_BLOCK, FRONTMATTER_DELIMITER, joinLines, } from '../config/formatting.js';
|
|
4
|
+
import { FetchError } from '../errors/app-error.js';
|
|
5
|
+
import { endTransformStage, startTransformStage, } from '../services/telemetry.js';
|
|
6
|
+
import { throwIfAborted } from '../utils/cancellation.js';
|
|
7
|
+
import { detectLanguageFromCode, resolveLanguageFromAttributes, } from '../utils/code-language.js';
|
|
8
|
+
import { isRecord } from '../utils/guards.js';
|
|
9
|
+
const YAML_SPECIAL_CHARS = /[:[\]{}"\r\t'|>&*!?,#]|\n/;
|
|
10
|
+
const YAML_NUMERIC = /^[\d.]+$/;
|
|
11
|
+
const YAML_RESERVED_WORDS = /^(true|false|null|yes|no|on|off)$/i;
|
|
12
|
+
const ESCAPE_PATTERNS = {
|
|
13
|
+
backslash: /\\/g,
|
|
14
|
+
quote: /"/g,
|
|
15
|
+
newline: /\n/g,
|
|
16
|
+
tab: /\t/g,
|
|
17
|
+
};
|
|
18
|
+
const YAML_QUOTE_CHECKS = [
|
|
19
|
+
(input) => YAML_SPECIAL_CHARS.test(input),
|
|
20
|
+
(input) => input.startsWith(' ') || input.endsWith(' '),
|
|
21
|
+
(input) => input === '',
|
|
22
|
+
(input) => YAML_NUMERIC.test(input),
|
|
23
|
+
(input) => YAML_RESERVED_WORDS.test(input),
|
|
24
|
+
];
|
|
25
|
+
function needsYamlQuotes(value) {
|
|
26
|
+
return YAML_QUOTE_CHECKS.some((check) => check(value));
|
|
27
|
+
}
|
|
28
|
+
function escapeYamlValue(value) {
|
|
29
|
+
if (!needsYamlQuotes(value)) {
|
|
30
|
+
return value;
|
|
31
|
+
}
|
|
32
|
+
const escaped = value
|
|
33
|
+
.replace(ESCAPE_PATTERNS.backslash, '\\\\')
|
|
34
|
+
.replace(ESCAPE_PATTERNS.quote, '\\"')
|
|
35
|
+
.replace(ESCAPE_PATTERNS.newline, '\\n')
|
|
36
|
+
.replace(ESCAPE_PATTERNS.tab, '\\t');
|
|
37
|
+
return `"${escaped}"`;
|
|
38
|
+
}
|
|
39
|
+
function appendFrontmatterField(lines, key, value) {
|
|
40
|
+
if (!value)
|
|
41
|
+
return;
|
|
42
|
+
lines.push(`${key}: ${escapeYamlValue(value)}`);
|
|
43
|
+
}
|
|
44
|
+
function buildFrontmatter(metadata) {
|
|
45
|
+
if (!metadata)
|
|
46
|
+
return '';
|
|
47
|
+
const lines = [FRONTMATTER_DELIMITER];
|
|
48
|
+
appendFrontmatterField(lines, 'title', metadata.title);
|
|
49
|
+
appendFrontmatterField(lines, 'source', metadata.url);
|
|
50
|
+
appendFrontmatterField(lines, 'author', metadata.author);
|
|
51
|
+
appendFrontmatterField(lines, 'description', metadata.description);
|
|
52
|
+
appendFrontmatterField(lines, 'fetchedAt', metadata.fetchedAt);
|
|
53
|
+
lines.push(FRONTMATTER_DELIMITER);
|
|
54
|
+
return joinLines(lines);
|
|
55
|
+
}
|
|
56
|
+
function isElement(node) {
|
|
57
|
+
return (isRecord(node) &&
|
|
58
|
+
'getAttribute' in node &&
|
|
59
|
+
typeof node.getAttribute === 'function');
|
|
60
|
+
}
|
|
61
|
+
const STRUCTURAL_TAGS = new Set([
|
|
62
|
+
'script',
|
|
63
|
+
'style',
|
|
64
|
+
'noscript',
|
|
65
|
+
'iframe',
|
|
66
|
+
'nav',
|
|
67
|
+
'footer',
|
|
68
|
+
'aside',
|
|
69
|
+
'header',
|
|
70
|
+
'form',
|
|
71
|
+
'button',
|
|
72
|
+
'input',
|
|
73
|
+
'select',
|
|
74
|
+
'textarea',
|
|
75
|
+
]);
|
|
76
|
+
const NAVIGATION_ROLES = new Set([
|
|
77
|
+
'navigation',
|
|
78
|
+
'banner',
|
|
79
|
+
'complementary',
|
|
80
|
+
'contentinfo',
|
|
81
|
+
'tree',
|
|
82
|
+
'menubar',
|
|
83
|
+
'menu',
|
|
84
|
+
]);
|
|
85
|
+
const PROMO_PATTERN = /banner|promo|announcement|cta|callout|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast/;
|
|
86
|
+
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
87
|
+
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
88
|
+
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
89
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
90
|
+
const NOISE_MARKERS = [
|
|
91
|
+
'<script',
|
|
92
|
+
'<style',
|
|
93
|
+
'<noscript',
|
|
94
|
+
'<iframe',
|
|
95
|
+
'<nav',
|
|
96
|
+
'<footer',
|
|
97
|
+
'<aside',
|
|
98
|
+
'<header',
|
|
99
|
+
'<form',
|
|
100
|
+
'<button',
|
|
101
|
+
'<input',
|
|
102
|
+
'<select',
|
|
103
|
+
'<textarea',
|
|
104
|
+
'<svg',
|
|
105
|
+
'<canvas',
|
|
106
|
+
' aria-hidden="true"',
|
|
107
|
+
" aria-hidden='true'",
|
|
108
|
+
' hidden',
|
|
109
|
+
' role="navigation"',
|
|
110
|
+
" role='navigation'",
|
|
111
|
+
' role="banner"',
|
|
112
|
+
" role='banner'",
|
|
113
|
+
' role="complementary"',
|
|
114
|
+
" role='complementary'",
|
|
115
|
+
' role="contentinfo"',
|
|
116
|
+
" role='contentinfo'",
|
|
117
|
+
' role="tree"',
|
|
118
|
+
" role='tree'",
|
|
119
|
+
' role="menubar"',
|
|
120
|
+
" role='menubar'",
|
|
121
|
+
' role="menu"',
|
|
122
|
+
" role='menu'",
|
|
123
|
+
' banner',
|
|
124
|
+
' promo',
|
|
125
|
+
' announcement',
|
|
126
|
+
' cta',
|
|
127
|
+
' callout',
|
|
128
|
+
' advert',
|
|
129
|
+
' newsletter',
|
|
130
|
+
' subscribe',
|
|
131
|
+
' cookie',
|
|
132
|
+
' consent',
|
|
133
|
+
' popup',
|
|
134
|
+
' modal',
|
|
135
|
+
' overlay',
|
|
136
|
+
' toast',
|
|
137
|
+
' fixed',
|
|
138
|
+
' sticky',
|
|
139
|
+
' z-50',
|
|
140
|
+
' z-4',
|
|
141
|
+
' isolate',
|
|
142
|
+
];
|
|
143
|
+
function mayContainNoise(html) {
|
|
144
|
+
const haystack = html.toLowerCase();
|
|
145
|
+
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
146
|
+
}
|
|
147
|
+
function isFullDocumentHtml(html) {
|
|
148
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
149
|
+
}
|
|
150
|
+
function isStructuralNoiseTag(tagName) {
|
|
151
|
+
return (STRUCTURAL_TAGS.has(tagName) || tagName === 'svg' || tagName === 'canvas');
|
|
152
|
+
}
|
|
153
|
+
function isElementHidden(element) {
|
|
154
|
+
return (element.getAttribute('hidden') !== null ||
|
|
155
|
+
element.getAttribute('aria-hidden') === 'true');
|
|
156
|
+
}
|
|
157
|
+
function hasNoiseRole(role) {
|
|
158
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
159
|
+
}
|
|
160
|
+
function matchesPromoIdOrClass(className, id) {
|
|
161
|
+
const combined = `${className} ${id}`.toLowerCase();
|
|
162
|
+
return PROMO_PATTERN.test(combined);
|
|
163
|
+
}
|
|
164
|
+
function matchesHighZIsolate(className) {
|
|
165
|
+
return HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className);
|
|
166
|
+
}
|
|
167
|
+
function matchesFixedOrHighZIsolate(className) {
|
|
168
|
+
return FIXED_PATTERN.test(className) || matchesHighZIsolate(className);
|
|
169
|
+
}
|
|
170
|
+
function readElementMetadata(element) {
|
|
171
|
+
return {
|
|
172
|
+
tagName: element.tagName.toLowerCase(),
|
|
173
|
+
className: element.getAttribute('class') ?? '',
|
|
174
|
+
id: element.getAttribute('id') ?? '',
|
|
175
|
+
role: element.getAttribute('role'),
|
|
176
|
+
isHidden: isElementHidden(element),
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
function isNoiseElement(node) {
|
|
180
|
+
const metadata = readElementMetadata(node);
|
|
181
|
+
return (isStructuralNoiseTag(metadata.tagName) ||
|
|
182
|
+
metadata.isHidden ||
|
|
183
|
+
hasNoiseRole(metadata.role) ||
|
|
184
|
+
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
185
|
+
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
186
|
+
}
|
|
187
|
+
function removeNoiseFromHtml(html) {
|
|
188
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
189
|
+
if (!shouldParse)
|
|
190
|
+
return html;
|
|
191
|
+
const shouldRemove = mayContainNoise(html);
|
|
192
|
+
try {
|
|
193
|
+
const { document } = parseHTML(html);
|
|
194
|
+
if (shouldRemove) {
|
|
195
|
+
const nodes = Array.from(document.querySelectorAll('*'));
|
|
196
|
+
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
197
|
+
const node = nodes[index];
|
|
198
|
+
if (!node)
|
|
199
|
+
continue;
|
|
200
|
+
if (isElement(node) && isNoiseElement(node)) {
|
|
201
|
+
node.remove();
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
const { body } = document;
|
|
206
|
+
if (body?.innerHTML)
|
|
207
|
+
return body.innerHTML;
|
|
208
|
+
if (typeof document.toString ===
|
|
209
|
+
'function') {
|
|
210
|
+
return document.toString();
|
|
211
|
+
}
|
|
212
|
+
const { documentElement } = document;
|
|
213
|
+
if (documentElement?.outerHTML)
|
|
214
|
+
return documentElement.outerHTML;
|
|
215
|
+
return html;
|
|
216
|
+
}
|
|
217
|
+
catch {
|
|
218
|
+
return html;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
function buildInlineCode(content) {
|
|
222
|
+
const runs = content.match(/`+/g);
|
|
223
|
+
const longest = runs?.sort((a, b) => b.length - a.length)[0] ?? '';
|
|
224
|
+
const delimiter = `\`${longest}`;
|
|
225
|
+
const padding = delimiter.length > 1 ? ' ' : '';
|
|
226
|
+
return `${delimiter}${padding}${content}${padding}${delimiter}`;
|
|
227
|
+
}
|
|
228
|
+
function isCodeBlock(parent) {
|
|
229
|
+
if (!isRecord(parent))
|
|
230
|
+
return false;
|
|
231
|
+
const tagName = typeof parent.tagName === 'string' ? parent.tagName.toUpperCase() : '';
|
|
232
|
+
return ['PRE', 'WRAPPED-PRE'].includes(tagName);
|
|
233
|
+
}
|
|
234
|
+
function createCodeTranslator() {
|
|
235
|
+
return {
|
|
236
|
+
code: (ctx) => {
|
|
237
|
+
if (!isRecord(ctx)) {
|
|
238
|
+
return {
|
|
239
|
+
spaceIfRepeatingChar: true,
|
|
240
|
+
noEscape: true,
|
|
241
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
const { node, parent, visitor } = ctx;
|
|
245
|
+
const getAttribute = isRecord(node) && typeof node.getAttribute === 'function'
|
|
246
|
+
? node.getAttribute.bind(node)
|
|
247
|
+
: undefined;
|
|
248
|
+
if (!isCodeBlock(parent)) {
|
|
249
|
+
return {
|
|
250
|
+
spaceIfRepeatingChar: true,
|
|
251
|
+
noEscape: true,
|
|
252
|
+
postprocess: ({ content }) => buildInlineCode(content),
|
|
253
|
+
};
|
|
254
|
+
}
|
|
255
|
+
const className = getAttribute?.('class') ?? '';
|
|
256
|
+
const dataLanguage = getAttribute?.('data-language') ?? '';
|
|
257
|
+
const attributeLanguage = resolveLanguageFromAttributes(className, dataLanguage);
|
|
258
|
+
const childTranslators = isRecord(visitor) ? visitor.instance : null;
|
|
259
|
+
const codeBlockTranslators = isRecord(childTranslators) &&
|
|
260
|
+
isRecord(childTranslators
|
|
261
|
+
.codeBlockTranslators)
|
|
262
|
+
? childTranslators.codeBlockTranslators
|
|
263
|
+
: null;
|
|
264
|
+
return {
|
|
265
|
+
noEscape: true,
|
|
266
|
+
preserveWhitespace: true,
|
|
267
|
+
...(codeBlockTranslators
|
|
268
|
+
? { childTranslators: codeBlockTranslators }
|
|
269
|
+
: null),
|
|
270
|
+
postprocess: ({ content }) => {
|
|
271
|
+
const language = attributeLanguage ?? detectLanguageFromCode(content) ?? '';
|
|
272
|
+
return CODE_BLOCK.format(content, language);
|
|
273
|
+
},
|
|
274
|
+
};
|
|
275
|
+
},
|
|
276
|
+
};
|
|
277
|
+
}
|
|
278
|
+
let markdownInstance = null;
|
|
279
|
+
function createMarkdownInstance() {
|
|
280
|
+
return new NodeHtmlMarkdown({
|
|
281
|
+
codeFence: CODE_BLOCK.fence,
|
|
282
|
+
codeBlockStyle: 'fenced',
|
|
283
|
+
emDelimiter: '_',
|
|
284
|
+
bulletMarker: '-',
|
|
285
|
+
}, createCodeTranslator());
|
|
286
|
+
}
|
|
287
|
+
function getMarkdownConverter() {
|
|
288
|
+
markdownInstance ??= createMarkdownInstance();
|
|
289
|
+
return markdownInstance;
|
|
290
|
+
}
|
|
291
|
+
export function htmlToMarkdown(html, metadata, options) {
|
|
292
|
+
const url = options?.url ?? metadata?.url ?? '';
|
|
293
|
+
const frontmatter = buildFrontmatter(metadata);
|
|
294
|
+
if (!html)
|
|
295
|
+
return frontmatter;
|
|
296
|
+
try {
|
|
297
|
+
throwIfAborted(options?.signal, url, 'markdown:begin');
|
|
298
|
+
const noiseStage = startTransformStage(url, 'markdown:noise');
|
|
299
|
+
const cleanedHtml = removeNoiseFromHtml(html);
|
|
300
|
+
endTransformStage(noiseStage);
|
|
301
|
+
throwIfAborted(options?.signal, url, 'markdown:cleaned');
|
|
302
|
+
const translateStage = startTransformStage(url, 'markdown:translate');
|
|
303
|
+
const content = getMarkdownConverter().translate(cleanedHtml).trim();
|
|
304
|
+
endTransformStage(translateStage);
|
|
305
|
+
throwIfAborted(options?.signal, url, 'markdown:translated');
|
|
306
|
+
return frontmatter ? `${frontmatter}\n${content}` : content;
|
|
307
|
+
}
|
|
308
|
+
catch (error) {
|
|
309
|
+
if (error instanceof FetchError) {
|
|
310
|
+
throw error;
|
|
311
|
+
}
|
|
312
|
+
return frontmatter;
|
|
313
|
+
}
|
|
314
|
+
}
|