@j0hanz/fetch-url-mcp 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +2 -3
- package/dist/cli.js +1 -2
- package/dist/http/auth.d.ts +5 -3
- package/dist/http/auth.js +64 -15
- package/dist/http/health.d.ts +1 -2
- package/dist/http/health.js +7 -18
- package/dist/http/helpers.d.ts +3 -4
- package/dist/http/helpers.js +21 -21
- package/dist/http/native.d.ts +0 -1
- package/dist/http/native.js +34 -26
- package/dist/http/rate-limit.d.ts +0 -1
- package/dist/http/rate-limit.js +3 -4
- package/dist/index.d.ts +0 -1
- package/dist/index.js +17 -18
- package/dist/lib/{markdown-cleanup.d.ts → content.d.ts} +4 -2
- package/dist/lib/content.js +1356 -0
- package/dist/lib/core.d.ts +253 -0
- package/dist/lib/core.js +1228 -0
- package/dist/lib/{tool-pipeline.d.ts → fetch-pipeline.d.ts} +1 -2
- package/dist/lib/{tool-pipeline.js → fetch-pipeline.js} +10 -19
- package/dist/lib/{fetch.d.ts → http.d.ts} +7 -9
- package/dist/lib/{fetch.js → http.js} +706 -944
- package/dist/lib/mcp-tools.d.ts +28 -0
- package/dist/lib/mcp-tools.js +107 -0
- package/dist/lib/{tool-progress.d.ts → progress.d.ts} +0 -1
- package/dist/lib/{tool-progress.js → progress.js} +8 -13
- package/dist/lib/task-handlers.d.ts +5 -0
- package/dist/lib/{mcp.js → task-handlers.js} +56 -12
- package/dist/lib/url.d.ts +70 -0
- package/dist/lib/url.js +686 -0
- package/dist/lib/utils.d.ts +58 -0
- package/dist/lib/utils.js +304 -0
- package/dist/prompts/index.d.ts +0 -1
- package/dist/prompts/index.js +0 -1
- package/dist/resources/index.d.ts +0 -1
- package/dist/resources/index.js +74 -33
- package/dist/resources/instructions.d.ts +0 -1
- package/dist/resources/instructions.js +2 -2
- package/dist/schemas/inputs.d.ts +0 -1
- package/dist/schemas/inputs.js +2 -3
- package/dist/schemas/outputs.d.ts +0 -1
- package/dist/schemas/outputs.js +1 -2
- package/dist/server.d.ts +0 -1
- package/dist/server.js +16 -26
- package/dist/tasks/execution.d.ts +0 -1
- package/dist/tasks/execution.js +27 -24
- package/dist/tasks/manager.d.ts +7 -3
- package/dist/tasks/manager.js +53 -34
- package/dist/tasks/owner.d.ts +1 -2
- package/dist/tasks/owner.js +1 -2
- package/dist/tasks/tool-registry.d.ts +1 -2
- package/dist/tasks/tool-registry.js +0 -1
- package/dist/tools/fetch-url.d.ts +1 -2
- package/dist/tools/fetch-url.js +39 -31
- package/dist/tools/index.d.ts +0 -1
- package/dist/tools/index.js +0 -1
- package/dist/transform/html-translators.d.ts +1 -0
- package/dist/transform/html-translators.js +454 -0
- package/dist/transform/metadata.d.ts +4 -0
- package/dist/transform/metadata.js +183 -0
- package/dist/transform/transform.d.ts +0 -1
- package/dist/transform/transform.js +24 -641
- package/dist/transform/types.d.ts +9 -11
- package/dist/transform/types.js +0 -1
- package/dist/transform/worker-pool.d.ts +0 -1
- package/dist/transform/worker-pool.js +7 -16
- package/dist/transform/workers/shared.d.ts +0 -1
- package/dist/transform/workers/shared.js +1 -2
- package/dist/transform/workers/transform-child.d.ts +0 -1
- package/dist/transform/workers/transform-child.js +0 -1
- package/dist/transform/workers/transform-worker.d.ts +0 -1
- package/dist/transform/workers/transform-worker.js +0 -1
- package/package.json +6 -3
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/http/auth.d.ts.map +0 -1
- package/dist/http/auth.js.map +0 -1
- package/dist/http/health.d.ts.map +0 -1
- package/dist/http/health.js.map +0 -1
- package/dist/http/helpers.d.ts.map +0 -1
- package/dist/http/helpers.js.map +0 -1
- package/dist/http/native.d.ts.map +0 -1
- package/dist/http/native.js.map +0 -1
- package/dist/http/rate-limit.d.ts.map +0 -1
- package/dist/http/rate-limit.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/lib/cache.d.ts +0 -54
- package/dist/lib/cache.d.ts.map +0 -1
- package/dist/lib/cache.js +0 -264
- package/dist/lib/cache.js.map +0 -1
- package/dist/lib/config.d.ts +0 -143
- package/dist/lib/config.d.ts.map +0 -1
- package/dist/lib/config.js +0 -476
- package/dist/lib/config.js.map +0 -1
- package/dist/lib/crypto.d.ts +0 -4
- package/dist/lib/crypto.d.ts.map +0 -1
- package/dist/lib/crypto.js +0 -56
- package/dist/lib/crypto.js.map +0 -1
- package/dist/lib/dom-noise-removal.d.ts +0 -2
- package/dist/lib/dom-noise-removal.d.ts.map +0 -1
- package/dist/lib/dom-noise-removal.js +0 -494
- package/dist/lib/dom-noise-removal.js.map +0 -1
- package/dist/lib/download.d.ts +0 -4
- package/dist/lib/download.d.ts.map +0 -1
- package/dist/lib/download.js +0 -106
- package/dist/lib/download.js.map +0 -1
- package/dist/lib/errors.d.ts +0 -14
- package/dist/lib/errors.d.ts.map +0 -1
- package/dist/lib/errors.js +0 -72
- package/dist/lib/errors.js.map +0 -1
- package/dist/lib/fetch-content.d.ts +0 -5
- package/dist/lib/fetch-content.d.ts.map +0 -1
- package/dist/lib/fetch-content.js +0 -164
- package/dist/lib/fetch-content.js.map +0 -1
- package/dist/lib/fetch-stream.d.ts +0 -5
- package/dist/lib/fetch-stream.d.ts.map +0 -1
- package/dist/lib/fetch-stream.js +0 -29
- package/dist/lib/fetch-stream.js.map +0 -1
- package/dist/lib/fetch.d.ts.map +0 -1
- package/dist/lib/fetch.js.map +0 -1
- package/dist/lib/host-normalization.d.ts +0 -2
- package/dist/lib/host-normalization.d.ts.map +0 -1
- package/dist/lib/host-normalization.js +0 -91
- package/dist/lib/host-normalization.js.map +0 -1
- package/dist/lib/ip-blocklist.d.ts +0 -9
- package/dist/lib/ip-blocklist.d.ts.map +0 -1
- package/dist/lib/ip-blocklist.js +0 -79
- package/dist/lib/ip-blocklist.js.map +0 -1
- package/dist/lib/json.d.ts +0 -2
- package/dist/lib/json.d.ts.map +0 -1
- package/dist/lib/json.js +0 -45
- package/dist/lib/json.js.map +0 -1
- package/dist/lib/language-detection.d.ts +0 -3
- package/dist/lib/language-detection.d.ts.map +0 -1
- package/dist/lib/language-detection.js +0 -355
- package/dist/lib/language-detection.js.map +0 -1
- package/dist/lib/markdown-cleanup.d.ts.map +0 -1
- package/dist/lib/markdown-cleanup.js +0 -532
- package/dist/lib/markdown-cleanup.js.map +0 -1
- package/dist/lib/mcp-lifecycle.d.ts +0 -5
- package/dist/lib/mcp-lifecycle.d.ts.map +0 -1
- package/dist/lib/mcp-lifecycle.js +0 -51
- package/dist/lib/mcp-lifecycle.js.map +0 -1
- package/dist/lib/mcp-validator.d.ts +0 -17
- package/dist/lib/mcp-validator.d.ts.map +0 -1
- package/dist/lib/mcp-validator.js +0 -45
- package/dist/lib/mcp-validator.js.map +0 -1
- package/dist/lib/mcp.d.ts +0 -4
- package/dist/lib/mcp.d.ts.map +0 -1
- package/dist/lib/mcp.js.map +0 -1
- package/dist/lib/observability.d.ts +0 -23
- package/dist/lib/observability.d.ts.map +0 -1
- package/dist/lib/observability.js +0 -238
- package/dist/lib/observability.js.map +0 -1
- package/dist/lib/server-tuning.d.ts +0 -15
- package/dist/lib/server-tuning.d.ts.map +0 -1
- package/dist/lib/server-tuning.js +0 -49
- package/dist/lib/server-tuning.js.map +0 -1
- package/dist/lib/session.d.ts +0 -45
- package/dist/lib/session.d.ts.map +0 -1
- package/dist/lib/session.js +0 -263
- package/dist/lib/session.js.map +0 -1
- package/dist/lib/timer-utils.d.ts +0 -13
- package/dist/lib/timer-utils.d.ts.map +0 -1
- package/dist/lib/timer-utils.js +0 -44
- package/dist/lib/timer-utils.js.map +0 -1
- package/dist/lib/tool-errors.d.ts +0 -12
- package/dist/lib/tool-errors.d.ts.map +0 -1
- package/dist/lib/tool-errors.js +0 -55
- package/dist/lib/tool-errors.js.map +0 -1
- package/dist/lib/tool-pipeline.d.ts.map +0 -1
- package/dist/lib/tool-pipeline.js.map +0 -1
- package/dist/lib/tool-progress.d.ts.map +0 -1
- package/dist/lib/tool-progress.js.map +0 -1
- package/dist/lib/type-guards.d.ts +0 -16
- package/dist/lib/type-guards.d.ts.map +0 -1
- package/dist/lib/type-guards.js +0 -13
- package/dist/lib/type-guards.js.map +0 -1
- package/dist/prompts/index.d.ts.map +0 -1
- package/dist/prompts/index.js.map +0 -1
- package/dist/resources/index.d.ts.map +0 -1
- package/dist/resources/index.js.map +0 -1
- package/dist/resources/instructions.d.ts.map +0 -1
- package/dist/resources/instructions.js.map +0 -1
- package/dist/schemas/inputs.d.ts.map +0 -1
- package/dist/schemas/inputs.js.map +0 -1
- package/dist/schemas/outputs.d.ts.map +0 -1
- package/dist/schemas/outputs.js.map +0 -1
- package/dist/server.d.ts.map +0 -1
- package/dist/server.js.map +0 -1
- package/dist/tasks/execution.d.ts.map +0 -1
- package/dist/tasks/execution.js.map +0 -1
- package/dist/tasks/manager.d.ts.map +0 -1
- package/dist/tasks/manager.js.map +0 -1
- package/dist/tasks/owner.d.ts.map +0 -1
- package/dist/tasks/owner.js.map +0 -1
- package/dist/tasks/tool-registry.d.ts.map +0 -1
- package/dist/tasks/tool-registry.js.map +0 -1
- package/dist/tools/fetch-url.d.ts.map +0 -1
- package/dist/tools/fetch-url.js.map +0 -1
- package/dist/tools/index.d.ts.map +0 -1
- package/dist/tools/index.js.map +0 -1
- package/dist/transform/transform.d.ts.map +0 -1
- package/dist/transform/transform.js.map +0 -1
- package/dist/transform/types.d.ts.map +0 -1
- package/dist/transform/types.js.map +0 -1
- package/dist/transform/worker-pool.d.ts.map +0 -1
- package/dist/transform/worker-pool.js.map +0 -1
- package/dist/transform/workers/shared.d.ts.map +0 -1
- package/dist/transform/workers/shared.js.map +0 -1
- package/dist/transform/workers/transform-child.d.ts.map +0 -1
- package/dist/transform/workers/transform-child.js.map +0 -1
- package/dist/transform/workers/transform-worker.d.ts.map +0 -1
- package/dist/transform/workers/transform-worker.js.map +0 -1
|
@@ -0,0 +1,1356 @@
|
|
|
1
|
+
import { parseHTML } from 'linkedom';
|
|
2
|
+
import {} from '../transform/types.js';
|
|
3
|
+
import { config, logDebug } from './core.js';
|
|
4
|
+
import { throwIfAborted } from './utils.js';
|
|
5
|
+
const NOISE_SCAN_LIMIT = 50_000;
|
|
6
|
+
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
|
+
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
|
+
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
10
|
+
const HTML_FRAGMENT_MARKERS = /<\s*(?:article|main|section|div|nav|footer|header|aside|table|ul|ol)\b/i;
|
|
11
|
+
const NOISE_PATTERNS = [
|
|
12
|
+
/<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
|
|
13
|
+
/[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
|
|
14
|
+
/[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
|
|
15
|
+
/[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
|
|
16
|
+
/[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumb|pagination)\b/i,
|
|
17
|
+
];
|
|
18
|
+
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
19
|
+
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
|
|
20
|
+
const SKIP_URL_PREFIXES = [
|
|
21
|
+
'#',
|
|
22
|
+
'java' + 'script:',
|
|
23
|
+
'mailto:',
|
|
24
|
+
'tel:',
|
|
25
|
+
'data:',
|
|
26
|
+
'blob:',
|
|
27
|
+
];
|
|
28
|
+
const BASE_STRUCTURAL_TAGS = new Set([
|
|
29
|
+
'script',
|
|
30
|
+
'style',
|
|
31
|
+
'noscript',
|
|
32
|
+
'iframe',
|
|
33
|
+
'form',
|
|
34
|
+
'button',
|
|
35
|
+
'input',
|
|
36
|
+
'select',
|
|
37
|
+
'textarea',
|
|
38
|
+
]);
|
|
39
|
+
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
|
|
40
|
+
const NAVIGATION_ROLES = new Set([
|
|
41
|
+
'navigation',
|
|
42
|
+
'banner',
|
|
43
|
+
'complementary',
|
|
44
|
+
'contentinfo',
|
|
45
|
+
'tree',
|
|
46
|
+
'menubar',
|
|
47
|
+
'menu',
|
|
48
|
+
'dialog',
|
|
49
|
+
'alertdialog',
|
|
50
|
+
'search',
|
|
51
|
+
]);
|
|
52
|
+
const INTERACTIVE_CONTENT_ROLES = new Set([
|
|
53
|
+
'tabpanel',
|
|
54
|
+
'tab',
|
|
55
|
+
'tablist',
|
|
56
|
+
'dialog',
|
|
57
|
+
'alertdialog',
|
|
58
|
+
'menu',
|
|
59
|
+
'menuitem',
|
|
60
|
+
'option',
|
|
61
|
+
'listbox',
|
|
62
|
+
'combobox',
|
|
63
|
+
'tooltip',
|
|
64
|
+
'alert',
|
|
65
|
+
]);
|
|
66
|
+
const PROMO_TOKENS_ALWAYS = [
|
|
67
|
+
'banner',
|
|
68
|
+
'promo',
|
|
69
|
+
'announcement',
|
|
70
|
+
'cta',
|
|
71
|
+
'advert',
|
|
72
|
+
'ads',
|
|
73
|
+
'sponsor',
|
|
74
|
+
'recommend',
|
|
75
|
+
'breadcrumb',
|
|
76
|
+
'pagination',
|
|
77
|
+
'pager',
|
|
78
|
+
'taglist',
|
|
79
|
+
];
|
|
80
|
+
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
81
|
+
const PROMO_TOKENS_BY_CATEGORY = {
|
|
82
|
+
'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
|
|
83
|
+
newsletters: ['newsletter', 'subscribe'],
|
|
84
|
+
'social-share': ['share', 'social'],
|
|
85
|
+
};
|
|
86
|
+
const BASE_NOISE_SELECTORS = {
|
|
87
|
+
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
88
|
+
cookieBanners: '[role="dialog"]',
|
|
89
|
+
hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
|
|
90
|
+
};
|
|
91
|
+
const NO_MATCH_REGEX = /a^/i;
|
|
92
|
+
let cachedContext;
|
|
93
|
+
let lastConfigRef;
|
|
94
|
+
function escapeRegexLiteral(value) {
|
|
95
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
96
|
+
}
|
|
97
|
+
function buildTokenRegex(tokens) {
|
|
98
|
+
if (tokens.size === 0)
|
|
99
|
+
return NO_MATCH_REGEX;
|
|
100
|
+
return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
|
|
101
|
+
}
|
|
102
|
+
function addTokens(target, tokens) {
|
|
103
|
+
for (const token of tokens)
|
|
104
|
+
target.add(token);
|
|
105
|
+
}
|
|
106
|
+
function getPromoMatchers(currentConfig, flags) {
|
|
107
|
+
const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
|
|
108
|
+
const aggressiveTokens = new Set();
|
|
109
|
+
if (currentConfig.aggressiveMode) {
|
|
110
|
+
addTokens(aggressiveTokens, PROMO_TOKENS_AGGRESSIVE);
|
|
111
|
+
}
|
|
112
|
+
if (flags.cookieBanners) {
|
|
113
|
+
addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['cookie-banners']);
|
|
114
|
+
}
|
|
115
|
+
if (flags.newsletters) {
|
|
116
|
+
addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['newsletters']);
|
|
117
|
+
}
|
|
118
|
+
if (flags.socialShare) {
|
|
119
|
+
addTokens(baseTokens, PROMO_TOKENS_BY_CATEGORY['social-share']);
|
|
120
|
+
}
|
|
121
|
+
for (const t of currentConfig.extraTokens) {
|
|
122
|
+
const n = t.toLowerCase().trim();
|
|
123
|
+
if (n)
|
|
124
|
+
baseTokens.add(n);
|
|
125
|
+
}
|
|
126
|
+
return {
|
|
127
|
+
base: buildTokenRegex(baseTokens),
|
|
128
|
+
aggressive: buildTokenRegex(aggressiveTokens),
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
function getContext() {
|
|
132
|
+
const currentConfig = config.noiseRemoval;
|
|
133
|
+
if (cachedContext !== undefined && lastConfigRef === currentConfig)
|
|
134
|
+
return cachedContext;
|
|
135
|
+
const enabled = new Set(currentConfig.enabledCategories
|
|
136
|
+
.map((c) => {
|
|
137
|
+
const s = c.toLowerCase().trim();
|
|
138
|
+
const { locale } = config.i18n;
|
|
139
|
+
return locale ? s.toLocaleLowerCase(locale) : s;
|
|
140
|
+
})
|
|
141
|
+
.filter(Boolean));
|
|
142
|
+
const isEnabled = (cat) => enabled.has(cat);
|
|
143
|
+
const flags = {
|
|
144
|
+
navFooter: isEnabled('nav-footer'),
|
|
145
|
+
cookieBanners: isEnabled('cookie-banners'),
|
|
146
|
+
newsletters: isEnabled('newsletters'),
|
|
147
|
+
socialShare: isEnabled('social-share'),
|
|
148
|
+
};
|
|
149
|
+
const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
|
|
150
|
+
if (!currentConfig.preserveSvgCanvas) {
|
|
151
|
+
structuralTags.add('svg');
|
|
152
|
+
structuralTags.add('canvas');
|
|
153
|
+
}
|
|
154
|
+
const promoMatchers = getPromoMatchers(currentConfig, flags);
|
|
155
|
+
const extraSelectors = currentConfig.extraSelectors
|
|
156
|
+
.map((s) => s.trim())
|
|
157
|
+
.filter((s) => s.length > 0);
|
|
158
|
+
// Pre-build selectors
|
|
159
|
+
const selectors = [BASE_NOISE_SELECTORS.hidden];
|
|
160
|
+
if (flags.navFooter)
|
|
161
|
+
selectors.push(BASE_NOISE_SELECTORS.navFooter);
|
|
162
|
+
if (flags.cookieBanners)
|
|
163
|
+
selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
|
|
164
|
+
const baseSelector = selectors.join(',');
|
|
165
|
+
const candidateSelector = [
|
|
166
|
+
...structuralTags,
|
|
167
|
+
...ALWAYS_NOISE_TAGS,
|
|
168
|
+
'aside',
|
|
169
|
+
'header',
|
|
170
|
+
'[class]',
|
|
171
|
+
'[id]',
|
|
172
|
+
'[role]',
|
|
173
|
+
'[style]',
|
|
174
|
+
].join(',');
|
|
175
|
+
cachedContext = {
|
|
176
|
+
flags,
|
|
177
|
+
structuralTags,
|
|
178
|
+
weights: currentConfig.weights,
|
|
179
|
+
promoMatchers,
|
|
180
|
+
promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
|
|
181
|
+
extraSelectors,
|
|
182
|
+
baseSelector,
|
|
183
|
+
candidateSelector,
|
|
184
|
+
};
|
|
185
|
+
lastConfigRef = currentConfig;
|
|
186
|
+
return cachedContext;
|
|
187
|
+
}
|
|
188
|
+
function isInteractive(element, role) {
|
|
189
|
+
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
190
|
+
return true;
|
|
191
|
+
const ds = element.getAttribute('data-state');
|
|
192
|
+
if (ds === 'inactive' || ds === 'closed')
|
|
193
|
+
return true;
|
|
194
|
+
const dataOrientation = element.getAttribute('data-orientation');
|
|
195
|
+
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
196
|
+
return true;
|
|
197
|
+
return (element.hasAttribute('data-accordion-item') ||
|
|
198
|
+
element.hasAttribute('data-radix-collection-item'));
|
|
199
|
+
}
|
|
200
|
+
function isWithinPrimaryContent(element) {
|
|
201
|
+
let current = element;
|
|
202
|
+
while (current) {
|
|
203
|
+
const tagName = current.tagName.toLowerCase();
|
|
204
|
+
if (tagName === 'article' || tagName === 'main')
|
|
205
|
+
return true;
|
|
206
|
+
if (current.getAttribute('role') === 'main')
|
|
207
|
+
return true;
|
|
208
|
+
current = current.parentElement;
|
|
209
|
+
}
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
212
|
+
function shouldPreserve(element, tagName) {
|
|
213
|
+
// Check Dialog
|
|
214
|
+
const role = element.getAttribute('role');
|
|
215
|
+
if (role === 'dialog' || role === 'alertdialog') {
|
|
216
|
+
if (isWithinPrimaryContent(element))
|
|
217
|
+
return true;
|
|
218
|
+
const textLen = (element.textContent || '').length;
|
|
219
|
+
if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
|
|
220
|
+
return true;
|
|
221
|
+
return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
|
|
222
|
+
}
|
|
223
|
+
// Check Nav/Footer
|
|
224
|
+
if (tagName === 'nav' || tagName === 'footer') {
|
|
225
|
+
if (element.querySelector('article,main,section,[role="main"]'))
|
|
226
|
+
return true;
|
|
227
|
+
return ((element.textContent || '').trim().length >=
|
|
228
|
+
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
229
|
+
}
|
|
230
|
+
return false;
|
|
231
|
+
}
|
|
232
|
+
function removeNodes(nodes) {
|
|
233
|
+
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
234
|
+
const node = nodes[i];
|
|
235
|
+
if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
|
|
236
|
+
node.remove();
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
function scoreNavFooter(tagName, role, className, id, weights) {
|
|
241
|
+
let score = 0;
|
|
242
|
+
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
243
|
+
score += weights.structural;
|
|
244
|
+
// Header Boilerplate
|
|
245
|
+
if (tagName === 'header') {
|
|
246
|
+
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
247
|
+
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
248
|
+
score += weights.structural;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
// Role Noise
|
|
252
|
+
if (role && NAVIGATION_ROLES.has(role)) {
|
|
253
|
+
if (tagName !== 'aside' || role !== 'complementary') {
|
|
254
|
+
score += weights.structural;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
return score;
|
|
258
|
+
}
|
|
259
|
+
function extractElementMetadata(element) {
|
|
260
|
+
const tagName = element.tagName.toLowerCase();
|
|
261
|
+
const className = element.getAttribute('class') ?? '';
|
|
262
|
+
const id = element.getAttribute('id') ?? '';
|
|
263
|
+
const role = element.getAttribute('role');
|
|
264
|
+
const style = element.getAttribute('style');
|
|
265
|
+
const _isInteractive = isInteractive(element, role);
|
|
266
|
+
const isHidden = element.hasAttribute('hidden') ||
|
|
267
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
268
|
+
(style !== null &&
|
|
269
|
+
/\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
|
|
270
|
+
return {
|
|
271
|
+
tagName,
|
|
272
|
+
className,
|
|
273
|
+
id,
|
|
274
|
+
role,
|
|
275
|
+
style,
|
|
276
|
+
isInteractive: _isInteractive,
|
|
277
|
+
isHidden,
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
function isNoiseElement(element, context) {
|
|
281
|
+
const meta = extractElementMetadata(element);
|
|
282
|
+
let score = 0;
|
|
283
|
+
const { weights } = context;
|
|
284
|
+
// Structural
|
|
285
|
+
if (context.structuralTags.has(meta.tagName) && !meta.isInteractive) {
|
|
286
|
+
score += weights.structural;
|
|
287
|
+
}
|
|
288
|
+
// Nav/Footer Scoring
|
|
289
|
+
if (context.flags.navFooter) {
|
|
290
|
+
score += scoreNavFooter(meta.tagName, meta.role, meta.className, meta.id, weights);
|
|
291
|
+
}
|
|
292
|
+
// Hidden
|
|
293
|
+
if (meta.isHidden && !meta.isInteractive) {
|
|
294
|
+
score += weights.hidden;
|
|
295
|
+
}
|
|
296
|
+
// Sticky/Fixed
|
|
297
|
+
if (FIXED_OR_HIGH_Z_PATTERN.test(meta.className)) {
|
|
298
|
+
score += weights.stickyFixed;
|
|
299
|
+
}
|
|
300
|
+
// Promo
|
|
301
|
+
if (context.promoEnabled) {
|
|
302
|
+
const aggTest = context.promoMatchers.aggressive.test(meta.className) ||
|
|
303
|
+
context.promoMatchers.aggressive.test(meta.id);
|
|
304
|
+
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
305
|
+
const isBaseMatch = !aggTest &&
|
|
306
|
+
(context.promoMatchers.base.test(meta.className) ||
|
|
307
|
+
context.promoMatchers.base.test(meta.id));
|
|
308
|
+
if (isAggressiveMatch || isBaseMatch) {
|
|
309
|
+
score += weights.promo;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
return score >= weights.threshold;
|
|
313
|
+
}
|
|
314
|
+
function cleanHeadingWrapperDivs(h) {
|
|
315
|
+
const divs = h.querySelectorAll('div');
|
|
316
|
+
for (let j = divs.length - 1; j >= 0; j--) {
|
|
317
|
+
const d = divs[j];
|
|
318
|
+
if (!d?.parentNode)
|
|
319
|
+
continue;
|
|
320
|
+
const cls = d.getAttribute('class') ?? '';
|
|
321
|
+
const stl = d.getAttribute('style') ?? '';
|
|
322
|
+
if (cls.includes('absolute') ||
|
|
323
|
+
stl.includes('position') ||
|
|
324
|
+
d.getAttribute('tabindex') === '-1') {
|
|
325
|
+
d.remove();
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
function cleanHeadingAnchors(h) {
|
|
330
|
+
const anchors = h.querySelectorAll('a');
|
|
331
|
+
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
332
|
+
const a = anchors[j];
|
|
333
|
+
if (!a?.parentNode)
|
|
334
|
+
continue;
|
|
335
|
+
const href = a.getAttribute('href') ?? '';
|
|
336
|
+
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
337
|
+
if (href.startsWith('#') && txt.length === 0) {
|
|
338
|
+
a.remove();
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
function cleanHeadingZeroWidth(h, document) {
|
|
343
|
+
const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
|
|
344
|
+
let node;
|
|
345
|
+
while ((node = walker.nextNode())) {
|
|
346
|
+
if (node.textContent?.includes('\u200B')) {
|
|
347
|
+
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
function cleanHeadings(document) {
|
|
352
|
+
// Clean Heading Anchors
|
|
353
|
+
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
354
|
+
for (const h of headings) {
|
|
355
|
+
if (!h.parentNode)
|
|
356
|
+
continue;
|
|
357
|
+
cleanHeadingWrapperDivs(h);
|
|
358
|
+
cleanHeadingAnchors(h);
|
|
359
|
+
cleanHeadingZeroWidth(h, document);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
function stripNoise(document, context, signal) {
|
|
363
|
+
cleanHeadings(document);
|
|
364
|
+
// Remove Base & Extra
|
|
365
|
+
const { baseSelector, extraSelectors } = context;
|
|
366
|
+
// Base
|
|
367
|
+
const baseNodes = document.querySelectorAll(baseSelector);
|
|
368
|
+
removeNodes(baseNodes);
|
|
369
|
+
// Extra
|
|
370
|
+
if (extraSelectors.length > 0) {
|
|
371
|
+
const combinedExtra = extraSelectors.join(',');
|
|
372
|
+
const extraNodes = document.querySelectorAll(combinedExtra);
|
|
373
|
+
removeNodes(extraNodes);
|
|
374
|
+
}
|
|
375
|
+
// Candidates
|
|
376
|
+
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
377
|
+
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
378
|
+
if (i % 500 === 0 && signal?.aborted) {
|
|
379
|
+
throw new Error('Noise removal aborted');
|
|
380
|
+
}
|
|
381
|
+
const node = candidates[i];
|
|
382
|
+
if (!node)
|
|
383
|
+
continue;
|
|
384
|
+
if (!node.parentNode)
|
|
385
|
+
continue;
|
|
386
|
+
if (shouldPreserve(node, node.tagName.toLowerCase()))
|
|
387
|
+
continue;
|
|
388
|
+
if (isNoiseElement(node, context)) {
|
|
389
|
+
node.remove();
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
function processUrlElement(el, attr, base, isSrcset) {
|
|
394
|
+
if (!el.parentNode)
|
|
395
|
+
return;
|
|
396
|
+
if (isSrcset) {
|
|
397
|
+
const val = el.getAttribute(attr);
|
|
398
|
+
if (val) {
|
|
399
|
+
const newVal = val
|
|
400
|
+
.split(',')
|
|
401
|
+
.map((entry) => {
|
|
402
|
+
const parts = entry.trim().split(/\s+/);
|
|
403
|
+
if (!parts[0])
|
|
404
|
+
return entry;
|
|
405
|
+
try {
|
|
406
|
+
parts[0] = new URL(parts[0], base).href;
|
|
407
|
+
}
|
|
408
|
+
catch {
|
|
409
|
+
/* ignore */
|
|
410
|
+
}
|
|
411
|
+
return parts.join(' ');
|
|
412
|
+
})
|
|
413
|
+
.join(', ');
|
|
414
|
+
el.setAttribute(attr, newVal);
|
|
415
|
+
}
|
|
416
|
+
return;
|
|
417
|
+
}
|
|
418
|
+
const val = el.getAttribute(attr);
|
|
419
|
+
if (val &&
|
|
420
|
+
!SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
|
|
421
|
+
try {
|
|
422
|
+
el.setAttribute(attr, new URL(val, base).href);
|
|
423
|
+
}
|
|
424
|
+
catch {
|
|
425
|
+
/* ignore */
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
function resolveUrls(document, baseUrlStr) {
|
|
430
|
+
let base;
|
|
431
|
+
try {
|
|
432
|
+
base = new URL(baseUrlStr);
|
|
433
|
+
}
|
|
434
|
+
catch {
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
438
|
+
for (const el of elements) {
|
|
439
|
+
const tag = el.tagName.toLowerCase();
|
|
440
|
+
if (tag === 'a')
|
|
441
|
+
processUrlElement(el, 'href', base, false);
|
|
442
|
+
else if (tag === 'img')
|
|
443
|
+
processUrlElement(el, 'src', base, false);
|
|
444
|
+
else if (tag === 'source')
|
|
445
|
+
processUrlElement(el, 'srcset', base, true);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
function serialize(document, fallback) {
|
|
449
|
+
const bodyHtml = document.body.innerHTML;
|
|
450
|
+
if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
451
|
+
return bodyHtml;
|
|
452
|
+
const outerHtml = document.documentElement.outerHTML;
|
|
453
|
+
if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
454
|
+
return outerHtml;
|
|
455
|
+
return fallback;
|
|
456
|
+
}
|
|
457
|
+
function isFullDocumentHtml(html) {
|
|
458
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
459
|
+
}
|
|
460
|
+
function mayContainNoise(html) {
|
|
461
|
+
const sample = html.length <= NOISE_SCAN_LIMIT
|
|
462
|
+
? html
|
|
463
|
+
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
464
|
+
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
465
|
+
}
|
|
466
|
+
export function removeNoiseFromHtml(html, document, baseUrl, signal) {
|
|
467
|
+
const shouldParse = isFullDocumentHtml(html) ||
|
|
468
|
+
mayContainNoise(html) ||
|
|
469
|
+
HTML_FRAGMENT_MARKERS.test(html);
|
|
470
|
+
if (!shouldParse)
|
|
471
|
+
return html;
|
|
472
|
+
try {
|
|
473
|
+
const context = getContext();
|
|
474
|
+
if (config.noiseRemoval.debug) {
|
|
475
|
+
logDebug('Noise removal audit enabled', {
|
|
476
|
+
categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
|
|
477
|
+
});
|
|
478
|
+
}
|
|
479
|
+
const doc = document ?? parseHTML(html).document;
|
|
480
|
+
stripNoise(doc, context, signal);
|
|
481
|
+
if (baseUrl)
|
|
482
|
+
resolveUrls(doc, baseUrl);
|
|
483
|
+
return serialize(doc, html);
|
|
484
|
+
}
|
|
485
|
+
catch {
|
|
486
|
+
return html;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
class DetectionContext {
|
|
490
|
+
code;
|
|
491
|
+
_lower;
|
|
492
|
+
_lines;
|
|
493
|
+
_trimmedStart;
|
|
494
|
+
constructor(code) {
|
|
495
|
+
this.code = code;
|
|
496
|
+
}
|
|
497
|
+
get lower() {
|
|
498
|
+
this._lower ??= this.code.toLowerCase();
|
|
499
|
+
return this._lower;
|
|
500
|
+
}
|
|
501
|
+
get lines() {
|
|
502
|
+
this._lines ??= this.code.split(/\r?\n/);
|
|
503
|
+
return this._lines;
|
|
504
|
+
}
|
|
505
|
+
get trimmedStart() {
|
|
506
|
+
this._trimmedStart ??= this.code.trimStart();
|
|
507
|
+
return this._trimmedStart;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
const BASH_COMMANDS = new Set([
|
|
511
|
+
'sudo',
|
|
512
|
+
'chmod',
|
|
513
|
+
'mkdir',
|
|
514
|
+
'cd',
|
|
515
|
+
'ls',
|
|
516
|
+
'cat',
|
|
517
|
+
'echo',
|
|
518
|
+
]);
|
|
519
|
+
const BASH_PACKAGE_MANAGERS = [
|
|
520
|
+
'npm',
|
|
521
|
+
'yarn',
|
|
522
|
+
'pnpm',
|
|
523
|
+
'npx',
|
|
524
|
+
'brew',
|
|
525
|
+
'apt',
|
|
526
|
+
'pip',
|
|
527
|
+
'cargo',
|
|
528
|
+
'go',
|
|
529
|
+
];
|
|
530
|
+
const BASH_VERBS = new Set(['install', 'add', 'run', 'build', 'start']);
|
|
531
|
+
const TYPESCRIPT_HINTS = [
|
|
532
|
+
': string',
|
|
533
|
+
':string',
|
|
534
|
+
': number',
|
|
535
|
+
':number',
|
|
536
|
+
': boolean',
|
|
537
|
+
':boolean',
|
|
538
|
+
': void',
|
|
539
|
+
':void',
|
|
540
|
+
': any',
|
|
541
|
+
':any',
|
|
542
|
+
': unknown',
|
|
543
|
+
':unknown',
|
|
544
|
+
': never',
|
|
545
|
+
':never',
|
|
546
|
+
];
|
|
547
|
+
const HTML_TAGS = [
|
|
548
|
+
'<!doctype',
|
|
549
|
+
'<html',
|
|
550
|
+
'<head',
|
|
551
|
+
'<body',
|
|
552
|
+
'<div',
|
|
553
|
+
'<span',
|
|
554
|
+
'<p',
|
|
555
|
+
'<a',
|
|
556
|
+
'<script',
|
|
557
|
+
'<style',
|
|
558
|
+
];
|
|
559
|
+
const RUST_REGEX = /\b(?:fn|impl|struct|enum)\b/;
|
|
560
|
+
const JS_REGEX = /\b(?:const|let|var|function|class|async|await|export|import)\b/;
|
|
561
|
+
const PYTHON_UNIQUE_REGEX = /\b(?:def |elif |except |finally:|yield |lambda |raise |pass$)/m;
|
|
562
|
+
const JS_SIGNAL_REGEX = /\b(?:const |let |var |function |require\(|=>|===|!==|console\.)/;
|
|
563
|
+
const CSS_REGEX = /@media|@import|@keyframes/;
|
|
564
|
+
const CSS_PROPERTY_REGEX = /^\s*[a-z][\w-]*\s*:/;
|
|
565
|
+
function containsJsxTag(code) {
|
|
566
|
+
const len = code.length;
|
|
567
|
+
for (let i = 0; i < len - 1; i++) {
|
|
568
|
+
if (code.charCodeAt(i) === 60 /* < */) {
|
|
569
|
+
const next = code.charCodeAt(i + 1);
|
|
570
|
+
if (next >= 65 && next <= 90)
|
|
571
|
+
return true; // A-Z
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
return false;
|
|
575
|
+
}
|
|
576
|
+
function isBashLine(line) {
|
|
577
|
+
const trimmed = line.trimStart();
|
|
578
|
+
if (trimmed.length === 0)
|
|
579
|
+
return false;
|
|
580
|
+
// Shell Prefix
|
|
581
|
+
if (trimmed.startsWith('#!') ||
|
|
582
|
+
trimmed.startsWith('$ ') ||
|
|
583
|
+
trimmed.startsWith('# ')) {
|
|
584
|
+
return true;
|
|
585
|
+
}
|
|
586
|
+
const spaceIdx = trimmed.indexOf(' ');
|
|
587
|
+
const firstWord = spaceIdx === -1 ? trimmed : trimmed.slice(0, spaceIdx);
|
|
588
|
+
if (BASH_COMMANDS.has(firstWord))
|
|
589
|
+
return true;
|
|
590
|
+
// Package Managers
|
|
591
|
+
const isPkgMgr = BASH_PACKAGE_MANAGERS.includes(firstWord);
|
|
592
|
+
if (isPkgMgr && spaceIdx !== -1) {
|
|
593
|
+
const rest = trimmed.slice(spaceIdx + 1);
|
|
594
|
+
const secondSpaceIdx = rest.indexOf(' ');
|
|
595
|
+
const secondWord = secondSpaceIdx === -1 ? rest : rest.slice(0, secondSpaceIdx);
|
|
596
|
+
if (BASH_VERBS.has(secondWord))
|
|
597
|
+
return true;
|
|
598
|
+
}
|
|
599
|
+
return false;
|
|
600
|
+
}
|
|
601
|
+
function detectBashIndicators(lines) {
|
|
602
|
+
return lines.some((line) => isBashLine(line));
|
|
603
|
+
}
|
|
604
|
+
function detectCssStructure(lines) {
|
|
605
|
+
for (const line of lines) {
|
|
606
|
+
const trimmed = line.trimStart();
|
|
607
|
+
if (trimmed.length === 0)
|
|
608
|
+
continue;
|
|
609
|
+
const hasSelector = (trimmed.startsWith('.') || trimmed.startsWith('#')) &&
|
|
610
|
+
trimmed.includes('{');
|
|
611
|
+
if (hasSelector)
|
|
612
|
+
return true;
|
|
613
|
+
if (trimmed.includes(';') &&
|
|
614
|
+
CSS_PROPERTY_REGEX.test(trimmed) &&
|
|
615
|
+
!trimmed.includes('(')) {
|
|
616
|
+
return true;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
return false;
|
|
620
|
+
}
|
|
621
|
+
function detectYamlStructure(lines) {
|
|
622
|
+
for (const line of lines) {
|
|
623
|
+
const trimmed = line.trim();
|
|
624
|
+
if (trimmed.length === 0)
|
|
625
|
+
continue;
|
|
626
|
+
const colonIdx = trimmed.indexOf(':');
|
|
627
|
+
if (colonIdx <= 0)
|
|
628
|
+
continue;
|
|
629
|
+
const after = trimmed.charCodeAt(colonIdx + 1);
|
|
630
|
+
// space (32) or tab (9)
|
|
631
|
+
if (after === 32 || after === 9)
|
|
632
|
+
return true;
|
|
633
|
+
}
|
|
634
|
+
return false;
|
|
635
|
+
}
|
|
636
|
+
const LANGUAGES = [
|
|
637
|
+
{
|
|
638
|
+
lang: 'rust',
|
|
639
|
+
weight: 25,
|
|
640
|
+
match: (ctx) => {
|
|
641
|
+
if (ctx.lower.includes('let mut'))
|
|
642
|
+
return true;
|
|
643
|
+
if (RUST_REGEX.test(ctx.lower))
|
|
644
|
+
return true;
|
|
645
|
+
return ctx.lower.includes('use ') && ctx.lower.includes('::');
|
|
646
|
+
},
|
|
647
|
+
},
|
|
648
|
+
{
|
|
649
|
+
lang: 'go',
|
|
650
|
+
weight: 22,
|
|
651
|
+
match: (ctx) => {
|
|
652
|
+
if (ctx.lower.includes('import "'))
|
|
653
|
+
return true;
|
|
654
|
+
return /\b(?:package|func)\b/.test(ctx.lower);
|
|
655
|
+
},
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
lang: 'jsx',
|
|
659
|
+
weight: 22,
|
|
660
|
+
match: (ctx) => {
|
|
661
|
+
const l = ctx.lower;
|
|
662
|
+
if (l.includes('classname=') ||
|
|
663
|
+
l.includes('jsx:') ||
|
|
664
|
+
l.includes("from 'react'") ||
|
|
665
|
+
l.includes('from "react"')) {
|
|
666
|
+
return true;
|
|
667
|
+
}
|
|
668
|
+
return containsJsxTag(ctx.code);
|
|
669
|
+
},
|
|
670
|
+
},
|
|
671
|
+
{
|
|
672
|
+
lang: 'typescript',
|
|
673
|
+
weight: 20,
|
|
674
|
+
match: (ctx) => {
|
|
675
|
+
if (/\b(?:interface|type)\b/.test(ctx.lower))
|
|
676
|
+
return true;
|
|
677
|
+
const l = ctx.lower;
|
|
678
|
+
for (const hint of TYPESCRIPT_HINTS) {
|
|
679
|
+
if (l.includes(hint))
|
|
680
|
+
return true;
|
|
681
|
+
}
|
|
682
|
+
return false;
|
|
683
|
+
},
|
|
684
|
+
},
|
|
685
|
+
{
|
|
686
|
+
lang: 'sql',
|
|
687
|
+
weight: 20,
|
|
688
|
+
match: (ctx) => {
|
|
689
|
+
const l = ctx.lower;
|
|
690
|
+
return /\b(?:select|insert|update|delete|create|alter|drop)\b/.test(l);
|
|
691
|
+
},
|
|
692
|
+
},
|
|
693
|
+
{
|
|
694
|
+
lang: 'python',
|
|
695
|
+
weight: 18,
|
|
696
|
+
match: (ctx) => {
|
|
697
|
+
const l = ctx.lower;
|
|
698
|
+
if (l.includes('print(') || l.includes('__name__'))
|
|
699
|
+
return true;
|
|
700
|
+
if (l.includes('self.') || l.includes('elif '))
|
|
701
|
+
return true;
|
|
702
|
+
// Check for Python's None/True/False using original case (they are capitalized in Python)
|
|
703
|
+
if (ctx.code.includes('None') ||
|
|
704
|
+
ctx.code.includes('True') ||
|
|
705
|
+
ctx.code.includes('False')) {
|
|
706
|
+
return true;
|
|
707
|
+
}
|
|
708
|
+
// Python-unique keywords that JS doesn't have
|
|
709
|
+
if (PYTHON_UNIQUE_REGEX.test(l))
|
|
710
|
+
return true;
|
|
711
|
+
// Shared keywords (import, from, class) — only match if no JS signals present
|
|
712
|
+
if (/\b(?:import|from|class)\b/.test(l) &&
|
|
713
|
+
!JS_SIGNAL_REGEX.test(l) &&
|
|
714
|
+
!l.includes('{') &&
|
|
715
|
+
!l.includes("from '")) {
|
|
716
|
+
return true;
|
|
717
|
+
}
|
|
718
|
+
return false;
|
|
719
|
+
},
|
|
720
|
+
},
|
|
721
|
+
{
|
|
722
|
+
lang: 'css',
|
|
723
|
+
weight: 18,
|
|
724
|
+
match: (ctx) => {
|
|
725
|
+
if (CSS_REGEX.test(ctx.lower))
|
|
726
|
+
return true;
|
|
727
|
+
return detectCssStructure(ctx.lines);
|
|
728
|
+
},
|
|
729
|
+
},
|
|
730
|
+
{
|
|
731
|
+
lang: 'bash',
|
|
732
|
+
weight: 15,
|
|
733
|
+
match: (ctx) => detectBashIndicators(ctx.lines),
|
|
734
|
+
},
|
|
735
|
+
{
|
|
736
|
+
lang: 'yaml',
|
|
737
|
+
weight: 15,
|
|
738
|
+
match: (ctx) => detectYamlStructure(ctx.lines),
|
|
739
|
+
},
|
|
740
|
+
{
|
|
741
|
+
lang: 'javascript',
|
|
742
|
+
weight: 15,
|
|
743
|
+
match: (ctx) => JS_REGEX.test(ctx.lower),
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
lang: 'html',
|
|
747
|
+
weight: 12,
|
|
748
|
+
match: (ctx) => {
|
|
749
|
+
const l = ctx.lower;
|
|
750
|
+
for (const tag of HTML_TAGS) {
|
|
751
|
+
if (l.includes(tag))
|
|
752
|
+
return true;
|
|
753
|
+
}
|
|
754
|
+
return false;
|
|
755
|
+
},
|
|
756
|
+
},
|
|
757
|
+
{
|
|
758
|
+
lang: 'json',
|
|
759
|
+
weight: 10,
|
|
760
|
+
match: (ctx) => {
|
|
761
|
+
const s = ctx.trimmedStart;
|
|
762
|
+
return s.startsWith('{') || s.startsWith('[');
|
|
763
|
+
},
|
|
764
|
+
},
|
|
765
|
+
];
|
|
766
|
+
function extractLanguageFromClassName(className) {
|
|
767
|
+
if (!className)
|
|
768
|
+
return undefined;
|
|
769
|
+
// Split by whitespace and check for language indicators
|
|
770
|
+
const tokens = className.match(/\S+/g);
|
|
771
|
+
if (!tokens)
|
|
772
|
+
return undefined;
|
|
773
|
+
// Fast path: check for prefixes
|
|
774
|
+
for (const token of tokens) {
|
|
775
|
+
const lower = token.toLowerCase();
|
|
776
|
+
if (lower.startsWith('language-'))
|
|
777
|
+
return token.slice(9);
|
|
778
|
+
if (lower.startsWith('lang-'))
|
|
779
|
+
return token.slice(5);
|
|
780
|
+
if (lower.startsWith('highlight-'))
|
|
781
|
+
return token.slice(10);
|
|
782
|
+
}
|
|
783
|
+
// Fallback: check for hljs context
|
|
784
|
+
if (!tokens.includes('hljs'))
|
|
785
|
+
return undefined;
|
|
786
|
+
const langClass = tokens.find((t) => {
|
|
787
|
+
const l = t.toLowerCase();
|
|
788
|
+
return l !== 'hljs' && !l.startsWith('hljs-');
|
|
789
|
+
});
|
|
790
|
+
return langClass;
|
|
791
|
+
}
|
|
792
|
+
function resolveLanguageFromDataAttribute(dataLang) {
|
|
793
|
+
const trimmed = dataLang.trim();
|
|
794
|
+
if (!trimmed)
|
|
795
|
+
return undefined;
|
|
796
|
+
// Check if \w+
|
|
797
|
+
for (let i = 0; i < trimmed.length; i++) {
|
|
798
|
+
const c = trimmed.charCodeAt(i);
|
|
799
|
+
// valid: A-Z, a-z, 0-9, _
|
|
800
|
+
const isUpper = c >= 65 && c <= 90;
|
|
801
|
+
const isLower = c >= 97 && c <= 122;
|
|
802
|
+
const isDigit = c >= 48 && c <= 57;
|
|
803
|
+
const isUnder = c === 95;
|
|
804
|
+
if (!isUpper && !isLower && !isDigit && !isUnder) {
|
|
805
|
+
return undefined;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
return trimmed;
|
|
809
|
+
}
|
|
810
|
+
export function resolveLanguageFromAttributes(className, dataLang) {
|
|
811
|
+
return (extractLanguageFromClassName(className) ??
|
|
812
|
+
resolveLanguageFromDataAttribute(dataLang));
|
|
813
|
+
}
|
|
814
|
+
export function detectLanguageFromCode(code) {
|
|
815
|
+
if (!code)
|
|
816
|
+
return undefined;
|
|
817
|
+
// Fast path for empty/whitespace only
|
|
818
|
+
let empty = true;
|
|
819
|
+
for (let i = 0; i < code.length; i++) {
|
|
820
|
+
if (code.charCodeAt(i) > 32) {
|
|
821
|
+
empty = false;
|
|
822
|
+
break;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
if (empty)
|
|
826
|
+
return undefined;
|
|
827
|
+
const ctx = new DetectionContext(code);
|
|
828
|
+
let bestLang;
|
|
829
|
+
let bestScore = -1;
|
|
830
|
+
for (const def of LANGUAGES) {
|
|
831
|
+
if (def.match(ctx)) {
|
|
832
|
+
if (def.weight > bestScore) {
|
|
833
|
+
bestScore = def.weight;
|
|
834
|
+
bestLang = def.lang;
|
|
835
|
+
if (bestScore >= 25)
|
|
836
|
+
break;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
return bestLang;
|
|
841
|
+
}
|
|
842
|
+
const MAX_LINE_LENGTH = 80;
|
|
843
|
+
const REGEX = {
|
|
844
|
+
HEADING_MARKER: /^#{1,6}\s/m,
|
|
845
|
+
HEADING_STRICT: /^#{1,6}\s+/m,
|
|
846
|
+
EMPTY_HEADING_LINE: /^#{1,6}[ \t\u00A0]*$/,
|
|
847
|
+
FENCE_START: /^\s*(`{3,}|~{3,})/,
|
|
848
|
+
LIST_MARKER: /^(?:[-*+])\s/m,
|
|
849
|
+
TOC_LINK: /^- \[[^\]]+\]\(#[^)]+\)\s*$/,
|
|
850
|
+
TOC_HEADING: /^(?:#{1,6}\s+)?(?:table of contents|contents)\s*$/i,
|
|
851
|
+
HTML_DOC_START: /^(<!doctype|<html)/i,
|
|
852
|
+
COMBINED_LINE_REMOVALS: /^(?:\[Skip to (?:main )?(?:content|navigation)\]\(#[^)]*\)|\[Skip link\]\(#[^)]*\)|Was this page helpful\??)\s*$/gim,
|
|
853
|
+
ZERO_WIDTH_ANCHOR: /\[(?:\s|\u200B)*\]\(#[^)]*\)[ \t]*/g,
|
|
854
|
+
CONCATENATED_PROPS: /([a-z_][a-z0-9_]{0,30}\??:\s+)([\u0022\u201C][^\u0022\u201C\u201D]*[\u0022\u201D])([a-z_][a-z0-9_]{0,30}\??:)/g,
|
|
855
|
+
DOUBLE_NEWLINE_REDUCER: /\n{3,}/g,
|
|
856
|
+
SOURCE_KEY: /^source:\s/im,
|
|
857
|
+
HEADING_SPACING: /(^#{1,6}\s[^\n]*)\n([^\n])/gm,
|
|
858
|
+
HEADING_CODE_BLOCK: /(^#{1,6}\s+\w+)```/gm,
|
|
859
|
+
SPACING_LINK_FIX: /\]\(([^)]+)\)\[/g,
|
|
860
|
+
SPACING_ADJ_COMBINED: /(?:\]\([^)]+\)|`[^`]+`)(?=[A-Za-z0-9])/g,
|
|
861
|
+
SPACING_CODE_DASH: /(`[^`]+`)\s*\\-\s*/g,
|
|
862
|
+
SPACING_ESCAPES: /\\([[\].])/g,
|
|
863
|
+
SPACING_LIST_NUM_COMBINED: /^((?![-*+] |\d+\. |[ \t]).+)\n((?:[-*+]|\d+\.) )/gm,
|
|
864
|
+
NESTED_LIST_INDENT: /^( +)((?:[-*+])|\d+\.)\s/gm,
|
|
865
|
+
TYPEDOC_COMMENT: /(`+)(?:(?!\1)[\s\S])*?\1|\s?\/\\?\*[\s\S]*?\\?\*\//g,
|
|
866
|
+
};
|
|
867
|
+
const HEADING_KEYWORDS = new Set(config.markdownCleanup.headingKeywords.map((value) => value.toLocaleLowerCase(config.i18n.locale)));
|
|
868
|
+
const SPECIAL_PREFIXES = /^(?:example|note|tip|warning|important|caution):\s+\S/i;
|
|
869
|
+
const TOC_SCAN_LIMIT = 20;
|
|
870
|
+
const TOC_MAX_NON_EMPTY = 12;
|
|
871
|
+
const TOC_LINK_RATIO_THRESHOLD = 0.8;
|
|
872
|
+
const TYPEDOC_PREFIXES = [
|
|
873
|
+
'Defined in:',
|
|
874
|
+
'Returns:',
|
|
875
|
+
'Since:',
|
|
876
|
+
'See also:',
|
|
877
|
+
];
|
|
878
|
+
function createAbortChecker(options) {
|
|
879
|
+
const signal = options?.signal;
|
|
880
|
+
const url = options?.url ?? '';
|
|
881
|
+
return (stage) => {
|
|
882
|
+
throwIfAborted(signal, url, stage);
|
|
883
|
+
};
|
|
884
|
+
}
|
|
885
|
+
function getLineEnding(content) {
|
|
886
|
+
return content.includes('\r\n') ? '\r\n' : '\n';
|
|
887
|
+
}
|
|
888
|
+
function isBlank(line) {
|
|
889
|
+
return line === undefined || line.trim().length === 0;
|
|
890
|
+
}
|
|
891
|
+
function hasFollowingContent(lines, startIndex) {
|
|
892
|
+
// Optimization: Bound lookahead to avoid checking too many lines in huge files
|
|
893
|
+
for (let i = startIndex + 1; i < Math.min(lines.length, startIndex + 50); i++) {
|
|
894
|
+
if (!isBlank(lines[i]))
|
|
895
|
+
return true;
|
|
896
|
+
}
|
|
897
|
+
return false;
|
|
898
|
+
}
|
|
899
|
+
function isTitleCaseOrKeyword(trimmed) {
|
|
900
|
+
// Quick check for length to avoid regex on long strings
|
|
901
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
902
|
+
return false;
|
|
903
|
+
// Single word optimization
|
|
904
|
+
if (!trimmed.includes(' ')) {
|
|
905
|
+
if (!/^[A-Z]/.test(trimmed))
|
|
906
|
+
return false;
|
|
907
|
+
return HEADING_KEYWORDS.has(trimmed.toLocaleLowerCase(config.i18n.locale));
|
|
908
|
+
}
|
|
909
|
+
// Split limited number of words
|
|
910
|
+
const words = trimmed.split(/\s+/);
|
|
911
|
+
const len = words.length;
|
|
912
|
+
if (len < 2 || len > 6)
|
|
913
|
+
return false;
|
|
914
|
+
let capitalizedCount = 0;
|
|
915
|
+
for (let i = 0; i < len; i++) {
|
|
916
|
+
const w = words[i];
|
|
917
|
+
if (!w)
|
|
918
|
+
continue;
|
|
919
|
+
const isCap = /^[A-Z][a-z]*$/.test(w);
|
|
920
|
+
if (isCap)
|
|
921
|
+
capitalizedCount++;
|
|
922
|
+
else if (!/^(?:and|or|the|of|in|for|to|a)$/i.test(w))
|
|
923
|
+
return false;
|
|
924
|
+
}
|
|
925
|
+
return capitalizedCount >= 2;
|
|
926
|
+
}
|
|
927
|
+
function getHeadingPrefix(trimmed) {
|
|
928
|
+
if (trimmed.length > MAX_LINE_LENGTH)
|
|
929
|
+
return null;
|
|
930
|
+
// Fast path: Check common markdown markers first
|
|
931
|
+
const firstChar = trimmed.charCodeAt(0);
|
|
932
|
+
// # (35), - (45), * (42), + (43), digit (48-57), [ (91)
|
|
933
|
+
if (firstChar === 35 ||
|
|
934
|
+
firstChar === 45 ||
|
|
935
|
+
firstChar === 42 ||
|
|
936
|
+
firstChar === 43 ||
|
|
937
|
+
firstChar === 91 ||
|
|
938
|
+
(firstChar >= 48 && firstChar <= 57)) {
|
|
939
|
+
if (REGEX.HEADING_MARKER.test(trimmed) ||
|
|
940
|
+
REGEX.LIST_MARKER.test(trimmed) ||
|
|
941
|
+
/^\d+\.\s/.test(trimmed) ||
|
|
942
|
+
/^\[.*\]\(.*\)$/.test(trimmed)) {
|
|
943
|
+
return null;
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
if (SPECIAL_PREFIXES.test(trimmed)) {
|
|
947
|
+
return /^example:\s/i.test(trimmed) ? '### ' : '## ';
|
|
948
|
+
}
|
|
949
|
+
const lastChar = trimmed.charCodeAt(trimmed.length - 1);
|
|
950
|
+
// . (46), ! (33), ? (63)
|
|
951
|
+
if (lastChar === 46 || lastChar === 33 || lastChar === 63)
|
|
952
|
+
return null;
|
|
953
|
+
return isTitleCaseOrKeyword(trimmed) ? '## ' : null;
|
|
954
|
+
}
|
|
955
|
+
function getTocBlockStats(lines, headingIndex) {
|
|
956
|
+
let total = 0;
|
|
957
|
+
let linkCount = 0;
|
|
958
|
+
let nonLinkCount = 0;
|
|
959
|
+
const lookaheadMax = Math.min(lines.length, headingIndex + TOC_SCAN_LIMIT);
|
|
960
|
+
for (let i = headingIndex + 1; i < lookaheadMax; i++) {
|
|
961
|
+
const line = lines[i];
|
|
962
|
+
if (!line)
|
|
963
|
+
continue;
|
|
964
|
+
const trimmed = line.trim();
|
|
965
|
+
if (!trimmed)
|
|
966
|
+
continue;
|
|
967
|
+
if (REGEX.HEADING_MARKER.test(trimmed))
|
|
968
|
+
break;
|
|
969
|
+
total += 1;
|
|
970
|
+
if (REGEX.TOC_LINK.test(trimmed))
|
|
971
|
+
linkCount += 1;
|
|
972
|
+
else
|
|
973
|
+
nonLinkCount += 1;
|
|
974
|
+
if (total >= TOC_MAX_NON_EMPTY)
|
|
975
|
+
break;
|
|
976
|
+
}
|
|
977
|
+
return { total, linkCount, nonLinkCount };
|
|
978
|
+
}
|
|
979
|
+
function skipTocLines(lines, startIndex) {
|
|
980
|
+
for (let i = startIndex; i < lines.length; i++) {
|
|
981
|
+
const line = lines[i];
|
|
982
|
+
if (line === undefined)
|
|
983
|
+
continue;
|
|
984
|
+
const trimmed = line.trim();
|
|
985
|
+
if (!trimmed)
|
|
986
|
+
continue;
|
|
987
|
+
if (!REGEX.TOC_LINK.test(trimmed))
|
|
988
|
+
return i;
|
|
989
|
+
}
|
|
990
|
+
return lines.length;
|
|
991
|
+
}
|
|
992
|
+
function isTypeDocArtifactLine(line) {
|
|
993
|
+
const trimmed = line.trim();
|
|
994
|
+
for (const prefix of TYPEDOC_PREFIXES) {
|
|
995
|
+
if (!trimmed.startsWith(prefix))
|
|
996
|
+
continue;
|
|
997
|
+
const rest = trimmed.slice(prefix.length).trimStart();
|
|
998
|
+
if (!rest.startsWith('**`'))
|
|
999
|
+
return false;
|
|
1000
|
+
return rest.includes('`**');
|
|
1001
|
+
}
|
|
1002
|
+
return false;
|
|
1003
|
+
}
|
|
1004
|
+
function tryPromoteOrphan(lines, i, trimmed) {
|
|
1005
|
+
const prevLine = lines[i - 1];
|
|
1006
|
+
const isOrphan = i === 0 || !prevLine || prevLine.trim().length === 0;
|
|
1007
|
+
if (!isOrphan)
|
|
1008
|
+
return null;
|
|
1009
|
+
const prefix = getHeadingPrefix(trimmed);
|
|
1010
|
+
if (!prefix)
|
|
1011
|
+
return null;
|
|
1012
|
+
const isSpecialPrefix = SPECIAL_PREFIXES.test(trimmed);
|
|
1013
|
+
if (!isSpecialPrefix && !hasFollowingContent(lines, i))
|
|
1014
|
+
return null;
|
|
1015
|
+
return `${prefix}${trimmed}`;
|
|
1016
|
+
}
|
|
1017
|
+
function shouldSkipAsToc(lines, i, trimmed, removeToc, options) {
|
|
1018
|
+
if (!removeToc || !REGEX.TOC_HEADING.test(trimmed))
|
|
1019
|
+
return null;
|
|
1020
|
+
const { total, linkCount, nonLinkCount } = getTocBlockStats(lines, i);
|
|
1021
|
+
if (total === 0 || nonLinkCount > 0)
|
|
1022
|
+
return null;
|
|
1023
|
+
const ratio = linkCount / total;
|
|
1024
|
+
if (ratio <= TOC_LINK_RATIO_THRESHOLD)
|
|
1025
|
+
return null;
|
|
1026
|
+
throwIfAborted(options?.signal, options?.url ?? '', 'markdown:cleanup:toc');
|
|
1027
|
+
return skipTocLines(lines, i + 1);
|
|
1028
|
+
}
|
|
1029
|
+
function preprocessLines(lines, options) {
|
|
1030
|
+
const processedLines = [];
|
|
1031
|
+
const len = lines.length;
|
|
1032
|
+
const promote = config.markdownCleanup.promoteOrphanHeadings;
|
|
1033
|
+
const removeToc = config.markdownCleanup.removeTocBlocks;
|
|
1034
|
+
const checkAbort = createAbortChecker(options);
|
|
1035
|
+
let skipUntil = -1;
|
|
1036
|
+
for (let i = 0; i < len; i++) {
|
|
1037
|
+
if (i < skipUntil)
|
|
1038
|
+
continue;
|
|
1039
|
+
let line = lines[i];
|
|
1040
|
+
if (line === undefined)
|
|
1041
|
+
continue;
|
|
1042
|
+
const trimmed = line.trim();
|
|
1043
|
+
if (REGEX.EMPTY_HEADING_LINE.test(trimmed))
|
|
1044
|
+
continue;
|
|
1045
|
+
const tocSkip = shouldSkipAsToc(lines, i, trimmed, removeToc, options);
|
|
1046
|
+
if (tocSkip !== null) {
|
|
1047
|
+
skipUntil = tocSkip;
|
|
1048
|
+
continue;
|
|
1049
|
+
}
|
|
1050
|
+
if (promote && trimmed.length > 0) {
|
|
1051
|
+
checkAbort('markdown:cleanup:promote');
|
|
1052
|
+
const promoted = tryPromoteOrphan(lines, i, trimmed);
|
|
1053
|
+
if (promoted)
|
|
1054
|
+
line = promoted;
|
|
1055
|
+
}
|
|
1056
|
+
processedLines.push(line);
|
|
1057
|
+
}
|
|
1058
|
+
return processedLines.join('\n');
|
|
1059
|
+
}
|
|
1060
|
+
function processTextBuffer(lines, options) {
|
|
1061
|
+
if (lines.length === 0)
|
|
1062
|
+
return '';
|
|
1063
|
+
const text = preprocessLines(lines, options);
|
|
1064
|
+
return applyGlobalRegexes(text, options);
|
|
1065
|
+
}
|
|
1066
|
+
function applyGlobalRegexes(text, options) {
|
|
1067
|
+
let result = text;
|
|
1068
|
+
const checkAbort = createAbortChecker(options);
|
|
1069
|
+
checkAbort('markdown:cleanup:headings');
|
|
1070
|
+
// fixAndSpaceHeadings
|
|
1071
|
+
result = result
|
|
1072
|
+
.replace(REGEX.HEADING_SPACING, '$1\n\n$2')
|
|
1073
|
+
.replace(REGEX.HEADING_CODE_BLOCK, '$1\n\n```');
|
|
1074
|
+
if (config.markdownCleanup.removeTypeDocComments) {
|
|
1075
|
+
checkAbort('markdown:cleanup:typedoc');
|
|
1076
|
+
result = result
|
|
1077
|
+
.split('\n')
|
|
1078
|
+
.filter((line) => !isTypeDocArtifactLine(line))
|
|
1079
|
+
.join('\n');
|
|
1080
|
+
result = result.replace(REGEX.TYPEDOC_COMMENT, (match) => match.startsWith('`') ? match : '');
|
|
1081
|
+
}
|
|
1082
|
+
if (config.markdownCleanup.removeSkipLinks) {
|
|
1083
|
+
checkAbort('markdown:cleanup:skip-links');
|
|
1084
|
+
result = result
|
|
1085
|
+
.replace(REGEX.ZERO_WIDTH_ANCHOR, '')
|
|
1086
|
+
.replace(REGEX.COMBINED_LINE_REMOVALS, '');
|
|
1087
|
+
}
|
|
1088
|
+
checkAbort('markdown:cleanup:spacing');
|
|
1089
|
+
// normalizeSpacing
|
|
1090
|
+
result = result
|
|
1091
|
+
.replace(REGEX.SPACING_LINK_FIX, ']($1)\n\n[')
|
|
1092
|
+
.replace(REGEX.SPACING_ADJ_COMBINED, '$& ')
|
|
1093
|
+
.replace(REGEX.SPACING_CODE_DASH, '$1 - ')
|
|
1094
|
+
.replace(REGEX.SPACING_ESCAPES, '$1')
|
|
1095
|
+
.replace(REGEX.SPACING_LIST_NUM_COMBINED, '$1\n\n$2')
|
|
1096
|
+
.replace(REGEX.DOUBLE_NEWLINE_REDUCER, '\n\n');
|
|
1097
|
+
result = normalizeNestedListIndentation(result);
|
|
1098
|
+
checkAbort('markdown:cleanup:properties');
|
|
1099
|
+
// fixProperties
|
|
1100
|
+
for (let k = 0; k < 3; k++) {
|
|
1101
|
+
const next = result.replace(REGEX.CONCATENATED_PROPS, '$1$2\n\n$3');
|
|
1102
|
+
if (next === result)
|
|
1103
|
+
break;
|
|
1104
|
+
result = next;
|
|
1105
|
+
}
|
|
1106
|
+
return result;
|
|
1107
|
+
}
|
|
1108
|
+
function normalizeNestedListIndentation(text) {
|
|
1109
|
+
return text.replace(REGEX.NESTED_LIST_INDENT, (match, spaces, marker) => {
|
|
1110
|
+
const count = spaces.length;
|
|
1111
|
+
if (count < 2 || count % 2 !== 0)
|
|
1112
|
+
return match;
|
|
1113
|
+
const normalized = ' '.repeat((count / 2) * 4);
|
|
1114
|
+
return `${normalized}${marker} `;
|
|
1115
|
+
});
|
|
1116
|
+
}
|
|
1117
|
+
export function cleanupMarkdownArtifacts(content, options) {
|
|
1118
|
+
if (!content)
|
|
1119
|
+
return '';
|
|
1120
|
+
const checkAbort = createAbortChecker(options);
|
|
1121
|
+
checkAbort('markdown:cleanup:begin');
|
|
1122
|
+
const len = content.length;
|
|
1123
|
+
let lastIndex = 0;
|
|
1124
|
+
let fenceMarker = null;
|
|
1125
|
+
const segments = [];
|
|
1126
|
+
let buffer = [];
|
|
1127
|
+
while (lastIndex < len) {
|
|
1128
|
+
let nextIndex = content.indexOf('\n', lastIndex);
|
|
1129
|
+
let line;
|
|
1130
|
+
if (nextIndex === -1) {
|
|
1131
|
+
line = content.slice(lastIndex);
|
|
1132
|
+
nextIndex = len;
|
|
1133
|
+
}
|
|
1134
|
+
else {
|
|
1135
|
+
if (nextIndex > lastIndex && content.charCodeAt(nextIndex - 1) === 13) {
|
|
1136
|
+
line = content.slice(lastIndex, nextIndex - 1);
|
|
1137
|
+
}
|
|
1138
|
+
else {
|
|
1139
|
+
line = content.slice(lastIndex, nextIndex);
|
|
1140
|
+
}
|
|
1141
|
+
nextIndex++; // Skip \n
|
|
1142
|
+
}
|
|
1143
|
+
const trimmed = line.trimStart();
|
|
1144
|
+
if (fenceMarker) {
|
|
1145
|
+
segments.push(line);
|
|
1146
|
+
if (trimmed.startsWith(fenceMarker) &&
|
|
1147
|
+
trimmed.slice(fenceMarker.length).trim() === '') {
|
|
1148
|
+
fenceMarker = null;
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
else {
|
|
1152
|
+
const match = REGEX.FENCE_START.exec(line);
|
|
1153
|
+
const newMarker = match ? (match[1] ?? '```') : null;
|
|
1154
|
+
if (!newMarker) {
|
|
1155
|
+
buffer.push(line);
|
|
1156
|
+
}
|
|
1157
|
+
else {
|
|
1158
|
+
if (buffer.length > 0) {
|
|
1159
|
+
segments.push(processTextBuffer(buffer, options));
|
|
1160
|
+
buffer = [];
|
|
1161
|
+
}
|
|
1162
|
+
segments.push(line);
|
|
1163
|
+
fenceMarker = newMarker;
|
|
1164
|
+
}
|
|
1165
|
+
}
|
|
1166
|
+
lastIndex = nextIndex;
|
|
1167
|
+
}
|
|
1168
|
+
if (buffer.length > 0) {
|
|
1169
|
+
segments.push(processTextBuffer(buffer, options));
|
|
1170
|
+
}
|
|
1171
|
+
return segments.join('\n').trim();
|
|
1172
|
+
}
|
|
1173
|
+
function detectFrontmatter(content) {
|
|
1174
|
+
const len = content.length;
|
|
1175
|
+
if (len < 4)
|
|
1176
|
+
return null;
|
|
1177
|
+
let lineEnding = null;
|
|
1178
|
+
let fenceLen = 0;
|
|
1179
|
+
if (content.startsWith('---\n')) {
|
|
1180
|
+
lineEnding = '\n';
|
|
1181
|
+
fenceLen = 4;
|
|
1182
|
+
}
|
|
1183
|
+
else if (content.startsWith('---\r\n')) {
|
|
1184
|
+
lineEnding = '\r\n';
|
|
1185
|
+
fenceLen = 5;
|
|
1186
|
+
}
|
|
1187
|
+
if (!lineEnding)
|
|
1188
|
+
return null;
|
|
1189
|
+
const fence = `---${lineEnding}`;
|
|
1190
|
+
const closeIndex = content.indexOf(fence, fenceLen);
|
|
1191
|
+
if (closeIndex === -1)
|
|
1192
|
+
return null;
|
|
1193
|
+
return {
|
|
1194
|
+
start: 0,
|
|
1195
|
+
end: closeIndex + fenceLen,
|
|
1196
|
+
linesStart: fenceLen,
|
|
1197
|
+
linesEnd: closeIndex,
|
|
1198
|
+
lineEnding,
|
|
1199
|
+
};
|
|
1200
|
+
}
|
|
1201
|
+
function parseFrontmatterEntry(line) {
|
|
1202
|
+
const trimmed = line.trim();
|
|
1203
|
+
const idx = trimmed.indexOf(':');
|
|
1204
|
+
if (!trimmed || idx <= 0)
|
|
1205
|
+
return null;
|
|
1206
|
+
return {
|
|
1207
|
+
key: trimmed.slice(0, idx).trim().toLowerCase(),
|
|
1208
|
+
value: trimmed.slice(idx + 1).trim(),
|
|
1209
|
+
};
|
|
1210
|
+
}
|
|
1211
|
+
function stripFrontmatterQuotes(val) {
|
|
1212
|
+
const first = val.charAt(0);
|
|
1213
|
+
const last = val.charAt(val.length - 1);
|
|
1214
|
+
if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
|
|
1215
|
+
return val.slice(1, -1).trim();
|
|
1216
|
+
}
|
|
1217
|
+
return val;
|
|
1218
|
+
}
|
|
1219
|
+
function scanFrontmatterForTitle(content, fm) {
|
|
1220
|
+
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1221
|
+
let lastIdx = 0;
|
|
1222
|
+
while (lastIdx < fmBody.length) {
|
|
1223
|
+
let nextIdx = fmBody.indexOf(fm.lineEnding, lastIdx);
|
|
1224
|
+
if (nextIdx === -1)
|
|
1225
|
+
nextIdx = fmBody.length;
|
|
1226
|
+
const line = fmBody.slice(lastIdx, nextIdx);
|
|
1227
|
+
const entry = parseFrontmatterEntry(line);
|
|
1228
|
+
if (entry) {
|
|
1229
|
+
if (entry.key === 'title' || entry.key === 'name') {
|
|
1230
|
+
const cleaned = stripFrontmatterQuotes(entry.value);
|
|
1231
|
+
if (cleaned)
|
|
1232
|
+
return cleaned;
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
lastIdx = nextIdx + fm.lineEnding.length;
|
|
1236
|
+
}
|
|
1237
|
+
return undefined;
|
|
1238
|
+
}
|
|
1239
|
+
function scanBodyForTitle(content) {
|
|
1240
|
+
const len = content.length;
|
|
1241
|
+
let scanIndex = 0;
|
|
1242
|
+
const LIMIT = 5000;
|
|
1243
|
+
const maxScan = Math.min(len, LIMIT);
|
|
1244
|
+
while (scanIndex < maxScan) {
|
|
1245
|
+
let nextIndex = content.indexOf('\n', scanIndex);
|
|
1246
|
+
if (nextIndex === -1)
|
|
1247
|
+
nextIndex = len;
|
|
1248
|
+
let line = content.slice(scanIndex, nextIndex);
|
|
1249
|
+
if (line.endsWith('\r'))
|
|
1250
|
+
line = line.slice(0, -1);
|
|
1251
|
+
const trimmed = line.trim();
|
|
1252
|
+
if (trimmed) {
|
|
1253
|
+
if (REGEX.HEADING_STRICT.test(trimmed)) {
|
|
1254
|
+
return trimmed.replace(REGEX.HEADING_MARKER, '').trim() || undefined;
|
|
1255
|
+
}
|
|
1256
|
+
return undefined;
|
|
1257
|
+
}
|
|
1258
|
+
scanIndex = nextIndex + 1;
|
|
1259
|
+
}
|
|
1260
|
+
return undefined;
|
|
1261
|
+
}
|
|
1262
|
+
export function extractTitleFromRawMarkdown(content) {
|
|
1263
|
+
const fm = detectFrontmatter(content);
|
|
1264
|
+
if (fm) {
|
|
1265
|
+
const title = scanFrontmatterForTitle(content, fm);
|
|
1266
|
+
if (title)
|
|
1267
|
+
return title;
|
|
1268
|
+
}
|
|
1269
|
+
return scanBodyForTitle(content);
|
|
1270
|
+
}
|
|
1271
|
+
export function addSourceToMarkdown(content, url) {
|
|
1272
|
+
const fm = detectFrontmatter(content);
|
|
1273
|
+
const useMarkdownFormat = config.transform.metadataFormat === 'markdown';
|
|
1274
|
+
if (useMarkdownFormat && !fm) {
|
|
1275
|
+
if (REGEX.SOURCE_KEY.test(content))
|
|
1276
|
+
return content;
|
|
1277
|
+
const lineEnding = getLineEnding(content);
|
|
1278
|
+
const firstH1Match = REGEX.HEADING_MARKER.exec(content);
|
|
1279
|
+
if (firstH1Match) {
|
|
1280
|
+
const h1Index = firstH1Match.index;
|
|
1281
|
+
const lineEndIndex = content.indexOf(lineEnding, h1Index);
|
|
1282
|
+
const insertPos = lineEndIndex === -1 ? content.length : lineEndIndex + lineEnding.length;
|
|
1283
|
+
const injection = `${lineEnding}Source: ${url}${lineEnding}`;
|
|
1284
|
+
return content.slice(0, insertPos) + injection + content.slice(insertPos);
|
|
1285
|
+
}
|
|
1286
|
+
return `Source: ${url}${lineEnding}${lineEnding}${content}`;
|
|
1287
|
+
}
|
|
1288
|
+
if (!fm) {
|
|
1289
|
+
const lineEnding = getLineEnding(content);
|
|
1290
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1291
|
+
return `---${lineEnding}source: "${escapedUrl}"${lineEnding}---${lineEnding}${lineEnding}${content}`;
|
|
1292
|
+
}
|
|
1293
|
+
const fmBody = content.slice(fm.linesStart, fm.linesEnd);
|
|
1294
|
+
if (REGEX.SOURCE_KEY.test(fmBody))
|
|
1295
|
+
return content;
|
|
1296
|
+
const escapedUrl = url.replace(/"/g, '\\"');
|
|
1297
|
+
const injection = `source: "${escapedUrl}"${fm.lineEnding}`;
|
|
1298
|
+
return content.slice(0, fm.linesEnd) + injection + content.slice(fm.linesEnd);
|
|
1299
|
+
}
|
|
1300
|
+
function countCommonTags(content, limit) {
|
|
1301
|
+
if (limit <= 0)
|
|
1302
|
+
return 0;
|
|
1303
|
+
const regex = /<(html|head|body|div|span|script|style|meta|link)\b/gi;
|
|
1304
|
+
let count = 0;
|
|
1305
|
+
while (regex.exec(content)) {
|
|
1306
|
+
count += 1;
|
|
1307
|
+
if (count > limit)
|
|
1308
|
+
break;
|
|
1309
|
+
}
|
|
1310
|
+
return count;
|
|
1311
|
+
}
|
|
1312
|
+
export function isRawTextContent(content) {
|
|
1313
|
+
const trimmed = content.trim();
|
|
1314
|
+
if (REGEX.HTML_DOC_START.test(trimmed))
|
|
1315
|
+
return false;
|
|
1316
|
+
if (detectFrontmatter(trimmed) !== null)
|
|
1317
|
+
return true;
|
|
1318
|
+
const tagCount = countCommonTags(content, 5);
|
|
1319
|
+
if (tagCount > 5)
|
|
1320
|
+
return false;
|
|
1321
|
+
return (REGEX.HEADING_MARKER.test(content) ||
|
|
1322
|
+
REGEX.LIST_MARKER.test(content) ||
|
|
1323
|
+
content.includes('```'));
|
|
1324
|
+
}
|
|
1325
|
+
function formatFetchedAt(value) {
|
|
1326
|
+
const date = new Date(value);
|
|
1327
|
+
if (Number.isNaN(date.getTime()))
|
|
1328
|
+
return value;
|
|
1329
|
+
const formatter = new Intl.DateTimeFormat(config.i18n.locale, {
|
|
1330
|
+
day: '2-digit',
|
|
1331
|
+
month: '2-digit',
|
|
1332
|
+
year: 'numeric',
|
|
1333
|
+
});
|
|
1334
|
+
return formatter.format(date);
|
|
1335
|
+
}
|
|
1336
|
+
export function buildMetadataFooter(metadata, fallbackUrl) {
|
|
1337
|
+
if (!metadata)
|
|
1338
|
+
return '';
|
|
1339
|
+
const lines = ['---', ''];
|
|
1340
|
+
const url = metadata.url || fallbackUrl;
|
|
1341
|
+
const parts = [];
|
|
1342
|
+
if (metadata.title)
|
|
1343
|
+
parts.push(`_${metadata.title}_`);
|
|
1344
|
+
if (metadata.author)
|
|
1345
|
+
parts.push(`_${metadata.author}_`);
|
|
1346
|
+
if (url)
|
|
1347
|
+
parts.push(`[_Original Source_](${url})`);
|
|
1348
|
+
if (metadata.fetchedAt) {
|
|
1349
|
+
parts.push(`_${formatFetchedAt(metadata.fetchedAt)}_`);
|
|
1350
|
+
}
|
|
1351
|
+
if (parts.length > 0)
|
|
1352
|
+
lines.push(` ${parts.join(' | ')}`);
|
|
1353
|
+
if (metadata.description)
|
|
1354
|
+
lines.push(` <sub>${metadata.description}</sub>`);
|
|
1355
|
+
return lines.join('\n');
|
|
1356
|
+
}
|