@j0hanz/superfetch 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +356 -223
- package/dist/assets/logo.svg +24837 -24835
- package/dist/cache.d.ts +28 -20
- package/dist/cache.js +292 -514
- package/dist/config.d.ts +41 -7
- package/dist/config.js +298 -148
- package/dist/crypto.js +25 -12
- package/dist/dom-noise-removal.js +379 -421
- package/dist/errors.d.ts +2 -2
- package/dist/errors.js +25 -8
- package/dist/fetch.d.ts +18 -16
- package/dist/fetch.js +1132 -526
- package/dist/host-normalization.js +40 -10
- package/dist/http-native.js +628 -287
- package/dist/index.js +67 -7
- package/dist/instructions.md +44 -30
- package/dist/ip-blocklist.d.ts +8 -0
- package/dist/ip-blocklist.js +65 -0
- package/dist/json.js +14 -9
- package/dist/language-detection.d.ts +2 -11
- package/dist/language-detection.js +289 -280
- package/dist/markdown-cleanup.d.ts +0 -1
- package/dist/markdown-cleanup.js +391 -429
- package/dist/mcp-validator.js +4 -2
- package/dist/mcp.js +184 -135
- package/dist/observability.js +89 -21
- package/dist/resources.js +16 -6
- package/dist/server-tuning.d.ts +2 -0
- package/dist/server-tuning.js +25 -23
- package/dist/session.d.ts +1 -0
- package/dist/session.js +41 -33
- package/dist/tasks.d.ts +2 -0
- package/dist/tasks.js +91 -9
- package/dist/timer-utils.d.ts +5 -0
- package/dist/timer-utils.js +20 -0
- package/dist/tools.d.ts +28 -5
- package/dist/tools.js +317 -183
- package/dist/transform-types.d.ts +5 -1
- package/dist/transform.d.ts +3 -2
- package/dist/transform.js +1138 -421
- package/dist/type-guards.d.ts +1 -0
- package/dist/type-guards.js +7 -0
- package/dist/workers/transform-child.d.ts +1 -0
- package/dist/workers/transform-child.js +118 -0
- package/dist/workers/transform-worker.js +87 -78
- package/package.json +21 -13
|
@@ -1,117 +1,32 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* DOM noise removal utilities for cleaning HTML before markdown conversion.
|
|
3
|
-
* Removes navigation, ads, popups, and other non-content elements.
|
|
4
|
-
*/
|
|
5
1
|
import { parseHTML } from 'linkedom';
|
|
6
2
|
import { config } from './config.js';
|
|
7
|
-
import {
|
|
8
|
-
|
|
9
|
-
* DOM guards & small helpers
|
|
10
|
-
* ------------------------------------------------------------------------------------------------- */
|
|
11
|
-
function isElement(node) {
|
|
12
|
-
return (isObject(node) &&
|
|
13
|
-
'getAttribute' in node &&
|
|
14
|
-
typeof node.getAttribute === 'function');
|
|
15
|
-
}
|
|
16
|
-
function isNodeListLike(value) {
|
|
17
|
-
return (isObject(value) &&
|
|
18
|
-
typeof value.length === 'number');
|
|
19
|
-
}
|
|
20
|
-
function getNodeListItem(nodes, index) {
|
|
21
|
-
if ('item' in nodes && typeof nodes.item === 'function') {
|
|
22
|
-
return nodes.item(index);
|
|
23
|
-
}
|
|
24
|
-
return nodes[index] ?? null;
|
|
25
|
-
}
|
|
26
|
-
/**
|
|
27
|
-
* Remove nodes from a list/iterable.
|
|
28
|
-
* - For NodeList-like collections we iterate backwards to be safe with live collections.
|
|
29
|
-
* - For iterables we snapshot into an array first.
|
|
30
|
-
*/
|
|
31
|
-
function removeNodes(nodes, shouldRemove) {
|
|
32
|
-
if (isNodeListLike(nodes)) {
|
|
33
|
-
for (let i = nodes.length - 1; i >= 0; i -= 1) {
|
|
34
|
-
const node = getNodeListItem(nodes, i);
|
|
35
|
-
if (node && shouldRemove(node))
|
|
36
|
-
node.remove();
|
|
37
|
-
}
|
|
38
|
-
return;
|
|
39
|
-
}
|
|
40
|
-
for (const node of nodes) {
|
|
41
|
-
if (shouldRemove(node))
|
|
42
|
-
node.remove();
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
/* -------------------------------------------------------------------------------------------------
|
|
46
|
-
* Fast-path parsing heuristics
|
|
47
|
-
* ------------------------------------------------------------------------------------------------- */
|
|
48
|
-
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
49
|
-
function isFullDocumentHtml(html) {
|
|
50
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
51
|
-
}
|
|
3
|
+
import { logDebug } from './observability.js';
|
|
4
|
+
// --- Constants & Pre-compiled Regex ---
|
|
52
5
|
const NOISE_SCAN_LIMIT = 50_000;
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
'
|
|
62
|
-
'
|
|
63
|
-
'
|
|
64
|
-
'
|
|
65
|
-
'<textarea',
|
|
66
|
-
'<svg',
|
|
67
|
-
'<canvas',
|
|
68
|
-
' aria-hidden="true"',
|
|
69
|
-
" aria-hidden='true'",
|
|
70
|
-
' hidden',
|
|
71
|
-
' role="navigation"',
|
|
72
|
-
" role='navigation'",
|
|
73
|
-
' role="banner"',
|
|
74
|
-
" role='banner'",
|
|
75
|
-
' role="complementary"',
|
|
76
|
-
" role='complementary'",
|
|
77
|
-
' role="contentinfo"',
|
|
78
|
-
" role='contentinfo'",
|
|
79
|
-
' role="tree"',
|
|
80
|
-
" role='tree'",
|
|
81
|
-
' role="menubar"',
|
|
82
|
-
" role='menubar'",
|
|
83
|
-
' role="menu"',
|
|
84
|
-
" role='menu'",
|
|
85
|
-
' banner',
|
|
86
|
-
' promo',
|
|
87
|
-
' announcement',
|
|
88
|
-
' cta',
|
|
89
|
-
' advert',
|
|
90
|
-
' newsletter',
|
|
91
|
-
' subscribe',
|
|
92
|
-
' cookie',
|
|
93
|
-
' consent',
|
|
94
|
-
' popup',
|
|
95
|
-
' modal',
|
|
96
|
-
' overlay',
|
|
97
|
-
' toast',
|
|
98
|
-
' fixed',
|
|
99
|
-
' sticky',
|
|
100
|
-
' z-50',
|
|
101
|
-
' z-4',
|
|
102
|
-
' isolate',
|
|
103
|
-
' breadcrumb',
|
|
104
|
-
' pagination',
|
|
6
|
+
const MIN_BODY_CONTENT_LENGTH = 100;
|
|
7
|
+
const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
8
|
+
const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
|
|
9
|
+
// Merged markers for fast rejection
|
|
10
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
11
|
+
// Split into smaller regexes to stay within sonarjs/regex-complexity limit
|
|
12
|
+
const NOISE_PATTERNS = [
|
|
13
|
+
/<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
|
|
14
|
+
/[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
|
|
15
|
+
/[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
|
|
16
|
+
/[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
|
|
17
|
+
/[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumb|pagination)\b/i,
|
|
105
18
|
];
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
19
|
+
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
20
|
+
const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
|
|
21
|
+
const SKIP_URL_PREFIXES = [
|
|
22
|
+
'#',
|
|
23
|
+
'java' + 'script:',
|
|
24
|
+
'mailto:',
|
|
25
|
+
'tel:',
|
|
26
|
+
'data:',
|
|
27
|
+
'blob:',
|
|
28
|
+
];
|
|
29
|
+
const BASE_STRUCTURAL_TAGS = new Set([
|
|
115
30
|
'script',
|
|
116
31
|
'style',
|
|
117
32
|
'noscript',
|
|
@@ -121,41 +36,8 @@ const STRUCTURAL_TAGS = new Set([
|
|
|
121
36
|
'input',
|
|
122
37
|
'select',
|
|
123
38
|
'textarea',
|
|
124
|
-
'svg',
|
|
125
|
-
'canvas',
|
|
126
39
|
]);
|
|
127
40
|
const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
|
|
128
|
-
const BASE_NOISE_SELECTORS = [
|
|
129
|
-
'nav',
|
|
130
|
-
'footer',
|
|
131
|
-
'header[class*="site"]',
|
|
132
|
-
'header[class*="nav"]',
|
|
133
|
-
'header[class*="menu"]',
|
|
134
|
-
'[role="banner"]',
|
|
135
|
-
'[role="navigation"]',
|
|
136
|
-
'[role="dialog"]',
|
|
137
|
-
'[style*="display: none"]',
|
|
138
|
-
'[style*="display:none"]',
|
|
139
|
-
'[hidden]',
|
|
140
|
-
'[aria-hidden="true"]',
|
|
141
|
-
];
|
|
142
|
-
const BASE_NOISE_SELECTOR = BASE_NOISE_SELECTORS.join(',');
|
|
143
|
-
const CANDIDATE_NOISE_SELECTOR = [
|
|
144
|
-
...STRUCTURAL_TAGS,
|
|
145
|
-
...ALWAYS_NOISE_TAGS,
|
|
146
|
-
'aside',
|
|
147
|
-
'header',
|
|
148
|
-
'[class]',
|
|
149
|
-
'[id]',
|
|
150
|
-
'[role]',
|
|
151
|
-
'[style]',
|
|
152
|
-
].join(',');
|
|
153
|
-
function buildNoiseSelector(extraSelectors) {
|
|
154
|
-
const extra = extraSelectors.filter((s) => s.trim().length > 0);
|
|
155
|
-
return extra.length === 0
|
|
156
|
-
? BASE_NOISE_SELECTOR
|
|
157
|
-
: `${BASE_NOISE_SELECTOR},${extra.join(',')}`;
|
|
158
|
-
}
|
|
159
41
|
const NAVIGATION_ROLES = new Set([
|
|
160
42
|
'navigation',
|
|
161
43
|
'banner',
|
|
@@ -182,334 +64,410 @@ const INTERACTIVE_CONTENT_ROLES = new Set([
|
|
|
182
64
|
'tooltip',
|
|
183
65
|
'alert',
|
|
184
66
|
]);
|
|
185
|
-
const
|
|
67
|
+
const PROMO_TOKENS_ALWAYS = [
|
|
186
68
|
'banner',
|
|
187
69
|
'promo',
|
|
188
70
|
'announcement',
|
|
189
71
|
'cta',
|
|
190
72
|
'advert',
|
|
191
|
-
'ad',
|
|
192
73
|
'ads',
|
|
193
74
|
'sponsor',
|
|
194
|
-
'newsletter',
|
|
195
|
-
'subscribe',
|
|
196
|
-
'cookie',
|
|
197
|
-
'consent',
|
|
198
|
-
'popup',
|
|
199
|
-
'modal',
|
|
200
|
-
'overlay',
|
|
201
|
-
'toast',
|
|
202
|
-
'share',
|
|
203
|
-
'social',
|
|
204
|
-
'related',
|
|
205
75
|
'recommend',
|
|
206
|
-
'comment',
|
|
207
76
|
'breadcrumb',
|
|
208
77
|
'pagination',
|
|
209
78
|
'pager',
|
|
210
79
|
'taglist',
|
|
211
80
|
];
|
|
212
|
-
const
|
|
213
|
-
const
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
return
|
|
81
|
+
const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
|
|
82
|
+
const PROMO_TOKENS_BY_CATEGORY = {
|
|
83
|
+
'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
|
|
84
|
+
newsletters: ['newsletter', 'subscribe'],
|
|
85
|
+
'social-share': ['share', 'social'],
|
|
86
|
+
};
|
|
87
|
+
const BASE_NOISE_SELECTORS = {
|
|
88
|
+
navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
|
|
89
|
+
cookieBanners: '[role="dialog"]',
|
|
90
|
+
hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
|
|
91
|
+
};
|
|
92
|
+
const NO_MATCH_REGEX = /a^/i;
|
|
93
|
+
// --- State Cache ---
|
|
94
|
+
let cachedContext;
|
|
95
|
+
let lastConfigRef;
|
|
96
|
+
// --- Helpers Inlined/Optimized ---
|
|
97
|
+
function escapeRegexLiteral(value) {
|
|
98
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
99
|
+
}
|
|
100
|
+
function buildTokenRegex(tokens) {
|
|
101
|
+
if (tokens.size === 0)
|
|
102
|
+
return NO_MATCH_REGEX;
|
|
103
|
+
return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
|
|
104
|
+
}
|
|
105
|
+
function getPromoMatchers(currentConfig, flags) {
|
|
106
|
+
const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
|
|
107
|
+
const aggressiveTokens = new Set();
|
|
108
|
+
if (currentConfig.aggressiveMode) {
|
|
109
|
+
for (const t of PROMO_TOKENS_AGGRESSIVE)
|
|
110
|
+
aggressiveTokens.add(t);
|
|
234
111
|
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
const
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
112
|
+
if (flags.cookieBanners)
|
|
113
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['cookie-banners'])
|
|
114
|
+
baseTokens.add(t);
|
|
115
|
+
if (flags.newsletters)
|
|
116
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['newsletters'])
|
|
117
|
+
baseTokens.add(t);
|
|
118
|
+
if (flags.socialShare)
|
|
119
|
+
for (const t of PROMO_TOKENS_BY_CATEGORY['social-share'])
|
|
120
|
+
baseTokens.add(t);
|
|
121
|
+
for (const t of currentConfig.extraTokens) {
|
|
122
|
+
const n = t.toLowerCase().trim();
|
|
123
|
+
if (n)
|
|
124
|
+
baseTokens.add(n);
|
|
243
125
|
}
|
|
126
|
+
return {
|
|
127
|
+
base: buildTokenRegex(baseTokens),
|
|
128
|
+
aggressive: buildTokenRegex(aggressiveTokens),
|
|
129
|
+
};
|
|
244
130
|
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
131
|
+
function getContext() {
|
|
132
|
+
const currentConfig = config.noiseRemoval;
|
|
133
|
+
if (cachedContext && lastConfigRef === currentConfig) {
|
|
134
|
+
return cachedContext;
|
|
249
135
|
}
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
136
|
+
const enabled = new Set(currentConfig.enabledCategories
|
|
137
|
+
.map((c) => {
|
|
138
|
+
const s = c.toLowerCase().trim();
|
|
139
|
+
const { locale } = config.i18n;
|
|
140
|
+
return locale ? s.toLocaleLowerCase(locale) : s;
|
|
141
|
+
})
|
|
142
|
+
.filter(Boolean));
|
|
143
|
+
const isEnabled = (cat) => enabled.has(cat);
|
|
144
|
+
const flags = {
|
|
145
|
+
navFooter: isEnabled('nav-footer'),
|
|
146
|
+
cookieBanners: isEnabled('cookie-banners'),
|
|
147
|
+
newsletters: isEnabled('newsletters'),
|
|
148
|
+
socialShare: isEnabled('social-share'),
|
|
149
|
+
};
|
|
150
|
+
const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
|
|
151
|
+
if (!currentConfig.preserveSvgCanvas) {
|
|
152
|
+
structuralTags.add('svg');
|
|
153
|
+
structuralTags.add('canvas');
|
|
154
|
+
}
|
|
155
|
+
const promoMatchers = getPromoMatchers(currentConfig, flags);
|
|
156
|
+
const extraSelectors = currentConfig.extraSelectors
|
|
157
|
+
.map((s) => s.trim())
|
|
158
|
+
.filter((s) => s.length > 0);
|
|
159
|
+
// Pre-build selectors
|
|
160
|
+
const selectors = [BASE_NOISE_SELECTORS.hidden];
|
|
161
|
+
if (flags.navFooter)
|
|
162
|
+
selectors.push(BASE_NOISE_SELECTORS.navFooter);
|
|
163
|
+
if (flags.cookieBanners)
|
|
164
|
+
selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
|
|
165
|
+
const baseSelector = selectors.join(',');
|
|
166
|
+
const candidateSelector = [
|
|
167
|
+
...structuralTags,
|
|
168
|
+
...ALWAYS_NOISE_TAGS,
|
|
169
|
+
'aside',
|
|
170
|
+
'header',
|
|
171
|
+
'[class]',
|
|
172
|
+
'[id]',
|
|
173
|
+
'[role]',
|
|
174
|
+
'[style]',
|
|
175
|
+
].join(',');
|
|
176
|
+
cachedContext = {
|
|
177
|
+
flags,
|
|
178
|
+
structuralTags,
|
|
179
|
+
weights: currentConfig.weights,
|
|
180
|
+
promoMatchers,
|
|
181
|
+
promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
|
|
182
|
+
extraSelectors,
|
|
183
|
+
baseSelector,
|
|
184
|
+
candidateSelector,
|
|
185
|
+
};
|
|
186
|
+
lastConfigRef = currentConfig;
|
|
187
|
+
return cachedContext;
|
|
188
|
+
}
|
|
189
|
+
// --- Hot Path Logic ---
|
|
190
|
+
function isInteractive(element, role) {
|
|
191
|
+
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
192
|
+
return true;
|
|
193
|
+
const ds = element.getAttribute('data-state');
|
|
194
|
+
if (ds === 'inactive' || ds === 'closed')
|
|
195
|
+
return true;
|
|
196
|
+
const dataOrientation = element.getAttribute('data-orientation');
|
|
197
|
+
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
198
|
+
return true;
|
|
199
|
+
return (element.hasAttribute('data-accordion-item') ||
|
|
200
|
+
element.hasAttribute('data-radix-collection-item'));
|
|
201
|
+
}
|
|
202
|
+
function isWithinPrimaryContent(element) {
|
|
203
|
+
let current = element;
|
|
204
|
+
while (current) {
|
|
205
|
+
const tagName = current.tagName.toLowerCase();
|
|
206
|
+
if (tagName === 'article' || tagName === 'main')
|
|
259
207
|
return true;
|
|
260
|
-
if (
|
|
208
|
+
if (current.getAttribute('role') === 'main')
|
|
261
209
|
return true;
|
|
262
|
-
|
|
210
|
+
current = current.parentElement;
|
|
211
|
+
}
|
|
212
|
+
return false;
|
|
213
|
+
}
|
|
214
|
+
function shouldPreserve(element, tagName) {
|
|
215
|
+
// Check Dialog
|
|
216
|
+
const role = element.getAttribute('role');
|
|
217
|
+
if (role === 'dialog' || role === 'alertdialog') {
|
|
218
|
+
if (isWithinPrimaryContent(element))
|
|
263
219
|
return true;
|
|
264
|
-
|
|
220
|
+
const textLen = (element.textContent || '').length;
|
|
221
|
+
if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
|
|
265
222
|
return true;
|
|
266
|
-
return
|
|
267
|
-
}
|
|
268
|
-
readMetadata(element) {
|
|
269
|
-
return {
|
|
270
|
-
tagName: element.tagName.toLowerCase(),
|
|
271
|
-
className: element.getAttribute('class') ?? '',
|
|
272
|
-
id: element.getAttribute('id') ?? '',
|
|
273
|
-
role: element.getAttribute('role'),
|
|
274
|
-
isHidden: this.isHidden(element),
|
|
275
|
-
};
|
|
276
|
-
}
|
|
277
|
-
isStructuralNoise(meta, element) {
|
|
278
|
-
if (!STRUCTURAL_TAGS.has(meta.tagName))
|
|
279
|
-
return false;
|
|
280
|
-
// Interactive structural components (dialogs, menus) are handled elsewhere.
|
|
281
|
-
return !this.isInteractiveComponent(element);
|
|
223
|
+
return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
|
|
282
224
|
}
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
if (this.hasNoiseRole(meta.role))
|
|
225
|
+
// Check Nav/Footer
|
|
226
|
+
if (tagName === 'nav' || tagName === 'footer') {
|
|
227
|
+
if (element.querySelector('article,main,section,[role="main"]'))
|
|
287
228
|
return true;
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
}
|
|
291
|
-
isHiddenNoise(meta, element) {
|
|
292
|
-
if (!meta.isHidden)
|
|
293
|
-
return false;
|
|
294
|
-
// Don't remove hidden interactive components (they may be managed by UI framework state).
|
|
295
|
-
return !this.isInteractiveComponent(element);
|
|
296
|
-
}
|
|
297
|
-
isRoleNoise(meta) {
|
|
298
|
-
const isComplementaryAside = meta.tagName === 'aside' && meta.role === 'complementary';
|
|
299
|
-
if (isComplementaryAside)
|
|
300
|
-
return false;
|
|
301
|
-
return this.hasNoiseRole(meta.role);
|
|
302
|
-
}
|
|
303
|
-
hasNoiseRole(role) {
|
|
304
|
-
return role !== null && NAVIGATION_ROLES.has(role);
|
|
229
|
+
return ((element.textContent || '').trim().length >=
|
|
230
|
+
NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
|
|
305
231
|
}
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
function removeNodes(nodes) {
|
|
235
|
+
for (let i = nodes.length - 1; i >= 0; i--) {
|
|
236
|
+
const node = nodes[i];
|
|
237
|
+
if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
|
|
238
|
+
node.remove();
|
|
239
|
+
}
|
|
309
240
|
}
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
241
|
+
}
|
|
242
|
+
function scoreNavFooter(tagName, role, className, id, weights) {
|
|
243
|
+
let score = 0;
|
|
244
|
+
if (ALWAYS_NOISE_TAGS.has(tagName))
|
|
245
|
+
score += weights.structural;
|
|
246
|
+
// Header Boilerplate
|
|
247
|
+
if (tagName === 'header') {
|
|
248
|
+
if ((role && NAVIGATION_ROLES.has(role)) ||
|
|
249
|
+
HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
|
|
250
|
+
score += weights.structural;
|
|
251
|
+
}
|
|
316
252
|
}
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
if (role
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
if (dataState === 'inactive' || dataState === 'closed')
|
|
323
|
-
return true;
|
|
324
|
-
const dataOrientation = element.getAttribute('data-orientation');
|
|
325
|
-
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
326
|
-
return true;
|
|
327
|
-
if (element.getAttribute('data-accordion-item') !== null)
|
|
328
|
-
return true;
|
|
329
|
-
if (element.getAttribute('data-radix-collection-item') !== null)
|
|
330
|
-
return true;
|
|
331
|
-
return false;
|
|
253
|
+
// Role Noise
|
|
254
|
+
if (role && NAVIGATION_ROLES.has(role)) {
|
|
255
|
+
if (tagName !== 'aside' || role !== 'complementary') {
|
|
256
|
+
score += weights.structural;
|
|
257
|
+
}
|
|
332
258
|
}
|
|
259
|
+
return score;
|
|
333
260
|
}
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
261
|
+
function isNoiseElement(element, context) {
|
|
262
|
+
const tagName = element.tagName.toLowerCase();
|
|
263
|
+
const className = element.getAttribute('class') ?? '';
|
|
264
|
+
const id = element.getAttribute('id') ?? '';
|
|
265
|
+
const role = element.getAttribute('role');
|
|
266
|
+
const _isInteractive = isInteractive(element, role);
|
|
267
|
+
const style = element.getAttribute('style');
|
|
268
|
+
const isHidden = element.hasAttribute('hidden') ||
|
|
269
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
270
|
+
(style !== null &&
|
|
271
|
+
/\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
|
|
272
|
+
let score = 0;
|
|
273
|
+
const { weights } = context;
|
|
274
|
+
// Structural
|
|
275
|
+
if (context.structuralTags.has(tagName) && !_isInteractive) {
|
|
276
|
+
score += weights.structural;
|
|
338
277
|
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
this.removeBySelector(document, CANDIDATE_NOISE_SELECTOR,
|
|
343
|
-
/* checkNoise */ true);
|
|
278
|
+
// Nav/Footer Scoring
|
|
279
|
+
if (context.flags.navFooter) {
|
|
280
|
+
score += scoreNavFooter(tagName, role, className, id, weights);
|
|
344
281
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
if (!isElement(node))
|
|
349
|
-
return false;
|
|
350
|
-
return checkNoise ? this.classifier.isNoise(node) : true;
|
|
351
|
-
});
|
|
282
|
+
// Hidden
|
|
283
|
+
if (isHidden && !_isInteractive) {
|
|
284
|
+
score += weights.hidden;
|
|
352
285
|
}
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
* ------------------------------------------------------------------------------------------------- */
|
|
357
|
-
const SKIP_URL_PREFIXES = [
|
|
358
|
-
'#',
|
|
359
|
-
'java' + 'script:',
|
|
360
|
-
'mailto:',
|
|
361
|
-
'tel:',
|
|
362
|
-
'data:',
|
|
363
|
-
'blob:',
|
|
364
|
-
];
|
|
365
|
-
function shouldSkipUrlResolution(url) {
|
|
366
|
-
const normalized = url.trim().toLowerCase();
|
|
367
|
-
return SKIP_URL_PREFIXES.some((prefix) => normalized.startsWith(prefix));
|
|
368
|
-
}
|
|
369
|
-
function tryResolveUrl(relativeUrl, baseUrl) {
|
|
370
|
-
try {
|
|
371
|
-
return new URL(relativeUrl, baseUrl).href;
|
|
286
|
+
// Sticky/Fixed
|
|
287
|
+
if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
|
|
288
|
+
score += weights.stickyFixed;
|
|
372
289
|
}
|
|
373
|
-
|
|
374
|
-
|
|
290
|
+
// Promo
|
|
291
|
+
if (context.promoEnabled) {
|
|
292
|
+
const aggTest = context.promoMatchers.aggressive.test(className) ||
|
|
293
|
+
context.promoMatchers.aggressive.test(id);
|
|
294
|
+
const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
|
|
295
|
+
const isBaseMatch = !aggTest &&
|
|
296
|
+
(context.promoMatchers.base.test(className) ||
|
|
297
|
+
context.promoMatchers.base.test(id));
|
|
298
|
+
if (isAggressiveMatch || isBaseMatch) {
|
|
299
|
+
score += weights.promo;
|
|
300
|
+
}
|
|
375
301
|
}
|
|
302
|
+
return score >= weights.threshold;
|
|
376
303
|
}
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
if (tag === 'a')
|
|
390
|
-
this.resolveAnchor(element, base);
|
|
391
|
-
else if (tag === 'img')
|
|
392
|
-
this.resolveImage(element, base);
|
|
393
|
-
else if (tag === 'source')
|
|
394
|
-
this.resolveSource(element, base);
|
|
304
|
+
function cleanHeadingWrapperDivs(h) {
|
|
305
|
+
const divs = h.querySelectorAll('div');
|
|
306
|
+
for (let j = divs.length - 1; j >= 0; j--) {
|
|
307
|
+
const d = divs[j];
|
|
308
|
+
if (!d?.parentNode)
|
|
309
|
+
continue;
|
|
310
|
+
const cls = d.getAttribute('class') ?? '';
|
|
311
|
+
const stl = d.getAttribute('style') ?? '';
|
|
312
|
+
if (cls.includes('absolute') ||
|
|
313
|
+
stl.includes('position') ||
|
|
314
|
+
d.getAttribute('tabindex') === '-1') {
|
|
315
|
+
d.remove();
|
|
395
316
|
}
|
|
396
317
|
}
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
const
|
|
402
|
-
if (
|
|
403
|
-
|
|
318
|
+
}
|
|
319
|
+
function cleanHeadingAnchors(h) {
|
|
320
|
+
const anchors = h.querySelectorAll('a');
|
|
321
|
+
for (let j = anchors.length - 1; j >= 0; j--) {
|
|
322
|
+
const a = anchors[j];
|
|
323
|
+
if (!a?.parentNode)
|
|
324
|
+
continue;
|
|
325
|
+
const href = a.getAttribute('href') ?? '';
|
|
326
|
+
const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
|
|
327
|
+
if (href.startsWith('#') && txt.length === 0) {
|
|
328
|
+
a.remove();
|
|
329
|
+
}
|
|
404
330
|
}
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
if (
|
|
411
|
-
|
|
331
|
+
}
|
|
332
|
+
function cleanHeadingZeroWidth(h, document) {
|
|
333
|
+
const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
|
|
334
|
+
let node;
|
|
335
|
+
while ((node = walker.nextNode())) {
|
|
336
|
+
if (node.textContent?.includes('\u200B')) {
|
|
337
|
+
node.textContent = node.textContent.replace(/\u200B/g, '');
|
|
338
|
+
}
|
|
412
339
|
}
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
if (!
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
const parts = entry.trim().split(/\s+/);
|
|
424
|
-
const url = parts[0];
|
|
425
|
-
if (url) {
|
|
426
|
-
const resolvedUrl = tryResolveUrl(url, base);
|
|
427
|
-
if (resolvedUrl)
|
|
428
|
-
parts[0] = resolvedUrl;
|
|
429
|
-
}
|
|
430
|
-
return parts.join(' ');
|
|
431
|
-
})
|
|
432
|
-
.join(', ');
|
|
433
|
-
element.setAttribute('srcset', resolved);
|
|
340
|
+
}
|
|
341
|
+
function cleanHeadings(document) {
|
|
342
|
+
// Clean Heading Anchors
|
|
343
|
+
const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
|
|
344
|
+
for (const h of headings) {
|
|
345
|
+
if (!h.parentNode)
|
|
346
|
+
continue;
|
|
347
|
+
cleanHeadingWrapperDivs(h);
|
|
348
|
+
cleanHeadingAnchors(h);
|
|
349
|
+
cleanHeadingZeroWidth(h, document);
|
|
434
350
|
}
|
|
435
351
|
}
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
const
|
|
447
|
-
|
|
448
|
-
return bodyInner;
|
|
449
|
-
const toStringFn = this.getDocumentToString(document);
|
|
450
|
-
if (toStringFn)
|
|
451
|
-
return toStringFn();
|
|
452
|
-
const outer = this.getDocumentElementOuterHtml(document);
|
|
453
|
-
if (outer)
|
|
454
|
-
return outer;
|
|
455
|
-
return fallbackHtml;
|
|
352
|
+
function stripNoise(document, context) {
|
|
353
|
+
cleanHeadings(document);
|
|
354
|
+
// Remove Base & Extra
|
|
355
|
+
const { baseSelector, extraSelectors } = context;
|
|
356
|
+
// Base
|
|
357
|
+
const baseNodes = document.querySelectorAll(baseSelector);
|
|
358
|
+
removeNodes(baseNodes);
|
|
359
|
+
// Extra
|
|
360
|
+
if (extraSelectors.length > 0) {
|
|
361
|
+
const combinedExtra = extraSelectors.join(',');
|
|
362
|
+
const extraNodes = document.querySelectorAll(combinedExtra);
|
|
363
|
+
removeNodes(extraNodes);
|
|
456
364
|
}
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
const
|
|
461
|
-
if (
|
|
462
|
-
|
|
463
|
-
|
|
365
|
+
// Candidates
|
|
366
|
+
const candidates = document.querySelectorAll(context.candidateSelector);
|
|
367
|
+
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
368
|
+
const node = candidates[i];
|
|
369
|
+
if (!node)
|
|
370
|
+
continue;
|
|
371
|
+
if (!node.parentNode)
|
|
372
|
+
continue;
|
|
373
|
+
if (shouldPreserve(node, node.tagName.toLowerCase()))
|
|
374
|
+
continue;
|
|
375
|
+
if (isNoiseElement(node, context)) {
|
|
376
|
+
node.remove();
|
|
464
377
|
}
|
|
465
|
-
return undefined;
|
|
466
378
|
}
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
379
|
+
}
|
|
380
|
+
function processUrlElement(el, attr, base, isSrcset) {
|
|
381
|
+
if (!el.parentNode)
|
|
382
|
+
return;
|
|
383
|
+
if (isSrcset) {
|
|
384
|
+
const val = el.getAttribute(attr);
|
|
385
|
+
if (val) {
|
|
386
|
+
const newVal = val
|
|
387
|
+
.split(',')
|
|
388
|
+
.map((entry) => {
|
|
389
|
+
const parts = entry.trim().split(/\s+/);
|
|
390
|
+
if (!parts[0])
|
|
391
|
+
return entry;
|
|
392
|
+
try {
|
|
393
|
+
parts[0] = new URL(parts[0], base).href;
|
|
394
|
+
}
|
|
395
|
+
catch {
|
|
396
|
+
/* ignore */
|
|
397
|
+
}
|
|
398
|
+
return parts.join(' ');
|
|
399
|
+
})
|
|
400
|
+
.join(', ');
|
|
401
|
+
el.setAttribute(attr, newVal);
|
|
482
402
|
}
|
|
483
|
-
return
|
|
403
|
+
return;
|
|
484
404
|
}
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
* ------------------------------------------------------------------------------------------------- */
|
|
489
|
-
class HtmlNoiseRemovalPipeline {
|
|
490
|
-
promo = new PromoDetector();
|
|
491
|
-
classifier = new NoiseClassifier(this.promo);
|
|
492
|
-
stripper = new NoiseStripper(this.classifier);
|
|
493
|
-
urlResolver = new RelativeUrlResolver();
|
|
494
|
-
serializer = new DocumentSerializer();
|
|
495
|
-
removeNoise(html, document, baseUrl) {
|
|
496
|
-
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
497
|
-
if (!shouldParse)
|
|
498
|
-
return html;
|
|
405
|
+
const val = el.getAttribute(attr);
|
|
406
|
+
if (val &&
|
|
407
|
+
!SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
|
|
499
408
|
try {
|
|
500
|
-
|
|
501
|
-
this.stripper.strip(resolvedDocument);
|
|
502
|
-
if (baseUrl) {
|
|
503
|
-
this.urlResolver.resolve(resolvedDocument, baseUrl);
|
|
504
|
-
}
|
|
505
|
-
return this.serializer.serialize(resolvedDocument, html);
|
|
409
|
+
el.setAttribute(attr, new URL(val, base).href);
|
|
506
410
|
}
|
|
507
411
|
catch {
|
|
508
|
-
|
|
412
|
+
/* ignore */
|
|
509
413
|
}
|
|
510
414
|
}
|
|
511
415
|
}
|
|
512
|
-
|
|
416
|
+
function resolveUrls(document, baseUrlStr) {
|
|
417
|
+
let base;
|
|
418
|
+
try {
|
|
419
|
+
base = new URL(baseUrlStr);
|
|
420
|
+
}
|
|
421
|
+
catch {
|
|
422
|
+
return;
|
|
423
|
+
}
|
|
424
|
+
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
425
|
+
for (const el of Array.from(elements)) {
|
|
426
|
+
const tag = el.tagName.toLowerCase();
|
|
427
|
+
if (tag === 'a')
|
|
428
|
+
processUrlElement(el, 'href', base, false);
|
|
429
|
+
else if (tag === 'img')
|
|
430
|
+
processUrlElement(el, 'src', base, false);
|
|
431
|
+
else if (tag === 'source')
|
|
432
|
+
processUrlElement(el, 'srcset', base, true);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
function serialize(document, fallback) {
|
|
436
|
+
const bodyHtml = document.body.innerHTML;
|
|
437
|
+
if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
438
|
+
return bodyHtml;
|
|
439
|
+
const outerHtml = document.documentElement.outerHTML;
|
|
440
|
+
if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
|
|
441
|
+
return outerHtml;
|
|
442
|
+
return fallback;
|
|
443
|
+
}
|
|
444
|
+
function isFullDocumentHtml(html) {
|
|
445
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
446
|
+
}
|
|
447
|
+
function mayContainNoise(html) {
|
|
448
|
+
const sample = html.length <= NOISE_SCAN_LIMIT
|
|
449
|
+
? html
|
|
450
|
+
: `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
|
|
451
|
+
return NOISE_PATTERNS.some((re) => re.test(sample));
|
|
452
|
+
}
|
|
513
453
|
export function removeNoiseFromHtml(html, document, baseUrl) {
|
|
514
|
-
|
|
454
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
455
|
+
if (!shouldParse)
|
|
456
|
+
return html;
|
|
457
|
+
try {
|
|
458
|
+
const context = getContext();
|
|
459
|
+
if (config.noiseRemoval.debug) {
|
|
460
|
+
logDebug('Noise removal audit enabled', {
|
|
461
|
+
categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
const doc = document ?? parseHTML(html).document;
|
|
465
|
+
stripNoise(doc, context);
|
|
466
|
+
if (baseUrl)
|
|
467
|
+
resolveUrls(doc, baseUrl);
|
|
468
|
+
return serialize(doc, html);
|
|
469
|
+
}
|
|
470
|
+
catch {
|
|
471
|
+
return html;
|
|
472
|
+
}
|
|
515
473
|
}
|