@j0hanz/superfetch 2.4.3 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache.d.ts +8 -8
- package/dist/cache.js +277 -264
- package/dist/crypto.js +4 -3
- package/dist/dom-noise-removal.js +355 -297
- package/dist/fetch.d.ts +13 -7
- package/dist/fetch.js +636 -690
- package/dist/http-native.js +535 -474
- package/dist/instructions.md +38 -27
- package/dist/language-detection.js +190 -153
- package/dist/markdown-cleanup.js +171 -158
- package/dist/mcp.js +161 -1
- package/dist/resources.d.ts +2 -0
- package/dist/resources.js +44 -0
- package/dist/session.js +144 -105
- package/dist/tasks.d.ts +37 -0
- package/dist/tasks.js +66 -0
- package/dist/tools.d.ts +6 -9
- package/dist/tools.js +166 -136
- package/dist/transform.d.ts +3 -1
- package/dist/transform.js +680 -778
- package/package.json +6 -6
|
@@ -5,43 +5,112 @@
|
|
|
5
5
|
import { parseHTML } from 'linkedom';
|
|
6
6
|
import { config } from './config.js';
|
|
7
7
|
import { isObject } from './type-guards.js';
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
/* -------------------------------------------------------------------------------------------------
|
|
9
|
+
* DOM guards & small helpers
|
|
10
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
11
11
|
function isElement(node) {
|
|
12
12
|
return (isObject(node) &&
|
|
13
13
|
'getAttribute' in node &&
|
|
14
14
|
typeof node.getAttribute === 'function');
|
|
15
15
|
}
|
|
16
|
-
function
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
const { body } = document;
|
|
20
|
-
if (isObject(body) && typeof body.innerHTML === 'string') {
|
|
21
|
-
return body.innerHTML;
|
|
22
|
-
}
|
|
23
|
-
return undefined;
|
|
16
|
+
function isNodeListLike(value) {
|
|
17
|
+
return (isObject(value) &&
|
|
18
|
+
typeof value.length === 'number');
|
|
24
19
|
}
|
|
25
|
-
function
|
|
26
|
-
if (
|
|
27
|
-
return
|
|
28
|
-
if (typeof document.toString === 'function') {
|
|
29
|
-
return document.toString.bind(document);
|
|
20
|
+
function getNodeListItem(nodes, index) {
|
|
21
|
+
if ('item' in nodes && typeof nodes.item === 'function') {
|
|
22
|
+
return nodes.item(index);
|
|
30
23
|
}
|
|
31
|
-
return
|
|
24
|
+
return nodes[index] ?? null;
|
|
32
25
|
}
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
26
|
+
/**
|
|
27
|
+
* Remove nodes from a list/iterable.
|
|
28
|
+
* - For NodeList-like collections we iterate backwards to be safe with live collections.
|
|
29
|
+
* - For iterables we snapshot into an array first.
|
|
30
|
+
*/
|
|
31
|
+
function removeNodes(nodes, shouldRemove) {
|
|
32
|
+
if (isNodeListLike(nodes)) {
|
|
33
|
+
for (let i = nodes.length - 1; i >= 0; i -= 1) {
|
|
34
|
+
const node = getNodeListItem(nodes, i);
|
|
35
|
+
if (node && shouldRemove(node))
|
|
36
|
+
node.remove();
|
|
37
|
+
}
|
|
38
|
+
return;
|
|
39
|
+
}
|
|
40
|
+
for (const node of nodes) {
|
|
41
|
+
if (shouldRemove(node))
|
|
42
|
+
node.remove();
|
|
39
43
|
}
|
|
40
|
-
return undefined;
|
|
41
44
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
+
/* -------------------------------------------------------------------------------------------------
|
|
46
|
+
* Fast-path parsing heuristics
|
|
47
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
48
|
+
const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
|
|
49
|
+
function isFullDocumentHtml(html) {
|
|
50
|
+
return HTML_DOCUMENT_MARKERS.test(html);
|
|
51
|
+
}
|
|
52
|
+
const NOISE_SCAN_LIMIT = 50_000;
|
|
53
|
+
const NOISE_MARKERS = [
|
|
54
|
+
'<script',
|
|
55
|
+
'<style',
|
|
56
|
+
'<noscript',
|
|
57
|
+
'<iframe',
|
|
58
|
+
'<nav',
|
|
59
|
+
'<footer',
|
|
60
|
+
'<header',
|
|
61
|
+
'<form',
|
|
62
|
+
'<button',
|
|
63
|
+
'<input',
|
|
64
|
+
'<select',
|
|
65
|
+
'<textarea',
|
|
66
|
+
'<svg',
|
|
67
|
+
'<canvas',
|
|
68
|
+
' aria-hidden="true"',
|
|
69
|
+
" aria-hidden='true'",
|
|
70
|
+
' hidden',
|
|
71
|
+
' role="navigation"',
|
|
72
|
+
" role='navigation'",
|
|
73
|
+
' role="banner"',
|
|
74
|
+
" role='banner'",
|
|
75
|
+
' role="complementary"',
|
|
76
|
+
" role='complementary'",
|
|
77
|
+
' role="contentinfo"',
|
|
78
|
+
" role='contentinfo'",
|
|
79
|
+
' role="tree"',
|
|
80
|
+
" role='tree'",
|
|
81
|
+
' role="menubar"',
|
|
82
|
+
" role='menubar'",
|
|
83
|
+
' role="menu"',
|
|
84
|
+
" role='menu'",
|
|
85
|
+
' banner',
|
|
86
|
+
' promo',
|
|
87
|
+
' announcement',
|
|
88
|
+
' cta',
|
|
89
|
+
' advert',
|
|
90
|
+
' newsletter',
|
|
91
|
+
' subscribe',
|
|
92
|
+
' cookie',
|
|
93
|
+
' consent',
|
|
94
|
+
' popup',
|
|
95
|
+
' modal',
|
|
96
|
+
' overlay',
|
|
97
|
+
' toast',
|
|
98
|
+
' fixed',
|
|
99
|
+
' sticky',
|
|
100
|
+
' z-50',
|
|
101
|
+
' z-4',
|
|
102
|
+
' isolate',
|
|
103
|
+
' breadcrumb',
|
|
104
|
+
' pagination',
|
|
105
|
+
];
|
|
106
|
+
function mayContainNoise(html) {
|
|
107
|
+
const sample = html.length > NOISE_SCAN_LIMIT ? html.slice(0, NOISE_SCAN_LIMIT) : html;
|
|
108
|
+
const haystack = sample.toLowerCase();
|
|
109
|
+
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
110
|
+
}
|
|
111
|
+
/* -------------------------------------------------------------------------------------------------
|
|
112
|
+
* Noise selectors & classification
|
|
113
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
45
114
|
const STRUCTURAL_TAGS = new Set([
|
|
46
115
|
'script',
|
|
47
116
|
'style',
|
|
@@ -82,10 +151,10 @@ const CANDIDATE_NOISE_SELECTOR = [
|
|
|
82
151
|
'[style]',
|
|
83
152
|
].join(',');
|
|
84
153
|
function buildNoiseSelector(extraSelectors) {
|
|
85
|
-
const extra = extraSelectors.filter((
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
154
|
+
const extra = extraSelectors.filter((s) => s.trim().length > 0);
|
|
155
|
+
return extra.length === 0
|
|
156
|
+
? BASE_NOISE_SELECTOR
|
|
157
|
+
: `${BASE_NOISE_SELECTOR},${extra.join(',')}`;
|
|
89
158
|
}
|
|
90
159
|
const NAVIGATION_ROLES = new Set([
|
|
91
160
|
'navigation',
|
|
@@ -140,214 +209,151 @@ const BASE_PROMO_TOKENS = [
|
|
|
140
209
|
'pager',
|
|
141
210
|
'taglist',
|
|
142
211
|
];
|
|
143
|
-
/**
|
|
144
|
-
* Get promo tokens merged with any user-configured extra tokens.
|
|
145
|
-
* Memoized because it is used in hot paths when scanning many nodes.
|
|
146
|
-
*/
|
|
147
|
-
let promoTokensCache = null;
|
|
148
|
-
function getPromoTokens() {
|
|
149
|
-
if (promoTokensCache)
|
|
150
|
-
return promoTokensCache;
|
|
151
|
-
const tokens = new Set(BASE_PROMO_TOKENS);
|
|
152
|
-
for (const token of config.noiseRemoval.extraTokens) {
|
|
153
|
-
const normalized = token.toLowerCase().trim();
|
|
154
|
-
if (normalized)
|
|
155
|
-
tokens.add(normalized);
|
|
156
|
-
}
|
|
157
|
-
promoTokensCache = tokens;
|
|
158
|
-
return tokens;
|
|
159
|
-
}
|
|
160
|
-
let promoRegexCache = null;
|
|
161
|
-
function getPromoRegex() {
|
|
162
|
-
if (promoRegexCache)
|
|
163
|
-
return promoRegexCache;
|
|
164
|
-
const tokens = Array.from(getPromoTokens());
|
|
165
|
-
const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
|
166
|
-
const pattern = `(?:^|[^a-z0-9])(?:${escaped.join('|')})(?:$|[^a-z0-9])`;
|
|
167
|
-
promoRegexCache = new RegExp(pattern, 'i');
|
|
168
|
-
return promoRegexCache;
|
|
169
|
-
}
|
|
170
212
|
const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
|
|
171
213
|
const FIXED_PATTERN = /\b(fixed|sticky)\b/;
|
|
172
214
|
const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
|
|
173
215
|
const ISOLATE_PATTERN = /\bisolate\b/;
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
'<nav',
|
|
181
|
-
'<footer',
|
|
182
|
-
'<header',
|
|
183
|
-
'<form',
|
|
184
|
-
'<button',
|
|
185
|
-
'<input',
|
|
186
|
-
'<select',
|
|
187
|
-
'<textarea',
|
|
188
|
-
'<svg',
|
|
189
|
-
'<canvas',
|
|
190
|
-
' aria-hidden="true"',
|
|
191
|
-
" aria-hidden='true'",
|
|
192
|
-
' hidden',
|
|
193
|
-
' role="navigation"',
|
|
194
|
-
" role='navigation'",
|
|
195
|
-
' role="banner"',
|
|
196
|
-
" role='banner'",
|
|
197
|
-
' role="complementary"',
|
|
198
|
-
" role='complementary'",
|
|
199
|
-
' role="contentinfo"',
|
|
200
|
-
" role='contentinfo'",
|
|
201
|
-
' role="tree"',
|
|
202
|
-
" role='tree'",
|
|
203
|
-
' role="menubar"',
|
|
204
|
-
" role='menubar'",
|
|
205
|
-
' role="menu"',
|
|
206
|
-
" role='menu'",
|
|
207
|
-
' banner',
|
|
208
|
-
' promo',
|
|
209
|
-
' announcement',
|
|
210
|
-
' cta',
|
|
211
|
-
' advert',
|
|
212
|
-
' newsletter',
|
|
213
|
-
' subscribe',
|
|
214
|
-
' cookie',
|
|
215
|
-
' consent',
|
|
216
|
-
' popup',
|
|
217
|
-
' modal',
|
|
218
|
-
' overlay',
|
|
219
|
-
' toast',
|
|
220
|
-
' fixed',
|
|
221
|
-
' sticky',
|
|
222
|
-
' z-50',
|
|
223
|
-
' z-4',
|
|
224
|
-
' isolate',
|
|
225
|
-
' breadcrumb',
|
|
226
|
-
' pagination',
|
|
227
|
-
];
|
|
228
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
229
|
-
// Noise Detection Functions
|
|
230
|
-
// ─────────────────────────────────────────────────────────────────────────────
|
|
231
|
-
const NOISE_SCAN_LIMIT = 50_000;
|
|
232
|
-
function mayContainNoise(html) {
|
|
233
|
-
// Fast path: only scan a bounded prefix; parsing is the expensive step anyway.
|
|
234
|
-
// Most noise markers appear near the top of the document (nav, scripts, meta, etc.).
|
|
235
|
-
const sample = html.length > NOISE_SCAN_LIMIT ? html.slice(0, NOISE_SCAN_LIMIT) : html;
|
|
236
|
-
const haystack = sample.toLowerCase();
|
|
237
|
-
return NOISE_MARKERS.some((marker) => haystack.includes(marker));
|
|
238
|
-
}
|
|
239
|
-
function isFullDocumentHtml(html) {
|
|
240
|
-
return HTML_DOCUMENT_MARKERS.test(html);
|
|
241
|
-
}
|
|
242
|
-
function isStructuralNoiseTag(tagName) {
|
|
243
|
-
return STRUCTURAL_TAGS.has(tagName);
|
|
244
|
-
}
|
|
245
|
-
function isInteractiveComponent(element) {
|
|
246
|
-
const role = element.getAttribute('role');
|
|
247
|
-
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
248
|
-
return true;
|
|
249
|
-
// Check for common UI framework data attributes that indicate managed visibility
|
|
250
|
-
const dataState = element.getAttribute('data-state');
|
|
251
|
-
if (dataState === 'inactive' || dataState === 'closed')
|
|
252
|
-
return true;
|
|
253
|
-
const dataOrientation = element.getAttribute('data-orientation');
|
|
254
|
-
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical') {
|
|
255
|
-
return true;
|
|
216
|
+
class PromoDetector {
|
|
217
|
+
tokenCache = null;
|
|
218
|
+
regexCache = null;
|
|
219
|
+
matches(className, id) {
|
|
220
|
+
const regex = this.getRegex();
|
|
221
|
+
return regex.test(className) || regex.test(id);
|
|
256
222
|
}
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
const style = element.getAttribute('style') ?? '';
|
|
266
|
-
return (element.getAttribute('hidden') !== null ||
|
|
267
|
-
element.getAttribute('aria-hidden') === 'true' ||
|
|
268
|
-
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
269
|
-
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
270
|
-
}
|
|
271
|
-
function hasNoiseRole(role) {
|
|
272
|
-
return role !== null && NAVIGATION_ROLES.has(role);
|
|
273
|
-
}
|
|
274
|
-
function matchesPromoIdOrClass(className, id) {
|
|
275
|
-
const regex = getPromoRegex();
|
|
276
|
-
return regex.test(className) || regex.test(id);
|
|
277
|
-
}
|
|
278
|
-
function matchesFixedOrHighZIsolate(className) {
|
|
279
|
-
return (FIXED_PATTERN.test(className) ||
|
|
280
|
-
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
281
|
-
}
|
|
282
|
-
function readElementMetadata(element) {
|
|
283
|
-
return {
|
|
284
|
-
tagName: element.tagName.toLowerCase(),
|
|
285
|
-
className: element.getAttribute('class') ?? '',
|
|
286
|
-
id: element.getAttribute('id') ?? '',
|
|
287
|
-
role: element.getAttribute('role'),
|
|
288
|
-
isHidden: isElementHidden(element),
|
|
289
|
-
};
|
|
290
|
-
}
|
|
291
|
-
function isBoilerplateHeader({ className, id, role, }) {
|
|
292
|
-
if (hasNoiseRole(role))
|
|
293
|
-
return true;
|
|
294
|
-
const combined = `${className} ${id}`.toLowerCase();
|
|
295
|
-
return HEADER_NOISE_PATTERN.test(combined);
|
|
296
|
-
}
|
|
297
|
-
function isNoiseElement(node) {
|
|
298
|
-
const metadata = readElementMetadata(node);
|
|
299
|
-
const isComplementaryAside = metadata.tagName === 'aside' && metadata.role === 'complementary';
|
|
300
|
-
const shouldCheckHidden = metadata.isHidden && !isInteractiveComponent(node);
|
|
301
|
-
const isInteractiveStructural = isStructuralNoiseTag(metadata.tagName) && isInteractiveComponent(node);
|
|
302
|
-
return ((isStructuralNoiseTag(metadata.tagName) && !isInteractiveStructural) ||
|
|
303
|
-
ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
|
|
304
|
-
(metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
|
|
305
|
-
shouldCheckHidden ||
|
|
306
|
-
(!isComplementaryAside && hasNoiseRole(metadata.role)) ||
|
|
307
|
-
matchesFixedOrHighZIsolate(metadata.className) ||
|
|
308
|
-
matchesPromoIdOrClass(metadata.className, metadata.id));
|
|
309
|
-
}
|
|
310
|
-
function isNodeListLike(value) {
|
|
311
|
-
return isObject(value) && typeof value.length === 'number';
|
|
312
|
-
}
|
|
313
|
-
function tryGetNodeListItem(nodes, index) {
|
|
314
|
-
if ('item' in nodes && typeof nodes.item === 'function') {
|
|
315
|
-
return nodes.item(index);
|
|
316
|
-
}
|
|
317
|
-
return nodes[index] ?? null;
|
|
318
|
-
}
|
|
319
|
-
function removeNoiseFromNodeListLike(nodes, shouldCheckNoise) {
|
|
320
|
-
for (let index = nodes.length - 1; index >= 0; index -= 1) {
|
|
321
|
-
const node = tryGetNodeListItem(nodes, index);
|
|
322
|
-
if (!node)
|
|
323
|
-
continue;
|
|
324
|
-
if (isElement(node) && (!shouldCheckNoise || isNoiseElement(node))) {
|
|
325
|
-
node.remove();
|
|
223
|
+
getTokens() {
|
|
224
|
+
if (this.tokenCache)
|
|
225
|
+
return this.tokenCache;
|
|
226
|
+
const tokens = new Set(BASE_PROMO_TOKENS);
|
|
227
|
+
for (const token of config.noiseRemoval.extraTokens) {
|
|
228
|
+
const normalized = token.toLowerCase().trim();
|
|
229
|
+
if (normalized)
|
|
230
|
+
tokens.add(normalized);
|
|
326
231
|
}
|
|
232
|
+
this.tokenCache = tokens;
|
|
233
|
+
return tokens;
|
|
234
|
+
}
|
|
235
|
+
getRegex() {
|
|
236
|
+
if (this.regexCache)
|
|
237
|
+
return this.regexCache;
|
|
238
|
+
const tokens = [...this.getTokens()];
|
|
239
|
+
const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
|
|
240
|
+
const pattern = `(?:^|[^a-z0-9])(?:${escaped.join('|')})(?:$|[^a-z0-9])`;
|
|
241
|
+
this.regexCache = new RegExp(pattern, 'i');
|
|
242
|
+
return this.regexCache;
|
|
327
243
|
}
|
|
328
244
|
}
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
245
|
+
class NoiseClassifier {
|
|
246
|
+
promo;
|
|
247
|
+
constructor(promo) {
|
|
248
|
+
this.promo = promo;
|
|
333
249
|
}
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
if (
|
|
337
|
-
|
|
338
|
-
|
|
250
|
+
isNoise(element) {
|
|
251
|
+
const meta = this.readMetadata(element);
|
|
252
|
+
if (this.isStructuralNoise(meta, element))
|
|
253
|
+
return true;
|
|
254
|
+
if (ALWAYS_NOISE_TAGS.has(meta.tagName))
|
|
255
|
+
return true;
|
|
256
|
+
if (this.isHeaderBoilerplate(meta))
|
|
257
|
+
return true;
|
|
258
|
+
if (this.isHiddenNoise(meta, element))
|
|
259
|
+
return true;
|
|
260
|
+
if (this.isRoleNoise(meta))
|
|
261
|
+
return true;
|
|
262
|
+
if (this.matchesFixedOrHighZIsolate(meta.className))
|
|
263
|
+
return true;
|
|
264
|
+
if (this.promo.matches(meta.className, meta.id))
|
|
265
|
+
return true;
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
readMetadata(element) {
|
|
269
|
+
return {
|
|
270
|
+
tagName: element.tagName.toLowerCase(),
|
|
271
|
+
className: element.getAttribute('class') ?? '',
|
|
272
|
+
id: element.getAttribute('id') ?? '',
|
|
273
|
+
role: element.getAttribute('role'),
|
|
274
|
+
isHidden: this.isHidden(element),
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
isStructuralNoise(meta, element) {
|
|
278
|
+
if (!STRUCTURAL_TAGS.has(meta.tagName))
|
|
279
|
+
return false;
|
|
280
|
+
// Interactive structural components (dialogs, menus) are handled elsewhere.
|
|
281
|
+
return !this.isInteractiveComponent(element);
|
|
282
|
+
}
|
|
283
|
+
isHeaderBoilerplate(meta) {
|
|
284
|
+
if (meta.tagName !== 'header')
|
|
285
|
+
return false;
|
|
286
|
+
if (this.hasNoiseRole(meta.role))
|
|
287
|
+
return true;
|
|
288
|
+
const combined = `${meta.className} ${meta.id}`.toLowerCase();
|
|
289
|
+
return HEADER_NOISE_PATTERN.test(combined);
|
|
290
|
+
}
|
|
291
|
+
isHiddenNoise(meta, element) {
|
|
292
|
+
if (!meta.isHidden)
|
|
293
|
+
return false;
|
|
294
|
+
// Don't remove hidden interactive components (they may be managed by UI framework state).
|
|
295
|
+
return !this.isInteractiveComponent(element);
|
|
296
|
+
}
|
|
297
|
+
isRoleNoise(meta) {
|
|
298
|
+
const isComplementaryAside = meta.tagName === 'aside' && meta.role === 'complementary';
|
|
299
|
+
if (isComplementaryAside)
|
|
300
|
+
return false;
|
|
301
|
+
return this.hasNoiseRole(meta.role);
|
|
302
|
+
}
|
|
303
|
+
hasNoiseRole(role) {
|
|
304
|
+
return role !== null && NAVIGATION_ROLES.has(role);
|
|
305
|
+
}
|
|
306
|
+
matchesFixedOrHighZIsolate(className) {
|
|
307
|
+
return (FIXED_PATTERN.test(className) ||
|
|
308
|
+
(HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
|
|
309
|
+
}
|
|
310
|
+
isHidden(element) {
|
|
311
|
+
const style = element.getAttribute('style') ?? '';
|
|
312
|
+
return (element.getAttribute('hidden') !== null ||
|
|
313
|
+
element.getAttribute('aria-hidden') === 'true' ||
|
|
314
|
+
/\bdisplay\s*:\s*none\b/i.test(style) ||
|
|
315
|
+
/\bvisibility\s*:\s*hidden\b/i.test(style));
|
|
316
|
+
}
|
|
317
|
+
isInteractiveComponent(element) {
|
|
318
|
+
const role = element.getAttribute('role');
|
|
319
|
+
if (role && INTERACTIVE_CONTENT_ROLES.has(role))
|
|
320
|
+
return true;
|
|
321
|
+
const dataState = element.getAttribute('data-state');
|
|
322
|
+
if (dataState === 'inactive' || dataState === 'closed')
|
|
323
|
+
return true;
|
|
324
|
+
const dataOrientation = element.getAttribute('data-orientation');
|
|
325
|
+
if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
|
|
326
|
+
return true;
|
|
327
|
+
if (element.getAttribute('data-accordion-item') !== null)
|
|
328
|
+
return true;
|
|
329
|
+
if (element.getAttribute('data-radix-collection-item') !== null)
|
|
330
|
+
return true;
|
|
331
|
+
return false;
|
|
339
332
|
}
|
|
340
333
|
}
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
334
|
+
class NoiseStripper {
|
|
335
|
+
classifier;
|
|
336
|
+
constructor(classifier) {
|
|
337
|
+
this.classifier = classifier;
|
|
338
|
+
}
|
|
339
|
+
strip(document) {
|
|
340
|
+
this.removeBySelector(document, buildNoiseSelector(config.noiseRemoval.extraSelectors),
|
|
341
|
+
/* checkNoise */ false);
|
|
342
|
+
this.removeBySelector(document, CANDIDATE_NOISE_SELECTOR,
|
|
343
|
+
/* checkNoise */ true);
|
|
344
|
+
}
|
|
345
|
+
removeBySelector(document, selector, checkNoise) {
|
|
346
|
+
const nodes = document.querySelectorAll(selector);
|
|
347
|
+
removeNodes(nodes, (node) => {
|
|
348
|
+
if (!isElement(node))
|
|
349
|
+
return false;
|
|
350
|
+
return checkNoise ? this.classifier.isNoise(node) : true;
|
|
351
|
+
});
|
|
352
|
+
}
|
|
347
353
|
}
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
354
|
+
/* -------------------------------------------------------------------------------------------------
|
|
355
|
+
* Relative URL resolution
|
|
356
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
351
357
|
const SKIP_URL_PREFIXES = [
|
|
352
358
|
'#',
|
|
353
359
|
'java' + 'script:',
|
|
@@ -360,9 +366,6 @@ function shouldSkipUrlResolution(url) {
|
|
|
360
366
|
const normalized = url.trim().toLowerCase();
|
|
361
367
|
return SKIP_URL_PREFIXES.some((prefix) => normalized.startsWith(prefix));
|
|
362
368
|
}
|
|
363
|
-
/**
|
|
364
|
-
* Safely resolve a relative URL to absolute using base URL.
|
|
365
|
-
*/
|
|
366
369
|
function tryResolveUrl(relativeUrl, baseUrl) {
|
|
367
370
|
try {
|
|
368
371
|
return new URL(relativeUrl, baseUrl).href;
|
|
@@ -371,87 +374,142 @@ function tryResolveUrl(relativeUrl, baseUrl) {
|
|
|
371
374
|
return null;
|
|
372
375
|
}
|
|
373
376
|
}
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
+
class RelativeUrlResolver {
|
|
378
|
+
resolve(document, baseUrl) {
|
|
379
|
+
let base;
|
|
380
|
+
try {
|
|
381
|
+
base = new URL(baseUrl);
|
|
382
|
+
}
|
|
383
|
+
catch {
|
|
384
|
+
// invalid base URL - skip resolution
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
for (const element of document.querySelectorAll('a[href], img[src], source[srcset]')) {
|
|
388
|
+
const tag = element.tagName.toLowerCase();
|
|
389
|
+
if (tag === 'a')
|
|
390
|
+
this.resolveAnchor(element, base);
|
|
391
|
+
else if (tag === 'img')
|
|
392
|
+
this.resolveImage(element, base);
|
|
393
|
+
else if (tag === 'source')
|
|
394
|
+
this.resolveSource(element, base);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
resolveAnchor(element, base) {
|
|
398
|
+
const href = element.getAttribute('href');
|
|
399
|
+
if (!href || shouldSkipUrlResolution(href))
|
|
400
|
+
return;
|
|
377
401
|
const resolved = tryResolveUrl(href, base);
|
|
378
402
|
if (resolved)
|
|
379
403
|
element.setAttribute('href', resolved);
|
|
380
404
|
}
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
405
|
+
resolveImage(element, base) {
|
|
406
|
+
const src = element.getAttribute('src');
|
|
407
|
+
if (!src || shouldSkipUrlResolution(src))
|
|
408
|
+
return;
|
|
385
409
|
const resolved = tryResolveUrl(src, base);
|
|
386
410
|
if (resolved)
|
|
387
411
|
element.setAttribute('src', resolved);
|
|
388
412
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
const
|
|
401
|
-
if (
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
})
|
|
406
|
-
.join(', ');
|
|
407
|
-
element.setAttribute('srcset', resolved);
|
|
408
|
-
}
|
|
409
|
-
function resolveRelativeUrls(document, baseUrl) {
|
|
410
|
-
try {
|
|
411
|
-
const base = new URL(baseUrl);
|
|
412
|
-
for (const element of document.querySelectorAll('a[href], img[src], source[srcset]')) {
|
|
413
|
-
const tag = element.tagName.toLowerCase();
|
|
414
|
-
if (tag === 'a') {
|
|
415
|
-
resolveAnchorElement(element, base);
|
|
416
|
-
}
|
|
417
|
-
else if (tag === 'img') {
|
|
418
|
-
resolveImageElement(element, base);
|
|
419
|
-
}
|
|
420
|
-
else if (tag === 'source') {
|
|
421
|
-
resolveSourceElement(element, base);
|
|
413
|
+
/**
|
|
414
|
+
* Keep original behavior: srcset entries are always attempted to be resolved (no prefix skipping).
|
|
415
|
+
*/
|
|
416
|
+
resolveSource(element, base) {
|
|
417
|
+
const srcset = element.getAttribute('srcset');
|
|
418
|
+
if (!srcset)
|
|
419
|
+
return;
|
|
420
|
+
const resolved = srcset
|
|
421
|
+
.split(',')
|
|
422
|
+
.map((entry) => {
|
|
423
|
+
const parts = entry.trim().split(/\s+/);
|
|
424
|
+
const url = parts[0];
|
|
425
|
+
if (url) {
|
|
426
|
+
const resolvedUrl = tryResolveUrl(url, base);
|
|
427
|
+
if (resolvedUrl)
|
|
428
|
+
parts[0] = resolvedUrl;
|
|
422
429
|
}
|
|
430
|
+
return parts.join(' ');
|
|
431
|
+
})
|
|
432
|
+
.join(', ');
|
|
433
|
+
element.setAttribute('srcset', resolved);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
/* -------------------------------------------------------------------------------------------------
|
|
437
|
+
* Serialization
|
|
438
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
439
|
+
class DocumentSerializer {
|
|
440
|
+
/**
|
|
441
|
+
* Preserve existing behavior:
|
|
442
|
+
* - Prefer body.innerHTML only if it has "substantial" content (> 100 chars).
|
|
443
|
+
* - Otherwise fall back to document.toString(), then documentElement.outerHTML, then original HTML.
|
|
444
|
+
*/
|
|
445
|
+
serialize(document, fallbackHtml) {
|
|
446
|
+
const bodyInner = this.getBodyInnerHtml(document);
|
|
447
|
+
if (bodyInner && bodyInner.trim().length > 100)
|
|
448
|
+
return bodyInner;
|
|
449
|
+
const toStringFn = this.getDocumentToString(document);
|
|
450
|
+
if (toStringFn)
|
|
451
|
+
return toStringFn();
|
|
452
|
+
const outer = this.getDocumentElementOuterHtml(document);
|
|
453
|
+
if (outer)
|
|
454
|
+
return outer;
|
|
455
|
+
return fallbackHtml;
|
|
456
|
+
}
|
|
457
|
+
getBodyInnerHtml(document) {
|
|
458
|
+
if (!isObject(document))
|
|
459
|
+
return undefined;
|
|
460
|
+
const { body } = document;
|
|
461
|
+
if (isObject(body) &&
|
|
462
|
+
typeof body.innerHTML === 'string') {
|
|
463
|
+
return body.innerHTML;
|
|
423
464
|
}
|
|
465
|
+
return undefined;
|
|
424
466
|
}
|
|
425
|
-
|
|
426
|
-
|
|
467
|
+
getDocumentToString(document) {
|
|
468
|
+
if (!isObject(document))
|
|
469
|
+
return undefined;
|
|
470
|
+
const fn = document.toString;
|
|
471
|
+
if (typeof fn === 'function')
|
|
472
|
+
return fn.bind(document);
|
|
473
|
+
return undefined;
|
|
474
|
+
}
|
|
475
|
+
getDocumentElementOuterHtml(document) {
|
|
476
|
+
if (!isObject(document))
|
|
477
|
+
return undefined;
|
|
478
|
+
const docEl = document.documentElement;
|
|
479
|
+
if (isObject(docEl) &&
|
|
480
|
+
typeof docEl.outerHTML === 'string') {
|
|
481
|
+
return docEl.outerHTML;
|
|
482
|
+
}
|
|
483
|
+
return undefined;
|
|
427
484
|
}
|
|
428
485
|
}
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
486
|
+
/* -------------------------------------------------------------------------------------------------
|
|
487
|
+
* Public pipeline
|
|
488
|
+
* ------------------------------------------------------------------------------------------------- */
|
|
489
|
+
class HtmlNoiseRemovalPipeline {
|
|
490
|
+
promo = new PromoDetector();
|
|
491
|
+
classifier = new NoiseClassifier(this.promo);
|
|
492
|
+
stripper = new NoiseStripper(this.classifier);
|
|
493
|
+
urlResolver = new RelativeUrlResolver();
|
|
494
|
+
serializer = new DocumentSerializer();
|
|
495
|
+
removeNoise(html, document, baseUrl) {
|
|
496
|
+
const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
|
|
497
|
+
if (!shouldParse)
|
|
498
|
+
return html;
|
|
499
|
+
try {
|
|
500
|
+
const resolvedDocument = document ?? parseHTML(html).document;
|
|
501
|
+
this.stripper.strip(resolvedDocument);
|
|
502
|
+
if (baseUrl) {
|
|
503
|
+
this.urlResolver.resolve(resolvedDocument, baseUrl);
|
|
504
|
+
}
|
|
505
|
+
return this.serializer.serialize(resolvedDocument, html);
|
|
439
506
|
}
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
// On some sites (e.g., Framer), noise removal empties body but leaves content in documentElement
|
|
443
|
-
if (bodyInnerHtml && bodyInnerHtml.trim().length > 100) {
|
|
444
|
-
return bodyInnerHtml;
|
|
507
|
+
catch {
|
|
508
|
+
return html;
|
|
445
509
|
}
|
|
446
|
-
const docToString = getDocumentToString(resolvedDocument);
|
|
447
|
-
if (docToString)
|
|
448
|
-
return docToString();
|
|
449
|
-
const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
|
|
450
|
-
if (documentElementOuterHtml)
|
|
451
|
-
return documentElementOuterHtml;
|
|
452
|
-
return html;
|
|
453
|
-
}
|
|
454
|
-
catch {
|
|
455
|
-
return html;
|
|
456
510
|
}
|
|
457
511
|
}
|
|
512
|
+
const pipeline = new HtmlNoiseRemovalPipeline();
|
|
513
|
+
export function removeNoiseFromHtml(html, document, baseUrl) {
|
|
514
|
+
return pipeline.removeNoise(html, document, baseUrl);
|
|
515
|
+
}
|