@j0hanz/superfetch 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +356 -223
  2. package/dist/assets/logo.svg +24837 -24835
  3. package/dist/cache.d.ts +28 -20
  4. package/dist/cache.js +292 -514
  5. package/dist/config.d.ts +41 -7
  6. package/dist/config.js +298 -148
  7. package/dist/crypto.js +25 -12
  8. package/dist/dom-noise-removal.js +379 -421
  9. package/dist/errors.d.ts +2 -2
  10. package/dist/errors.js +25 -8
  11. package/dist/fetch.d.ts +18 -16
  12. package/dist/fetch.js +1132 -526
  13. package/dist/host-normalization.js +40 -10
  14. package/dist/http-native.js +628 -287
  15. package/dist/index.js +67 -7
  16. package/dist/instructions.md +44 -30
  17. package/dist/ip-blocklist.d.ts +8 -0
  18. package/dist/ip-blocklist.js +65 -0
  19. package/dist/json.js +14 -9
  20. package/dist/language-detection.d.ts +2 -11
  21. package/dist/language-detection.js +289 -280
  22. package/dist/markdown-cleanup.d.ts +0 -1
  23. package/dist/markdown-cleanup.js +391 -429
  24. package/dist/mcp-validator.js +4 -2
  25. package/dist/mcp.js +184 -135
  26. package/dist/observability.js +89 -21
  27. package/dist/resources.js +16 -6
  28. package/dist/server-tuning.d.ts +2 -0
  29. package/dist/server-tuning.js +25 -23
  30. package/dist/session.d.ts +1 -0
  31. package/dist/session.js +41 -33
  32. package/dist/tasks.d.ts +2 -0
  33. package/dist/tasks.js +91 -9
  34. package/dist/timer-utils.d.ts +5 -0
  35. package/dist/timer-utils.js +20 -0
  36. package/dist/tools.d.ts +28 -5
  37. package/dist/tools.js +317 -183
  38. package/dist/transform-types.d.ts +5 -1
  39. package/dist/transform.d.ts +3 -2
  40. package/dist/transform.js +1138 -421
  41. package/dist/type-guards.d.ts +1 -0
  42. package/dist/type-guards.js +7 -0
  43. package/dist/workers/transform-child.d.ts +1 -0
  44. package/dist/workers/transform-child.js +118 -0
  45. package/dist/workers/transform-worker.js +87 -78
  46. package/package.json +21 -13
@@ -1,117 +1,32 @@
1
- /**
2
- * DOM noise removal utilities for cleaning HTML before markdown conversion.
3
- * Removes navigation, ads, popups, and other non-content elements.
4
- */
5
1
  import { parseHTML } from 'linkedom';
6
2
  import { config } from './config.js';
7
- import { isObject } from './type-guards.js';
8
- /* -------------------------------------------------------------------------------------------------
9
- * DOM guards & small helpers
10
- * ------------------------------------------------------------------------------------------------- */
11
- function isElement(node) {
12
- return (isObject(node) &&
13
- 'getAttribute' in node &&
14
- typeof node.getAttribute === 'function');
15
- }
16
- function isNodeListLike(value) {
17
- return (isObject(value) &&
18
- typeof value.length === 'number');
19
- }
20
- function getNodeListItem(nodes, index) {
21
- if ('item' in nodes && typeof nodes.item === 'function') {
22
- return nodes.item(index);
23
- }
24
- return nodes[index] ?? null;
25
- }
26
- /**
27
- * Remove nodes from a list/iterable.
28
- * - For NodeList-like collections we iterate backwards to be safe with live collections.
29
- * - For iterables we snapshot into an array first.
30
- */
31
- function removeNodes(nodes, shouldRemove) {
32
- if (isNodeListLike(nodes)) {
33
- for (let i = nodes.length - 1; i >= 0; i -= 1) {
34
- const node = getNodeListItem(nodes, i);
35
- if (node && shouldRemove(node))
36
- node.remove();
37
- }
38
- return;
39
- }
40
- for (const node of nodes) {
41
- if (shouldRemove(node))
42
- node.remove();
43
- }
44
- }
45
- /* -------------------------------------------------------------------------------------------------
46
- * Fast-path parsing heuristics
47
- * ------------------------------------------------------------------------------------------------- */
48
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
49
- function isFullDocumentHtml(html) {
50
- return HTML_DOCUMENT_MARKERS.test(html);
51
- }
3
+ import { logDebug } from './observability.js';
4
+ // --- Constants & Pre-compiled Regex ---
52
5
  const NOISE_SCAN_LIMIT = 50_000;
53
- const NOISE_MARKERS = [
54
- '<script',
55
- '<style',
56
- '<noscript',
57
- '<iframe',
58
- '<nav',
59
- '<footer',
60
- '<header',
61
- '<form',
62
- '<button',
63
- '<input',
64
- '<select',
65
- '<textarea',
66
- '<svg',
67
- '<canvas',
68
- ' aria-hidden="true"',
69
- " aria-hidden='true'",
70
- ' hidden',
71
- ' role="navigation"',
72
- " role='navigation'",
73
- ' role="banner"',
74
- " role='banner'",
75
- ' role="complementary"',
76
- " role='complementary'",
77
- ' role="contentinfo"',
78
- " role='contentinfo'",
79
- ' role="tree"',
80
- " role='tree'",
81
- ' role="menubar"',
82
- " role='menubar'",
83
- ' role="menu"',
84
- " role='menu'",
85
- ' banner',
86
- ' promo',
87
- ' announcement',
88
- ' cta',
89
- ' advert',
90
- ' newsletter',
91
- ' subscribe',
92
- ' cookie',
93
- ' consent',
94
- ' popup',
95
- ' modal',
96
- ' overlay',
97
- ' toast',
98
- ' fixed',
99
- ' sticky',
100
- ' z-50',
101
- ' z-4',
102
- ' isolate',
103
- ' breadcrumb',
104
- ' pagination',
6
+ const MIN_BODY_CONTENT_LENGTH = 100;
7
+ const DIALOG_MIN_CHARS_FOR_PRESERVATION = 500;
8
+ const NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION = 500;
9
+ // Merged markers for fast rejection
10
+ const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
11
+ // Split into smaller regexes to stay within sonarjs/regex-complexity limit
12
+ const NOISE_PATTERNS = [
13
+ /<\s*(?:script|style|noscript|iframe|nav|footer|header|form|button|input|select|textarea|svg|canvas)\b/i,
14
+ /[\s"']role\s*=\s*['"]?(?:navigation|banner|complementary|contentinfo|tree|menubar|menu)['"]?/i,
15
+ /[\s"'](?:aria-hidden\s*=\s*['"]?true['"]?|hidden)/i,
16
+ /[\s"'](?:banner|promo|announcement|cta|advert|newsletter|subscribe|cookie|consent|popup|modal|overlay|toast)\b/i,
17
+ /[\s"'](?:fixed|sticky|z-50|z-4|isolate|breadcrumb|pagination)\b/i,
105
18
  ];
106
- function mayContainNoise(html) {
107
- const sample = html.length > NOISE_SCAN_LIMIT ? html.slice(0, NOISE_SCAN_LIMIT) : html;
108
- const haystack = sample.toLowerCase();
109
- return NOISE_MARKERS.some((marker) => haystack.includes(marker));
110
- }
111
- /* -------------------------------------------------------------------------------------------------
112
- * Noise selectors & classification
113
- * ------------------------------------------------------------------------------------------------- */
114
- const STRUCTURAL_TAGS = new Set([
19
+ const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
20
+ const FIXED_OR_HIGH_Z_PATTERN = /\b(?:fixed|sticky|z-(?:4\d|50)|isolate)\b/;
21
+ const SKIP_URL_PREFIXES = [
22
+ '#',
23
+ 'java' + 'script:',
24
+ 'mailto:',
25
+ 'tel:',
26
+ 'data:',
27
+ 'blob:',
28
+ ];
29
+ const BASE_STRUCTURAL_TAGS = new Set([
115
30
  'script',
116
31
  'style',
117
32
  'noscript',
@@ -121,41 +36,8 @@ const STRUCTURAL_TAGS = new Set([
121
36
  'input',
122
37
  'select',
123
38
  'textarea',
124
- 'svg',
125
- 'canvas',
126
39
  ]);
127
40
  const ALWAYS_NOISE_TAGS = new Set(['nav', 'footer']);
128
- const BASE_NOISE_SELECTORS = [
129
- 'nav',
130
- 'footer',
131
- 'header[class*="site"]',
132
- 'header[class*="nav"]',
133
- 'header[class*="menu"]',
134
- '[role="banner"]',
135
- '[role="navigation"]',
136
- '[role="dialog"]',
137
- '[style*="display: none"]',
138
- '[style*="display:none"]',
139
- '[hidden]',
140
- '[aria-hidden="true"]',
141
- ];
142
- const BASE_NOISE_SELECTOR = BASE_NOISE_SELECTORS.join(',');
143
- const CANDIDATE_NOISE_SELECTOR = [
144
- ...STRUCTURAL_TAGS,
145
- ...ALWAYS_NOISE_TAGS,
146
- 'aside',
147
- 'header',
148
- '[class]',
149
- '[id]',
150
- '[role]',
151
- '[style]',
152
- ].join(',');
153
- function buildNoiseSelector(extraSelectors) {
154
- const extra = extraSelectors.filter((s) => s.trim().length > 0);
155
- return extra.length === 0
156
- ? BASE_NOISE_SELECTOR
157
- : `${BASE_NOISE_SELECTOR},${extra.join(',')}`;
158
- }
159
41
  const NAVIGATION_ROLES = new Set([
160
42
  'navigation',
161
43
  'banner',
@@ -182,334 +64,410 @@ const INTERACTIVE_CONTENT_ROLES = new Set([
182
64
  'tooltip',
183
65
  'alert',
184
66
  ]);
185
- const BASE_PROMO_TOKENS = [
67
+ const PROMO_TOKENS_ALWAYS = [
186
68
  'banner',
187
69
  'promo',
188
70
  'announcement',
189
71
  'cta',
190
72
  'advert',
191
- 'ad',
192
73
  'ads',
193
74
  'sponsor',
194
- 'newsletter',
195
- 'subscribe',
196
- 'cookie',
197
- 'consent',
198
- 'popup',
199
- 'modal',
200
- 'overlay',
201
- 'toast',
202
- 'share',
203
- 'social',
204
- 'related',
205
75
  'recommend',
206
- 'comment',
207
76
  'breadcrumb',
208
77
  'pagination',
209
78
  'pager',
210
79
  'taglist',
211
80
  ];
212
- const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
213
- const FIXED_PATTERN = /\b(fixed|sticky)\b/;
214
- const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
215
- const ISOLATE_PATTERN = /\bisolate\b/;
216
- class PromoDetector {
217
- tokenCache = null;
218
- regexCache = null;
219
- matches(className, id) {
220
- const regex = this.getRegex();
221
- return regex.test(className) || regex.test(id);
222
- }
223
- getTokens() {
224
- if (this.tokenCache)
225
- return this.tokenCache;
226
- const tokens = new Set(BASE_PROMO_TOKENS);
227
- for (const token of config.noiseRemoval.extraTokens) {
228
- const normalized = token.toLowerCase().trim();
229
- if (normalized)
230
- tokens.add(normalized);
231
- }
232
- this.tokenCache = tokens;
233
- return tokens;
81
+ const PROMO_TOKENS_AGGRESSIVE = ['ad', 'related', 'comment'];
82
+ const PROMO_TOKENS_BY_CATEGORY = {
83
+ 'cookie-banners': ['cookie', 'consent', 'popup', 'modal', 'overlay', 'toast'],
84
+ newsletters: ['newsletter', 'subscribe'],
85
+ 'social-share': ['share', 'social'],
86
+ };
87
+ const BASE_NOISE_SELECTORS = {
88
+ navFooter: 'nav,footer,header[class*="site"],header[class*="nav"],header[class*="menu"],[role="banner"],[role="navigation"]',
89
+ cookieBanners: '[role="dialog"]',
90
+ hidden: '[style*="display: none"],[style*="display:none"],[hidden],[aria-hidden="true"]',
91
+ };
92
+ const NO_MATCH_REGEX = /a^/i;
93
+ // --- State Cache ---
94
+ let cachedContext;
95
+ let lastConfigRef;
96
+ // --- Helpers Inlined/Optimized ---
97
+ function escapeRegexLiteral(value) {
98
+ return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
99
+ }
100
+ function buildTokenRegex(tokens) {
101
+ if (tokens.size === 0)
102
+ return NO_MATCH_REGEX;
103
+ return new RegExp(`(?:^|[^a-z0-9])(?:${[...tokens].map(escapeRegexLiteral).join('|')})(?:$|[^a-z0-9])`, 'i');
104
+ }
105
+ function getPromoMatchers(currentConfig, flags) {
106
+ const baseTokens = new Set(PROMO_TOKENS_ALWAYS);
107
+ const aggressiveTokens = new Set();
108
+ if (currentConfig.aggressiveMode) {
109
+ for (const t of PROMO_TOKENS_AGGRESSIVE)
110
+ aggressiveTokens.add(t);
234
111
  }
235
- getRegex() {
236
- if (this.regexCache)
237
- return this.regexCache;
238
- const tokens = [...this.getTokens()];
239
- const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
240
- const pattern = `(?:^|[^a-z0-9])(?:${escaped.join('|')})(?:$|[^a-z0-9])`;
241
- this.regexCache = new RegExp(pattern, 'i');
242
- return this.regexCache;
112
+ if (flags.cookieBanners)
113
+ for (const t of PROMO_TOKENS_BY_CATEGORY['cookie-banners'])
114
+ baseTokens.add(t);
115
+ if (flags.newsletters)
116
+ for (const t of PROMO_TOKENS_BY_CATEGORY['newsletters'])
117
+ baseTokens.add(t);
118
+ if (flags.socialShare)
119
+ for (const t of PROMO_TOKENS_BY_CATEGORY['social-share'])
120
+ baseTokens.add(t);
121
+ for (const t of currentConfig.extraTokens) {
122
+ const n = t.toLowerCase().trim();
123
+ if (n)
124
+ baseTokens.add(n);
243
125
  }
126
+ return {
127
+ base: buildTokenRegex(baseTokens),
128
+ aggressive: buildTokenRegex(aggressiveTokens),
129
+ };
244
130
  }
245
- class NoiseClassifier {
246
- promo;
247
- constructor(promo) {
248
- this.promo = promo;
131
+ function getContext() {
132
+ const currentConfig = config.noiseRemoval;
133
+ if (cachedContext && lastConfigRef === currentConfig) {
134
+ return cachedContext;
249
135
  }
250
- isNoise(element) {
251
- const meta = this.readMetadata(element);
252
- if (this.isStructuralNoise(meta, element))
253
- return true;
254
- if (ALWAYS_NOISE_TAGS.has(meta.tagName))
255
- return true;
256
- if (this.isHeaderBoilerplate(meta))
257
- return true;
258
- if (this.isHiddenNoise(meta, element))
136
+ const enabled = new Set(currentConfig.enabledCategories
137
+ .map((c) => {
138
+ const s = c.toLowerCase().trim();
139
+ const { locale } = config.i18n;
140
+ return locale ? s.toLocaleLowerCase(locale) : s;
141
+ })
142
+ .filter(Boolean));
143
+ const isEnabled = (cat) => enabled.has(cat);
144
+ const flags = {
145
+ navFooter: isEnabled('nav-footer'),
146
+ cookieBanners: isEnabled('cookie-banners'),
147
+ newsletters: isEnabled('newsletters'),
148
+ socialShare: isEnabled('social-share'),
149
+ };
150
+ const structuralTags = new Set(BASE_STRUCTURAL_TAGS);
151
+ if (!currentConfig.preserveSvgCanvas) {
152
+ structuralTags.add('svg');
153
+ structuralTags.add('canvas');
154
+ }
155
+ const promoMatchers = getPromoMatchers(currentConfig, flags);
156
+ const extraSelectors = currentConfig.extraSelectors
157
+ .map((s) => s.trim())
158
+ .filter((s) => s.length > 0);
159
+ // Pre-build selectors
160
+ const selectors = [BASE_NOISE_SELECTORS.hidden];
161
+ if (flags.navFooter)
162
+ selectors.push(BASE_NOISE_SELECTORS.navFooter);
163
+ if (flags.cookieBanners)
164
+ selectors.push(BASE_NOISE_SELECTORS.cookieBanners);
165
+ const baseSelector = selectors.join(',');
166
+ const candidateSelector = [
167
+ ...structuralTags,
168
+ ...ALWAYS_NOISE_TAGS,
169
+ 'aside',
170
+ 'header',
171
+ '[class]',
172
+ '[id]',
173
+ '[role]',
174
+ '[style]',
175
+ ].join(',');
176
+ cachedContext = {
177
+ flags,
178
+ structuralTags,
179
+ weights: currentConfig.weights,
180
+ promoMatchers,
181
+ promoEnabled: flags.cookieBanners || flags.newsletters || flags.socialShare,
182
+ extraSelectors,
183
+ baseSelector,
184
+ candidateSelector,
185
+ };
186
+ lastConfigRef = currentConfig;
187
+ return cachedContext;
188
+ }
189
+ // --- Hot Path Logic ---
190
+ function isInteractive(element, role) {
191
+ if (role && INTERACTIVE_CONTENT_ROLES.has(role))
192
+ return true;
193
+ const ds = element.getAttribute('data-state');
194
+ if (ds === 'inactive' || ds === 'closed')
195
+ return true;
196
+ const dataOrientation = element.getAttribute('data-orientation');
197
+ if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
198
+ return true;
199
+ return (element.hasAttribute('data-accordion-item') ||
200
+ element.hasAttribute('data-radix-collection-item'));
201
+ }
202
+ function isWithinPrimaryContent(element) {
203
+ let current = element;
204
+ while (current) {
205
+ const tagName = current.tagName.toLowerCase();
206
+ if (tagName === 'article' || tagName === 'main')
259
207
  return true;
260
- if (this.isRoleNoise(meta))
208
+ if (current.getAttribute('role') === 'main')
261
209
  return true;
262
- if (this.matchesFixedOrHighZIsolate(meta.className))
210
+ current = current.parentElement;
211
+ }
212
+ return false;
213
+ }
214
+ function shouldPreserve(element, tagName) {
215
+ // Check Dialog
216
+ const role = element.getAttribute('role');
217
+ if (role === 'dialog' || role === 'alertdialog') {
218
+ if (isWithinPrimaryContent(element))
263
219
  return true;
264
- if (this.promo.matches(meta.className, meta.id))
220
+ const textLen = (element.textContent || '').length;
221
+ if (textLen > DIALOG_MIN_CHARS_FOR_PRESERVATION)
265
222
  return true;
266
- return false;
267
- }
268
- readMetadata(element) {
269
- return {
270
- tagName: element.tagName.toLowerCase(),
271
- className: element.getAttribute('class') ?? '',
272
- id: element.getAttribute('id') ?? '',
273
- role: element.getAttribute('role'),
274
- isHidden: this.isHidden(element),
275
- };
276
- }
277
- isStructuralNoise(meta, element) {
278
- if (!STRUCTURAL_TAGS.has(meta.tagName))
279
- return false;
280
- // Interactive structural components (dialogs, menus) are handled elsewhere.
281
- return !this.isInteractiveComponent(element);
223
+ return element.querySelector('h1,h2,h3,h4,h5,h6') !== null;
282
224
  }
283
- isHeaderBoilerplate(meta) {
284
- if (meta.tagName !== 'header')
285
- return false;
286
- if (this.hasNoiseRole(meta.role))
225
+ // Check Nav/Footer
226
+ if (tagName === 'nav' || tagName === 'footer') {
227
+ if (element.querySelector('article,main,section,[role="main"]'))
287
228
  return true;
288
- const combined = `${meta.className} ${meta.id}`.toLowerCase();
289
- return HEADER_NOISE_PATTERN.test(combined);
290
- }
291
- isHiddenNoise(meta, element) {
292
- if (!meta.isHidden)
293
- return false;
294
- // Don't remove hidden interactive components (they may be managed by UI framework state).
295
- return !this.isInteractiveComponent(element);
296
- }
297
- isRoleNoise(meta) {
298
- const isComplementaryAside = meta.tagName === 'aside' && meta.role === 'complementary';
299
- if (isComplementaryAside)
300
- return false;
301
- return this.hasNoiseRole(meta.role);
302
- }
303
- hasNoiseRole(role) {
304
- return role !== null && NAVIGATION_ROLES.has(role);
229
+ return ((element.textContent || '').trim().length >=
230
+ NAV_FOOTER_MIN_CHARS_FOR_PRESERVATION);
305
231
  }
306
- matchesFixedOrHighZIsolate(className) {
307
- return (FIXED_PATTERN.test(className) ||
308
- (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
232
+ return false;
233
+ }
234
+ function removeNodes(nodes) {
235
+ for (let i = nodes.length - 1; i >= 0; i--) {
236
+ const node = nodes[i];
237
+ if (node?.parentNode && !shouldPreserve(node, node.tagName.toLowerCase())) {
238
+ node.remove();
239
+ }
309
240
  }
310
- isHidden(element) {
311
- const style = element.getAttribute('style') ?? '';
312
- return (element.getAttribute('hidden') !== null ||
313
- element.getAttribute('aria-hidden') === 'true' ||
314
- /\bdisplay\s*:\s*none\b/i.test(style) ||
315
- /\bvisibility\s*:\s*hidden\b/i.test(style));
241
+ }
242
+ function scoreNavFooter(tagName, role, className, id, weights) {
243
+ let score = 0;
244
+ if (ALWAYS_NOISE_TAGS.has(tagName))
245
+ score += weights.structural;
246
+ // Header Boilerplate
247
+ if (tagName === 'header') {
248
+ if ((role && NAVIGATION_ROLES.has(role)) ||
249
+ HEADER_NOISE_PATTERN.test(`${className} ${id}`)) {
250
+ score += weights.structural;
251
+ }
316
252
  }
317
- isInteractiveComponent(element) {
318
- const role = element.getAttribute('role');
319
- if (role && INTERACTIVE_CONTENT_ROLES.has(role))
320
- return true;
321
- const dataState = element.getAttribute('data-state');
322
- if (dataState === 'inactive' || dataState === 'closed')
323
- return true;
324
- const dataOrientation = element.getAttribute('data-orientation');
325
- if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
326
- return true;
327
- if (element.getAttribute('data-accordion-item') !== null)
328
- return true;
329
- if (element.getAttribute('data-radix-collection-item') !== null)
330
- return true;
331
- return false;
253
+ // Role Noise
254
+ if (role && NAVIGATION_ROLES.has(role)) {
255
+ if (tagName !== 'aside' || role !== 'complementary') {
256
+ score += weights.structural;
257
+ }
332
258
  }
259
+ return score;
333
260
  }
334
- class NoiseStripper {
335
- classifier;
336
- constructor(classifier) {
337
- this.classifier = classifier;
261
+ function isNoiseElement(element, context) {
262
+ const tagName = element.tagName.toLowerCase();
263
+ const className = element.getAttribute('class') ?? '';
264
+ const id = element.getAttribute('id') ?? '';
265
+ const role = element.getAttribute('role');
266
+ const _isInteractive = isInteractive(element, role);
267
+ const style = element.getAttribute('style');
268
+ const isHidden = element.hasAttribute('hidden') ||
269
+ element.getAttribute('aria-hidden') === 'true' ||
270
+ (style !== null &&
271
+ /\b(?:display\s*:\s*none|visibility\s*:\s*hidden)\b/i.test(style));
272
+ let score = 0;
273
+ const { weights } = context;
274
+ // Structural
275
+ if (context.structuralTags.has(tagName) && !_isInteractive) {
276
+ score += weights.structural;
338
277
  }
339
- strip(document) {
340
- this.removeBySelector(document, buildNoiseSelector(config.noiseRemoval.extraSelectors),
341
- /* checkNoise */ false);
342
- this.removeBySelector(document, CANDIDATE_NOISE_SELECTOR,
343
- /* checkNoise */ true);
278
+ // Nav/Footer Scoring
279
+ if (context.flags.navFooter) {
280
+ score += scoreNavFooter(tagName, role, className, id, weights);
344
281
  }
345
- removeBySelector(document, selector, checkNoise) {
346
- const nodes = document.querySelectorAll(selector);
347
- removeNodes(nodes, (node) => {
348
- if (!isElement(node))
349
- return false;
350
- return checkNoise ? this.classifier.isNoise(node) : true;
351
- });
282
+ // Hidden
283
+ if (isHidden && !_isInteractive) {
284
+ score += weights.hidden;
352
285
  }
353
- }
354
- /* -------------------------------------------------------------------------------------------------
355
- * Relative URL resolution
356
- * ------------------------------------------------------------------------------------------------- */
357
- const SKIP_URL_PREFIXES = [
358
- '#',
359
- 'java' + 'script:',
360
- 'mailto:',
361
- 'tel:',
362
- 'data:',
363
- 'blob:',
364
- ];
365
- function shouldSkipUrlResolution(url) {
366
- const normalized = url.trim().toLowerCase();
367
- return SKIP_URL_PREFIXES.some((prefix) => normalized.startsWith(prefix));
368
- }
369
- function tryResolveUrl(relativeUrl, baseUrl) {
370
- try {
371
- return new URL(relativeUrl, baseUrl).href;
286
+ // Sticky/Fixed
287
+ if (FIXED_OR_HIGH_Z_PATTERN.test(className)) {
288
+ score += weights.stickyFixed;
372
289
  }
373
- catch {
374
- return null;
290
+ // Promo
291
+ if (context.promoEnabled) {
292
+ const aggTest = context.promoMatchers.aggressive.test(className) ||
293
+ context.promoMatchers.aggressive.test(id);
294
+ const isAggressiveMatch = aggTest && !isWithinPrimaryContent(element);
295
+ const isBaseMatch = !aggTest &&
296
+ (context.promoMatchers.base.test(className) ||
297
+ context.promoMatchers.base.test(id));
298
+ if (isAggressiveMatch || isBaseMatch) {
299
+ score += weights.promo;
300
+ }
375
301
  }
302
+ return score >= weights.threshold;
376
303
  }
377
- class RelativeUrlResolver {
378
- resolve(document, baseUrl) {
379
- let base;
380
- try {
381
- base = new URL(baseUrl);
382
- }
383
- catch {
384
- // invalid base URL - skip resolution
385
- return;
386
- }
387
- for (const element of document.querySelectorAll('a[href], img[src], source[srcset]')) {
388
- const tag = element.tagName.toLowerCase();
389
- if (tag === 'a')
390
- this.resolveAnchor(element, base);
391
- else if (tag === 'img')
392
- this.resolveImage(element, base);
393
- else if (tag === 'source')
394
- this.resolveSource(element, base);
304
+ function cleanHeadingWrapperDivs(h) {
305
+ const divs = h.querySelectorAll('div');
306
+ for (let j = divs.length - 1; j >= 0; j--) {
307
+ const d = divs[j];
308
+ if (!d?.parentNode)
309
+ continue;
310
+ const cls = d.getAttribute('class') ?? '';
311
+ const stl = d.getAttribute('style') ?? '';
312
+ if (cls.includes('absolute') ||
313
+ stl.includes('position') ||
314
+ d.getAttribute('tabindex') === '-1') {
315
+ d.remove();
395
316
  }
396
317
  }
397
- resolveAnchor(element, base) {
398
- const href = element.getAttribute('href');
399
- if (!href || shouldSkipUrlResolution(href))
400
- return;
401
- const resolved = tryResolveUrl(href, base);
402
- if (resolved)
403
- element.setAttribute('href', resolved);
318
+ }
319
+ function cleanHeadingAnchors(h) {
320
+ const anchors = h.querySelectorAll('a');
321
+ for (let j = anchors.length - 1; j >= 0; j--) {
322
+ const a = anchors[j];
323
+ if (!a?.parentNode)
324
+ continue;
325
+ const href = a.getAttribute('href') ?? '';
326
+ const txt = (a.textContent || '').replace(/[\u200B\s]/g, '');
327
+ if (href.startsWith('#') && txt.length === 0) {
328
+ a.remove();
329
+ }
404
330
  }
405
- resolveImage(element, base) {
406
- const src = element.getAttribute('src');
407
- if (!src || shouldSkipUrlResolution(src))
408
- return;
409
- const resolved = tryResolveUrl(src, base);
410
- if (resolved)
411
- element.setAttribute('src', resolved);
331
+ }
332
+ function cleanHeadingZeroWidth(h, document) {
333
+ const walker = document.createTreeWalker(h, 4); // SHOW_TEXT
334
+ let node;
335
+ while ((node = walker.nextNode())) {
336
+ if (node.textContent?.includes('\u200B')) {
337
+ node.textContent = node.textContent.replace(/\u200B/g, '');
338
+ }
412
339
  }
413
- /**
414
- * Keep original behavior: srcset entries are always attempted to be resolved (no prefix skipping).
415
- */
416
- resolveSource(element, base) {
417
- const srcset = element.getAttribute('srcset');
418
- if (!srcset)
419
- return;
420
- const resolved = srcset
421
- .split(',')
422
- .map((entry) => {
423
- const parts = entry.trim().split(/\s+/);
424
- const url = parts[0];
425
- if (url) {
426
- const resolvedUrl = tryResolveUrl(url, base);
427
- if (resolvedUrl)
428
- parts[0] = resolvedUrl;
429
- }
430
- return parts.join(' ');
431
- })
432
- .join(', ');
433
- element.setAttribute('srcset', resolved);
340
+ }
341
+ function cleanHeadings(document) {
342
+ // Clean Heading Anchors
343
+ const headings = document.querySelectorAll('h1,h2,h3,h4,h5,h6');
344
+ for (const h of headings) {
345
+ if (!h.parentNode)
346
+ continue;
347
+ cleanHeadingWrapperDivs(h);
348
+ cleanHeadingAnchors(h);
349
+ cleanHeadingZeroWidth(h, document);
434
350
  }
435
351
  }
436
- /* -------------------------------------------------------------------------------------------------
437
- * Serialization
438
- * ------------------------------------------------------------------------------------------------- */
439
- class DocumentSerializer {
440
- /**
441
- * Preserve existing behavior:
442
- * - Prefer body.innerHTML only if it has "substantial" content (> 100 chars).
443
- * - Otherwise fall back to document.toString(), then documentElement.outerHTML, then original HTML.
444
- */
445
- serialize(document, fallbackHtml) {
446
- const bodyInner = this.getBodyInnerHtml(document);
447
- if (bodyInner && bodyInner.trim().length > 100)
448
- return bodyInner;
449
- const toStringFn = this.getDocumentToString(document);
450
- if (toStringFn)
451
- return toStringFn();
452
- const outer = this.getDocumentElementOuterHtml(document);
453
- if (outer)
454
- return outer;
455
- return fallbackHtml;
352
+ function stripNoise(document, context) {
353
+ cleanHeadings(document);
354
+ // Remove Base & Extra
355
+ const { baseSelector, extraSelectors } = context;
356
+ // Base
357
+ const baseNodes = document.querySelectorAll(baseSelector);
358
+ removeNodes(baseNodes);
359
+ // Extra
360
+ if (extraSelectors.length > 0) {
361
+ const combinedExtra = extraSelectors.join(',');
362
+ const extraNodes = document.querySelectorAll(combinedExtra);
363
+ removeNodes(extraNodes);
456
364
  }
457
- getBodyInnerHtml(document) {
458
- if (!isObject(document))
459
- return undefined;
460
- const { body } = document;
461
- if (isObject(body) &&
462
- typeof body.innerHTML === 'string') {
463
- return body.innerHTML;
365
+ // Candidates
366
+ const candidates = document.querySelectorAll(context.candidateSelector);
367
+ for (let i = candidates.length - 1; i >= 0; i--) {
368
+ const node = candidates[i];
369
+ if (!node)
370
+ continue;
371
+ if (!node.parentNode)
372
+ continue;
373
+ if (shouldPreserve(node, node.tagName.toLowerCase()))
374
+ continue;
375
+ if (isNoiseElement(node, context)) {
376
+ node.remove();
464
377
  }
465
- return undefined;
466
378
  }
467
- getDocumentToString(document) {
468
- if (!isObject(document))
469
- return undefined;
470
- const fn = document.toString;
471
- if (typeof fn === 'function')
472
- return fn.bind(document);
473
- return undefined;
474
- }
475
- getDocumentElementOuterHtml(document) {
476
- if (!isObject(document))
477
- return undefined;
478
- const docEl = document.documentElement;
479
- if (isObject(docEl) &&
480
- typeof docEl.outerHTML === 'string') {
481
- return docEl.outerHTML;
379
+ }
380
+ function processUrlElement(el, attr, base, isSrcset) {
381
+ if (!el.parentNode)
382
+ return;
383
+ if (isSrcset) {
384
+ const val = el.getAttribute(attr);
385
+ if (val) {
386
+ const newVal = val
387
+ .split(',')
388
+ .map((entry) => {
389
+ const parts = entry.trim().split(/\s+/);
390
+ if (!parts[0])
391
+ return entry;
392
+ try {
393
+ parts[0] = new URL(parts[0], base).href;
394
+ }
395
+ catch {
396
+ /* ignore */
397
+ }
398
+ return parts.join(' ');
399
+ })
400
+ .join(', ');
401
+ el.setAttribute(attr, newVal);
482
402
  }
483
- return undefined;
403
+ return;
484
404
  }
485
- }
486
- /* -------------------------------------------------------------------------------------------------
487
- * Public pipeline
488
- * ------------------------------------------------------------------------------------------------- */
489
- class HtmlNoiseRemovalPipeline {
490
- promo = new PromoDetector();
491
- classifier = new NoiseClassifier(this.promo);
492
- stripper = new NoiseStripper(this.classifier);
493
- urlResolver = new RelativeUrlResolver();
494
- serializer = new DocumentSerializer();
495
- removeNoise(html, document, baseUrl) {
496
- const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
497
- if (!shouldParse)
498
- return html;
405
+ const val = el.getAttribute(attr);
406
+ if (val &&
407
+ !SKIP_URL_PREFIXES.some((p) => val.trim().toLowerCase().startsWith(p))) {
499
408
  try {
500
- const resolvedDocument = document ?? parseHTML(html).document;
501
- this.stripper.strip(resolvedDocument);
502
- if (baseUrl) {
503
- this.urlResolver.resolve(resolvedDocument, baseUrl);
504
- }
505
- return this.serializer.serialize(resolvedDocument, html);
409
+ el.setAttribute(attr, new URL(val, base).href);
506
410
  }
507
411
  catch {
508
- return html;
412
+ /* ignore */
509
413
  }
510
414
  }
511
415
  }
512
- const pipeline = new HtmlNoiseRemovalPipeline();
416
+ function resolveUrls(document, baseUrlStr) {
417
+ let base;
418
+ try {
419
+ base = new URL(baseUrlStr);
420
+ }
421
+ catch {
422
+ return;
423
+ }
424
+ const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
425
+ for (const el of Array.from(elements)) {
426
+ const tag = el.tagName.toLowerCase();
427
+ if (tag === 'a')
428
+ processUrlElement(el, 'href', base, false);
429
+ else if (tag === 'img')
430
+ processUrlElement(el, 'src', base, false);
431
+ else if (tag === 'source')
432
+ processUrlElement(el, 'srcset', base, true);
433
+ }
434
+ }
435
+ function serialize(document, fallback) {
436
+ const bodyHtml = document.body.innerHTML;
437
+ if (bodyHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
438
+ return bodyHtml;
439
+ const outerHtml = document.documentElement.outerHTML;
440
+ if (outerHtml.trim().length > MIN_BODY_CONTENT_LENGTH)
441
+ return outerHtml;
442
+ return fallback;
443
+ }
444
+ function isFullDocumentHtml(html) {
445
+ return HTML_DOCUMENT_MARKERS.test(html);
446
+ }
447
+ function mayContainNoise(html) {
448
+ const sample = html.length <= NOISE_SCAN_LIMIT
449
+ ? html
450
+ : `${html.substring(0, NOISE_SCAN_LIMIT)}\n${html.substring(html.length - NOISE_SCAN_LIMIT)}`;
451
+ return NOISE_PATTERNS.some((re) => re.test(sample));
452
+ }
513
453
  export function removeNoiseFromHtml(html, document, baseUrl) {
514
- return pipeline.removeNoise(html, document, baseUrl);
454
+ const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
455
+ if (!shouldParse)
456
+ return html;
457
+ try {
458
+ const context = getContext();
459
+ if (config.noiseRemoval.debug) {
460
+ logDebug('Noise removal audit enabled', {
461
+ categories: [...(context.flags.navFooter ? ['nav-footer'] : [])],
462
+ });
463
+ }
464
+ const doc = document ?? parseHTML(html).document;
465
+ stripNoise(doc, context);
466
+ if (baseUrl)
467
+ resolveUrls(doc, baseUrl);
468
+ return serialize(doc, html);
469
+ }
470
+ catch {
471
+ return html;
472
+ }
515
473
  }