@j0hanz/superfetch 2.4.2 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,43 +5,112 @@
5
5
  import { parseHTML } from 'linkedom';
6
6
  import { config } from './config.js';
7
7
  import { isObject } from './type-guards.js';
8
- // ─────────────────────────────────────────────────────────────────────────────
9
- // DOM Type Guards and Accessors
10
- // ─────────────────────────────────────────────────────────────────────────────
8
+ /* -------------------------------------------------------------------------------------------------
9
+ * DOM guards & small helpers
10
+ * ------------------------------------------------------------------------------------------------- */
11
11
  function isElement(node) {
12
12
  return (isObject(node) &&
13
13
  'getAttribute' in node &&
14
14
  typeof node.getAttribute === 'function');
15
15
  }
16
- function getBodyInnerHtml(document) {
17
- if (!isObject(document))
18
- return undefined;
19
- const { body } = document;
20
- if (isObject(body) && typeof body.innerHTML === 'string') {
21
- return body.innerHTML;
22
- }
23
- return undefined;
16
+ function isNodeListLike(value) {
17
+ return (isObject(value) &&
18
+ typeof value.length === 'number');
24
19
  }
25
- function getDocumentToString(document) {
26
- if (!isObject(document))
27
- return undefined;
28
- if (typeof document.toString === 'function') {
29
- return document.toString.bind(document);
20
+ function getNodeListItem(nodes, index) {
21
+ if ('item' in nodes && typeof nodes.item === 'function') {
22
+ return nodes.item(index);
30
23
  }
31
- return undefined;
24
+ return nodes[index] ?? null;
32
25
  }
33
- function getDocumentElementOuterHtml(document) {
34
- if (!isObject(document))
35
- return undefined;
36
- const docEl = document.documentElement;
37
- if (isObject(docEl) && typeof docEl.outerHTML === 'string') {
38
- return docEl.outerHTML;
26
+ /**
27
+ * Remove nodes from a list/iterable.
28
+ * - For NodeList-like collections we iterate backwards to be safe with live collections.
29
+ * - For iterables we snapshot into an array first.
30
+ */
31
+ function removeNodes(nodes, shouldRemove) {
32
+ if (isNodeListLike(nodes)) {
33
+ for (let i = nodes.length - 1; i >= 0; i -= 1) {
34
+ const node = getNodeListItem(nodes, i);
35
+ if (node && shouldRemove(node))
36
+ node.remove();
37
+ }
38
+ return;
39
+ }
40
+ for (const node of nodes) {
41
+ if (shouldRemove(node))
42
+ node.remove();
39
43
  }
40
- return undefined;
41
44
  }
42
- // ─────────────────────────────────────────────────────────────────────────────
43
- // Noise Detection Constants
44
- // ─────────────────────────────────────────────────────────────────────────────
45
+ /* -------------------------------------------------------------------------------------------------
46
+ * Fast-path parsing heuristics
47
+ * ------------------------------------------------------------------------------------------------- */
48
+ const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
49
+ function isFullDocumentHtml(html) {
50
+ return HTML_DOCUMENT_MARKERS.test(html);
51
+ }
52
+ const NOISE_SCAN_LIMIT = 50_000;
53
+ const NOISE_MARKERS = [
54
+ '<script',
55
+ '<style',
56
+ '<noscript',
57
+ '<iframe',
58
+ '<nav',
59
+ '<footer',
60
+ '<header',
61
+ '<form',
62
+ '<button',
63
+ '<input',
64
+ '<select',
65
+ '<textarea',
66
+ '<svg',
67
+ '<canvas',
68
+ ' aria-hidden="true"',
69
+ " aria-hidden='true'",
70
+ ' hidden',
71
+ ' role="navigation"',
72
+ " role='navigation'",
73
+ ' role="banner"',
74
+ " role='banner'",
75
+ ' role="complementary"',
76
+ " role='complementary'",
77
+ ' role="contentinfo"',
78
+ " role='contentinfo'",
79
+ ' role="tree"',
80
+ " role='tree'",
81
+ ' role="menubar"',
82
+ " role='menubar'",
83
+ ' role="menu"',
84
+ " role='menu'",
85
+ ' banner',
86
+ ' promo',
87
+ ' announcement',
88
+ ' cta',
89
+ ' advert',
90
+ ' newsletter',
91
+ ' subscribe',
92
+ ' cookie',
93
+ ' consent',
94
+ ' popup',
95
+ ' modal',
96
+ ' overlay',
97
+ ' toast',
98
+ ' fixed',
99
+ ' sticky',
100
+ ' z-50',
101
+ ' z-4',
102
+ ' isolate',
103
+ ' breadcrumb',
104
+ ' pagination',
105
+ ];
106
+ function mayContainNoise(html) {
107
+ const sample = html.length > NOISE_SCAN_LIMIT ? html.slice(0, NOISE_SCAN_LIMIT) : html;
108
+ const haystack = sample.toLowerCase();
109
+ return NOISE_MARKERS.some((marker) => haystack.includes(marker));
110
+ }
111
+ /* -------------------------------------------------------------------------------------------------
112
+ * Noise selectors & classification
113
+ * ------------------------------------------------------------------------------------------------- */
45
114
  const STRUCTURAL_TAGS = new Set([
46
115
  'script',
47
116
  'style',
@@ -82,10 +151,10 @@ const CANDIDATE_NOISE_SELECTOR = [
82
151
  '[style]',
83
152
  ].join(',');
84
153
  function buildNoiseSelector(extraSelectors) {
85
- const extra = extraSelectors.filter((selector) => selector.trim().length > 0);
86
- if (extra.length === 0)
87
- return BASE_NOISE_SELECTOR;
88
- return `${BASE_NOISE_SELECTOR},${extra.join(',')}`;
154
+ const extra = extraSelectors.filter((s) => s.trim().length > 0);
155
+ return extra.length === 0
156
+ ? BASE_NOISE_SELECTOR
157
+ : `${BASE_NOISE_SELECTOR},${extra.join(',')}`;
89
158
  }
90
159
  const NAVIGATION_ROLES = new Set([
91
160
  'navigation',
@@ -140,214 +209,151 @@ const BASE_PROMO_TOKENS = [
140
209
  'pager',
141
210
  'taglist',
142
211
  ];
143
- /**
144
- * Get promo tokens merged with any user-configured extra tokens.
145
- * Memoized because it is used in hot paths when scanning many nodes.
146
- */
147
- let promoTokensCache = null;
148
- function getPromoTokens() {
149
- if (promoTokensCache)
150
- return promoTokensCache;
151
- const tokens = new Set(BASE_PROMO_TOKENS);
152
- for (const token of config.noiseRemoval.extraTokens) {
153
- const normalized = token.toLowerCase().trim();
154
- if (normalized)
155
- tokens.add(normalized);
156
- }
157
- promoTokensCache = tokens;
158
- return tokens;
159
- }
160
- let promoRegexCache = null;
161
- function getPromoRegex() {
162
- if (promoRegexCache)
163
- return promoRegexCache;
164
- const tokens = Array.from(getPromoTokens());
165
- const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
166
- const pattern = `(?:^|[^a-z0-9])(?:${escaped.join('|')})(?:$|[^a-z0-9])`;
167
- promoRegexCache = new RegExp(pattern, 'i');
168
- return promoRegexCache;
169
- }
170
212
  const HEADER_NOISE_PATTERN = /\b(site-header|masthead|topbar|navbar|nav(?:bar)?|menu|header-nav)\b/i;
171
213
  const FIXED_PATTERN = /\b(fixed|sticky)\b/;
172
214
  const HIGH_Z_PATTERN = /\bz-(?:4\d|50)\b/;
173
215
  const ISOLATE_PATTERN = /\bisolate\b/;
174
- const HTML_DOCUMENT_MARKERS = /<\s*(?:!doctype|html|head|body)\b/i;
175
- const NOISE_MARKERS = [
176
- '<script',
177
- '<style',
178
- '<noscript',
179
- '<iframe',
180
- '<nav',
181
- '<footer',
182
- '<header',
183
- '<form',
184
- '<button',
185
- '<input',
186
- '<select',
187
- '<textarea',
188
- '<svg',
189
- '<canvas',
190
- ' aria-hidden="true"',
191
- " aria-hidden='true'",
192
- ' hidden',
193
- ' role="navigation"',
194
- " role='navigation'",
195
- ' role="banner"',
196
- " role='banner'",
197
- ' role="complementary"',
198
- " role='complementary'",
199
- ' role="contentinfo"',
200
- " role='contentinfo'",
201
- ' role="tree"',
202
- " role='tree'",
203
- ' role="menubar"',
204
- " role='menubar'",
205
- ' role="menu"',
206
- " role='menu'",
207
- ' banner',
208
- ' promo',
209
- ' announcement',
210
- ' cta',
211
- ' advert',
212
- ' newsletter',
213
- ' subscribe',
214
- ' cookie',
215
- ' consent',
216
- ' popup',
217
- ' modal',
218
- ' overlay',
219
- ' toast',
220
- ' fixed',
221
- ' sticky',
222
- ' z-50',
223
- ' z-4',
224
- ' isolate',
225
- ' breadcrumb',
226
- ' pagination',
227
- ];
228
- // ─────────────────────────────────────────────────────────────────────────────
229
- // Noise Detection Functions
230
- // ─────────────────────────────────────────────────────────────────────────────
231
- const NOISE_SCAN_LIMIT = 50_000;
232
- function mayContainNoise(html) {
233
- // Fast path: only scan a bounded prefix; parsing is the expensive step anyway.
234
- // Most noise markers appear near the top of the document (nav, scripts, meta, etc.).
235
- const sample = html.length > NOISE_SCAN_LIMIT ? html.slice(0, NOISE_SCAN_LIMIT) : html;
236
- const haystack = sample.toLowerCase();
237
- return NOISE_MARKERS.some((marker) => haystack.includes(marker));
238
- }
239
- function isFullDocumentHtml(html) {
240
- return HTML_DOCUMENT_MARKERS.test(html);
241
- }
242
- function isStructuralNoiseTag(tagName) {
243
- return STRUCTURAL_TAGS.has(tagName);
244
- }
245
- function isInteractiveComponent(element) {
246
- const role = element.getAttribute('role');
247
- if (role && INTERACTIVE_CONTENT_ROLES.has(role))
248
- return true;
249
- // Check for common UI framework data attributes that indicate managed visibility
250
- const dataState = element.getAttribute('data-state');
251
- if (dataState === 'inactive' || dataState === 'closed')
252
- return true;
253
- const dataOrientation = element.getAttribute('data-orientation');
254
- if (dataOrientation === 'horizontal' || dataOrientation === 'vertical') {
255
- return true;
216
+ class PromoDetector {
217
+ tokenCache = null;
218
+ regexCache = null;
219
+ matches(className, id) {
220
+ const regex = this.getRegex();
221
+ return regex.test(className) || regex.test(id);
256
222
  }
257
- // Check for accordion/collapse patterns
258
- if (element.getAttribute('data-accordion-item') !== null)
259
- return true;
260
- if (element.getAttribute('data-radix-collection-item') !== null)
261
- return true;
262
- return false;
263
- }
264
- function isElementHidden(element) {
265
- const style = element.getAttribute('style') ?? '';
266
- return (element.getAttribute('hidden') !== null ||
267
- element.getAttribute('aria-hidden') === 'true' ||
268
- /\bdisplay\s*:\s*none\b/i.test(style) ||
269
- /\bvisibility\s*:\s*hidden\b/i.test(style));
270
- }
271
- function hasNoiseRole(role) {
272
- return role !== null && NAVIGATION_ROLES.has(role);
273
- }
274
- function matchesPromoIdOrClass(className, id) {
275
- const regex = getPromoRegex();
276
- return regex.test(className) || regex.test(id);
277
- }
278
- function matchesFixedOrHighZIsolate(className) {
279
- return (FIXED_PATTERN.test(className) ||
280
- (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
281
- }
282
- function readElementMetadata(element) {
283
- return {
284
- tagName: element.tagName.toLowerCase(),
285
- className: element.getAttribute('class') ?? '',
286
- id: element.getAttribute('id') ?? '',
287
- role: element.getAttribute('role'),
288
- isHidden: isElementHidden(element),
289
- };
290
- }
291
- function isBoilerplateHeader({ className, id, role, }) {
292
- if (hasNoiseRole(role))
293
- return true;
294
- const combined = `${className} ${id}`.toLowerCase();
295
- return HEADER_NOISE_PATTERN.test(combined);
296
- }
297
- function isNoiseElement(node) {
298
- const metadata = readElementMetadata(node);
299
- const isComplementaryAside = metadata.tagName === 'aside' && metadata.role === 'complementary';
300
- const shouldCheckHidden = metadata.isHidden && !isInteractiveComponent(node);
301
- const isInteractiveStructural = isStructuralNoiseTag(metadata.tagName) && isInteractiveComponent(node);
302
- return ((isStructuralNoiseTag(metadata.tagName) && !isInteractiveStructural) ||
303
- ALWAYS_NOISE_TAGS.has(metadata.tagName) ||
304
- (metadata.tagName === 'header' && isBoilerplateHeader(metadata)) ||
305
- shouldCheckHidden ||
306
- (!isComplementaryAside && hasNoiseRole(metadata.role)) ||
307
- matchesFixedOrHighZIsolate(metadata.className) ||
308
- matchesPromoIdOrClass(metadata.className, metadata.id));
309
- }
310
- function isNodeListLike(value) {
311
- return isObject(value) && typeof value.length === 'number';
312
- }
313
- function tryGetNodeListItem(nodes, index) {
314
- if ('item' in nodes && typeof nodes.item === 'function') {
315
- return nodes.item(index);
316
- }
317
- return nodes[index] ?? null;
318
- }
319
- function removeNoiseFromNodeListLike(nodes, shouldCheckNoise) {
320
- for (let index = nodes.length - 1; index >= 0; index -= 1) {
321
- const node = tryGetNodeListItem(nodes, index);
322
- if (!node)
323
- continue;
324
- if (isElement(node) && (!shouldCheckNoise || isNoiseElement(node))) {
325
- node.remove();
223
+ getTokens() {
224
+ if (this.tokenCache)
225
+ return this.tokenCache;
226
+ const tokens = new Set(BASE_PROMO_TOKENS);
227
+ for (const token of config.noiseRemoval.extraTokens) {
228
+ const normalized = token.toLowerCase().trim();
229
+ if (normalized)
230
+ tokens.add(normalized);
326
231
  }
232
+ this.tokenCache = tokens;
233
+ return tokens;
234
+ }
235
+ getRegex() {
236
+ if (this.regexCache)
237
+ return this.regexCache;
238
+ const tokens = [...this.getTokens()];
239
+ const escaped = tokens.map((t) => t.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
240
+ const pattern = `(?:^|[^a-z0-9])(?:${escaped.join('|')})(?:$|[^a-z0-9])`;
241
+ this.regexCache = new RegExp(pattern, 'i');
242
+ return this.regexCache;
327
243
  }
328
244
  }
329
- function removeNoiseNodes(nodes, shouldCheckNoise = true) {
330
- if (isNodeListLike(nodes)) {
331
- removeNoiseFromNodeListLike(nodes, shouldCheckNoise);
332
- return;
245
+ class NoiseClassifier {
246
+ promo;
247
+ constructor(promo) {
248
+ this.promo = promo;
333
249
  }
334
- const nodeList = Array.from(nodes);
335
- for (const node of nodeList) {
336
- if (isElement(node) && (!shouldCheckNoise || isNoiseElement(node))) {
337
- node.remove();
338
- }
250
+ isNoise(element) {
251
+ const meta = this.readMetadata(element);
252
+ if (this.isStructuralNoise(meta, element))
253
+ return true;
254
+ if (ALWAYS_NOISE_TAGS.has(meta.tagName))
255
+ return true;
256
+ if (this.isHeaderBoilerplate(meta))
257
+ return true;
258
+ if (this.isHiddenNoise(meta, element))
259
+ return true;
260
+ if (this.isRoleNoise(meta))
261
+ return true;
262
+ if (this.matchesFixedOrHighZIsolate(meta.className))
263
+ return true;
264
+ if (this.promo.matches(meta.className, meta.id))
265
+ return true;
266
+ return false;
267
+ }
268
+ readMetadata(element) {
269
+ return {
270
+ tagName: element.tagName.toLowerCase(),
271
+ className: element.getAttribute('class') ?? '',
272
+ id: element.getAttribute('id') ?? '',
273
+ role: element.getAttribute('role'),
274
+ isHidden: this.isHidden(element),
275
+ };
276
+ }
277
+ isStructuralNoise(meta, element) {
278
+ if (!STRUCTURAL_TAGS.has(meta.tagName))
279
+ return false;
280
+ // Interactive structural components (dialogs, menus) are handled elsewhere.
281
+ return !this.isInteractiveComponent(element);
282
+ }
283
+ isHeaderBoilerplate(meta) {
284
+ if (meta.tagName !== 'header')
285
+ return false;
286
+ if (this.hasNoiseRole(meta.role))
287
+ return true;
288
+ const combined = `${meta.className} ${meta.id}`.toLowerCase();
289
+ return HEADER_NOISE_PATTERN.test(combined);
290
+ }
291
+ isHiddenNoise(meta, element) {
292
+ if (!meta.isHidden)
293
+ return false;
294
+ // Don't remove hidden interactive components (they may be managed by UI framework state).
295
+ return !this.isInteractiveComponent(element);
296
+ }
297
+ isRoleNoise(meta) {
298
+ const isComplementaryAside = meta.tagName === 'aside' && meta.role === 'complementary';
299
+ if (isComplementaryAside)
300
+ return false;
301
+ return this.hasNoiseRole(meta.role);
302
+ }
303
+ hasNoiseRole(role) {
304
+ return role !== null && NAVIGATION_ROLES.has(role);
305
+ }
306
+ matchesFixedOrHighZIsolate(className) {
307
+ return (FIXED_PATTERN.test(className) ||
308
+ (HIGH_Z_PATTERN.test(className) && ISOLATE_PATTERN.test(className)));
309
+ }
310
+ isHidden(element) {
311
+ const style = element.getAttribute('style') ?? '';
312
+ return (element.getAttribute('hidden') !== null ||
313
+ element.getAttribute('aria-hidden') === 'true' ||
314
+ /\bdisplay\s*:\s*none\b/i.test(style) ||
315
+ /\bvisibility\s*:\s*hidden\b/i.test(style));
316
+ }
317
+ isInteractiveComponent(element) {
318
+ const role = element.getAttribute('role');
319
+ if (role && INTERACTIVE_CONTENT_ROLES.has(role))
320
+ return true;
321
+ const dataState = element.getAttribute('data-state');
322
+ if (dataState === 'inactive' || dataState === 'closed')
323
+ return true;
324
+ const dataOrientation = element.getAttribute('data-orientation');
325
+ if (dataOrientation === 'horizontal' || dataOrientation === 'vertical')
326
+ return true;
327
+ if (element.getAttribute('data-accordion-item') !== null)
328
+ return true;
329
+ if (element.getAttribute('data-radix-collection-item') !== null)
330
+ return true;
331
+ return false;
339
332
  }
340
333
  }
341
- function stripNoiseNodes(document) {
342
- const targetSelectors = buildNoiseSelector(config.noiseRemoval.extraSelectors);
343
- const potentialNoiseNodes = document.querySelectorAll(targetSelectors);
344
- removeNoiseNodes(potentialNoiseNodes, false);
345
- const allElements = document.querySelectorAll(CANDIDATE_NOISE_SELECTOR);
346
- removeNoiseNodes(allElements, true);
334
+ class NoiseStripper {
335
+ classifier;
336
+ constructor(classifier) {
337
+ this.classifier = classifier;
338
+ }
339
+ strip(document) {
340
+ this.removeBySelector(document, buildNoiseSelector(config.noiseRemoval.extraSelectors),
341
+ /* checkNoise */ false);
342
+ this.removeBySelector(document, CANDIDATE_NOISE_SELECTOR,
343
+ /* checkNoise */ true);
344
+ }
345
+ removeBySelector(document, selector, checkNoise) {
346
+ const nodes = document.querySelectorAll(selector);
347
+ removeNodes(nodes, (node) => {
348
+ if (!isElement(node))
349
+ return false;
350
+ return checkNoise ? this.classifier.isNoise(node) : true;
351
+ });
352
+ }
347
353
  }
348
- // ─────────────────────────────────────────────────────────────────────────────
349
- // URL Resolution
350
- // ─────────────────────────────────────────────────────────────────────────────
354
+ /* -------------------------------------------------------------------------------------------------
355
+ * Relative URL resolution
356
+ * ------------------------------------------------------------------------------------------------- */
351
357
  const SKIP_URL_PREFIXES = [
352
358
  '#',
353
359
  'java' + 'script:',
@@ -360,9 +366,6 @@ function shouldSkipUrlResolution(url) {
360
366
  const normalized = url.trim().toLowerCase();
361
367
  return SKIP_URL_PREFIXES.some((prefix) => normalized.startsWith(prefix));
362
368
  }
363
- /**
364
- * Safely resolve a relative URL to absolute using base URL.
365
- */
366
369
  function tryResolveUrl(relativeUrl, baseUrl) {
367
370
  try {
368
371
  return new URL(relativeUrl, baseUrl).href;
@@ -371,87 +374,142 @@ function tryResolveUrl(relativeUrl, baseUrl) {
371
374
  return null;
372
375
  }
373
376
  }
374
- function resolveAnchorElement(element, base) {
375
- const href = element.getAttribute('href');
376
- if (href && !shouldSkipUrlResolution(href)) {
377
+ class RelativeUrlResolver {
378
+ resolve(document, baseUrl) {
379
+ let base;
380
+ try {
381
+ base = new URL(baseUrl);
382
+ }
383
+ catch {
384
+ // invalid base URL - skip resolution
385
+ return;
386
+ }
387
+ for (const element of document.querySelectorAll('a[href], img[src], source[srcset]')) {
388
+ const tag = element.tagName.toLowerCase();
389
+ if (tag === 'a')
390
+ this.resolveAnchor(element, base);
391
+ else if (tag === 'img')
392
+ this.resolveImage(element, base);
393
+ else if (tag === 'source')
394
+ this.resolveSource(element, base);
395
+ }
396
+ }
397
+ resolveAnchor(element, base) {
398
+ const href = element.getAttribute('href');
399
+ if (!href || shouldSkipUrlResolution(href))
400
+ return;
377
401
  const resolved = tryResolveUrl(href, base);
378
402
  if (resolved)
379
403
  element.setAttribute('href', resolved);
380
404
  }
381
- }
382
- function resolveImageElement(element, base) {
383
- const src = element.getAttribute('src');
384
- if (src && !shouldSkipUrlResolution(src)) {
405
+ resolveImage(element, base) {
406
+ const src = element.getAttribute('src');
407
+ if (!src || shouldSkipUrlResolution(src))
408
+ return;
385
409
  const resolved = tryResolveUrl(src, base);
386
410
  if (resolved)
387
411
  element.setAttribute('src', resolved);
388
412
  }
389
- }
390
- function resolveSourceElement(element, base) {
391
- const srcset = element.getAttribute('srcset');
392
- if (!srcset)
393
- return;
394
- const resolved = srcset
395
- .split(',')
396
- .map((entry) => {
397
- const parts = entry.trim().split(/\s+/);
398
- const url = parts[0];
399
- if (url) {
400
- const resolvedUrl = tryResolveUrl(url, base);
401
- if (resolvedUrl)
402
- parts[0] = resolvedUrl;
403
- }
404
- return parts.join(' ');
405
- })
406
- .join(', ');
407
- element.setAttribute('srcset', resolved);
408
- }
409
- function resolveRelativeUrls(document, baseUrl) {
410
- try {
411
- const base = new URL(baseUrl);
412
- for (const element of document.querySelectorAll('a[href], img[src], source[srcset]')) {
413
- const tag = element.tagName.toLowerCase();
414
- if (tag === 'a') {
415
- resolveAnchorElement(element, base);
416
- }
417
- else if (tag === 'img') {
418
- resolveImageElement(element, base);
419
- }
420
- else if (tag === 'source') {
421
- resolveSourceElement(element, base);
413
+ /**
414
+ * Keep original behavior: srcset entries are always attempted to be resolved (no prefix skipping).
415
+ */
416
+ resolveSource(element, base) {
417
+ const srcset = element.getAttribute('srcset');
418
+ if (!srcset)
419
+ return;
420
+ const resolved = srcset
421
+ .split(',')
422
+ .map((entry) => {
423
+ const parts = entry.trim().split(/\s+/);
424
+ const url = parts[0];
425
+ if (url) {
426
+ const resolvedUrl = tryResolveUrl(url, base);
427
+ if (resolvedUrl)
428
+ parts[0] = resolvedUrl;
422
429
  }
430
+ return parts.join(' ');
431
+ })
432
+ .join(', ');
433
+ element.setAttribute('srcset', resolved);
434
+ }
435
+ }
436
+ /* -------------------------------------------------------------------------------------------------
437
+ * Serialization
438
+ * ------------------------------------------------------------------------------------------------- */
439
+ class DocumentSerializer {
440
+ /**
441
+ * Preserve existing behavior:
442
+ * - Prefer body.innerHTML only if it has "substantial" content (> 100 chars).
443
+ * - Otherwise fall back to document.toString(), then documentElement.outerHTML, then original HTML.
444
+ */
445
+ serialize(document, fallbackHtml) {
446
+ const bodyInner = this.getBodyInnerHtml(document);
447
+ if (bodyInner && bodyInner.trim().length > 100)
448
+ return bodyInner;
449
+ const toStringFn = this.getDocumentToString(document);
450
+ if (toStringFn)
451
+ return toStringFn();
452
+ const outer = this.getDocumentElementOuterHtml(document);
453
+ if (outer)
454
+ return outer;
455
+ return fallbackHtml;
456
+ }
457
+ getBodyInnerHtml(document) {
458
+ if (!isObject(document))
459
+ return undefined;
460
+ const { body } = document;
461
+ if (isObject(body) &&
462
+ typeof body.innerHTML === 'string') {
463
+ return body.innerHTML;
423
464
  }
465
+ return undefined;
424
466
  }
425
- catch {
426
- /* invalid base URL - skip resolution */
467
+ getDocumentToString(document) {
468
+ if (!isObject(document))
469
+ return undefined;
470
+ const fn = document.toString;
471
+ if (typeof fn === 'function')
472
+ return fn.bind(document);
473
+ return undefined;
474
+ }
475
+ getDocumentElementOuterHtml(document) {
476
+ if (!isObject(document))
477
+ return undefined;
478
+ const docEl = document.documentElement;
479
+ if (isObject(docEl) &&
480
+ typeof docEl.outerHTML === 'string') {
481
+ return docEl.outerHTML;
482
+ }
483
+ return undefined;
427
484
  }
428
485
  }
429
- export function removeNoiseFromHtml(html, document, baseUrl) {
430
- const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
431
- if (!shouldParse)
432
- return html;
433
- try {
434
- const resolvedDocument = document ?? parseHTML(html).document;
435
- stripNoiseNodes(resolvedDocument);
436
- // Resolve relative URLs before converting to markdown
437
- if (baseUrl) {
438
- resolveRelativeUrls(resolvedDocument, baseUrl);
486
+ /* -------------------------------------------------------------------------------------------------
487
+ * Public pipeline
488
+ * ------------------------------------------------------------------------------------------------- */
489
+ class HtmlNoiseRemovalPipeline {
490
+ promo = new PromoDetector();
491
+ classifier = new NoiseClassifier(this.promo);
492
+ stripper = new NoiseStripper(this.classifier);
493
+ urlResolver = new RelativeUrlResolver();
494
+ serializer = new DocumentSerializer();
495
+ removeNoise(html, document, baseUrl) {
496
+ const shouldParse = isFullDocumentHtml(html) || mayContainNoise(html);
497
+ if (!shouldParse)
498
+ return html;
499
+ try {
500
+ const resolvedDocument = document ?? parseHTML(html).document;
501
+ this.stripper.strip(resolvedDocument);
502
+ if (baseUrl) {
503
+ this.urlResolver.resolve(resolvedDocument, baseUrl);
504
+ }
505
+ return this.serializer.serialize(resolvedDocument, html);
439
506
  }
440
- const bodyInnerHtml = getBodyInnerHtml(resolvedDocument);
441
- // Only use body innerHTML if it has substantial content
442
- // On some sites (e.g., Framer), noise removal empties body but leaves content in documentElement
443
- if (bodyInnerHtml && bodyInnerHtml.trim().length > 100) {
444
- return bodyInnerHtml;
507
+ catch {
508
+ return html;
445
509
  }
446
- const docToString = getDocumentToString(resolvedDocument);
447
- if (docToString)
448
- return docToString();
449
- const documentElementOuterHtml = getDocumentElementOuterHtml(resolvedDocument);
450
- if (documentElementOuterHtml)
451
- return documentElementOuterHtml;
452
- return html;
453
- }
454
- catch {
455
- return html;
456
510
  }
457
511
  }
512
+ const pipeline = new HtmlNoiseRemovalPipeline();
513
+ export function removeNoiseFromHtml(html, document, baseUrl) {
514
+ return pipeline.removeNoise(html, document, baseUrl);
515
+ }