defuddle 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/README.md +95 -23
  2. package/dist/cli.js +2 -1
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +2 -0
  5. package/dist/constants.js +20 -1
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +59 -1
  8. package/dist/defuddle.js +703 -214
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/footnotes.js +2 -1
  11. package/dist/elements/footnotes.js.map +1 -1
  12. package/dist/extractor-registry.d.ts +1 -0
  13. package/dist/extractor-registry.js +3 -0
  14. package/dist/extractor-registry.js.map +1 -1
  15. package/dist/extractors/_base.d.ts +6 -0
  16. package/dist/extractors/_base.js +8 -0
  17. package/dist/extractors/_base.js.map +1 -1
  18. package/dist/extractors/github.d.ts +10 -2
  19. package/dist/extractors/github.js +158 -71
  20. package/dist/extractors/github.js.map +1 -1
  21. package/dist/extractors/hackernews.js +24 -77
  22. package/dist/extractors/hackernews.js.map +1 -1
  23. package/dist/extractors/reddit.d.ts +1 -2
  24. package/dist/extractors/reddit.js +41 -94
  25. package/dist/extractors/reddit.js.map +1 -1
  26. package/dist/extractors/x-oembed.d.ts +0 -1
  27. package/dist/extractors/x-oembed.js +20 -27
  28. package/dist/extractors/x-oembed.js.map +1 -1
  29. package/dist/extractors/youtube.d.ts +37 -0
  30. package/dist/extractors/youtube.js +409 -9
  31. package/dist/extractors/youtube.js.map +1 -1
  32. package/dist/index.d.ts +1 -1
  33. package/dist/index.full.js +1 -1
  34. package/dist/index.js +1 -1
  35. package/dist/markdown.js +5 -15
  36. package/dist/markdown.js.map +1 -1
  37. package/dist/metadata.d.ts +5 -0
  38. package/dist/metadata.js +28 -0
  39. package/dist/metadata.js.map +1 -1
  40. package/dist/node.js +0 -5
  41. package/dist/node.js.map +1 -1
  42. package/dist/scoring.d.ts +8 -2
  43. package/dist/scoring.js +109 -26
  44. package/dist/scoring.js.map +1 -1
  45. package/dist/standardize.js +103 -69
  46. package/dist/standardize.js.map +1 -1
  47. package/dist/types.d.ts +44 -0
  48. package/dist/utils/comments.d.ts +44 -0
  49. package/dist/utils/comments.js +103 -0
  50. package/dist/utils/comments.js.map +1 -0
  51. package/dist/utils/dom.d.ts +14 -0
  52. package/dist/utils/dom.js +34 -0
  53. package/dist/utils/dom.js.map +1 -1
  54. package/dist/utils/transcript.d.ts +37 -0
  55. package/dist/utils/transcript.js +61 -0
  56. package/dist/utils/transcript.js.map +1 -0
  57. package/dist/utils.d.ts +2 -1
  58. package/dist/utils.js +6 -2
  59. package/dist/utils.js.map +1 -1
  60. package/package.json +1 -1
package/dist/defuddle.js CHANGED
@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
9
9
  const scoring_1 = require("./scoring");
10
10
  const utils_1 = require("./utils");
11
11
  const dom_1 = require("./utils/dom");
12
+ /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
+ const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
+ // Content pattern detection constants
15
+ const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
+ const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
+ const BOILERPLATE_PATTERNS = [
18
+ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
19
+ /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
20
+ /^Originally (?:published|appeared) (?:in|on|at)\b/i,
21
+ ];
22
+ const METADATA_STRIP_PATTERNS = [
23
+ /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
24
+ /\b\d+(?:st|nd|rd|th)?\b/g,
25
+ /\bmin(?:ute)?s?\b/gi,
26
+ /\bread\b/gi,
27
+ /[|·•—–\-,.\s]/g,
28
+ ];
12
29
  class Defuddle {
13
30
  /**
14
31
  * Create a new Defuddle instance
@@ -16,10 +33,23 @@ class Defuddle {
16
33
  * @param options - Options for parsing
17
34
  */
18
35
  constructor(doc, options = {}) {
36
+ this._schemaOrgData = undefined;
37
+ this._schemaOrgExtracted = false;
19
38
  this.doc = doc;
20
39
  this.options = options;
21
40
  this.debug = options.debug || false;
22
41
  }
42
+ /**
43
+ * Lazily extract and cache schema.org data. Must be called before
44
+ * parse() strips script tags from the document.
45
+ */
46
+ getSchemaOrgData() {
47
+ if (!this._schemaOrgExtracted) {
48
+ this._schemaOrgData = this._extractSchemaOrgData(this.doc);
49
+ this._schemaOrgExtracted = true;
50
+ }
51
+ return this._schemaOrgData;
52
+ }
23
53
  /**
24
54
  * Parse the document and extract its main content
25
55
  */
@@ -41,6 +71,21 @@ class Defuddle {
41
71
  result = retryResult;
42
72
  }
43
73
  }
74
+ // If still very little content, the page may be an index/listing page
75
+ // where card elements were scored as non-content or removed by partial
76
+ // selectors (e.g. "post-preview"). Retry with both disabled.
77
+ if (result.wordCount < 50) {
78
+ this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
79
+ const indexRetry = this.parseInternal({
80
+ removeLowScoring: false,
81
+ removePartialSelectors: false,
82
+ removeContentPatterns: false
83
+ });
84
+ if (indexRetry.wordCount > result.wordCount) {
85
+ this._log('Index page retry produced more content');
86
+ result = indexRetry;
87
+ }
88
+ }
44
89
  // Strip dangerous elements from this.doc before any fallback paths
45
90
  // that read from it (e.g. _findContentBySchemaText).
46
91
  // This must happen after parseInternal, which needs script tags
@@ -111,8 +156,7 @@ class Defuddle {
111
156
  el.removeAttribute(attr.name);
112
157
  }
113
158
  else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
114
- const val = attr.value.replace(/[\s\u0000-\u001F]+/g, '').toLowerCase();
115
- if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
159
+ if ((0, dom_1.isDangerousUrl)(attr.value)) {
116
160
  el.removeAttribute(attr.name);
117
161
  }
118
162
  }
@@ -231,46 +275,61 @@ class Defuddle {
231
275
  return url;
232
276
  }
233
277
  /**
234
- * Parse the document, falling back to async extractors if sync parse yields no content
278
+ * Parse the document asynchronously. Checks for extractors that prefer
279
+ * async (e.g. YouTube transcripts) before sync, then falls back to async
280
+ * extractors if sync parse yields no content.
235
281
  */
236
282
  async parseAsync() {
283
+ if (this.options.useAsync !== false) {
284
+ const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
285
+ if (asyncResult)
286
+ return asyncResult;
287
+ }
237
288
  const result = this.parse();
238
289
  if (result.wordCount > 0 || this.options.useAsync === false) {
239
290
  return result;
240
291
  }
292
+ return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
293
+ }
294
+ /**
295
+ * Fetch only async variables (e.g. transcript) without re-parsing.
296
+ * Safe to call after parse() — uses cached schema.org data since
297
+ * parse() strips script tags from the document.
298
+ */
299
+ async fetchAsyncVariables() {
300
+ if (this.options.useAsync === false)
301
+ return null;
241
302
  try {
242
303
  const url = this.options.url || this.doc.URL;
243
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
244
- const extractor = extractor_registry_1.ExtractorRegistry.findAsyncExtractor(this.doc, url, schemaOrgData);
304
+ const schemaOrgData = this.getSchemaOrgData();
305
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
306
+ if (extractor) {
307
+ const extracted = await extractor.extractAsync();
308
+ return this.getExtractorVariables(extracted.variables) || null;
309
+ }
310
+ }
311
+ catch (error) {
312
+ console.error('Defuddle', 'Error fetching async variables:', error);
313
+ }
314
+ return null;
315
+ }
316
+ async tryAsyncExtractor(finder) {
317
+ try {
318
+ const url = this.options.url || this.doc.URL;
319
+ const schemaOrgData = this.getSchemaOrgData();
320
+ const extractor = finder(this.doc, url, schemaOrgData);
245
321
  if (extractor) {
246
322
  const startTime = Date.now();
247
323
  const extracted = await extractor.extractAsync();
248
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
249
324
  const pageMetaTags = this._collectMetaTags();
250
325
  const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
251
- const endTime = Date.now();
252
- return {
253
- content: contentHtml,
254
- title: extracted.variables?.title || metadata.title,
255
- description: metadata.description,
256
- domain: metadata.domain,
257
- favicon: metadata.favicon,
258
- image: metadata.image,
259
- published: extracted.variables?.published || metadata.published,
260
- author: extracted.variables?.author || metadata.author,
261
- site: extracted.variables?.site || metadata.site,
262
- schemaOrgData: metadata.schemaOrgData,
263
- wordCount: this.countWords(extracted.contentHtml),
264
- parseTime: Math.round(endTime - startTime),
265
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
266
- metaTags: pageMetaTags
267
- };
326
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
268
327
  }
269
328
  }
270
329
  catch (error) {
271
330
  console.error('Defuddle', 'Error in async extraction:', error);
272
331
  }
273
- return result;
332
+ return null;
274
333
  }
275
334
  /**
276
335
  * Internal parse method that does the actual work
@@ -280,14 +339,26 @@ class Defuddle {
280
339
  const options = {
281
340
  removeExactSelectors: true,
282
341
  removePartialSelectors: true,
342
+ removeHiddenElements: true,
343
+ removeLowScoring: true,
344
+ removeSmallImages: true,
345
+ removeContentPatterns: true,
346
+ standardize: true,
283
347
  ...this.options,
284
348
  ...overrideOptions
285
349
  };
286
- // Extract schema.org data
287
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
288
- const pageMetaTags = this._collectMetaTags();
289
- // Extract metadata
290
- const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
350
+ const debugRemovals = [];
351
+ // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
352
+ const schemaOrgData = this.getSchemaOrgData();
353
+ // Cache meta tags and metadata across retries
354
+ if (!this._metaTags) {
355
+ this._metaTags = this._collectMetaTags();
356
+ }
357
+ const pageMetaTags = this._metaTags;
358
+ if (!this._metadata) {
359
+ this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
360
+ }
361
+ const metadata = this._metadata;
291
362
  if (options.removeImages) {
292
363
  this.removeImages(this.doc);
293
364
  }
@@ -297,37 +368,36 @@ class Defuddle {
297
368
  const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
298
369
  if (extractor && extractor.canExtract()) {
299
370
  const extracted = extractor.extract();
300
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
301
- const endTime = Date.now();
302
- // console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
303
- return {
304
- content: contentHtml,
305
- title: extracted.variables?.title || metadata.title,
306
- description: metadata.description,
307
- domain: metadata.domain,
308
- favicon: metadata.favicon,
309
- image: metadata.image,
310
- published: extracted.variables?.published || metadata.published,
311
- author: extracted.variables?.author || metadata.author,
312
- site: extracted.variables?.site || metadata.site,
313
- schemaOrgData: metadata.schemaOrgData,
314
- wordCount: this.countWords(extracted.contentHtml),
315
- parseTime: Math.round(endTime - startTime),
316
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
317
- metaTags: pageMetaTags
318
- };
371
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
319
372
  }
320
373
  // Continue if there is no extractor...
321
- // Evaluate mobile styles and sizes on original document
322
- const mobileStyles = this._evaluateMediaQueries(this.doc);
323
- // Find small images in original document, excluding lazy-loaded ones
324
- const smallImages = this.findSmallImages(this.doc);
374
+ // Evaluate mobile styles and sizes on original document (cached across retries)
375
+ if (!this._mobileStyles) {
376
+ this._mobileStyles = this._evaluateMediaQueries(this.doc);
377
+ }
378
+ const mobileStyles = this._mobileStyles;
379
+ // Find small images in original document (cached across retries)
380
+ if (!this._smallImages) {
381
+ this._smallImages = this.findSmallImages(this.doc);
382
+ }
383
+ const smallImages = this._smallImages;
325
384
  // Clone document
326
385
  const clone = this.doc.cloneNode(true);
386
+ // Flatten shadow DOM content into the clone
387
+ this.flattenShadowRoots(this.doc, clone);
388
+ // Resolve React streaming SSR suspense boundaries
389
+ this.resolveStreamedContent(clone);
327
390
  // Apply mobile styles to clone
328
391
  this.applyMobileStyles(clone, mobileStyles);
329
392
  // Find main content
330
- const mainContent = this.findMainContent(clone);
393
+ let mainContent = null;
394
+ if (options.contentSelector) {
395
+ mainContent = clone.querySelector(options.contentSelector);
396
+ this._log('Using contentSelector:', options.contentSelector, mainContent ? 'found' : 'not found');
397
+ }
398
+ if (!mainContent) {
399
+ mainContent = this.findMainContent(clone);
400
+ }
331
401
  if (!mainContent) {
332
402
  const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
333
403
  const endTime = Date.now();
@@ -340,31 +410,52 @@ class Defuddle {
340
410
  };
341
411
  }
342
412
  // Standardize footnotes before cleanup (CSS sidenotes use display:none)
343
- (0, footnotes_1.standardizeFootnotes)(mainContent);
413
+ if (options.standardize) {
414
+ (0, footnotes_1.standardizeFootnotes)(mainContent);
415
+ }
344
416
  // Remove small images
345
- this.removeSmallImages(clone, smallImages);
417
+ if (options.removeSmallImages) {
418
+ this.removeSmallImages(clone, smallImages);
419
+ }
346
420
  // Remove hidden elements using computed styles
347
- this.removeHiddenElements(clone);
421
+ if (options.removeHiddenElements) {
422
+ this.removeHiddenElements(clone, debugRemovals);
423
+ }
348
424
  // Remove non-content blocks by scoring
349
425
  // Tries to find lists, navigation based on text content and link density
350
- scoring_1.ContentScorer.scoreAndRemove(clone, this.debug);
426
+ if (options.removeLowScoring) {
427
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
428
+ }
351
429
  // Remove clutter using selectors
352
430
  if (options.removeExactSelectors || options.removePartialSelectors) {
353
- this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent);
431
+ this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
432
+ }
433
+ // Remove elements by content patterns (read time, boilerplate, article cards)
434
+ if (options.removeContentPatterns && mainContent) {
435
+ this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
354
436
  }
355
437
  // Normalize the main content
356
- (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
438
+ if (options.standardize) {
439
+ (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
440
+ }
357
441
  // Resolve relative URLs to absolute
358
442
  this.resolveRelativeUrls(mainContent);
359
443
  const content = mainContent.outerHTML;
360
444
  const endTime = Date.now();
361
- return {
445
+ const result = {
362
446
  content,
363
447
  ...metadata,
364
448
  wordCount: this.countWords(content),
365
449
  parseTime: Math.round(endTime - startTime),
366
450
  metaTags: pageMetaTags
367
451
  };
452
+ if (this.debug) {
453
+ result.debug = {
454
+ contentSelector: this.getElementSelector(mainContent),
455
+ removals: debugRemovals
456
+ };
457
+ }
458
+ return result;
368
459
  }
369
460
  catch (error) {
370
461
  console.error('Defuddle', 'Error processing document:', error);
@@ -380,17 +471,33 @@ class Defuddle {
380
471
  }
381
472
  }
382
473
  countWords(content) {
383
- // Parse HTML content to extract text
384
- const tempDiv = this.doc.createElement('div');
385
- tempDiv.appendChild((0, dom_1.parseHTML)(this.doc, content));
386
- // Get text content, removing extra whitespace
387
- const text = tempDiv.textContent || '';
388
- const words = text
389
- .trim()
390
- .replace(/\s+/g, ' ') // Replace multiple spaces with single space
391
- .split(' ')
392
- .filter(word => word.length > 0); // Filter out empty strings
393
- return words.length;
474
+ // Strip HTML tags and decode common entities without DOM parsing
475
+ const text = content
476
+ .replace(/<[^>]*>/g, ' ')
477
+ .replace(/&nbsp;/gi, ' ')
478
+ .replace(/&amp;/gi, '&')
479
+ .replace(/&lt;/gi, '<')
480
+ .replace(/&gt;/gi, '>')
481
+ .replace(/&quot;/gi, '"')
482
+ .replace(/&#\d+;/g, ' ')
483
+ .replace(/&\w+;/g, ' ');
484
+ const trimmed = text.trim();
485
+ if (!trimmed)
486
+ return 0;
487
+ // Count words by splitting on whitespace
488
+ let count = 0;
489
+ let inWord = false;
490
+ for (let i = 0; i < trimmed.length; i++) {
491
+ const isSpace = trimmed.charCodeAt(i) <= 32;
492
+ if (!isSpace && !inWord) {
493
+ count++;
494
+ inWord = true;
495
+ }
496
+ else if (isSpace) {
497
+ inWord = false;
498
+ }
499
+ }
500
+ return count;
394
501
  }
395
502
  // Make all other methods private by removing the static keyword and using private
396
503
  _log(...args) {
@@ -487,61 +594,95 @@ class Defuddle {
487
594
  image.remove();
488
595
  });
489
596
  }
490
- removeHiddenElements(doc) {
597
+ removeHiddenElements(doc, debugRemovals) {
491
598
  let count = 0;
492
- const elementsToRemove = new Set();
493
- // Get all elements and check their styles
494
- const allElements = Array.from(doc.getElementsByTagName('*'));
495
- // Process styles in batches to minimize layout thrashing
496
- const BATCH_SIZE = 100;
497
- for (let i = 0; i < allElements.length; i += BATCH_SIZE) {
498
- const batch = allElements.slice(i, i + BATCH_SIZE);
499
- // Read phase - gather all computedStyles
500
- const styles = batch.map(element => {
599
+ const elementsToRemove = new Map();
600
+ // Check inline styles and CSS class-based hidden patterns.
601
+ const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
602
+ // Only use getComputedStyle in browser environments where it's meaningful.
603
+ // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
604
+ const defaultView = doc.defaultView;
605
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
606
+ const allElements = doc.querySelectorAll('*');
607
+ for (const element of allElements) {
608
+ // Skip elements that contain math — sites like Wikipedia wrap MathML
609
+ // in display:none spans for accessibility (the visible version is an
610
+ // image/SVG fallback). We need to preserve these for math extraction.
611
+ if (element.querySelector('math, [data-mathml], .katex-mathml') ||
612
+ element.tagName.toLowerCase() === 'math') {
613
+ continue;
614
+ }
615
+ // Check inline style for hidden patterns
616
+ const style = element.getAttribute('style');
617
+ if (style && hiddenStylePattern.test(style)) {
618
+ const reason = style.includes('display') ? 'display:none' :
619
+ style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
620
+ elementsToRemove.set(element, reason);
621
+ count++;
622
+ continue;
623
+ }
624
+ // Use getComputedStyle only in real browser environments
625
+ if (isBrowser) {
501
626
  try {
502
- return element.ownerDocument.defaultView?.getComputedStyle(element);
503
- }
504
- catch (e) {
505
- // If we can't get computed style, check inline styles
506
- const style = element.getAttribute('style');
507
- if (!style)
508
- return null;
509
- // Create a temporary style element to parse inline styles
510
- const tempStyle = doc.createElement('style');
511
- tempStyle.textContent = `* { ${style} }`;
512
- doc.head.appendChild(tempStyle);
513
- const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
514
- doc.head.removeChild(tempStyle);
515
- return computedStyle;
627
+ const computedStyle = defaultView.getComputedStyle(element);
628
+ let reason = '';
629
+ if (computedStyle.display === 'none')
630
+ reason = 'display:none';
631
+ else if (computedStyle.visibility === 'hidden')
632
+ reason = 'visibility:hidden';
633
+ else if (computedStyle.opacity === '0')
634
+ reason = 'opacity:0';
635
+ if (reason) {
636
+ elementsToRemove.set(element, reason);
637
+ count++;
638
+ continue;
639
+ }
516
640
  }
517
- });
518
- // Write phase - mark elements for removal
519
- batch.forEach((element, index) => {
520
- const computedStyle = styles[index];
521
- if (computedStyle && (computedStyle.display === 'none' ||
522
- computedStyle.visibility === 'hidden' ||
523
- computedStyle.opacity === '0')) {
524
- elementsToRemove.add(element);
525
- count++;
641
+ catch (e) { }
642
+ }
643
+ // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
644
+ // "sm:hidden", "not-machine:hidden")
645
+ const className = element.getAttribute('class') || '';
646
+ if (className) {
647
+ const tokens = className.split(/\s+/);
648
+ for (const token of tokens) {
649
+ if (token === 'hidden' || token.endsWith(':hidden')) {
650
+ elementsToRemove.set(element, `class:${token}`);
651
+ count++;
652
+ break;
653
+ }
526
654
  }
527
- });
655
+ }
528
656
  }
529
657
  // Batch remove all hidden elements
530
- elementsToRemove.forEach(el => el.remove());
658
+ elementsToRemove.forEach((reason, el) => {
659
+ if (this.debug && debugRemovals) {
660
+ debugRemovals.push({
661
+ step: 'removeHiddenElements',
662
+ reason,
663
+ text: (0, utils_1.textPreview)(el)
664
+ });
665
+ }
666
+ el.remove();
667
+ });
531
668
  this._log('Removed hidden elements:', count);
532
669
  }
533
- removeBySelector(doc, removeExact = true, removePartial = true, mainContent) {
670
+ removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
534
671
  const startTime = Date.now();
535
672
  let exactSelectorCount = 0;
536
673
  let partialSelectorCount = 0;
537
- // Track all elements to be removed
538
- const elementsToRemove = new Set();
674
+ // Track all elements to be removed, with their match type
675
+ const elementsToRemove = new Map();
539
676
  // First collect elements matching exact selectors
540
677
  if (removeExact) {
541
678
  const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(','));
542
679
  exactElements.forEach(el => {
543
680
  if (el?.parentNode) {
544
- elementsToRemove.add(el);
681
+ // Skip elements inside code blocks (e.g. syntax highlighting spans)
682
+ if (el.closest('pre, code')) {
683
+ return;
684
+ }
685
+ elementsToRemove.set(el, { type: 'exact' });
545
686
  exactSelectorCount++;
546
687
  }
547
688
  });
@@ -550,6 +691,10 @@ class Defuddle {
550
691
  // Pre-compile regexes and combine into a single regex for better performance
551
692
  const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
552
693
  const partialRegex = new RegExp(combinedPattern, 'i');
694
+ // Pre-compile individual regexes for debug pattern identification
695
+ const individualRegexes = this.debug
696
+ ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
697
+ : null;
553
698
  // Create an efficient attribute selector for elements we care about
554
699
  const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(',');
555
700
  const allElements = doc.querySelectorAll(attributeSelector);
@@ -581,7 +726,10 @@ class Defuddle {
581
726
  }
582
727
  // Check for partial match using single regex test
583
728
  if (partialRegex.test(attrs)) {
584
- elementsToRemove.add(el);
729
+ const matchedPattern = individualRegexes
730
+ ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
731
+ : undefined;
732
+ elementsToRemove.set(el, { type: 'partial', selector: matchedPattern });
585
733
  partialSelectorCount++;
586
734
  }
587
735
  });
@@ -590,7 +738,7 @@ class Defuddle {
590
738
  // Skip elements that are ancestors of mainContent to avoid disconnecting it
591
739
  // Skip footnote list containers, their parents, and immediate children
592
740
  // Skip anchor links inside headings - the heading transform handles these
593
- elementsToRemove.forEach(el => {
741
+ elementsToRemove.forEach(({ type, selector }, el) => {
594
742
  if (mainContent && el.contains(mainContent)) {
595
743
  return;
596
744
  }
@@ -608,6 +756,14 @@ class Defuddle {
608
756
  }
609
757
  }
610
758
  catch (e) { }
759
+ if (this.debug && debugRemovals) {
760
+ debugRemovals.push({
761
+ step: 'removeBySelector',
762
+ selector: type === 'exact' ? 'exact' : selector,
763
+ reason: type === 'exact' ? 'exact selector match' : `partial match: ${selector}`,
764
+ text: (0, utils_1.textPreview)(el)
765
+ });
766
+ }
611
767
  el.remove();
612
768
  });
613
769
  const endTime = Date.now();
@@ -622,106 +778,50 @@ class Defuddle {
622
778
  findSmallImages(doc) {
623
779
  const MIN_DIMENSION = 33;
624
780
  const smallImages = new Set();
625
- const transformRegex = /scale\(([\d.]+)\)/;
626
- const startTime = Date.now();
627
781
  let processedCount = 0;
628
- // 1. Read phase - Gather all elements in a single pass
629
- const elements = [
630
- ...Array.from(doc.getElementsByTagName('img')),
631
- ...Array.from(doc.getElementsByTagName('svg'))
632
- ];
633
- if (elements.length === 0) {
634
- return smallImages;
635
- }
636
- // 2. Batch process - Collect all measurements in one go
637
- const measurements = elements.map(element => ({
638
- element,
639
- // Static attributes (no reflow)
640
- naturalWidth: element.tagName.toLowerCase() === 'img' ?
641
- parseInt(element.getAttribute('width') || '0') || 0 : 0,
642
- naturalHeight: element.tagName.toLowerCase() === 'img' ?
643
- parseInt(element.getAttribute('height') || '0') || 0 : 0,
644
- attrWidth: parseInt(element.getAttribute('width') || '0'),
645
- attrHeight: parseInt(element.getAttribute('height') || '0')
646
- }));
647
- // 3. Batch compute styles - Process in chunks to avoid long tasks
648
- const BATCH_SIZE = 50;
649
- for (let i = 0; i < measurements.length; i += BATCH_SIZE) {
650
- const batch = measurements.slice(i, i + BATCH_SIZE);
651
- try {
652
- // Read phase - compute all styles at once
653
- const styles = batch.map(({ element }) => {
654
- try {
655
- return element.ownerDocument.defaultView?.getComputedStyle(element);
656
- }
657
- catch (e) {
658
- return null;
659
- }
660
- });
661
- // Get bounding rectangles if available
662
- const rects = batch.map(({ element }) => {
663
- try {
664
- return element.getBoundingClientRect();
665
- }
666
- catch (e) {
667
- return null;
668
- }
669
- });
670
- // Process phase - no DOM operations
671
- batch.forEach((measurement, index) => {
672
- try {
673
- const style = styles[index];
674
- const rect = rects[index];
675
- if (!style)
676
- return;
677
- // Get transform scale in the same batch
678
- const transform = style.transform;
679
- const scale = transform ?
680
- parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
681
- // Calculate effective dimensions
682
- const widths = [
683
- measurement.naturalWidth,
684
- measurement.attrWidth,
685
- parseInt(style.width) || 0,
686
- rect ? rect.width * scale : 0
687
- ].filter(dim => typeof dim === 'number' && dim > 0);
688
- const heights = [
689
- measurement.naturalHeight,
690
- measurement.attrHeight,
691
- parseInt(style.height) || 0,
692
- rect ? rect.height * scale : 0
693
- ].filter(dim => typeof dim === 'number' && dim > 0);
694
- // Decision phase - no DOM operations
695
- if (widths.length > 0 && heights.length > 0) {
696
- const effectiveWidth = Math.min(...widths);
697
- const effectiveHeight = Math.min(...heights);
698
- if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
699
- const identifier = this.getElementIdentifier(measurement.element);
700
- if (identifier) {
701
- smallImages.add(identifier);
702
- processedCount++;
703
- }
704
- }
705
- }
706
- }
707
- catch (e) {
708
- if (this.debug) {
709
- console.warn('Defuddle: Failed to process element dimensions:', e);
710
- }
711
- }
712
- });
782
+ const elements = doc.querySelectorAll('img, svg');
783
+ const defaultView = doc.defaultView;
784
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
785
+ for (const element of elements) {
786
+ const attrWidth = parseInt(element.getAttribute('width') || '0');
787
+ const attrHeight = parseInt(element.getAttribute('height') || '0');
788
+ // Check inline style dimensions
789
+ const style = element.getAttribute('style') || '';
790
+ const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
791
+ const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
792
+ // Use getComputedStyle and getBoundingClientRect only in browser
793
+ let computedWidth = 0, computedHeight = 0;
794
+ if (isBrowser) {
795
+ try {
796
+ const cs = defaultView.getComputedStyle(element);
797
+ computedWidth = parseInt(cs.width) || 0;
798
+ computedHeight = parseInt(cs.height) || 0;
799
+ }
800
+ catch (e) { }
801
+ try {
802
+ const rect = element.getBoundingClientRect();
803
+ if (rect.width > 0)
804
+ computedWidth = computedWidth || rect.width;
805
+ if (rect.height > 0)
806
+ computedHeight = computedHeight || rect.height;
807
+ }
808
+ catch (e) { }
713
809
  }
714
- catch (e) {
715
- if (this.debug) {
716
- console.warn('Defuddle: Failed to process batch:', e);
810
+ const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
811
+ const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
812
+ if (widths.length > 0 && heights.length > 0) {
813
+ const effectiveWidth = Math.min(...widths);
814
+ const effectiveHeight = Math.min(...heights);
815
+ if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
816
+ const identifier = this.getElementIdentifier(element);
817
+ if (identifier) {
818
+ smallImages.add(identifier);
819
+ processedCount++;
820
+ }
717
821
  }
718
822
  }
719
823
  }
720
- const endTime = Date.now();
721
- this._log('Found small elements:', {
722
- count: processedCount,
723
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
724
- });
824
+ this._log('Found small elements:', processedCount);
725
825
  return smallImages;
726
826
  }
727
827
  removeSmallImages(doc, smallImages) {
@@ -805,12 +905,29 @@ class Defuddle {
805
905
  // just because sibling noise inflates the parent's content score.
806
906
  // Only prefer the child if it has meaningful content (>50 words),
807
907
  // otherwise it may be an empty card element (e.g. related article cards).
908
+ // Skip this when the parent contains multiple children matching the
909
+ // same selector — that indicates a listing/portfolio page where the
910
+ // parent is the real content container.
808
911
  const top = candidates[0];
809
912
  let best = top;
810
913
  for (let i = 1; i < candidates.length; i++) {
811
914
  const child = candidates[i];
812
915
  const childWords = (child.element.textContent || '').split(/\s+/).length;
813
916
  if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
917
+ // Count how many candidates share this selector index inside
918
+ // the top element. Use top (not best) as the stable reference
919
+ // so the check isn't affected by earlier iterations.
920
+ let siblingsAtIndex = 0;
921
+ for (const c of candidates) {
922
+ if (c.selectorIndex === child.selectorIndex && top.element.contains(c.element)) {
923
+ if (++siblingsAtIndex > 1)
924
+ break;
925
+ }
926
+ }
927
+ if (siblingsAtIndex > 1) {
928
+ // Multiple articles/cards inside the parent — it's a listing page
929
+ continue;
930
+ }
814
931
  best = child;
815
932
  }
816
933
  }
@@ -839,13 +956,11 @@ class Defuddle {
839
956
  }
840
957
  findContentByScoring(doc) {
841
958
  const candidates = [];
842
- constants_1.BLOCK_ELEMENTS.forEach((tag) => {
843
- Array.from(doc.getElementsByTagName(tag)).forEach((element) => {
844
- const score = scoring_1.ContentScorer.scoreElement(element);
845
- if (score > 0) {
846
- candidates.push({ score, element });
847
- }
848
- });
959
+ doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
960
+ const score = scoring_1.ContentScorer.scoreElement(element);
961
+ if (score > 0) {
962
+ candidates.push({ score, element });
963
+ }
849
964
  });
850
965
  return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
851
966
  }
@@ -931,6 +1046,137 @@ class Defuddle {
931
1046
  el.setAttribute('poster', resolve(poster));
932
1047
  });
933
1048
  }
1049
+ /**
1050
+ * Flatten shadow DOM content into a cloned document.
1051
+ * Walks both trees in parallel so positional correspondence is exact.
1052
+ */
1053
+ flattenShadowRoots(original, clone) {
1054
+ const origElements = Array.from(original.body.querySelectorAll('*'));
1055
+ // Find the first element with a shadow root (also serves as the hasShadowRoots check)
1056
+ const firstShadow = origElements.find(el => el.shadowRoot);
1057
+ if (!firstShadow)
1058
+ return;
1059
+ const cloneElements = Array.from(clone.body.querySelectorAll('*'));
1060
+ // Check if we can directly read shadow DOM content (main world / Node.js).
1061
+ // In content script isolated worlds, shadowRoot exists but content is empty.
1062
+ const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
1063
+ if (canReadShadow) {
1064
+ // Direct traversal works (main world / Node.js)
1065
+ for (let i = origElements.length - 1; i >= 0; i--) {
1066
+ const origEl = origElements[i];
1067
+ if (!origEl.shadowRoot)
1068
+ continue;
1069
+ const cloneEl = cloneElements[i];
1070
+ if (!cloneEl)
1071
+ continue;
1072
+ const shadowHtml = origEl.shadowRoot.innerHTML;
1073
+ if (shadowHtml.length > 0) {
1074
+ this.replaceShadowHost(cloneEl, shadowHtml, clone);
1075
+ }
1076
+ }
1077
+ }
1078
+ else {
1079
+ // Content script isolated world — read data-defuddle-shadow attributes
1080
+ // stamped by an external main-world script.
1081
+ const shadowData = [];
1082
+ for (let i = 0; i < origElements.length; i++) {
1083
+ const origEl = origElements[i];
1084
+ const shadowHtml = origEl.getAttribute('data-defuddle-shadow');
1085
+ if (!shadowHtml)
1086
+ continue;
1087
+ const cloneEl = cloneElements[i];
1088
+ if (!cloneEl)
1089
+ continue;
1090
+ shadowData.push({ cloneEl, html: shadowHtml });
1091
+ // Clean up temporary attributes from both original and clone
1092
+ origEl.removeAttribute('data-defuddle-shadow');
1093
+ cloneEl.removeAttribute('data-defuddle-shadow');
1094
+ }
1095
+ for (const { cloneEl, html } of shadowData) {
1096
+ this.replaceShadowHost(cloneEl, html, clone);
1097
+ }
1098
+ }
1099
+ }
1100
+ /**
1101
+ * Resolve React streaming SSR suspense boundaries.
1102
+ * React's streaming SSR places content in hidden divs (id="S:0") and
1103
+ * template placeholders (id="B:0") with $RC scripts to swap them.
1104
+ * Since we don't execute scripts, we perform the swap manually.
1105
+ */
1106
+ resolveStreamedContent(doc) {
1107
+ // Find $RC("B:X","S:X") calls in inline scripts
1108
+ const scripts = doc.querySelectorAll('script');
1109
+ const swaps = [];
1110
+ const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
1111
+ for (const script of scripts) {
1112
+ const text = script.textContent || '';
1113
+ if (!text.includes('$RC('))
1114
+ continue;
1115
+ rcPattern.lastIndex = 0;
1116
+ let match;
1117
+ while ((match = rcPattern.exec(text)) !== null) {
1118
+ swaps.push({ templateId: match[1], contentId: match[2] });
1119
+ }
1120
+ }
1121
+ if (swaps.length === 0)
1122
+ return;
1123
+ let swapCount = 0;
1124
+ for (const { templateId, contentId } of swaps) {
1125
+ const template = doc.getElementById(templateId);
1126
+ const content = doc.getElementById(contentId);
1127
+ if (!template || !content)
1128
+ continue;
1129
+ const parent = template.parentNode;
1130
+ if (!parent)
1131
+ continue;
1132
+ // Remove the fallback/skeleton content after the template
1133
+ // until the <!--/$--> comment marker
1134
+ let next = template.nextSibling;
1135
+ let foundMarker = false;
1136
+ while (next) {
1137
+ const following = next.nextSibling;
1138
+ if (next.nodeType === 8 && next.data === '/$') {
1139
+ next.remove();
1140
+ foundMarker = true;
1141
+ break;
1142
+ }
1143
+ next.remove();
1144
+ next = following;
1145
+ }
1146
+ // Skip swap if marker wasn't found — malformed streaming output
1147
+ if (!foundMarker)
1148
+ continue;
1149
+ // Insert content children before the template position
1150
+ while (content.firstChild) {
1151
+ parent.insertBefore(content.firstChild, template);
1152
+ }
1153
+ // Clean up the template and hidden div
1154
+ template.remove();
1155
+ content.remove();
1156
+ swapCount++;
1157
+ }
1158
+ if (swapCount > 0) {
1159
+ this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
1160
+ }
1161
+ }
1162
+ /**
1163
+ * Replace a shadow DOM host element with a div containing its shadow content.
1164
+ * Custom elements (tag names with hyphens) would re-initialize when inserted
1165
+ * into a live DOM, recreating their shadow roots and hiding the content.
1166
+ */
1167
+ replaceShadowHost(el, shadowHtml, doc) {
1168
+ const fragment = (0, dom_1.parseHTML)(doc, shadowHtml);
1169
+ if (el.tagName.includes('-')) {
1170
+ // Custom element — replace with a div to prevent re-initialization
1171
+ const div = doc.createElement('div');
1172
+ div.appendChild(fragment);
1173
+ el.parentNode?.replaceChild(div, el);
1174
+ }
1175
+ else {
1176
+ el.textContent = '';
1177
+ el.appendChild(fragment);
1178
+ }
1179
+ }
934
1180
  /**
935
1181
  * Resolve relative URLs in an HTML string
936
1182
  */
@@ -1004,6 +1250,249 @@ class Defuddle {
1004
1250
  _decodeHTMLEntities(text) {
1005
1251
  return (0, dom_1.decodeHTMLEntities)(this.doc, text);
1006
1252
  }
1253
+ /**
1254
+ * Build a DefuddleResponse from an extractor result with metadata
1255
+ */
1256
+ buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
1257
+ const contentHtml = this.resolveContentUrls(extracted.contentHtml);
1258
+ const variables = this.getExtractorVariables(extracted.variables);
1259
+ return {
1260
+ content: contentHtml,
1261
+ title: extracted.variables?.title || metadata.title,
1262
+ description: metadata.description,
1263
+ domain: metadata.domain,
1264
+ favicon: metadata.favicon,
1265
+ image: metadata.image,
1266
+ language: extracted.variables?.language || metadata.language,
1267
+ published: extracted.variables?.published || metadata.published,
1268
+ author: extracted.variables?.author || metadata.author,
1269
+ site: extracted.variables?.site || metadata.site,
1270
+ schemaOrgData: metadata.schemaOrgData,
1271
+ wordCount: this.countWords(extracted.contentHtml),
1272
+ parseTime: Math.round(Date.now() - startTime),
1273
+ extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
1274
+ metaTags: pageMetaTags,
1275
+ ...(variables ? { variables } : {}),
1276
+ };
1277
+ }
1278
+ /**
1279
+ * Filter extractor variables to only include custom ones
1280
+ * (exclude standard fields that are already mapped to top-level properties)
1281
+ */
1282
+ getExtractorVariables(variables) {
1283
+ if (!variables)
1284
+ return undefined;
1285
+ const custom = {};
1286
+ let hasCustom = false;
1287
+ for (const [key, value] of Object.entries(variables)) {
1288
+ if (!STANDARD_VARIABLE_KEYS.has(key)) {
1289
+ custom[key] = value;
1290
+ hasCustom = true;
1291
+ }
1292
+ }
1293
+ return hasCustom ? custom : undefined;
1294
+ }
1295
+ /**
1296
+ * Content-based pattern removal for elements that can't be detected by
1297
+ * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
1298
+ */
1299
+ removeByContentPattern(mainContent, debugRemovals) {
1300
+ // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
1301
+ // Only removes leaf elements whose text is PURELY date + read time,
1302
+ // not mixed with other meaningful content like tag names.
1303
+ const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
1304
+ for (const el of candidates) {
1305
+ if (!el.parentNode)
1306
+ continue;
1307
+ if (el.closest('pre') || el.closest('code'))
1308
+ continue;
1309
+ const text = el.textContent?.trim() || '';
1310
+ const words = text.split(/\s+/).length;
1311
+ // Match date + read time in short elements
1312
+ if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1313
+ // Ensure this is a leaf-ish element, not a large container
1314
+ if (el.querySelectorAll('p, div, section, article').length === 0) {
1315
+ // Verify the text is ONLY date + read time metadata
1316
+ // by stripping all date/time words and checking nothing remains
1317
+ let cleaned = text;
1318
+ for (const pattern of METADATA_STRIP_PATTERNS) {
1319
+ cleaned = cleaned.replace(pattern, '');
1320
+ }
1321
+ if (cleaned.trim().length > 0)
1322
+ continue;
1323
+ if (this.debug && debugRemovals) {
1324
+ debugRemovals.push({
1325
+ step: 'removeByContentPattern',
1326
+ reason: 'read time metadata',
1327
+ text: (0, utils_1.textPreview)(el)
1328
+ });
1329
+ }
1330
+ el.remove();
1331
+ }
1332
+ }
1333
+ }
1334
+ // Remove standalone time/date elements near the start or end of content.
1335
+ // A <time> in its own paragraph at the boundary is metadata (publish date),
1336
+ // but <time> inline within prose should be preserved (see issue #136).
1337
+ const timeElements = Array.from(mainContent.querySelectorAll('time'));
1338
+ const contentText = mainContent.textContent || '';
1339
+ for (const time of timeElements) {
1340
+ if (!time.parentNode)
1341
+ continue;
1342
+ // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
1343
+ // Stop at block elements to avoid removing containers with other content.
1344
+ let target = time;
1345
+ let targetText = target.textContent?.trim() || '';
1346
+ while (target.parentElement && target.parentElement !== mainContent) {
1347
+ const parentTag = target.parentElement.tagName.toLowerCase();
1348
+ const parentText = target.parentElement.textContent?.trim() || '';
1349
+ // If parent is a <p> that only wraps this time, include it
1350
+ if (parentTag === 'p' && parentText === targetText) {
1351
+ target = target.parentElement;
1352
+ break;
1353
+ }
1354
+ // Only walk through inline formatting wrappers
1355
+ if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
1356
+ parentText === targetText) {
1357
+ target = target.parentElement;
1358
+ targetText = parentText;
1359
+ continue;
1360
+ }
1361
+ break;
1362
+ }
1363
+ const text = target.textContent?.trim() || '';
1364
+ const words = text.split(/\s+/).length;
1365
+ if (words > 10)
1366
+ continue;
1367
+ // Check if this element is near the start or end of mainContent
1368
+ const pos = contentText.indexOf(text);
1369
+ const distFromEnd = contentText.length - (pos + text.length);
1370
+ if (pos > 200 && distFromEnd > 200)
1371
+ continue;
1372
+ if (this.debug && debugRemovals) {
1373
+ debugRemovals.push({
1374
+ step: 'removeByContentPattern',
1375
+ reason: 'boundary date element',
1376
+ text: (0, utils_1.textPreview)(target)
1377
+ });
1378
+ }
1379
+ target.remove();
1380
+ }
1381
+ // Remove section breadcrumbs
1382
+ // Short elements containing a link to a parent section of the current URL.
1383
+ const url = this.options.url || this.doc.URL || '';
1384
+ let urlPath = '';
1385
+ try {
1386
+ urlPath = new URL(url).pathname;
1387
+ }
1388
+ catch { }
1389
+ if (urlPath) {
1390
+ const shortElements = mainContent.querySelectorAll('div, span, p');
1391
+ for (const el of shortElements) {
1392
+ if (!el.parentNode)
1393
+ continue;
1394
+ const text = el.textContent?.trim() || '';
1395
+ const words = text.split(/\s+/).length;
1396
+ if (words > 10)
1397
+ continue;
1398
+ // Must be a leaf-ish element (no block children)
1399
+ if (el.querySelectorAll('p, div, section, article').length > 0)
1400
+ continue;
1401
+ const link = el.querySelector('a[href]');
1402
+ if (!link)
1403
+ continue;
1404
+ try {
1405
+ const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
1406
+ if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
1407
+ if (this.debug && debugRemovals) {
1408
+ debugRemovals.push({
1409
+ step: 'removeByContentPattern',
1410
+ reason: 'section breadcrumb',
1411
+ text: (0, utils_1.textPreview)(el)
1412
+ });
1413
+ }
1414
+ el.remove();
1415
+ }
1416
+ }
1417
+ catch { }
1418
+ }
1419
+ }
1420
+ // Remove boilerplate sentences and trailing non-content.
1421
+ // Search elements for end-of-article boilerplate, then truncate
1422
+ // from the best ancestor that has siblings to remove.
1423
+ const fullText = mainContent.textContent || '';
1424
+ const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
1425
+ for (const el of boilerplateElements) {
1426
+ if (!el.parentNode)
1427
+ continue;
1428
+ const text = el.textContent?.trim() || '';
1429
+ const words = text.split(/\s+/).length;
1430
+ if (words > 50 || words < 3)
1431
+ continue;
1432
+ for (const pattern of BOILERPLATE_PATTERNS) {
1433
+ if (pattern.test(text)) {
1434
+ // Walk up to find an ancestor that has next siblings to truncate.
1435
+ // Don't walk all the way to mainContent's direct child — if there's
1436
+ // a single wrapper div, that would remove everything.
1437
+ let target = el;
1438
+ while (target.parentElement && target.parentElement !== mainContent) {
1439
+ if (target.nextElementSibling)
1440
+ break;
1441
+ target = target.parentElement;
1442
+ }
1443
+ // Only truncate if there's substantial content before the boilerplate
1444
+ const targetText = target.textContent || '';
1445
+ const targetPos = fullText.indexOf(targetText);
1446
+ if (targetPos < 200)
1447
+ continue;
1448
+ // Collect ancestors before modifying the DOM
1449
+ const ancestors = [];
1450
+ let anc = target.parentElement;
1451
+ while (anc && anc !== mainContent) {
1452
+ ancestors.push(anc);
1453
+ anc = anc.parentElement;
1454
+ }
1455
+ // Remove target element and its following siblings
1456
+ this.removeTrailingSiblings(target, true, debugRemovals);
1457
+ // Cascade upward: remove following siblings at each
1458
+ // ancestor level too. Everything after the boilerplate
1459
+ // in document order is non-content.
1460
+ for (const ancestor of ancestors) {
1461
+ this.removeTrailingSiblings(ancestor, false, debugRemovals);
1462
+ }
1463
+ return;
1464
+ }
1465
+ }
1466
+ }
1467
+ }
1468
+ /**
1469
+ * Remove an element's following siblings, and optionally the element itself.
1470
+ */
1471
+ removeTrailingSiblings(element, removeSelf, debugRemovals) {
1472
+ let sibling = element.nextElementSibling;
1473
+ while (sibling) {
1474
+ const next = sibling.nextElementSibling;
1475
+ if (this.debug && debugRemovals) {
1476
+ debugRemovals.push({
1477
+ step: 'removeByContentPattern',
1478
+ reason: 'trailing non-content',
1479
+ text: (0, utils_1.textPreview)(sibling)
1480
+ });
1481
+ }
1482
+ sibling.remove();
1483
+ sibling = next;
1484
+ }
1485
+ if (removeSelf) {
1486
+ if (this.debug && debugRemovals) {
1487
+ debugRemovals.push({
1488
+ step: 'removeByContentPattern',
1489
+ reason: 'boilerplate text',
1490
+ text: (0, utils_1.textPreview)(element)
1491
+ });
1492
+ }
1493
+ element.remove();
1494
+ }
1495
+ }
1007
1496
  }
1008
1497
  exports.Defuddle = Defuddle;
1009
1498
  //# sourceMappingURL=defuddle.js.map