defuddle 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/README.md +59 -21
  2. package/dist/cli.js +54 -49
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.js +50 -10
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +8 -1
  8. package/dist/defuddle.js +404 -86
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/callouts.d.ts +6 -0
  11. package/dist/elements/callouts.js +74 -0
  12. package/dist/elements/callouts.js.map +1 -0
  13. package/dist/elements/code.js +31 -9
  14. package/dist/elements/code.js.map +1 -1
  15. package/dist/elements/headings.d.ts +6 -0
  16. package/dist/elements/headings.js +55 -50
  17. package/dist/elements/headings.js.map +1 -1
  18. package/dist/elements/images.js +10 -1
  19. package/dist/elements/images.js.map +1 -1
  20. package/dist/elements/math.base.js +1 -4
  21. package/dist/elements/math.base.js.map +1 -1
  22. package/dist/extractor-registry.d.ts +5 -5
  23. package/dist/extractor-registry.js +8 -8
  24. package/dist/extractor-registry.js.map +1 -1
  25. package/dist/extractors/_base.d.ts +6 -1
  26. package/dist/extractors/_base.js +2 -1
  27. package/dist/extractors/_base.js.map +1 -1
  28. package/dist/extractors/github.js +3 -3
  29. package/dist/extractors/github.js.map +1 -1
  30. package/dist/extractors/hackernews.js +1 -1
  31. package/dist/extractors/hackernews.js.map +1 -1
  32. package/dist/extractors/reddit.js +7 -4
  33. package/dist/extractors/reddit.js.map +1 -1
  34. package/dist/extractors/twitter.js +3 -1
  35. package/dist/extractors/twitter.js.map +1 -1
  36. package/dist/extractors/youtube.d.ts +35 -2
  37. package/dist/extractors/youtube.js +359 -30
  38. package/dist/extractors/youtube.js.map +1 -1
  39. package/dist/fetch.d.ts +13 -0
  40. package/dist/fetch.js +181 -0
  41. package/dist/fetch.js.map +1 -0
  42. package/dist/index.full.js +1 -1
  43. package/dist/index.js +1 -1
  44. package/dist/markdown.js +81 -33
  45. package/dist/markdown.js.map +1 -1
  46. package/dist/metadata.js +1 -1
  47. package/dist/metadata.js.map +1 -1
  48. package/dist/node.d.ts +12 -5
  49. package/dist/node.js +53 -17
  50. package/dist/node.js.map +1 -1
  51. package/dist/scoring.js +15 -10
  52. package/dist/scoring.js.map +1 -1
  53. package/dist/standardize.js +112 -60
  54. package/dist/standardize.js.map +1 -1
  55. package/dist/types.d.ts +14 -0
  56. package/dist/utils/dom.d.ts +5 -0
  57. package/dist/utils/dom.js +8 -0
  58. package/dist/utils/dom.js.map +1 -1
  59. package/dist/utils/linkedom-compat.d.ts +5 -0
  60. package/dist/utils/linkedom-compat.js +23 -0
  61. package/dist/utils/linkedom-compat.js.map +1 -0
  62. package/dist/utils.d.ts +6 -0
  63. package/dist/utils.js +36 -0
  64. package/dist/utils.js.map +1 -1
  65. package/package.json +3 -4
package/dist/defuddle.js CHANGED
@@ -2,16 +2,20 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.Defuddle = void 0;
4
4
  const metadata_1 = require("./metadata");
5
+ const headings_1 = require("./elements/headings");
5
6
  const extractor_registry_1 = require("./extractor-registry");
6
7
  const constants_1 = require("./constants");
7
8
  const standardize_1 = require("./standardize");
8
9
  const footnotes_1 = require("./elements/footnotes");
10
+ const callouts_1 = require("./elements/callouts");
9
11
  const scoring_1 = require("./scoring");
10
12
  const utils_1 = require("./utils");
11
13
  const dom_1 = require("./utils/dom");
12
14
  /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
15
  const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
16
  // Content pattern detection constants
17
+ const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
18
+ const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
15
19
  const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
20
  const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
21
  const BOILERPLATE_PATTERNS = [
@@ -72,6 +76,36 @@ class Defuddle {
72
76
  }
73
77
  }
74
78
  // If still very little content, the page may be an index/listing page
79
+ // or a page that reveals content at runtime from a hidden wrapper.
80
+ // Retry once with hidden-element removal disabled.
81
+ if (result.wordCount < 50) {
82
+ this._log('Still very little content, retrying without hidden-element removal');
83
+ const hiddenRetry = this.parseInternal({
84
+ removeHiddenElements: false
85
+ });
86
+ if (hiddenRetry.wordCount > result.wordCount * 2) {
87
+ this._log('Hidden-element retry produced more content');
88
+ result = hiddenRetry;
89
+ }
90
+ // Try targeting the largest hidden subtree directly to avoid body-level
91
+ // leftovers (e.g. FPS counters) when hidden content is the real article.
92
+ const hiddenSelector = this.findLargestHiddenContentSelector();
93
+ if (hiddenSelector) {
94
+ this._log('Retrying with hidden content selector:', hiddenSelector);
95
+ const hiddenSelectorRetry = this.parseInternal({
96
+ removeHiddenElements: false,
97
+ removePartialSelectors: false,
98
+ contentSelector: hiddenSelector
99
+ });
100
+ if (hiddenSelectorRetry.wordCount > result.wordCount ||
101
+ (hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
102
+ hiddenSelectorRetry.content.length < result.content.length)) {
103
+ this._log('Hidden-selector retry produced better focused content');
104
+ result = hiddenSelectorRetry;
105
+ }
106
+ }
107
+ }
108
+ // If still very little content, the page may be an index/listing page
75
109
  // where card elements were scored as non-content or removed by partial
76
110
  // selectors (e.g. "post-preview"). Retry with both disabled.
77
111
  if (result.wordCount < 50) {
@@ -95,17 +129,17 @@ class Defuddle {
95
129
  // longer than what we extracted, the scorer likely picked the wrong
96
130
  // element from a feed. Find the correct element in the DOM.
97
131
  const schemaText = this._getSchemaText(result.schemaOrgData);
98
- if (schemaText && this.countWords(schemaText) > result.wordCount) {
132
+ if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
99
133
  const contentHtml = this._findContentBySchemaText(schemaText);
100
134
  if (contentHtml) {
101
135
  this._log('Found DOM content matching schema.org text');
102
136
  result.content = contentHtml;
103
- result.wordCount = this.countWords(contentHtml);
137
+ result.wordCount = this.countHtmlWords(contentHtml);
104
138
  }
105
139
  else {
106
140
  this._log('Using schema.org text as content (DOM element not found)');
107
141
  result.content = schemaText;
108
- result.wordCount = this.countWords(schemaText);
142
+ result.wordCount = this.countHtmlWords(schemaText);
109
143
  }
110
144
  }
111
145
  return result;
@@ -113,17 +147,30 @@ class Defuddle {
113
147
  /**
114
148
  * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
115
149
  */
116
- _getSchemaText(schemaOrgData) {
117
- if (!schemaOrgData)
150
+ _getSchemaText(schemaOrgData, depth = 0) {
151
+ if (!schemaOrgData || depth > 10)
118
152
  return '';
119
153
  const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
120
154
  for (const item of items) {
155
+ // Recurse into nested arrays
156
+ if (Array.isArray(item)) {
157
+ const found = this._getSchemaText(item, depth + 1);
158
+ if (found)
159
+ return found;
160
+ continue;
161
+ }
121
162
  if (item?.text && typeof item.text === 'string') {
122
163
  return item.text;
123
164
  }
124
165
  if (item?.articleBody && typeof item.articleBody === 'string') {
125
166
  return item.articleBody;
126
167
  }
168
+ // Traverse @graph arrays (common in JSON-LD with multiple entities)
169
+ if (item?.['@graph'] && Array.isArray(item['@graph'])) {
170
+ const found = this._getSchemaText(item['@graph'], depth + 1);
171
+ if (found)
172
+ return found;
173
+ }
127
174
  }
128
175
  return '';
129
176
  }
@@ -164,39 +211,43 @@ class Defuddle {
164
211
  }
165
212
  }
166
213
  /**
167
- * Find a DOM element whose text matches the schema.org text content.
168
- * Used when the content scorer picked the wrong element from a feed page.
169
- * Returns the element's inner HTML including sibling media (images, etc.)
214
+ * Find the smallest DOM element whose text contains the search phrase
215
+ * and whose word count is at least 80% of the expected count.
216
+ * Shared by _findSchemaContentElement and _findContentBySchemaText.
170
217
  */
171
- _findContentBySchemaText(schemaText) {
172
- const body = this.doc.body;
173
- if (!body)
174
- return '';
175
- // Use the first paragraph as the search phrase.
176
- // DOM textContent concatenates <p> elements without separators,
177
- // so we can't cross paragraph boundaries when matching.
218
+ _findElementBySchemaText(root, schemaText) {
178
219
  const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
179
220
  const searchPhrase = firstPara.substring(0, 100).trim();
180
221
  if (!searchPhrase)
181
- return '';
182
- const schemaWordCount = this.countWords(schemaText);
183
- // Find the smallest element whose text contains the search phrase
184
- // and whose word count is close to the schema text's word count
222
+ return null;
223
+ const schemaWordCount = (0, utils_1.countWords)(schemaText);
185
224
  let bestMatch = null;
186
225
  let bestSize = Infinity;
187
- const allElements = body.querySelectorAll('*');
226
+ const allElements = root.querySelectorAll('*');
188
227
  for (const el of allElements) {
189
- const elText = (el.textContent || '');
228
+ if (el === root)
229
+ continue;
230
+ const elText = el.textContent || '';
190
231
  if (!elText.includes(searchPhrase))
191
232
  continue;
192
- const elWords = elText.trim().split(/\s+/).length;
193
- // Element should contain roughly the same amount of text
194
- // (allow some slack for surrounding whitespace / minor extras)
233
+ const elWords = (0, utils_1.countWords)(elText);
195
234
  if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
196
235
  bestSize = elWords;
197
236
  bestMatch = el;
198
237
  }
199
238
  }
239
+ return bestMatch;
240
+ }
241
+ /**
242
+ * Find a DOM element whose text matches the schema.org text content.
243
+ * Used when the content scorer picked the wrong element from a feed page.
244
+ * Returns the element's inner HTML including sibling media (images, etc.)
245
+ */
246
+ _findContentBySchemaText(schemaText) {
247
+ const body = this.doc.body;
248
+ if (!body)
249
+ return '';
250
+ const bestMatch = this._findElementBySchemaText(body, schemaText);
200
251
  if (!bestMatch)
201
252
  return '';
202
253
  // Read the largest sibling image src BEFORE resolveRelativeUrls
@@ -230,6 +281,8 @@ class Defuddle {
230
281
  catch { }
231
282
  }
232
283
  }
284
+ // Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
285
+ (0, headings_1.removeHeadingAnchors)(bestMatch);
233
286
  // Now resolve URLs in the text content
234
287
  this.resolveRelativeUrls(bestMatch);
235
288
  let html = (0, dom_1.serializeHTML)(bestMatch);
@@ -241,6 +294,27 @@ class Defuddle {
241
294
  }
242
295
  return html;
243
296
  }
297
+ findLargestHiddenContentSelector() {
298
+ const body = this.doc.body;
299
+ if (!body)
300
+ return undefined;
301
+ const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
302
+ const className = el.getAttribute('class') || '';
303
+ return !className.includes('math');
304
+ });
305
+ let best = null;
306
+ let bestWords = 0;
307
+ for (const el of candidates) {
308
+ const words = (0, utils_1.countWords)(el.textContent || '');
309
+ if (words > bestWords) {
310
+ best = el;
311
+ bestWords = words;
312
+ }
313
+ }
314
+ if (!best || bestWords < 30)
315
+ return undefined;
316
+ return this.getElementSelector(best);
317
+ }
244
318
  /**
245
319
  * Get the largest available src from an img element,
246
320
  * checking srcset for higher-resolution versions.
@@ -302,7 +376,8 @@ class Defuddle {
302
376
  try {
303
377
  const url = this.options.url || this.doc.URL;
304
378
  const schemaOrgData = this.getSchemaOrgData();
305
- const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
379
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
380
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
306
381
  if (extractor) {
307
382
  const extracted = await extractor.extractAsync();
308
383
  return this.getExtractorVariables(extracted.variables) || null;
@@ -317,7 +392,8 @@ class Defuddle {
317
392
  try {
318
393
  const url = this.options.url || this.doc.URL;
319
394
  const schemaOrgData = this.getSchemaOrgData();
320
- const extractor = finder(this.doc, url, schemaOrgData);
395
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
396
+ const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
321
397
  if (extractor) {
322
398
  const startTime = Date.now();
323
399
  const extracted = await extractor.extractAsync();
@@ -336,6 +412,25 @@ class Defuddle {
336
412
  */
337
413
  parseInternal(overrideOptions = {}) {
338
414
  const startTime = Date.now();
415
+ // Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
416
+ if (!this.doc.documentElement) {
417
+ const url = this.options.url || '';
418
+ return {
419
+ content: '',
420
+ title: '',
421
+ description: '',
422
+ domain: url ? new URL(url).hostname : '',
423
+ favicon: '',
424
+ image: '',
425
+ language: '',
426
+ parseTime: Date.now() - startTime,
427
+ published: '',
428
+ author: '',
429
+ site: '',
430
+ schemaOrgData: null,
431
+ wordCount: 0,
432
+ };
433
+ }
339
434
  const options = {
340
435
  removeExactSelectors: true,
341
436
  removePartialSelectors: true,
@@ -344,6 +439,7 @@ class Defuddle {
344
439
  removeSmallImages: true,
345
440
  removeContentPatterns: true,
346
441
  standardize: true,
442
+ includeReplies: 'extractors',
347
443
  ...this.options,
348
444
  ...overrideOptions
349
445
  };
@@ -365,7 +461,11 @@ class Defuddle {
365
461
  try {
366
462
  // Use site-specific extractor first, if there is one
367
463
  const url = options.url || this.doc.URL;
368
- const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
464
+ const extractorOpts = {
465
+ includeReplies: options.includeReplies,
466
+ language: options.language,
467
+ };
468
+ const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
369
469
  if (extractor && extractor.canExtract()) {
370
470
  const extracted = extractor.extract();
371
471
  return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
@@ -383,6 +483,9 @@ class Defuddle {
383
483
  const smallImages = this._smallImages;
384
484
  // Clone document
385
485
  const clone = this.doc.cloneNode(true);
486
+ // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
487
+ // create when parsing HTML entities like &#39;
488
+ clone.body?.normalize();
386
489
  // Flatten shadow DOM content into the clone
387
490
  this.flattenShadowRoots(this.doc, clone);
388
491
  // Resolve React streaming SSR suspense boundaries
@@ -398,20 +501,36 @@ class Defuddle {
398
501
  if (!mainContent) {
399
502
  mainContent = this.findMainContent(clone);
400
503
  }
504
+ // If we fell back to <body>, try using schema.org articleBody/text
505
+ // to find a more specific content element within the DOM.
506
+ if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
507
+ const schemaText = this._getSchemaText(schemaOrgData);
508
+ if (schemaText) {
509
+ const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
510
+ if (schemaContent) {
511
+ this._log('Found content element via schema.org text');
512
+ mainContent = schemaContent;
513
+ }
514
+ }
515
+ }
401
516
  if (!mainContent) {
402
- const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
517
+ const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
403
518
  const endTime = Date.now();
404
519
  return {
405
520
  content: fallbackContent,
406
521
  ...metadata,
407
- wordCount: this.countWords(fallbackContent),
522
+ wordCount: this.countHtmlWords(fallbackContent),
408
523
  parseTime: Math.round(endTime - startTime),
409
524
  metaTags: pageMetaTags
410
525
  };
411
526
  }
527
+ // Remove <wbr> elements — word break opportunity hints that carry no
528
+ // content but cause unwanted whitespace during standardization.
529
+ mainContent.querySelectorAll('wbr').forEach(el => el.remove());
412
530
  // Standardize footnotes before cleanup (CSS sidenotes use display:none)
413
531
  if (options.standardize) {
414
532
  (0, footnotes_1.standardizeFootnotes)(mainContent);
533
+ (0, callouts_1.standardizeCallouts)(mainContent);
415
534
  }
416
535
  // Remove small images
417
536
  if (options.removeSmallImages) {
@@ -421,15 +540,17 @@ class Defuddle {
421
540
  if (options.removeHiddenElements) {
422
541
  this.removeHiddenElements(clone, debugRemovals);
423
542
  }
424
- // Remove non-content blocks by scoring
425
- // Tries to find lists, navigation based on text content and link density
543
+ // Remove clutter using selectors — deterministic removal of known
544
+ // non-content elements (nav, footer, .sidebar, etc.) by class/id.
545
+ // Runs before scoring so the heuristic scorer sees a cleaner DOM.
546
+ if (options.removeExactSelectors || options.removePartialSelectors) {
547
+ this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
548
+ }
549
+ // Remove non-content blocks by scoring — heuristic removal based
550
+ // on link density, text ratios, and navigation indicators.
426
551
  if (options.removeLowScoring) {
427
552
  scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
428
553
  }
429
- // Remove clutter using selectors
430
- if (options.removeExactSelectors || options.removePartialSelectors) {
431
- this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
432
- }
433
554
  // Remove elements by content patterns (read time, boilerplate, article cards)
434
555
  if (options.removeContentPatterns && mainContent) {
435
556
  this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
@@ -445,7 +566,7 @@ class Defuddle {
445
566
  const result = {
446
567
  content,
447
568
  ...metadata,
448
- wordCount: this.countWords(content),
569
+ wordCount: this.countHtmlWords(content),
449
570
  parseTime: Math.round(endTime - startTime),
450
571
  metaTags: pageMetaTags
451
572
  };
@@ -459,18 +580,18 @@ class Defuddle {
459
580
  }
460
581
  catch (error) {
461
582
  console.error('Defuddle', 'Error processing document:', error);
462
- const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
583
+ const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
463
584
  const endTime = Date.now();
464
585
  return {
465
586
  content: errorContent,
466
587
  ...metadata,
467
- wordCount: this.countWords(errorContent),
588
+ wordCount: this.countHtmlWords(errorContent),
468
589
  parseTime: Math.round(endTime - startTime),
469
590
  metaTags: pageMetaTags
470
591
  };
471
592
  }
472
593
  }
473
- countWords(content) {
594
+ countHtmlWords(content) {
474
595
  // Strip HTML tags and decode common entities without DOM parsing
475
596
  const text = content
476
597
  .replace(/<[^>]*>/g, ' ')
@@ -481,25 +602,8 @@ class Defuddle {
481
602
  .replace(/&quot;/gi, '"')
482
603
  .replace(/&#\d+;/g, ' ')
483
604
  .replace(/&\w+;/g, ' ');
484
- const trimmed = text.trim();
485
- if (!trimmed)
486
- return 0;
487
- // Count words by splitting on whitespace
488
- let count = 0;
489
- let inWord = false;
490
- for (let i = 0; i < trimmed.length; i++) {
491
- const isSpace = trimmed.charCodeAt(i) <= 32;
492
- if (!isSpace && !inWord) {
493
- count++;
494
- inWord = true;
495
- }
496
- else if (isSpace) {
497
- inWord = false;
498
- }
499
- }
500
- return count;
605
+ return (0, utils_1.countWords)(text);
501
606
  }
502
- // Make all other methods private by removing the static keyword and using private
503
607
  _log(...args) {
504
608
  if (this.debug) {
505
609
  console.log('Defuddle:', ...args);
@@ -509,6 +613,8 @@ class Defuddle {
509
613
  const mobileStyles = [];
510
614
  const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
511
615
  try {
616
+ if (!doc.styleSheets)
617
+ return mobileStyles;
512
618
  // Get all styles, including inline styles
513
619
  const sheets = Array.from(doc.styleSheets).filter(sheet => {
514
620
  try {
@@ -646,7 +752,7 @@ class Defuddle {
646
752
  if (className) {
647
753
  const tokens = className.split(/\s+/);
648
754
  for (const token of tokens) {
649
- if (token === 'hidden' || token.endsWith(':hidden')) {
755
+ if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
650
756
  elementsToRemove.set(element, `class:${token}`);
651
757
  count++;
652
758
  break;
@@ -667,7 +773,7 @@ class Defuddle {
667
773
  });
668
774
  this._log('Removed hidden elements:', count);
669
775
  }
670
- removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
776
+ removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
671
777
  const startTime = Date.now();
672
778
  let exactSelectorCount = 0;
673
779
  let partialSelectorCount = 0;
@@ -675,9 +781,17 @@ class Defuddle {
675
781
  const elementsToRemove = new Map();
676
782
  // First collect elements matching exact selectors
677
783
  if (removeExact) {
678
- const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(','));
784
+ const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
679
785
  exactElements.forEach(el => {
680
786
  if (el?.parentNode) {
787
+ if (skipHiddenExactSelectors) {
788
+ const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
789
+ const role = (el.getAttribute('role') || '').toLowerCase();
790
+ if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
791
+ (hiddenAncestor && role === 'dialog')) {
792
+ return;
793
+ }
794
+ }
681
795
  // Skip elements inside code blocks (e.g. syntax highlighting spans)
682
796
  if (el.closest('pre, code')) {
683
797
  return;
@@ -688,16 +802,12 @@ class Defuddle {
688
802
  });
689
803
  }
690
804
  if (removePartial) {
691
- // Pre-compile regexes and combine into a single regex for better performance
692
- const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
693
- const partialRegex = new RegExp(combinedPattern, 'i');
694
- // Pre-compile individual regexes for debug pattern identification
805
+ // Pre-compile individual regexes for debug pattern identification only
695
806
  const individualRegexes = this.debug
696
807
  ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
697
808
  : null;
698
- // Create an efficient attribute selector for elements we care about
699
- const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(',');
700
- const allElements = doc.querySelectorAll(attributeSelector);
809
+ // Use pre-built attribute selector for elements we care about
810
+ const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
701
811
  // Process elements for partial matches
702
812
  allElements.forEach(el => {
703
813
  // Skip if already marked for removal
@@ -707,13 +817,13 @@ class Defuddle {
707
817
  // Skip code elements and elements containing code blocks
708
818
  // where class names indicate language/syntax, not page structure
709
819
  const tag = el.tagName;
710
- if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
820
+ if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
711
821
  return;
712
822
  }
713
823
  // Get all relevant attributes and combine into a single string
714
824
  const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
715
825
  if (attr === 'class') {
716
- return el.className && typeof el.className === 'string' ? el.className : '';
826
+ return (0, dom_1.getClassName)(el);
717
827
  }
718
828
  if (attr === 'id') {
719
829
  return el.id || '';
@@ -725,7 +835,7 @@ class Defuddle {
725
835
  return;
726
836
  }
727
837
  // Check for partial match using single regex test
728
- if (partialRegex.test(attrs)) {
838
+ if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
729
839
  const matchedPattern = individualRegexes
730
840
  ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
731
841
  : undefined;
@@ -787,8 +897,8 @@ class Defuddle {
787
897
  const attrHeight = parseInt(element.getAttribute('height') || '0');
788
898
  // Check inline style dimensions
789
899
  const style = element.getAttribute('style') || '';
790
- const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
791
- const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
900
+ const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
901
+ const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
792
902
  // Use getComputedStyle and getBoundingClientRect only in browser
793
903
  let computedWidth = 0, computedHeight = 0;
794
904
  if (isBrowser) {
@@ -856,7 +966,7 @@ class Defuddle {
856
966
  return `srcset:${dataSrcset}`;
857
967
  }
858
968
  const id = element.id || '';
859
- const className = element.className || '';
969
+ const className = (0, dom_1.getClassName)(element);
860
970
  const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
861
971
  if (id)
862
972
  return `id:${id}`;
@@ -912,7 +1022,7 @@ class Defuddle {
912
1022
  let best = top;
913
1023
  for (let i = 1; i < candidates.length; i++) {
914
1024
  const child = candidates[i];
915
- const childWords = (child.element.textContent || '').split(/\s+/).length;
1025
+ const childWords = (0, utils_1.countWords)(child.element.textContent || '');
916
1026
  if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
917
1027
  // Count how many candidates share this selector index inside
918
1028
  // the top element. Use top (not best) as the stable reference
@@ -972,8 +1082,8 @@ class Defuddle {
972
1082
  if (current.id) {
973
1083
  selector += '#' + current.id;
974
1084
  }
975
- else if (current.className && typeof current.className === 'string') {
976
- selector += '.' + current.className.trim().split(/\s+/).join('.');
1085
+ else if ((0, dom_1.getClassName)(current)) {
1086
+ selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
977
1087
  }
978
1088
  parts.unshift(selector);
979
1089
  current = current.parentElement;
@@ -987,15 +1097,35 @@ class Defuddle {
987
1097
  * Resolve relative URLs to absolute within a DOM element
988
1098
  */
989
1099
  resolveRelativeUrls(element) {
990
- const baseUrl = this.options.url || this.doc.URL;
991
- if (!baseUrl)
1100
+ const docUrl = this.options.url || this.doc.URL;
1101
+ if (!docUrl)
992
1102
  return;
1103
+ // Respect <base href> for relative URL resolution, matching browser behavior
1104
+ let baseUrl = docUrl;
1105
+ const baseEl = this.doc.querySelector('base[href]');
1106
+ if (baseEl) {
1107
+ const baseHref = baseEl.getAttribute('href');
1108
+ if (baseHref) {
1109
+ try {
1110
+ baseUrl = new URL(baseHref, docUrl).href;
1111
+ }
1112
+ catch {
1113
+ // Invalid base href, fall back to document URL
1114
+ }
1115
+ }
1116
+ }
993
1117
  const resolve = (url) => {
1118
+ // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
1119
+ // Normalize these before URL resolution.
1120
+ const normalized = url
1121
+ .trim()
1122
+ .replace(/^\\?["']+/, '')
1123
+ .replace(/\\?["']+$/, '');
994
1124
  try {
995
- return new URL(url, baseUrl).href;
1125
+ return new URL(normalized, baseUrl).href;
996
1126
  }
997
1127
  catch {
998
- return url;
1128
+ return normalized || url;
999
1129
  }
1000
1130
  };
1001
1131
  element.querySelectorAll('[href]').forEach(el => {
@@ -1051,6 +1181,8 @@ class Defuddle {
1051
1181
  * Walks both trees in parallel so positional correspondence is exact.
1052
1182
  */
1053
1183
  flattenShadowRoots(original, clone) {
1184
+ if (!original.body || !clone.body)
1185
+ return;
1054
1186
  const origElements = Array.from(original.body.querySelectorAll('*'));
1055
1187
  // Find the first element with a shadow root (also serves as the hasShadowRoots check)
1056
1188
  const firstShadow = origElements.find(el => el.shadowRoot);
@@ -1268,7 +1400,7 @@ class Defuddle {
1268
1400
  author: extracted.variables?.author || metadata.author,
1269
1401
  site: extracted.variables?.site || metadata.site,
1270
1402
  schemaOrgData: metadata.schemaOrgData,
1271
- wordCount: this.countWords(extracted.contentHtml),
1403
+ wordCount: this.countHtmlWords(extracted.contentHtml),
1272
1404
  parseTime: Math.round(Date.now() - startTime),
1273
1405
  extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
1274
1406
  metaTags: pageMetaTags,
@@ -1307,7 +1439,7 @@ class Defuddle {
1307
1439
  if (el.closest('pre') || el.closest('code'))
1308
1440
  continue;
1309
1441
  const text = el.textContent?.trim() || '';
1310
- const words = text.split(/\s+/).length;
1442
+ const words = (0, utils_1.countWords)(text);
1311
1443
  // Match date + read time in short elements
1312
1444
  if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1313
1445
  // Ensure this is a leaf-ish element, not a large container
@@ -1361,7 +1493,7 @@ class Defuddle {
1361
1493
  break;
1362
1494
  }
1363
1495
  const text = target.textContent?.trim() || '';
1364
- const words = text.split(/\s+/).length;
1496
+ const words = (0, utils_1.countWords)(text);
1365
1497
  if (words > 10)
1366
1498
  continue;
1367
1499
  // Check if this element is near the start or end of mainContent
@@ -1378,12 +1510,78 @@ class Defuddle {
1378
1510
  }
1379
1511
  target.remove();
1380
1512
  }
1513
+ // Remove blog post metadata lists near content boundaries.
1514
+ // These are short <ul>/<ol> elements where every item is a brief
1515
+ // label + value pair (date, reading time, share, etc.) with no
1516
+ // prose sentences. Detected structurally: all items are very short,
1517
+ // none contain sentence-ending punctuation, and the total text is minimal.
1518
+ const metadataLists = mainContent.querySelectorAll('ul, ol');
1519
+ for (const list of metadataLists) {
1520
+ if (!list.parentNode)
1521
+ continue;
1522
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1523
+ if (items.length < 2 || items.length > 8)
1524
+ continue;
1525
+ // Must be near the start or end of content
1526
+ const listText = list.textContent?.trim() || '';
1527
+ const listPos = contentText.indexOf(listText);
1528
+ const distFromEnd = contentText.length - (listPos + listText.length);
1529
+ if (listPos > 500 && distFromEnd > 500)
1530
+ continue;
1531
+ // Skip lists introduced by a preceding paragraph (e.g. "Features include:")
1532
+ // — those are content lists, not standalone metadata
1533
+ const prevSibling = list.previousElementSibling;
1534
+ if (prevSibling) {
1535
+ const prevText = prevSibling.textContent?.trim() || '';
1536
+ if (prevText.endsWith(':'))
1537
+ continue;
1538
+ }
1539
+ // Every item must be very short (label + value) with no prose
1540
+ let isMetadata = true;
1541
+ for (const item of items) {
1542
+ const text = item.textContent?.trim() || '';
1543
+ const words = (0, utils_1.countWords)(text);
1544
+ if (words > 8) {
1545
+ isMetadata = false;
1546
+ break;
1547
+ }
1548
+ // Prose has sentence-ending punctuation; metadata doesn't
1549
+ if (/[.!?]$/.test(text)) {
1550
+ isMetadata = false;
1551
+ break;
1552
+ }
1553
+ }
1554
+ if (!isMetadata)
1555
+ continue;
1556
+ // Total text should be very short — this is metadata, not content
1557
+ if ((0, utils_1.countWords)(listText) > 30)
1558
+ continue;
1559
+ // Walk up to find the container to remove (e.g. a wrapper div)
1560
+ let target = list;
1561
+ while (target.parentElement && target.parentElement !== mainContent) {
1562
+ const parentText = target.parentElement.textContent?.trim() || '';
1563
+ if (parentText !== listText)
1564
+ break;
1565
+ target = target.parentElement;
1566
+ }
1567
+ if (this.debug && debugRemovals) {
1568
+ debugRemovals.push({
1569
+ step: 'removeByContentPattern',
1570
+ reason: 'blog metadata list',
1571
+ text: (0, utils_1.textPreview)(target)
1572
+ });
1573
+ }
1574
+ target.remove();
1575
+ }
1381
1576
  // Remove section breadcrumbs
1382
1577
  // Short elements containing a link to a parent section of the current URL.
1383
1578
  const url = this.options.url || this.doc.URL || '';
1384
1579
  let urlPath = '';
1580
+ let pageHost = '';
1385
1581
  try {
1386
- urlPath = new URL(url).pathname;
1582
+ const parsedUrl = new URL(url);
1583
+ urlPath = parsedUrl.pathname;
1584
+ pageHost = parsedUrl.hostname.replace(/^www\./, '');
1387
1585
  }
1388
1586
  catch { }
1389
1587
  if (urlPath) {
@@ -1392,7 +1590,7 @@ class Defuddle {
1392
1590
  if (!el.parentNode)
1393
1591
  continue;
1394
1592
  const text = el.textContent?.trim() || '';
1395
- const words = text.split(/\s+/).length;
1593
+ const words = (0, utils_1.countWords)(text);
1396
1594
  if (words > 10)
1397
1595
  continue;
1398
1596
  // Must be a leaf-ish element (no block children)
@@ -1417,6 +1615,126 @@ class Defuddle {
1417
1615
  catch { }
1418
1616
  }
1419
1617
  }
1618
+ // Remove trailing external link lists — a heading + list of purely
1619
+ // off-site links as the last content block (affiliate picks, product
1620
+ // roundups, etc.). Only removed when nothing meaningful follows.
1621
+ if (pageHost) {
1622
+ const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
1623
+ for (const heading of headings) {
1624
+ if (!heading.parentNode)
1625
+ continue;
1626
+ const list = heading.nextElementSibling;
1627
+ if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
1628
+ continue;
1629
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1630
+ if (items.length < 2)
1631
+ continue;
1632
+ // The list must be the last meaningful block — nothing after it
1633
+ // except whitespace or empty elements. Walk up through ancestors
1634
+ // to check siblings at each level up to mainContent.
1635
+ let trailingContent = false;
1636
+ let checkEl = list;
1637
+ while (checkEl && checkEl !== mainContent) {
1638
+ let sibling = checkEl.nextElementSibling;
1639
+ while (sibling) {
1640
+ if ((sibling.textContent?.trim() || '').length > 0) {
1641
+ trailingContent = true;
1642
+ break;
1643
+ }
1644
+ sibling = sibling.nextElementSibling;
1645
+ }
1646
+ if (trailingContent)
1647
+ break;
1648
+ checkEl = checkEl.parentElement;
1649
+ }
1650
+ if (trailingContent)
1651
+ continue;
1652
+ // Every list item must be primarily a link pointing off-site
1653
+ let allExternalLinks = true;
1654
+ for (const item of items) {
1655
+ const links = item.querySelectorAll('a[href]');
1656
+ if (links.length === 0) {
1657
+ allExternalLinks = false;
1658
+ break;
1659
+ }
1660
+ const itemText = item.textContent?.trim() || '';
1661
+ let linkTextLen = 0;
1662
+ for (const link of links) {
1663
+ linkTextLen += (link.textContent?.trim() || '').length;
1664
+ try {
1665
+ const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
1666
+ if (linkHost === pageHost) {
1667
+ allExternalLinks = false;
1668
+ break;
1669
+ }
1670
+ }
1671
+ catch { }
1672
+ }
1673
+ if (!allExternalLinks)
1674
+ break;
1675
+ if (linkTextLen < itemText.length * 0.6) {
1676
+ allExternalLinks = false;
1677
+ break;
1678
+ }
1679
+ }
1680
+ if (!allExternalLinks)
1681
+ continue;
1682
+ if (this.debug && debugRemovals) {
1683
+ debugRemovals.push({
1684
+ step: 'removeByContentPattern',
1685
+ reason: 'trailing external link list',
1686
+ text: (0, utils_1.textPreview)(heading)
1687
+ });
1688
+ debugRemovals.push({
1689
+ step: 'removeByContentPattern',
1690
+ reason: 'trailing external link list',
1691
+ text: (0, utils_1.textPreview)(list)
1692
+ });
1693
+ }
1694
+ list.remove();
1695
+ heading.remove();
1696
+ }
1697
+ }
1698
+ // Remove trailing thin sections — the last few direct children of
1699
+ // mainContent that contain a heading but very little prose. These are
1700
+ // typically CTAs, newsletter prompts, or promotional sections that
1701
+ // have been partially stripped by prior removal steps.
1702
+ const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
1703
+ if (totalWords > 300) {
1704
+ // Walk backwards from the last direct child of mainContent,
1705
+ // collecting trailing elements that are thin (empty or very short prose).
1706
+ // Exclude SVG text (path data) from word counts — it's not prose.
1707
+ const trailingEls = [];
1708
+ let trailingWords = 0;
1709
+ let child = mainContent.lastElementChild;
1710
+ while (child) {
1711
+ // Count prose words, excluding SVG path data which inflates word counts
1712
+ let svgWords = 0;
1713
+ for (const svg of child.querySelectorAll('svg')) {
1714
+ svgWords += (0, utils_1.countWords)(svg.textContent || '');
1715
+ }
1716
+ const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
1717
+ if (words > 25)
1718
+ break;
1719
+ trailingWords += words;
1720
+ trailingEls.push(child);
1721
+ child = child.previousElementSibling;
1722
+ }
1723
+ // Must have a heading in the trailing elements and total < 15% of content.
1724
+ // Skip if trailing elements contain content indicators (math, code, tables, images).
1725
+ if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
1726
+ const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
1727
+ const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
1728
+ if (hasHeading && !hasContent) {
1729
+ for (const el of trailingEls) {
1730
+ if (this.debug && debugRemovals) {
1731
+ debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
1732
+ }
1733
+ el.remove();
1734
+ }
1735
+ }
1736
+ }
1737
+ }
1420
1738
  // Remove boilerplate sentences and trailing non-content.
1421
1739
  // Search elements for end-of-article boilerplate, then truncate
1422
1740
  // from the best ancestor that has siblings to remove.
@@ -1426,7 +1744,7 @@ class Defuddle {
1426
1744
  if (!el.parentNode)
1427
1745
  continue;
1428
1746
  const text = el.textContent?.trim() || '';
1429
- const words = text.split(/\s+/).length;
1747
+ const words = (0, utils_1.countWords)(text);
1430
1748
  if (words > 50 || words < 3)
1431
1749
  continue;
1432
1750
  for (const pattern of BOILERPLATE_PATTERNS) {