defuddle 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/README.md +33 -0
  2. package/dist/cli.js +47 -10
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +2 -2
  5. package/dist/constants.js +67 -11
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +5 -18
  8. package/dist/defuddle.js +212 -621
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/callouts.d.ts +6 -0
  11. package/dist/elements/callouts.js +74 -0
  12. package/dist/elements/callouts.js.map +1 -0
  13. package/dist/elements/code.js +76 -11
  14. package/dist/elements/code.js.map +1 -1
  15. package/dist/elements/footnotes.js +420 -45
  16. package/dist/elements/footnotes.js.map +1 -1
  17. package/dist/elements/headings.d.ts +6 -0
  18. package/dist/elements/headings.js +18 -0
  19. package/dist/elements/headings.js.map +1 -1
  20. package/dist/elements/images.js +10 -1
  21. package/dist/elements/images.js.map +1 -1
  22. package/dist/elements/math.base.d.ts +1 -0
  23. package/dist/elements/math.base.js +5 -5
  24. package/dist/elements/math.base.js.map +1 -1
  25. package/dist/elements/math.core.d.ts +1 -0
  26. package/dist/elements/math.d.ts +1 -1
  27. package/dist/elements/math.full.d.ts +1 -0
  28. package/dist/elements/math.full.js +90 -0
  29. package/dist/elements/math.full.js.map +1 -0
  30. package/dist/elements/math.js +3 -3
  31. package/dist/extractor-registry.d.ts +5 -5
  32. package/dist/extractor-registry.js +28 -8
  33. package/dist/extractor-registry.js.map +1 -1
  34. package/dist/extractors/_base.d.ts +6 -1
  35. package/dist/extractors/_base.js +2 -1
  36. package/dist/extractors/_base.js.map +1 -1
  37. package/dist/extractors/bbcode-data.d.ts +10 -0
  38. package/dist/extractors/bbcode-data.js +59 -0
  39. package/dist/extractors/bbcode-data.js.map +1 -0
  40. package/dist/extractors/c2-wiki.d.ts +15 -0
  41. package/dist/extractors/c2-wiki.js +143 -0
  42. package/dist/extractors/c2-wiki.js.map +1 -0
  43. package/dist/extractors/github.js +3 -3
  44. package/dist/extractors/github.js.map +1 -1
  45. package/dist/extractors/hackernews.js +1 -1
  46. package/dist/extractors/hackernews.js.map +1 -1
  47. package/dist/extractors/reddit.d.ts +1 -0
  48. package/dist/extractors/reddit.js +20 -17
  49. package/dist/extractors/reddit.js.map +1 -1
  50. package/dist/extractors/substack.d.ts +17 -0
  51. package/dist/extractors/substack.js +188 -0
  52. package/dist/extractors/substack.js.map +1 -0
  53. package/dist/extractors/twitter.js +3 -1
  54. package/dist/extractors/twitter.js.map +1 -1
  55. package/dist/extractors/x-article.d.ts +1 -0
  56. package/dist/extractors/x-article.js +27 -2
  57. package/dist/extractors/x-article.js.map +1 -1
  58. package/dist/extractors/x-oembed.js +1 -1
  59. package/dist/extractors/x-oembed.js.map +1 -1
  60. package/dist/extractors/youtube.d.ts +22 -2
  61. package/dist/extractors/youtube.js +299 -47
  62. package/dist/extractors/youtube.js.map +1 -1
  63. package/dist/fetch.d.ts +13 -0
  64. package/dist/fetch.js +350 -0
  65. package/dist/fetch.js.map +1 -0
  66. package/dist/index.full.js +1 -1
  67. package/dist/index.js +1 -1
  68. package/dist/markdown.js +103 -35
  69. package/dist/markdown.js.map +1 -1
  70. package/dist/metadata.d.ts +4 -3
  71. package/dist/metadata.js +194 -40
  72. package/dist/metadata.js.map +1 -1
  73. package/dist/node.d.ts +1 -1
  74. package/dist/node.js +3 -6
  75. package/dist/node.js.map +1 -1
  76. package/dist/removals/content-patterns.d.ts +2 -0
  77. package/dist/removals/content-patterns.js +835 -0
  78. package/dist/removals/content-patterns.js.map +1 -0
  79. package/dist/removals/hidden.d.ts +2 -0
  80. package/dist/removals/hidden.js +78 -0
  81. package/dist/removals/hidden.js.map +1 -0
  82. package/dist/removals/metadata-block.d.ts +8 -0
  83. package/dist/removals/metadata-block.js +40 -0
  84. package/dist/removals/metadata-block.js.map +1 -0
  85. package/dist/{scoring.d.ts → removals/scoring.d.ts} +1 -1
  86. package/dist/{scoring.js → removals/scoring.js} +17 -14
  87. package/dist/removals/scoring.js.map +1 -0
  88. package/dist/removals/selectors.d.ts +2 -0
  89. package/dist/removals/selectors.js +118 -0
  90. package/dist/removals/selectors.js.map +1 -0
  91. package/dist/removals/small-images.d.ts +3 -0
  92. package/dist/removals/small-images.js +116 -0
  93. package/dist/removals/small-images.js.map +1 -0
  94. package/dist/standardize.d.ts +2 -1
  95. package/dist/standardize.js +122 -111
  96. package/dist/standardize.js.map +1 -1
  97. package/dist/types/extractors.d.ts +1 -0
  98. package/dist/types.d.ts +19 -0
  99. package/dist/utils/bbcode.d.ts +6 -0
  100. package/dist/utils/bbcode.js +57 -0
  101. package/dist/utils/bbcode.js.map +1 -0
  102. package/dist/utils/dom.d.ts +5 -0
  103. package/dist/utils/dom.js +8 -0
  104. package/dist/utils/dom.js.map +1 -1
  105. package/dist/utils.js +1 -1
  106. package/dist/utils.js.map +1 -1
  107. package/package.json +1 -1
  108. package/dist/elements/math.core.js +0 -52
  109. package/dist/elements/math.core.js.map +0 -1
  110. package/dist/index.js.map +0 -1
  111. package/dist/scoring.js.map +0 -1
package/dist/defuddle.js CHANGED
@@ -6,26 +6,17 @@ const extractor_registry_1 = require("./extractor-registry");
6
6
  const constants_1 = require("./constants");
7
7
  const standardize_1 = require("./standardize");
8
8
  const footnotes_1 = require("./elements/footnotes");
9
- const scoring_1 = require("./scoring");
9
+ const callouts_1 = require("./elements/callouts");
10
+ const scoring_1 = require("./removals/scoring");
11
+ const small_images_1 = require("./removals/small-images");
12
+ const hidden_1 = require("./removals/hidden");
13
+ const selectors_1 = require("./removals/selectors");
14
+ const content_patterns_1 = require("./removals/content-patterns");
15
+ const metadata_block_1 = require("./removals/metadata-block");
10
16
  const utils_1 = require("./utils");
11
17
  const dom_1 = require("./utils/dom");
12
18
  /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
19
  const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
- // Content pattern detection constants
15
- const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
- const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
- const BOILERPLATE_PATTERNS = [
18
- /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
19
- /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
20
- /^Originally (?:published|appeared) (?:in|on|at)\b/i,
21
- ];
22
- const METADATA_STRIP_PATTERNS = [
23
- /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
24
- /\b\d+(?:st|nd|rd|th)?\b/g,
25
- /\bmin(?:ute)?s?\b/gi,
26
- /\bread\b/gi,
27
- /[|·•—–\-,.\s]/g,
28
- ];
29
20
  class Defuddle {
30
21
  /**
31
22
  * Create a new Defuddle instance
@@ -35,6 +26,7 @@ class Defuddle {
35
26
  constructor(doc, options = {}) {
36
27
  this._schemaOrgData = undefined;
37
28
  this._schemaOrgExtracted = false;
29
+ this._inExtractorPipelineRun = false;
38
30
  this.doc = doc;
39
31
  this.options = options;
40
32
  this.debug = options.debug || false;
@@ -117,20 +109,23 @@ class Defuddle {
117
109
  }
118
110
  }
119
111
  // Strip dangerous elements from this.doc before any fallback paths
120
- // that read from it (e.g. _findContentBySchemaText).
121
- // This must happen after parseInternal, which needs script tags
122
- // for schema.org extraction, site-specific extractors, and math.
112
+ // that read from it. This must happen after parseInternal, which needs
113
+ // script tags for schema.org extraction, site-specific extractors, and math.
123
114
  this._stripUnsafeElements();
124
- // If schema.org has a SocialMediaPosting with text content that is
125
- // longer than what we extracted, the scorer likely picked the wrong
126
- // element from a feed. Find the correct element in the DOM.
115
+ // If schema.org has text content that is significantly longer than what we
116
+ // extracted, the scorer likely picked the wrong element from a feed page.
117
+ // Use a 1.5x threshold to avoid triggering when the difference is small
118
+ // (e.g. just related-content link text removed).
127
119
  const schemaText = this._getSchemaText(result.schemaOrgData);
128
- if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
129
- const contentHtml = this._findContentBySchemaText(schemaText);
130
- if (contentHtml) {
131
- this._log('Found DOM content matching schema.org text');
132
- result.content = contentHtml;
133
- result.wordCount = this.countHtmlWords(contentHtml);
120
+ if (schemaText && this.countHtmlWords(schemaText) > result.wordCount * 1.5) {
121
+ const bestMatch = this._findElementBySchemaText(this.doc.body, schemaText);
122
+ if (bestMatch) {
123
+ // Re-run the full pipeline with the schema-identified element as the
124
+ // content root so it benefits from the same cleanup as normal extraction.
125
+ const selector = this.getElementSelector(bestMatch);
126
+ this._log('Schema.org suggests a better content element, retrying with selector:', selector);
127
+ const schemaRetry = this.parseInternal({ contentSelector: selector });
128
+ result = schemaRetry;
134
129
  }
135
130
  else {
136
131
  this._log('Using schema.org text as content (DOM element not found)');
@@ -143,17 +138,30 @@ class Defuddle {
143
138
  /**
144
139
  * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
145
140
  */
146
- _getSchemaText(schemaOrgData) {
147
- if (!schemaOrgData)
141
+ _getSchemaText(schemaOrgData, depth = 0) {
142
+ if (!schemaOrgData || depth > 10)
148
143
  return '';
149
144
  const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
150
145
  for (const item of items) {
146
+ // Recurse into nested arrays
147
+ if (Array.isArray(item)) {
148
+ const found = this._getSchemaText(item, depth + 1);
149
+ if (found)
150
+ return found;
151
+ continue;
152
+ }
151
153
  if (item?.text && typeof item.text === 'string') {
152
154
  return item.text;
153
155
  }
154
156
  if (item?.articleBody && typeof item.articleBody === 'string') {
155
157
  return item.articleBody;
156
158
  }
159
+ // Traverse @graph arrays (common in JSON-LD with multiple entities)
160
+ if (item?.['@graph'] && Array.isArray(item['@graph'])) {
161
+ const found = this._getSchemaText(item['@graph'], depth + 1);
162
+ if (found)
163
+ return found;
164
+ }
157
165
  }
158
166
  return '';
159
167
  }
@@ -194,82 +202,32 @@ class Defuddle {
194
202
  }
195
203
  }
196
204
  /**
197
- * Find a DOM element whose text matches the schema.org text content.
198
- * Used when the content scorer picked the wrong element from a feed page.
199
- * Returns the element's inner HTML including sibling media (images, etc.)
205
+ * Find the smallest DOM element whose text contains the search phrase
206
+ * and whose word count is at least 80% of the expected count.
207
+ * Shared by _findSchemaContentElement and _findContentBySchemaText.
200
208
  */
201
- _findContentBySchemaText(schemaText) {
202
- const body = this.doc.body;
203
- if (!body)
204
- return '';
205
- // Use the first paragraph as the search phrase.
206
- // DOM textContent concatenates <p> elements without separators,
207
- // so we can't cross paragraph boundaries when matching.
209
+ _findElementBySchemaText(root, schemaText) {
208
210
  const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
209
211
  const searchPhrase = firstPara.substring(0, 100).trim();
210
212
  if (!searchPhrase)
211
- return '';
212
- const schemaWordCount = this.countHtmlWords(schemaText);
213
- // Find the smallest element whose text contains the search phrase
214
- // and whose word count is close to the schema text's word count
213
+ return null;
214
+ const schemaWordCount = (0, utils_1.countWords)(schemaText);
215
215
  let bestMatch = null;
216
216
  let bestSize = Infinity;
217
- const allElements = body.querySelectorAll('*');
217
+ const allElements = root.querySelectorAll('*');
218
218
  for (const el of allElements) {
219
- const elText = (el.textContent || '');
219
+ if (el === root)
220
+ continue;
221
+ const elText = el.textContent || '';
220
222
  if (!elText.includes(searchPhrase))
221
223
  continue;
222
224
  const elWords = (0, utils_1.countWords)(elText);
223
- // Element should contain roughly the same amount of text
224
- // (allow some slack for surrounding whitespace / minor extras)
225
225
  if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
226
226
  bestSize = elWords;
227
227
  bestMatch = el;
228
228
  }
229
229
  }
230
- if (!bestMatch)
231
- return '';
232
- // Read the largest sibling image src BEFORE resolveRelativeUrls
233
- // can mangle comma-containing CDN URLs in srcset attributes
234
- let imageSrc = '';
235
- let imageAlt = '';
236
- const parent = bestMatch.parentElement;
237
- if (parent && parent !== body) {
238
- const images = parent.querySelectorAll('img');
239
- let largestImg = null;
240
- let largestArea = 0;
241
- for (const img of images) {
242
- if (bestMatch.contains(img))
243
- continue;
244
- const w = parseInt(img.getAttribute('width') || '0', 10);
245
- const h = parseInt(img.getAttribute('height') || '0', 10);
246
- const area = w * h;
247
- if (area > largestArea) {
248
- largestArea = area;
249
- largestImg = img;
250
- }
251
- }
252
- if (largestImg) {
253
- imageSrc = this._getLargestImageSrc(largestImg);
254
- imageAlt = largestImg.getAttribute('alt') || '';
255
- try {
256
- const baseUrl = this.options.url || this.doc.URL;
257
- if (baseUrl)
258
- imageSrc = new URL(imageSrc, baseUrl).href;
259
- }
260
- catch { }
261
- }
262
- }
263
- // Now resolve URLs in the text content
264
- this.resolveRelativeUrls(bestMatch);
265
- let html = (0, dom_1.serializeHTML)(bestMatch);
266
- if (imageSrc) {
267
- const img = this.doc.createElement('img');
268
- img.setAttribute('src', imageSrc);
269
- img.setAttribute('alt', imageAlt);
270
- html += img.outerHTML;
271
- }
272
- return html;
230
+ return bestMatch;
273
231
  }
274
232
  findLargestHiddenContentSelector() {
275
233
  const body = this.doc.body;
@@ -353,7 +311,8 @@ class Defuddle {
353
311
  try {
354
312
  const url = this.options.url || this.doc.URL;
355
313
  const schemaOrgData = this.getSchemaOrgData();
356
- const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
314
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
315
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
357
316
  if (extractor) {
358
317
  const extracted = await extractor.extractAsync();
359
318
  return this.getExtractorVariables(extracted.variables) || null;
@@ -368,7 +327,8 @@ class Defuddle {
368
327
  try {
369
328
  const url = this.options.url || this.doc.URL;
370
329
  const schemaOrgData = this.getSchemaOrgData();
371
- const extractor = finder(this.doc, url, schemaOrgData);
330
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
331
+ const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
372
332
  if (extractor) {
373
333
  const startTime = Date.now();
374
334
  const extracted = await extractor.extractAsync();
@@ -387,6 +347,16 @@ class Defuddle {
387
347
  */
388
348
  parseInternal(overrideOptions = {}) {
389
349
  const startTime = Date.now();
350
+ const profile = {};
351
+ const doProfile = this.options.profile ?? false;
352
+ const profileStep = (name, fn) => {
353
+ if (!doProfile)
354
+ return fn();
355
+ const t = performance.now();
356
+ const result = fn();
357
+ profile[name] = Math.round(performance.now() - t);
358
+ return result;
359
+ };
390
360
  // Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
391
361
  if (!this.doc.documentElement) {
392
362
  const url = this.options.url || '';
@@ -414,6 +384,7 @@ class Defuddle {
414
384
  removeSmallImages: true,
415
385
  removeContentPatterns: true,
416
386
  standardize: true,
387
+ includeReplies: 'extractors',
417
388
  ...this.options,
418
389
  ...overrideOptions
419
390
  };
@@ -435,10 +406,41 @@ class Defuddle {
435
406
  try {
436
407
  // Use site-specific extractor first, if there is one
437
408
  const url = options.url || this.doc.URL;
438
- const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
439
- if (extractor && extractor.canExtract()) {
440
- const extracted = extractor.extract();
441
- return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
409
+ const extractorOpts = {
410
+ includeReplies: options.includeReplies,
411
+ language: options.language,
412
+ };
413
+ if (!this._inExtractorPipelineRun) {
414
+ const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
415
+ if (extractor && extractor.canExtract()) {
416
+ const extracted = extractor.extract();
417
+ if (extracted.contentSelector) {
418
+ this._inExtractorPipelineRun = true;
419
+ try {
420
+ const pipelineResult = this.parseInternal({
421
+ contentSelector: extracted.contentSelector,
422
+ removeLowScoring: false,
423
+ removeHiddenElements: false,
424
+ });
425
+ const variables = this.getExtractorVariables(extracted.variables);
426
+ return {
427
+ ...pipelineResult,
428
+ title: extracted.variables?.title || pipelineResult.title,
429
+ description: extracted.variables?.description || pipelineResult.description,
430
+ author: extracted.variables?.author || pipelineResult.author,
431
+ published: extracted.variables?.published || pipelineResult.published,
432
+ site: extracted.variables?.site || pipelineResult.site,
433
+ language: extracted.variables?.language || pipelineResult.language,
434
+ extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
435
+ ...(variables ? { variables } : {}),
436
+ };
437
+ }
438
+ finally {
439
+ this._inExtractorPipelineRun = false;
440
+ }
441
+ }
442
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
443
+ }
442
444
  }
443
445
  // Continue if there is no extractor...
444
446
  // Evaluate mobile styles and sizes on original document (cached across retries)
@@ -448,29 +450,47 @@ class Defuddle {
448
450
  const mobileStyles = this._mobileStyles;
449
451
  // Find small images in original document (cached across retries)
450
452
  if (!this._smallImages) {
451
- this._smallImages = this.findSmallImages(this.doc);
453
+ this._smallImages = (0, small_images_1.findSmallImages)(this.doc, this.debug);
452
454
  }
453
455
  const smallImages = this._smallImages;
454
456
  // Clone document
455
- const clone = this.doc.cloneNode(true);
456
- // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
457
- // create when parsing HTML entities like &#39;
458
- clone.body?.normalize();
457
+ let clone;
458
+ profileStep('cloneDocument', () => {
459
+ clone = this.doc.cloneNode(true);
460
+ // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
461
+ // create when parsing HTML entities like &#39;
462
+ clone.body?.normalize();
463
+ });
459
464
  // Flatten shadow DOM content into the clone
460
- this.flattenShadowRoots(this.doc, clone);
465
+ profileStep('flattenShadowRoots', () => this.flattenShadowRoots(this.doc, clone));
461
466
  // Resolve React streaming SSR suspense boundaries
462
- this.resolveStreamedContent(clone);
467
+ profileStep('resolveStreamedContent', () => this.resolveStreamedContent(clone));
463
468
  // Apply mobile styles to clone
464
- this.applyMobileStyles(clone, mobileStyles);
469
+ profileStep('applyMobileStyles', () => this.applyMobileStyles(clone, mobileStyles));
465
470
  // Find main content
466
- let mainContent = null;
467
- if (options.contentSelector) {
468
- mainContent = clone.querySelector(options.contentSelector);
469
- this._log('Using contentSelector:', options.contentSelector, mainContent ? 'found' : 'not found');
470
- }
471
- if (!mainContent) {
472
- mainContent = this.findMainContent(clone);
473
- }
471
+ const mainContent = profileStep('findMainContent', () => {
472
+ let found = null;
473
+ if (options.contentSelector) {
474
+ found = clone.querySelector(options.contentSelector);
475
+ this._log('Using contentSelector:', options.contentSelector, found ? 'found' : 'not found');
476
+ }
477
+ if (!found) {
478
+ found = this.findMainContent(clone);
479
+ }
480
+ // If we fell back to <body>, try using schema.org articleBody/text
481
+ // to find a more specific content element within the DOM.
482
+ if (found && found.tagName.toLowerCase() === 'body') {
483
+ const schemaText = this._getSchemaText(schemaOrgData);
484
+ if (schemaText) {
485
+ const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
486
+ if (schemaContent) {
487
+ this._log('Found content element via schema.org text');
488
+ found = schemaContent;
489
+ }
490
+ }
491
+ }
492
+ return found;
493
+ });
474
494
  if (!mainContent) {
475
495
  const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
476
496
  const endTime = Date.now();
@@ -482,37 +502,66 @@ class Defuddle {
482
502
  metaTags: pageMetaTags
483
503
  };
484
504
  }
505
+ // Remove h1-adjacent date/author metadata blocks from the content.
506
+ // These are extracted as frontmatter but also appear in the body when a
507
+ // wide container (e.g. <main>) is selected as the content element.
508
+ profileStep('removeMetadataBlock', () => {
509
+ if (metadata.published || metadata.author) {
510
+ (0, metadata_block_1.removeMetadataBlock)(mainContent);
511
+ }
512
+ // Remove <wbr> elements — word break opportunity hints that carry no
513
+ // content but cause unwanted whitespace during standardization.
514
+ mainContent.querySelectorAll('wbr').forEach(el => el.remove());
515
+ });
485
516
  // Standardize footnotes before cleanup (CSS sidenotes use display:none)
486
- if (options.standardize) {
487
- (0, footnotes_1.standardizeFootnotes)(mainContent);
488
- }
517
+ profileStep('standardizeFootnotesCallouts', () => {
518
+ if (options.standardize) {
519
+ (0, footnotes_1.standardizeFootnotes)(mainContent);
520
+ (0, callouts_1.standardizeCallouts)(mainContent);
521
+ }
522
+ });
489
523
  // Remove small images
490
- if (options.removeSmallImages) {
491
- this.removeSmallImages(clone, smallImages);
492
- }
524
+ profileStep('removeSmallImages', () => {
525
+ if (options.removeSmallImages) {
526
+ (0, small_images_1.removeSmallImages)(clone, smallImages, this.debug);
527
+ }
528
+ });
493
529
  // Remove hidden elements using computed styles
494
- if (options.removeHiddenElements) {
495
- this.removeHiddenElements(clone, debugRemovals);
496
- }
497
- // Remove non-content blocks by scoring
498
- // Tries to find lists, navigation based on text content and link density
499
- if (options.removeLowScoring) {
500
- scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
501
- }
502
- // Remove clutter using selectors
503
- if (options.removeExactSelectors || options.removePartialSelectors) {
504
- this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
505
- }
530
+ profileStep('removeHiddenElements', () => {
531
+ if (options.removeHiddenElements) {
532
+ (0, hidden_1.removeHiddenElements)(clone, this.debug, debugRemovals);
533
+ }
534
+ });
535
+ // Remove clutter using selectors — deterministic removal of known
536
+ // non-content elements (nav, footer, .sidebar, etc.) by class/id.
537
+ // Runs before scoring so the heuristic scorer sees a cleaner DOM.
538
+ profileStep('removeBySelector', () => {
539
+ if (options.removeExactSelectors || options.removePartialSelectors) {
540
+ (0, selectors_1.removeBySelector)(clone, this.debug, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
541
+ }
542
+ });
543
+ // Remove non-content blocks by scoring — heuristic removal based
544
+ // on link density, text ratios, and navigation indicators.
545
+ profileStep('removeLowScoring', () => {
546
+ if (options.removeLowScoring) {
547
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
548
+ }
549
+ });
506
550
  // Remove elements by content patterns (read time, boilerplate, article cards)
507
- if (options.removeContentPatterns && mainContent) {
508
- this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
509
- }
551
+ profileStep('removeByContentPattern', () => {
552
+ if (options.removeContentPatterns && mainContent) {
553
+ const url = this.options.url || this.doc.URL || '';
554
+ (0, content_patterns_1.removeByContentPattern)(mainContent, this.debug, url, debugRemovals);
555
+ }
556
+ });
510
557
  // Normalize the main content
511
- if (options.standardize) {
512
- (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
513
- }
558
+ profileStep('standardizeContent', () => {
559
+ if (options.standardize) {
560
+ (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug, doProfile ? profile : undefined);
561
+ }
562
+ });
514
563
  // Resolve relative URLs to absolute
515
- this.resolveRelativeUrls(mainContent);
564
+ profileStep('resolveRelativeUrls', () => this.resolveRelativeUrls(mainContent));
516
565
  const content = mainContent.outerHTML;
517
566
  const endTime = Date.now();
518
567
  const result = {
@@ -528,6 +577,9 @@ class Defuddle {
528
577
  removals: debugRemovals
529
578
  };
530
579
  }
580
+ if (this.options.profile) {
581
+ result.profile = profile;
582
+ }
531
583
  return result;
532
584
  }
533
585
  catch (error) {
@@ -556,7 +608,6 @@ class Defuddle {
556
608
  .replace(/&\w+;/g, ' ');
557
609
  return (0, utils_1.countWords)(text);
558
610
  }
559
- // Make all other methods private by removing the static keyword and using private
560
611
  _log(...args) {
561
612
  if (this.debug) {
562
613
  console.log('Defuddle:', ...args);
@@ -653,282 +704,6 @@ class Defuddle {
653
704
  image.remove();
654
705
  });
655
706
  }
656
- removeHiddenElements(doc, debugRemovals) {
657
- let count = 0;
658
- const elementsToRemove = new Map();
659
- // Check inline styles and CSS class-based hidden patterns.
660
- const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
661
- // Only use getComputedStyle in browser environments where it's meaningful.
662
- // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
663
- const defaultView = doc.defaultView;
664
- const isBrowser = typeof window !== 'undefined' && defaultView === window;
665
- const allElements = doc.querySelectorAll('*');
666
- for (const element of allElements) {
667
- // Skip elements that contain math — sites like Wikipedia wrap MathML
668
- // in display:none spans for accessibility (the visible version is an
669
- // image/SVG fallback). We need to preserve these for math extraction.
670
- if (element.querySelector('math, [data-mathml], .katex-mathml') ||
671
- element.tagName.toLowerCase() === 'math') {
672
- continue;
673
- }
674
- // Check inline style for hidden patterns
675
- const style = element.getAttribute('style');
676
- if (style && hiddenStylePattern.test(style)) {
677
- const reason = style.includes('display') ? 'display:none' :
678
- style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
679
- elementsToRemove.set(element, reason);
680
- count++;
681
- continue;
682
- }
683
- // Use getComputedStyle only in real browser environments
684
- if (isBrowser) {
685
- try {
686
- const computedStyle = defaultView.getComputedStyle(element);
687
- let reason = '';
688
- if (computedStyle.display === 'none')
689
- reason = 'display:none';
690
- else if (computedStyle.visibility === 'hidden')
691
- reason = 'visibility:hidden';
692
- else if (computedStyle.opacity === '0')
693
- reason = 'opacity:0';
694
- if (reason) {
695
- elementsToRemove.set(element, reason);
696
- count++;
697
- continue;
698
- }
699
- }
700
- catch (e) { }
701
- }
702
- // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
703
- // "sm:hidden", "not-machine:hidden")
704
- const className = element.getAttribute('class') || '';
705
- if (className) {
706
- const tokens = className.split(/\s+/);
707
- for (const token of tokens) {
708
- if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
709
- elementsToRemove.set(element, `class:${token}`);
710
- count++;
711
- break;
712
- }
713
- }
714
- }
715
- }
716
- // Batch remove all hidden elements
717
- elementsToRemove.forEach((reason, el) => {
718
- if (this.debug && debugRemovals) {
719
- debugRemovals.push({
720
- step: 'removeHiddenElements',
721
- reason,
722
- text: (0, utils_1.textPreview)(el)
723
- });
724
- }
725
- el.remove();
726
- });
727
- this._log('Removed hidden elements:', count);
728
- }
729
- removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
730
- const startTime = Date.now();
731
- let exactSelectorCount = 0;
732
- let partialSelectorCount = 0;
733
- // Track all elements to be removed, with their match type
734
- const elementsToRemove = new Map();
735
- // First collect elements matching exact selectors
736
- if (removeExact) {
737
- const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
738
- exactElements.forEach(el => {
739
- if (el?.parentNode) {
740
- if (skipHiddenExactSelectors) {
741
- const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
742
- const role = (el.getAttribute('role') || '').toLowerCase();
743
- if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
744
- (hiddenAncestor && role === 'dialog')) {
745
- return;
746
- }
747
- }
748
- // Skip elements inside code blocks (e.g. syntax highlighting spans)
749
- if (el.closest('pre, code')) {
750
- return;
751
- }
752
- elementsToRemove.set(el, { type: 'exact' });
753
- exactSelectorCount++;
754
- }
755
- });
756
- }
757
- if (removePartial) {
758
- // Pre-compile individual regexes for debug pattern identification only
759
- const individualRegexes = this.debug
760
- ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
761
- : null;
762
- // Use pre-built attribute selector for elements we care about
763
- const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
764
- // Process elements for partial matches
765
- allElements.forEach(el => {
766
- // Skip if already marked for removal
767
- if (elementsToRemove.has(el)) {
768
- return;
769
- }
770
- // Skip code elements and elements containing code blocks
771
- // where class names indicate language/syntax, not page structure
772
- const tag = el.tagName;
773
- if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
774
- return;
775
- }
776
- // Get all relevant attributes and combine into a single string
777
- const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
778
- if (attr === 'class') {
779
- return el.className && typeof el.className === 'string' ? el.className : '';
780
- }
781
- if (attr === 'id') {
782
- return el.id || '';
783
- }
784
- return el.getAttribute(attr) || '';
785
- }).join(' ').toLowerCase();
786
- // Skip if no attributes to check
787
- if (!attrs.trim()) {
788
- return;
789
- }
790
- // Check for partial match using single regex test
791
- if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
792
- const matchedPattern = individualRegexes
793
- ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
794
- : undefined;
795
- elementsToRemove.set(el, { type: 'partial', selector: matchedPattern });
796
- partialSelectorCount++;
797
- }
798
- });
799
- }
800
- // Remove all collected elements in a single pass
801
- // Skip elements that are ancestors of mainContent to avoid disconnecting it
802
- // Skip footnote list containers, their parents, and immediate children
803
- // Skip anchor links inside headings - the heading transform handles these
804
- elementsToRemove.forEach(({ type, selector }, el) => {
805
- if (mainContent && el.contains(mainContent)) {
806
- return;
807
- }
808
- if (el.tagName === 'A' && el.closest('h1, h2, h3, h4, h5, h6')) {
809
- return;
810
- }
811
- try {
812
- if (el.matches(constants_1.FOOTNOTE_LIST_SELECTORS) || el.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS)) {
813
- return;
814
- }
815
- // Protect immediate children of footnote containers (e.g. wikidot div.footnote-footer)
816
- const parent = el.parentElement;
817
- if (parent && parent.matches(constants_1.FOOTNOTE_LIST_SELECTORS)) {
818
- return;
819
- }
820
- }
821
- catch (e) { }
822
- if (this.debug && debugRemovals) {
823
- debugRemovals.push({
824
- step: 'removeBySelector',
825
- selector: type === 'exact' ? 'exact' : selector,
826
- reason: type === 'exact' ? 'exact selector match' : `partial match: ${selector}`,
827
- text: (0, utils_1.textPreview)(el)
828
- });
829
- }
830
- el.remove();
831
- });
832
- const endTime = Date.now();
833
- this._log('Removed clutter elements:', {
834
- exactSelectors: exactSelectorCount,
835
- partialSelectors: partialSelectorCount,
836
- total: elementsToRemove.size,
837
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
838
- });
839
- }
840
- // Find small IMG and SVG elements
841
- findSmallImages(doc) {
842
- const MIN_DIMENSION = 33;
843
- const smallImages = new Set();
844
- let processedCount = 0;
845
- const elements = doc.querySelectorAll('img, svg');
846
- const defaultView = doc.defaultView;
847
- const isBrowser = typeof window !== 'undefined' && defaultView === window;
848
- for (const element of elements) {
849
- const attrWidth = parseInt(element.getAttribute('width') || '0');
850
- const attrHeight = parseInt(element.getAttribute('height') || '0');
851
- // Check inline style dimensions
852
- const style = element.getAttribute('style') || '';
853
- const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
854
- const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
855
- // Use getComputedStyle and getBoundingClientRect only in browser
856
- let computedWidth = 0, computedHeight = 0;
857
- if (isBrowser) {
858
- try {
859
- const cs = defaultView.getComputedStyle(element);
860
- computedWidth = parseInt(cs.width) || 0;
861
- computedHeight = parseInt(cs.height) || 0;
862
- }
863
- catch (e) { }
864
- try {
865
- const rect = element.getBoundingClientRect();
866
- if (rect.width > 0)
867
- computedWidth = computedWidth || rect.width;
868
- if (rect.height > 0)
869
- computedHeight = computedHeight || rect.height;
870
- }
871
- catch (e) { }
872
- }
873
- const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
874
- const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
875
- if (widths.length > 0 && heights.length > 0) {
876
- const effectiveWidth = Math.min(...widths);
877
- const effectiveHeight = Math.min(...heights);
878
- if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
879
- const identifier = this.getElementIdentifier(element);
880
- if (identifier) {
881
- smallImages.add(identifier);
882
- processedCount++;
883
- }
884
- }
885
- }
886
- }
887
- this._log('Found small elements:', processedCount);
888
- return smallImages;
889
- }
890
- removeSmallImages(doc, smallImages) {
891
- let removedCount = 0;
892
- ['img', 'svg'].forEach(tag => {
893
- const elements = doc.getElementsByTagName(tag);
894
- Array.from(elements).forEach(element => {
895
- const identifier = this.getElementIdentifier(element);
896
- if (identifier && smallImages.has(identifier)) {
897
- element.remove();
898
- removedCount++;
899
- }
900
- });
901
- });
902
- this._log('Removed small elements:', removedCount);
903
- }
904
- getElementIdentifier(element) {
905
- // Try to create a unique identifier using various attributes
906
- if (element.tagName.toLowerCase() === 'img') {
907
- // For lazy-loaded images, use data-src as identifier if available
908
- const dataSrc = element.getAttribute('data-src');
909
- if (dataSrc)
910
- return `src:${dataSrc}`;
911
- const src = element.getAttribute('src') || '';
912
- const srcset = element.getAttribute('srcset') || '';
913
- const dataSrcset = element.getAttribute('data-srcset');
914
- if (src)
915
- return `src:${src}`;
916
- if (srcset)
917
- return `srcset:${srcset}`;
918
- if (dataSrcset)
919
- return `srcset:${dataSrcset}`;
920
- }
921
- const id = element.id || '';
922
- const className = element.className || '';
923
- const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
924
- if (id)
925
- return `id:${id}`;
926
- if (viewBox)
927
- return `viewBox:${viewBox}`;
928
- if (className)
929
- return `class:${className}`;
930
- return null;
931
- }
932
707
  findMainContent(doc) {
933
708
  // Find all potential content containers
934
709
  const candidates = [];
@@ -1035,8 +810,8 @@ class Defuddle {
1035
810
  if (current.id) {
1036
811
  selector += '#' + current.id;
1037
812
  }
1038
- else if (current.className && typeof current.className === 'string') {
1039
- selector += '.' + current.className.trim().split(/\s+/).join('.');
813
+ else if ((0, dom_1.getClassName)(current)) {
814
+ selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
1040
815
  }
1041
816
  parts.unshift(selector);
1042
817
  current = current.parentElement;
@@ -1050,9 +825,23 @@ class Defuddle {
1050
825
  * Resolve relative URLs to absolute within a DOM element
1051
826
  */
1052
827
  resolveRelativeUrls(element) {
1053
- const baseUrl = this.options.url || this.doc.URL;
1054
- if (!baseUrl)
828
+ const docUrl = this.options.url || this.doc.URL;
829
+ if (!docUrl)
1055
830
  return;
831
+ // Respect <base href> for relative URL resolution, matching browser behavior
832
+ let baseUrl = docUrl;
833
+ const baseEl = this.doc.querySelector('base[href]');
834
+ if (baseEl) {
835
+ const baseHref = baseEl.getAttribute('href');
836
+ if (baseHref) {
837
+ try {
838
+ baseUrl = new URL(baseHref, docUrl).href;
839
+ }
840
+ catch {
841
+ // Invalid base href, fall back to document URL
842
+ }
843
+ }
844
+ }
1056
845
  const resolve = (url) => {
1057
846
  // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
1058
847
  // Normalize these before URL resolution.
@@ -1060,6 +849,9 @@ class Defuddle {
1060
849
  .trim()
1061
850
  .replace(/^\\?["']+/, '')
1062
851
  .replace(/\\?["']+$/, '');
852
+ // Fragment-only hrefs reference anchors within the same document — keep them relative.
853
+ if (normalized.startsWith('#'))
854
+ return normalized;
1063
855
  try {
1064
856
  return new URL(normalized, baseUrl).href;
1065
857
  }
@@ -1330,7 +1122,7 @@ class Defuddle {
1330
1122
  return {
1331
1123
  content: contentHtml,
1332
1124
  title: extracted.variables?.title || metadata.title,
1333
- description: metadata.description,
1125
+ description: extracted.variables?.description || metadata.description,
1334
1126
  domain: metadata.domain,
1335
1127
  favicon: metadata.favicon,
1336
1128
  image: metadata.image,
@@ -1363,207 +1155,6 @@ class Defuddle {
1363
1155
  }
1364
1156
  return hasCustom ? custom : undefined;
1365
1157
  }
1366
- /**
1367
- * Content-based pattern removal for elements that can't be detected by
1368
- * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
1369
- */
1370
- removeByContentPattern(mainContent, debugRemovals) {
1371
- // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
1372
- // Only removes leaf elements whose text is PURELY date + read time,
1373
- // not mixed with other meaningful content like tag names.
1374
- const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
1375
- for (const el of candidates) {
1376
- if (!el.parentNode)
1377
- continue;
1378
- if (el.closest('pre') || el.closest('code'))
1379
- continue;
1380
- const text = el.textContent?.trim() || '';
1381
- const words = (0, utils_1.countWords)(text);
1382
- // Match date + read time in short elements
1383
- if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1384
- // Ensure this is a leaf-ish element, not a large container
1385
- if (el.querySelectorAll('p, div, section, article').length === 0) {
1386
- // Verify the text is ONLY date + read time metadata
1387
- // by stripping all date/time words and checking nothing remains
1388
- let cleaned = text;
1389
- for (const pattern of METADATA_STRIP_PATTERNS) {
1390
- cleaned = cleaned.replace(pattern, '');
1391
- }
1392
- if (cleaned.trim().length > 0)
1393
- continue;
1394
- if (this.debug && debugRemovals) {
1395
- debugRemovals.push({
1396
- step: 'removeByContentPattern',
1397
- reason: 'read time metadata',
1398
- text: (0, utils_1.textPreview)(el)
1399
- });
1400
- }
1401
- el.remove();
1402
- }
1403
- }
1404
- }
1405
- // Remove standalone time/date elements near the start or end of content.
1406
- // A <time> in its own paragraph at the boundary is metadata (publish date),
1407
- // but <time> inline within prose should be preserved (see issue #136).
1408
- const timeElements = Array.from(mainContent.querySelectorAll('time'));
1409
- const contentText = mainContent.textContent || '';
1410
- for (const time of timeElements) {
1411
- if (!time.parentNode)
1412
- continue;
1413
- // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
1414
- // Stop at block elements to avoid removing containers with other content.
1415
- let target = time;
1416
- let targetText = target.textContent?.trim() || '';
1417
- while (target.parentElement && target.parentElement !== mainContent) {
1418
- const parentTag = target.parentElement.tagName.toLowerCase();
1419
- const parentText = target.parentElement.textContent?.trim() || '';
1420
- // If parent is a <p> that only wraps this time, include it
1421
- if (parentTag === 'p' && parentText === targetText) {
1422
- target = target.parentElement;
1423
- break;
1424
- }
1425
- // Only walk through inline formatting wrappers
1426
- if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
1427
- parentText === targetText) {
1428
- target = target.parentElement;
1429
- targetText = parentText;
1430
- continue;
1431
- }
1432
- break;
1433
- }
1434
- const text = target.textContent?.trim() || '';
1435
- const words = (0, utils_1.countWords)(text);
1436
- if (words > 10)
1437
- continue;
1438
- // Check if this element is near the start or end of mainContent
1439
- const pos = contentText.indexOf(text);
1440
- const distFromEnd = contentText.length - (pos + text.length);
1441
- if (pos > 200 && distFromEnd > 200)
1442
- continue;
1443
- if (this.debug && debugRemovals) {
1444
- debugRemovals.push({
1445
- step: 'removeByContentPattern',
1446
- reason: 'boundary date element',
1447
- text: (0, utils_1.textPreview)(target)
1448
- });
1449
- }
1450
- target.remove();
1451
- }
1452
- // Remove section breadcrumbs
1453
- // Short elements containing a link to a parent section of the current URL.
1454
- const url = this.options.url || this.doc.URL || '';
1455
- let urlPath = '';
1456
- try {
1457
- urlPath = new URL(url).pathname;
1458
- }
1459
- catch { }
1460
- if (urlPath) {
1461
- const shortElements = mainContent.querySelectorAll('div, span, p');
1462
- for (const el of shortElements) {
1463
- if (!el.parentNode)
1464
- continue;
1465
- const text = el.textContent?.trim() || '';
1466
- const words = (0, utils_1.countWords)(text);
1467
- if (words > 10)
1468
- continue;
1469
- // Must be a leaf-ish element (no block children)
1470
- if (el.querySelectorAll('p, div, section, article').length > 0)
1471
- continue;
1472
- const link = el.querySelector('a[href]');
1473
- if (!link)
1474
- continue;
1475
- try {
1476
- const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
1477
- if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
1478
- if (this.debug && debugRemovals) {
1479
- debugRemovals.push({
1480
- step: 'removeByContentPattern',
1481
- reason: 'section breadcrumb',
1482
- text: (0, utils_1.textPreview)(el)
1483
- });
1484
- }
1485
- el.remove();
1486
- }
1487
- }
1488
- catch { }
1489
- }
1490
- }
1491
- // Remove boilerplate sentences and trailing non-content.
1492
- // Search elements for end-of-article boilerplate, then truncate
1493
- // from the best ancestor that has siblings to remove.
1494
- const fullText = mainContent.textContent || '';
1495
- const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
1496
- for (const el of boilerplateElements) {
1497
- if (!el.parentNode)
1498
- continue;
1499
- const text = el.textContent?.trim() || '';
1500
- const words = (0, utils_1.countWords)(text);
1501
- if (words > 50 || words < 3)
1502
- continue;
1503
- for (const pattern of BOILERPLATE_PATTERNS) {
1504
- if (pattern.test(text)) {
1505
- // Walk up to find an ancestor that has next siblings to truncate.
1506
- // Don't walk all the way to mainContent's direct child — if there's
1507
- // a single wrapper div, that would remove everything.
1508
- let target = el;
1509
- while (target.parentElement && target.parentElement !== mainContent) {
1510
- if (target.nextElementSibling)
1511
- break;
1512
- target = target.parentElement;
1513
- }
1514
- // Only truncate if there's substantial content before the boilerplate
1515
- const targetText = target.textContent || '';
1516
- const targetPos = fullText.indexOf(targetText);
1517
- if (targetPos < 200)
1518
- continue;
1519
- // Collect ancestors before modifying the DOM
1520
- const ancestors = [];
1521
- let anc = target.parentElement;
1522
- while (anc && anc !== mainContent) {
1523
- ancestors.push(anc);
1524
- anc = anc.parentElement;
1525
- }
1526
- // Remove target element and its following siblings
1527
- this.removeTrailingSiblings(target, true, debugRemovals);
1528
- // Cascade upward: remove following siblings at each
1529
- // ancestor level too. Everything after the boilerplate
1530
- // in document order is non-content.
1531
- for (const ancestor of ancestors) {
1532
- this.removeTrailingSiblings(ancestor, false, debugRemovals);
1533
- }
1534
- return;
1535
- }
1536
- }
1537
- }
1538
- }
1539
- /**
1540
- * Remove an element's following siblings, and optionally the element itself.
1541
- */
1542
- removeTrailingSiblings(element, removeSelf, debugRemovals) {
1543
- let sibling = element.nextElementSibling;
1544
- while (sibling) {
1545
- const next = sibling.nextElementSibling;
1546
- if (this.debug && debugRemovals) {
1547
- debugRemovals.push({
1548
- step: 'removeByContentPattern',
1549
- reason: 'trailing non-content',
1550
- text: (0, utils_1.textPreview)(sibling)
1551
- });
1552
- }
1553
- sibling.remove();
1554
- sibling = next;
1555
- }
1556
- if (removeSelf) {
1557
- if (this.debug && debugRemovals) {
1558
- debugRemovals.push({
1559
- step: 'removeByContentPattern',
1560
- reason: 'boilerplate text',
1561
- text: (0, utils_1.textPreview)(element)
1562
- });
1563
- }
1564
- element.remove();
1565
- }
1566
- }
1567
1158
  }
1568
1159
  exports.Defuddle = Defuddle;
1569
1160
  //# sourceMappingURL=defuddle.js.map