defuddle 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +33 -0
  2. package/dist/cli.js +47 -10
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +2 -0
  5. package/dist/constants.js +29 -2
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +6 -0
  8. package/dist/defuddle.js +287 -40
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/callouts.d.ts +6 -0
  11. package/dist/elements/callouts.js +74 -0
  12. package/dist/elements/callouts.js.map +1 -0
  13. package/dist/elements/headings.d.ts +6 -0
  14. package/dist/elements/headings.js +13 -0
  15. package/dist/elements/headings.js.map +1 -1
  16. package/dist/elements/images.js +10 -1
  17. package/dist/elements/images.js.map +1 -1
  18. package/dist/elements/math.base.js +1 -4
  19. package/dist/elements/math.base.js.map +1 -1
  20. package/dist/extractor-registry.d.ts +5 -5
  21. package/dist/extractor-registry.js +8 -8
  22. package/dist/extractor-registry.js.map +1 -1
  23. package/dist/extractors/_base.d.ts +6 -1
  24. package/dist/extractors/_base.js +2 -1
  25. package/dist/extractors/_base.js.map +1 -1
  26. package/dist/extractors/github.js +3 -3
  27. package/dist/extractors/github.js.map +1 -1
  28. package/dist/extractors/hackernews.js +1 -1
  29. package/dist/extractors/hackernews.js.map +1 -1
  30. package/dist/extractors/reddit.js +7 -4
  31. package/dist/extractors/reddit.js.map +1 -1
  32. package/dist/extractors/twitter.js +3 -1
  33. package/dist/extractors/twitter.js.map +1 -1
  34. package/dist/extractors/youtube.d.ts +13 -0
  35. package/dist/extractors/youtube.js +140 -20
  36. package/dist/extractors/youtube.js.map +1 -1
  37. package/dist/fetch.d.ts +13 -0
  38. package/dist/fetch.js +181 -0
  39. package/dist/fetch.js.map +1 -0
  40. package/dist/index.full.js +1 -1
  41. package/dist/index.js +1 -1
  42. package/dist/markdown.js +76 -33
  43. package/dist/markdown.js.map +1 -1
  44. package/dist/metadata.js +1 -1
  45. package/dist/metadata.js.map +1 -1
  46. package/dist/scoring.js +11 -6
  47. package/dist/scoring.js.map +1 -1
  48. package/dist/standardize.js +24 -57
  49. package/dist/standardize.js.map +1 -1
  50. package/dist/types.d.ts +14 -0
  51. package/dist/utils/dom.d.ts +5 -0
  52. package/dist/utils/dom.js +8 -0
  53. package/dist/utils/dom.js.map +1 -1
  54. package/package.json +1 -1
package/dist/defuddle.js CHANGED
@@ -2,16 +2,20 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.Defuddle = void 0;
4
4
  const metadata_1 = require("./metadata");
5
+ const headings_1 = require("./elements/headings");
5
6
  const extractor_registry_1 = require("./extractor-registry");
6
7
  const constants_1 = require("./constants");
7
8
  const standardize_1 = require("./standardize");
8
9
  const footnotes_1 = require("./elements/footnotes");
10
+ const callouts_1 = require("./elements/callouts");
9
11
  const scoring_1 = require("./scoring");
10
12
  const utils_1 = require("./utils");
11
13
  const dom_1 = require("./utils/dom");
12
14
  /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
15
  const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
16
  // Content pattern detection constants
17
+ const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
18
+ const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
15
19
  const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
20
  const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
21
  const BOILERPLATE_PATTERNS = [
@@ -143,17 +147,30 @@ class Defuddle {
143
147
  /**
144
148
  * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
145
149
  */
146
- _getSchemaText(schemaOrgData) {
147
- if (!schemaOrgData)
150
+ _getSchemaText(schemaOrgData, depth = 0) {
151
+ if (!schemaOrgData || depth > 10)
148
152
  return '';
149
153
  const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
150
154
  for (const item of items) {
155
+ // Recurse into nested arrays
156
+ if (Array.isArray(item)) {
157
+ const found = this._getSchemaText(item, depth + 1);
158
+ if (found)
159
+ return found;
160
+ continue;
161
+ }
151
162
  if (item?.text && typeof item.text === 'string') {
152
163
  return item.text;
153
164
  }
154
165
  if (item?.articleBody && typeof item.articleBody === 'string') {
155
166
  return item.articleBody;
156
167
  }
168
+ // Traverse @graph arrays (common in JSON-LD with multiple entities)
169
+ if (item?.['@graph'] && Array.isArray(item['@graph'])) {
170
+ const found = this._getSchemaText(item['@graph'], depth + 1);
171
+ if (found)
172
+ return found;
173
+ }
157
174
  }
158
175
  return '';
159
176
  }
@@ -194,39 +211,43 @@ class Defuddle {
194
211
  }
195
212
  }
196
213
  /**
197
- * Find a DOM element whose text matches the schema.org text content.
198
- * Used when the content scorer picked the wrong element from a feed page.
199
- * Returns the element's inner HTML including sibling media (images, etc.)
214
+ * Find the smallest DOM element whose text contains the search phrase
215
+ * and whose word count is at least 80% of the expected count.
216
+ * Shared by _findSchemaContentElement and _findContentBySchemaText.
200
217
  */
201
- _findContentBySchemaText(schemaText) {
202
- const body = this.doc.body;
203
- if (!body)
204
- return '';
205
- // Use the first paragraph as the search phrase.
206
- // DOM textContent concatenates <p> elements without separators,
207
- // so we can't cross paragraph boundaries when matching.
218
+ _findElementBySchemaText(root, schemaText) {
208
219
  const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
209
220
  const searchPhrase = firstPara.substring(0, 100).trim();
210
221
  if (!searchPhrase)
211
- return '';
212
- const schemaWordCount = this.countHtmlWords(schemaText);
213
- // Find the smallest element whose text contains the search phrase
214
- // and whose word count is close to the schema text's word count
222
+ return null;
223
+ const schemaWordCount = (0, utils_1.countWords)(schemaText);
215
224
  let bestMatch = null;
216
225
  let bestSize = Infinity;
217
- const allElements = body.querySelectorAll('*');
226
+ const allElements = root.querySelectorAll('*');
218
227
  for (const el of allElements) {
219
- const elText = (el.textContent || '');
228
+ if (el === root)
229
+ continue;
230
+ const elText = el.textContent || '';
220
231
  if (!elText.includes(searchPhrase))
221
232
  continue;
222
233
  const elWords = (0, utils_1.countWords)(elText);
223
- // Element should contain roughly the same amount of text
224
- // (allow some slack for surrounding whitespace / minor extras)
225
234
  if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
226
235
  bestSize = elWords;
227
236
  bestMatch = el;
228
237
  }
229
238
  }
239
+ return bestMatch;
240
+ }
241
+ /**
242
+ * Find a DOM element whose text matches the schema.org text content.
243
+ * Used when the content scorer picked the wrong element from a feed page.
244
+ * Returns the element's inner HTML including sibling media (images, etc.)
245
+ */
246
+ _findContentBySchemaText(schemaText) {
247
+ const body = this.doc.body;
248
+ if (!body)
249
+ return '';
250
+ const bestMatch = this._findElementBySchemaText(body, schemaText);
230
251
  if (!bestMatch)
231
252
  return '';
232
253
  // Read the largest sibling image src BEFORE resolveRelativeUrls
@@ -260,6 +281,8 @@ class Defuddle {
260
281
  catch { }
261
282
  }
262
283
  }
284
+ // Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
285
+ (0, headings_1.removeHeadingAnchors)(bestMatch);
263
286
  // Now resolve URLs in the text content
264
287
  this.resolveRelativeUrls(bestMatch);
265
288
  let html = (0, dom_1.serializeHTML)(bestMatch);
@@ -353,7 +376,8 @@ class Defuddle {
353
376
  try {
354
377
  const url = this.options.url || this.doc.URL;
355
378
  const schemaOrgData = this.getSchemaOrgData();
356
- const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
379
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
380
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
357
381
  if (extractor) {
358
382
  const extracted = await extractor.extractAsync();
359
383
  return this.getExtractorVariables(extracted.variables) || null;
@@ -368,7 +392,8 @@ class Defuddle {
368
392
  try {
369
393
  const url = this.options.url || this.doc.URL;
370
394
  const schemaOrgData = this.getSchemaOrgData();
371
- const extractor = finder(this.doc, url, schemaOrgData);
395
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
396
+ const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
372
397
  if (extractor) {
373
398
  const startTime = Date.now();
374
399
  const extracted = await extractor.extractAsync();
@@ -414,6 +439,7 @@ class Defuddle {
414
439
  removeSmallImages: true,
415
440
  removeContentPatterns: true,
416
441
  standardize: true,
442
+ includeReplies: 'extractors',
417
443
  ...this.options,
418
444
  ...overrideOptions
419
445
  };
@@ -435,7 +461,11 @@ class Defuddle {
435
461
  try {
436
462
  // Use site-specific extractor first, if there is one
437
463
  const url = options.url || this.doc.URL;
438
- const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
464
+ const extractorOpts = {
465
+ includeReplies: options.includeReplies,
466
+ language: options.language,
467
+ };
468
+ const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
439
469
  if (extractor && extractor.canExtract()) {
440
470
  const extracted = extractor.extract();
441
471
  return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
@@ -471,6 +501,18 @@ class Defuddle {
471
501
  if (!mainContent) {
472
502
  mainContent = this.findMainContent(clone);
473
503
  }
504
+ // If we fell back to <body>, try using schema.org articleBody/text
505
+ // to find a more specific content element within the DOM.
506
+ if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
507
+ const schemaText = this._getSchemaText(schemaOrgData);
508
+ if (schemaText) {
509
+ const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
510
+ if (schemaContent) {
511
+ this._log('Found content element via schema.org text');
512
+ mainContent = schemaContent;
513
+ }
514
+ }
515
+ }
474
516
  if (!mainContent) {
475
517
  const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
476
518
  const endTime = Date.now();
@@ -482,9 +524,13 @@ class Defuddle {
482
524
  metaTags: pageMetaTags
483
525
  };
484
526
  }
527
+ // Remove <wbr> elements — word break opportunity hints that carry no
528
+ // content but cause unwanted whitespace during standardization.
529
+ mainContent.querySelectorAll('wbr').forEach(el => el.remove());
485
530
  // Standardize footnotes before cleanup (CSS sidenotes use display:none)
486
531
  if (options.standardize) {
487
532
  (0, footnotes_1.standardizeFootnotes)(mainContent);
533
+ (0, callouts_1.standardizeCallouts)(mainContent);
488
534
  }
489
535
  // Remove small images
490
536
  if (options.removeSmallImages) {
@@ -494,15 +540,17 @@ class Defuddle {
494
540
  if (options.removeHiddenElements) {
495
541
  this.removeHiddenElements(clone, debugRemovals);
496
542
  }
497
- // Remove non-content blocks by scoring
498
- // Tries to find lists, navigation based on text content and link density
499
- if (options.removeLowScoring) {
500
- scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
501
- }
502
- // Remove clutter using selectors
543
+ // Remove clutter using selectors — deterministic removal of known
544
+ // non-content elements (nav, footer, .sidebar, etc.) by class/id.
545
+ // Runs before scoring so the heuristic scorer sees a cleaner DOM.
503
546
  if (options.removeExactSelectors || options.removePartialSelectors) {
504
547
  this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
505
548
  }
549
+ // Remove non-content blocks by scoring — heuristic removal based
550
+ // on link density, text ratios, and navigation indicators.
551
+ if (options.removeLowScoring) {
552
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
553
+ }
506
554
  // Remove elements by content patterns (read time, boilerplate, article cards)
507
555
  if (options.removeContentPatterns && mainContent) {
508
556
  this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
@@ -556,7 +604,6 @@ class Defuddle {
556
604
  .replace(/&\w+;/g, ' ');
557
605
  return (0, utils_1.countWords)(text);
558
606
  }
559
- // Make all other methods private by removing the static keyword and using private
560
607
  _log(...args) {
561
608
  if (this.debug) {
562
609
  console.log('Defuddle:', ...args);
@@ -770,13 +817,13 @@ class Defuddle {
770
817
  // Skip code elements and elements containing code blocks
771
818
  // where class names indicate language/syntax, not page structure
772
819
  const tag = el.tagName;
773
- if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
820
+ if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
774
821
  return;
775
822
  }
776
823
  // Get all relevant attributes and combine into a single string
777
824
  const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
778
825
  if (attr === 'class') {
779
- return el.className && typeof el.className === 'string' ? el.className : '';
826
+ return (0, dom_1.getClassName)(el);
780
827
  }
781
828
  if (attr === 'id') {
782
829
  return el.id || '';
@@ -850,8 +897,8 @@ class Defuddle {
850
897
  const attrHeight = parseInt(element.getAttribute('height') || '0');
851
898
  // Check inline style dimensions
852
899
  const style = element.getAttribute('style') || '';
853
- const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
854
- const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
900
+ const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
901
+ const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
855
902
  // Use getComputedStyle and getBoundingClientRect only in browser
856
903
  let computedWidth = 0, computedHeight = 0;
857
904
  if (isBrowser) {
@@ -919,7 +966,7 @@ class Defuddle {
919
966
  return `srcset:${dataSrcset}`;
920
967
  }
921
968
  const id = element.id || '';
922
- const className = element.className || '';
969
+ const className = (0, dom_1.getClassName)(element);
923
970
  const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
924
971
  if (id)
925
972
  return `id:${id}`;
@@ -1035,8 +1082,8 @@ class Defuddle {
1035
1082
  if (current.id) {
1036
1083
  selector += '#' + current.id;
1037
1084
  }
1038
- else if (current.className && typeof current.className === 'string') {
1039
- selector += '.' + current.className.trim().split(/\s+/).join('.');
1085
+ else if ((0, dom_1.getClassName)(current)) {
1086
+ selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
1040
1087
  }
1041
1088
  parts.unshift(selector);
1042
1089
  current = current.parentElement;
@@ -1050,9 +1097,23 @@ class Defuddle {
1050
1097
  * Resolve relative URLs to absolute within a DOM element
1051
1098
  */
1052
1099
  resolveRelativeUrls(element) {
1053
- const baseUrl = this.options.url || this.doc.URL;
1054
- if (!baseUrl)
1100
+ const docUrl = this.options.url || this.doc.URL;
1101
+ if (!docUrl)
1055
1102
  return;
1103
+ // Respect <base href> for relative URL resolution, matching browser behavior
1104
+ let baseUrl = docUrl;
1105
+ const baseEl = this.doc.querySelector('base[href]');
1106
+ if (baseEl) {
1107
+ const baseHref = baseEl.getAttribute('href');
1108
+ if (baseHref) {
1109
+ try {
1110
+ baseUrl = new URL(baseHref, docUrl).href;
1111
+ }
1112
+ catch {
1113
+ // Invalid base href, fall back to document URL
1114
+ }
1115
+ }
1116
+ }
1056
1117
  const resolve = (url) => {
1057
1118
  // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
1058
1119
  // Normalize these before URL resolution.
@@ -1449,12 +1510,78 @@ class Defuddle {
1449
1510
  }
1450
1511
  target.remove();
1451
1512
  }
1513
+ // Remove blog post metadata lists near content boundaries.
1514
+ // These are short <ul>/<ol> elements where every item is a brief
1515
+ // label + value pair (date, reading time, share, etc.) with no
1516
+ // prose sentences. Detected structurally: all items are very short,
1517
+ // none contain sentence-ending punctuation, and the total text is minimal.
1518
+ const metadataLists = mainContent.querySelectorAll('ul, ol');
1519
+ for (const list of metadataLists) {
1520
+ if (!list.parentNode)
1521
+ continue;
1522
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1523
+ if (items.length < 2 || items.length > 8)
1524
+ continue;
1525
+ // Must be near the start or end of content
1526
+ const listText = list.textContent?.trim() || '';
1527
+ const listPos = contentText.indexOf(listText);
1528
+ const distFromEnd = contentText.length - (listPos + listText.length);
1529
+ if (listPos > 500 && distFromEnd > 500)
1530
+ continue;
1531
+ // Skip lists introduced by a preceding paragraph (e.g. "Features include:")
1532
+ // — those are content lists, not standalone metadata
1533
+ const prevSibling = list.previousElementSibling;
1534
+ if (prevSibling) {
1535
+ const prevText = prevSibling.textContent?.trim() || '';
1536
+ if (prevText.endsWith(':'))
1537
+ continue;
1538
+ }
1539
+ // Every item must be very short (label + value) with no prose
1540
+ let isMetadata = true;
1541
+ for (const item of items) {
1542
+ const text = item.textContent?.trim() || '';
1543
+ const words = (0, utils_1.countWords)(text);
1544
+ if (words > 8) {
1545
+ isMetadata = false;
1546
+ break;
1547
+ }
1548
+ // Prose has sentence-ending punctuation; metadata doesn't
1549
+ if (/[.!?]$/.test(text)) {
1550
+ isMetadata = false;
1551
+ break;
1552
+ }
1553
+ }
1554
+ if (!isMetadata)
1555
+ continue;
1556
+ // Total text should be very short — this is metadata, not content
1557
+ if ((0, utils_1.countWords)(listText) > 30)
1558
+ continue;
1559
+ // Walk up to find the container to remove (e.g. a wrapper div)
1560
+ let target = list;
1561
+ while (target.parentElement && target.parentElement !== mainContent) {
1562
+ const parentText = target.parentElement.textContent?.trim() || '';
1563
+ if (parentText !== listText)
1564
+ break;
1565
+ target = target.parentElement;
1566
+ }
1567
+ if (this.debug && debugRemovals) {
1568
+ debugRemovals.push({
1569
+ step: 'removeByContentPattern',
1570
+ reason: 'blog metadata list',
1571
+ text: (0, utils_1.textPreview)(target)
1572
+ });
1573
+ }
1574
+ target.remove();
1575
+ }
1452
1576
  // Remove section breadcrumbs
1453
1577
  // Short elements containing a link to a parent section of the current URL.
1454
1578
  const url = this.options.url || this.doc.URL || '';
1455
1579
  let urlPath = '';
1580
+ let pageHost = '';
1456
1581
  try {
1457
- urlPath = new URL(url).pathname;
1582
+ const parsedUrl = new URL(url);
1583
+ urlPath = parsedUrl.pathname;
1584
+ pageHost = parsedUrl.hostname.replace(/^www\./, '');
1458
1585
  }
1459
1586
  catch { }
1460
1587
  if (urlPath) {
@@ -1488,6 +1615,126 @@ class Defuddle {
1488
1615
  catch { }
1489
1616
  }
1490
1617
  }
1618
+ // Remove trailing external link lists — a heading + list of purely
1619
+ // off-site links as the last content block (affiliate picks, product
1620
+ // roundups, etc.). Only removed when nothing meaningful follows.
1621
+ if (pageHost) {
1622
+ const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
1623
+ for (const heading of headings) {
1624
+ if (!heading.parentNode)
1625
+ continue;
1626
+ const list = heading.nextElementSibling;
1627
+ if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
1628
+ continue;
1629
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1630
+ if (items.length < 2)
1631
+ continue;
1632
+ // The list must be the last meaningful block — nothing after it
1633
+ // except whitespace or empty elements. Walk up through ancestors
1634
+ // to check siblings at each level up to mainContent.
1635
+ let trailingContent = false;
1636
+ let checkEl = list;
1637
+ while (checkEl && checkEl !== mainContent) {
1638
+ let sibling = checkEl.nextElementSibling;
1639
+ while (sibling) {
1640
+ if ((sibling.textContent?.trim() || '').length > 0) {
1641
+ trailingContent = true;
1642
+ break;
1643
+ }
1644
+ sibling = sibling.nextElementSibling;
1645
+ }
1646
+ if (trailingContent)
1647
+ break;
1648
+ checkEl = checkEl.parentElement;
1649
+ }
1650
+ if (trailingContent)
1651
+ continue;
1652
+ // Every list item must be primarily a link pointing off-site
1653
+ let allExternalLinks = true;
1654
+ for (const item of items) {
1655
+ const links = item.querySelectorAll('a[href]');
1656
+ if (links.length === 0) {
1657
+ allExternalLinks = false;
1658
+ break;
1659
+ }
1660
+ const itemText = item.textContent?.trim() || '';
1661
+ let linkTextLen = 0;
1662
+ for (const link of links) {
1663
+ linkTextLen += (link.textContent?.trim() || '').length;
1664
+ try {
1665
+ const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
1666
+ if (linkHost === pageHost) {
1667
+ allExternalLinks = false;
1668
+ break;
1669
+ }
1670
+ }
1671
+ catch { }
1672
+ }
1673
+ if (!allExternalLinks)
1674
+ break;
1675
+ if (linkTextLen < itemText.length * 0.6) {
1676
+ allExternalLinks = false;
1677
+ break;
1678
+ }
1679
+ }
1680
+ if (!allExternalLinks)
1681
+ continue;
1682
+ if (this.debug && debugRemovals) {
1683
+ debugRemovals.push({
1684
+ step: 'removeByContentPattern',
1685
+ reason: 'trailing external link list',
1686
+ text: (0, utils_1.textPreview)(heading)
1687
+ });
1688
+ debugRemovals.push({
1689
+ step: 'removeByContentPattern',
1690
+ reason: 'trailing external link list',
1691
+ text: (0, utils_1.textPreview)(list)
1692
+ });
1693
+ }
1694
+ list.remove();
1695
+ heading.remove();
1696
+ }
1697
+ }
1698
+ // Remove trailing thin sections — the last few direct children of
1699
+ // mainContent that contain a heading but very little prose. These are
1700
+ // typically CTAs, newsletter prompts, or promotional sections that
1701
+ // have been partially stripped by prior removal steps.
1702
+ const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
1703
+ if (totalWords > 300) {
1704
+ // Walk backwards from the last direct child of mainContent,
1705
+ // collecting trailing elements that are thin (empty or very short prose).
1706
+ // Exclude SVG text (path data) from word counts — it's not prose.
1707
+ const trailingEls = [];
1708
+ let trailingWords = 0;
1709
+ let child = mainContent.lastElementChild;
1710
+ while (child) {
1711
+ // Count prose words, excluding SVG path data which inflates word counts
1712
+ let svgWords = 0;
1713
+ for (const svg of child.querySelectorAll('svg')) {
1714
+ svgWords += (0, utils_1.countWords)(svg.textContent || '');
1715
+ }
1716
+ const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
1717
+ if (words > 25)
1718
+ break;
1719
+ trailingWords += words;
1720
+ trailingEls.push(child);
1721
+ child = child.previousElementSibling;
1722
+ }
1723
+ // Must have a heading in the trailing elements and total < 15% of content.
1724
+ // Skip if trailing elements contain content indicators (math, code, tables, images).
1725
+ if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
1726
+ const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
1727
+ const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
1728
+ if (hasHeading && !hasContent) {
1729
+ for (const el of trailingEls) {
1730
+ if (this.debug && debugRemovals) {
1731
+ debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
1732
+ }
1733
+ el.remove();
1734
+ }
1735
+ }
1736
+ }
1737
+ }
1491
1738
  // Remove boilerplate sentences and trailing non-content.
1492
1739
  // Search elements for end-of-article boilerplate, then truncate
1493
1740
  // from the best ancestor that has siblings to remove.