defuddle 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -0
- package/dist/cli.js +47 -10
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +29 -2
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +6 -0
- package/dist/defuddle.js +287 -40
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/headings.d.ts +6 -0
- package/dist/elements/headings.js +13 -0
- package/dist/elements/headings.js.map +1 -1
- package/dist/elements/images.js +10 -1
- package/dist/elements/images.js.map +1 -1
- package/dist/elements/math.base.js +1 -4
- package/dist/elements/math.base.js.map +1 -1
- package/dist/extractor-registry.d.ts +5 -5
- package/dist/extractor-registry.js +8 -8
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/_base.d.ts +6 -1
- package/dist/extractors/_base.js +2 -1
- package/dist/extractors/_base.js.map +1 -1
- package/dist/extractors/github.js +3 -3
- package/dist/extractors/github.js.map +1 -1
- package/dist/extractors/hackernews.js +1 -1
- package/dist/extractors/hackernews.js.map +1 -1
- package/dist/extractors/reddit.js +7 -4
- package/dist/extractors/reddit.js.map +1 -1
- package/dist/extractors/twitter.js +3 -1
- package/dist/extractors/twitter.js.map +1 -1
- package/dist/extractors/youtube.d.ts +13 -0
- package/dist/extractors/youtube.js +140 -20
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/fetch.d.ts +13 -0
- package/dist/fetch.js +181 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +76 -33
- package/dist/markdown.js.map +1 -1
- package/dist/metadata.js +1 -1
- package/dist/metadata.js.map +1 -1
- package/dist/scoring.js +11 -6
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.js +24 -57
- package/dist/standardize.js.map +1 -1
- package/dist/types.d.ts +14 -0
- package/dist/utils/dom.d.ts +5 -0
- package/dist/utils/dom.js +8 -0
- package/dist/utils/dom.js.map +1 -1
- package/package.json +1 -1
package/dist/defuddle.js
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.Defuddle = void 0;
|
|
4
4
|
const metadata_1 = require("./metadata");
|
|
5
|
+
const headings_1 = require("./elements/headings");
|
|
5
6
|
const extractor_registry_1 = require("./extractor-registry");
|
|
6
7
|
const constants_1 = require("./constants");
|
|
7
8
|
const standardize_1 = require("./standardize");
|
|
8
9
|
const footnotes_1 = require("./elements/footnotes");
|
|
10
|
+
const callouts_1 = require("./elements/callouts");
|
|
9
11
|
const scoring_1 = require("./scoring");
|
|
10
12
|
const utils_1 = require("./utils");
|
|
11
13
|
const dom_1 = require("./utils/dom");
|
|
12
14
|
/** Keys from extractor variables that map to top-level DefuddleResponse fields */
|
|
13
15
|
const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
|
|
14
16
|
// Content pattern detection constants
|
|
17
|
+
const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
|
|
18
|
+
const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
|
|
15
19
|
const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
|
|
16
20
|
const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
17
21
|
const BOILERPLATE_PATTERNS = [
|
|
@@ -143,17 +147,30 @@ class Defuddle {
|
|
|
143
147
|
/**
|
|
144
148
|
* Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
|
|
145
149
|
*/
|
|
146
|
-
_getSchemaText(schemaOrgData) {
|
|
147
|
-
if (!schemaOrgData)
|
|
150
|
+
_getSchemaText(schemaOrgData, depth = 0) {
|
|
151
|
+
if (!schemaOrgData || depth > 10)
|
|
148
152
|
return '';
|
|
149
153
|
const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
|
|
150
154
|
for (const item of items) {
|
|
155
|
+
// Recurse into nested arrays
|
|
156
|
+
if (Array.isArray(item)) {
|
|
157
|
+
const found = this._getSchemaText(item, depth + 1);
|
|
158
|
+
if (found)
|
|
159
|
+
return found;
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
151
162
|
if (item?.text && typeof item.text === 'string') {
|
|
152
163
|
return item.text;
|
|
153
164
|
}
|
|
154
165
|
if (item?.articleBody && typeof item.articleBody === 'string') {
|
|
155
166
|
return item.articleBody;
|
|
156
167
|
}
|
|
168
|
+
// Traverse @graph arrays (common in JSON-LD with multiple entities)
|
|
169
|
+
if (item?.['@graph'] && Array.isArray(item['@graph'])) {
|
|
170
|
+
const found = this._getSchemaText(item['@graph'], depth + 1);
|
|
171
|
+
if (found)
|
|
172
|
+
return found;
|
|
173
|
+
}
|
|
157
174
|
}
|
|
158
175
|
return '';
|
|
159
176
|
}
|
|
@@ -194,39 +211,43 @@ class Defuddle {
|
|
|
194
211
|
}
|
|
195
212
|
}
|
|
196
213
|
/**
|
|
197
|
-
* Find
|
|
198
|
-
*
|
|
199
|
-
*
|
|
214
|
+
* Find the smallest DOM element whose text contains the search phrase
|
|
215
|
+
* and whose word count is at least 80% of the expected count.
|
|
216
|
+
* Shared by _findSchemaContentElement and _findContentBySchemaText.
|
|
200
217
|
*/
|
|
201
|
-
|
|
202
|
-
const body = this.doc.body;
|
|
203
|
-
if (!body)
|
|
204
|
-
return '';
|
|
205
|
-
// Use the first paragraph as the search phrase.
|
|
206
|
-
// DOM textContent concatenates <p> elements without separators,
|
|
207
|
-
// so we can't cross paragraph boundaries when matching.
|
|
218
|
+
_findElementBySchemaText(root, schemaText) {
|
|
208
219
|
const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
|
|
209
220
|
const searchPhrase = firstPara.substring(0, 100).trim();
|
|
210
221
|
if (!searchPhrase)
|
|
211
|
-
return
|
|
212
|
-
const schemaWordCount =
|
|
213
|
-
// Find the smallest element whose text contains the search phrase
|
|
214
|
-
// and whose word count is close to the schema text's word count
|
|
222
|
+
return null;
|
|
223
|
+
const schemaWordCount = (0, utils_1.countWords)(schemaText);
|
|
215
224
|
let bestMatch = null;
|
|
216
225
|
let bestSize = Infinity;
|
|
217
|
-
const allElements =
|
|
226
|
+
const allElements = root.querySelectorAll('*');
|
|
218
227
|
for (const el of allElements) {
|
|
219
|
-
|
|
228
|
+
if (el === root)
|
|
229
|
+
continue;
|
|
230
|
+
const elText = el.textContent || '';
|
|
220
231
|
if (!elText.includes(searchPhrase))
|
|
221
232
|
continue;
|
|
222
233
|
const elWords = (0, utils_1.countWords)(elText);
|
|
223
|
-
// Element should contain roughly the same amount of text
|
|
224
|
-
// (allow some slack for surrounding whitespace / minor extras)
|
|
225
234
|
if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
|
|
226
235
|
bestSize = elWords;
|
|
227
236
|
bestMatch = el;
|
|
228
237
|
}
|
|
229
238
|
}
|
|
239
|
+
return bestMatch;
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Find a DOM element whose text matches the schema.org text content.
|
|
243
|
+
* Used when the content scorer picked the wrong element from a feed page.
|
|
244
|
+
* Returns the element's inner HTML including sibling media (images, etc.)
|
|
245
|
+
*/
|
|
246
|
+
_findContentBySchemaText(schemaText) {
|
|
247
|
+
const body = this.doc.body;
|
|
248
|
+
if (!body)
|
|
249
|
+
return '';
|
|
250
|
+
const bestMatch = this._findElementBySchemaText(body, schemaText);
|
|
230
251
|
if (!bestMatch)
|
|
231
252
|
return '';
|
|
232
253
|
// Read the largest sibling image src BEFORE resolveRelativeUrls
|
|
@@ -260,6 +281,8 @@ class Defuddle {
|
|
|
260
281
|
catch { }
|
|
261
282
|
}
|
|
262
283
|
}
|
|
284
|
+
// Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
|
|
285
|
+
(0, headings_1.removeHeadingAnchors)(bestMatch);
|
|
263
286
|
// Now resolve URLs in the text content
|
|
264
287
|
this.resolveRelativeUrls(bestMatch);
|
|
265
288
|
let html = (0, dom_1.serializeHTML)(bestMatch);
|
|
@@ -353,7 +376,8 @@ class Defuddle {
|
|
|
353
376
|
try {
|
|
354
377
|
const url = this.options.url || this.doc.URL;
|
|
355
378
|
const schemaOrgData = this.getSchemaOrgData();
|
|
356
|
-
const
|
|
379
|
+
const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
|
|
380
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
|
|
357
381
|
if (extractor) {
|
|
358
382
|
const extracted = await extractor.extractAsync();
|
|
359
383
|
return this.getExtractorVariables(extracted.variables) || null;
|
|
@@ -368,7 +392,8 @@ class Defuddle {
|
|
|
368
392
|
try {
|
|
369
393
|
const url = this.options.url || this.doc.URL;
|
|
370
394
|
const schemaOrgData = this.getSchemaOrgData();
|
|
371
|
-
const
|
|
395
|
+
const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
|
|
396
|
+
const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
|
|
372
397
|
if (extractor) {
|
|
373
398
|
const startTime = Date.now();
|
|
374
399
|
const extracted = await extractor.extractAsync();
|
|
@@ -414,6 +439,7 @@ class Defuddle {
|
|
|
414
439
|
removeSmallImages: true,
|
|
415
440
|
removeContentPatterns: true,
|
|
416
441
|
standardize: true,
|
|
442
|
+
includeReplies: 'extractors',
|
|
417
443
|
...this.options,
|
|
418
444
|
...overrideOptions
|
|
419
445
|
};
|
|
@@ -435,7 +461,11 @@ class Defuddle {
|
|
|
435
461
|
try {
|
|
436
462
|
// Use site-specific extractor first, if there is one
|
|
437
463
|
const url = options.url || this.doc.URL;
|
|
438
|
-
const
|
|
464
|
+
const extractorOpts = {
|
|
465
|
+
includeReplies: options.includeReplies,
|
|
466
|
+
language: options.language,
|
|
467
|
+
};
|
|
468
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
|
|
439
469
|
if (extractor && extractor.canExtract()) {
|
|
440
470
|
const extracted = extractor.extract();
|
|
441
471
|
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
@@ -471,6 +501,18 @@ class Defuddle {
|
|
|
471
501
|
if (!mainContent) {
|
|
472
502
|
mainContent = this.findMainContent(clone);
|
|
473
503
|
}
|
|
504
|
+
// If we fell back to <body>, try using schema.org articleBody/text
|
|
505
|
+
// to find a more specific content element within the DOM.
|
|
506
|
+
if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
|
|
507
|
+
const schemaText = this._getSchemaText(schemaOrgData);
|
|
508
|
+
if (schemaText) {
|
|
509
|
+
const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
|
|
510
|
+
if (schemaContent) {
|
|
511
|
+
this._log('Found content element via schema.org text');
|
|
512
|
+
mainContent = schemaContent;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
474
516
|
if (!mainContent) {
|
|
475
517
|
const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
476
518
|
const endTime = Date.now();
|
|
@@ -482,9 +524,13 @@ class Defuddle {
|
|
|
482
524
|
metaTags: pageMetaTags
|
|
483
525
|
};
|
|
484
526
|
}
|
|
527
|
+
// Remove <wbr> elements — word break opportunity hints that carry no
|
|
528
|
+
// content but cause unwanted whitespace during standardization.
|
|
529
|
+
mainContent.querySelectorAll('wbr').forEach(el => el.remove());
|
|
485
530
|
// Standardize footnotes before cleanup (CSS sidenotes use display:none)
|
|
486
531
|
if (options.standardize) {
|
|
487
532
|
(0, footnotes_1.standardizeFootnotes)(mainContent);
|
|
533
|
+
(0, callouts_1.standardizeCallouts)(mainContent);
|
|
488
534
|
}
|
|
489
535
|
// Remove small images
|
|
490
536
|
if (options.removeSmallImages) {
|
|
@@ -494,15 +540,17 @@ class Defuddle {
|
|
|
494
540
|
if (options.removeHiddenElements) {
|
|
495
541
|
this.removeHiddenElements(clone, debugRemovals);
|
|
496
542
|
}
|
|
497
|
-
// Remove
|
|
498
|
-
//
|
|
499
|
-
|
|
500
|
-
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
501
|
-
}
|
|
502
|
-
// Remove clutter using selectors
|
|
543
|
+
// Remove clutter using selectors — deterministic removal of known
|
|
544
|
+
// non-content elements (nav, footer, .sidebar, etc.) by class/id.
|
|
545
|
+
// Runs before scoring so the heuristic scorer sees a cleaner DOM.
|
|
503
546
|
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
504
547
|
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
|
|
505
548
|
}
|
|
549
|
+
// Remove non-content blocks by scoring — heuristic removal based
|
|
550
|
+
// on link density, text ratios, and navigation indicators.
|
|
551
|
+
if (options.removeLowScoring) {
|
|
552
|
+
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
553
|
+
}
|
|
506
554
|
// Remove elements by content patterns (read time, boilerplate, article cards)
|
|
507
555
|
if (options.removeContentPatterns && mainContent) {
|
|
508
556
|
this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
|
|
@@ -556,7 +604,6 @@ class Defuddle {
|
|
|
556
604
|
.replace(/&\w+;/g, ' ');
|
|
557
605
|
return (0, utils_1.countWords)(text);
|
|
558
606
|
}
|
|
559
|
-
// Make all other methods private by removing the static keyword and using private
|
|
560
607
|
_log(...args) {
|
|
561
608
|
if (this.debug) {
|
|
562
609
|
console.log('Defuddle:', ...args);
|
|
@@ -770,13 +817,13 @@ class Defuddle {
|
|
|
770
817
|
// Skip code elements and elements containing code blocks
|
|
771
818
|
// where class names indicate language/syntax, not page structure
|
|
772
819
|
const tag = el.tagName;
|
|
773
|
-
if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
|
|
820
|
+
if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
|
|
774
821
|
return;
|
|
775
822
|
}
|
|
776
823
|
// Get all relevant attributes and combine into a single string
|
|
777
824
|
const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
|
|
778
825
|
if (attr === 'class') {
|
|
779
|
-
return
|
|
826
|
+
return (0, dom_1.getClassName)(el);
|
|
780
827
|
}
|
|
781
828
|
if (attr === 'id') {
|
|
782
829
|
return el.id || '';
|
|
@@ -850,8 +897,8 @@ class Defuddle {
|
|
|
850
897
|
const attrHeight = parseInt(element.getAttribute('height') || '0');
|
|
851
898
|
// Check inline style dimensions
|
|
852
899
|
const style = element.getAttribute('style') || '';
|
|
853
|
-
const styleWidth = parseInt(style.match(
|
|
854
|
-
const styleHeight = parseInt(style.match(
|
|
900
|
+
const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
|
|
901
|
+
const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
|
|
855
902
|
// Use getComputedStyle and getBoundingClientRect only in browser
|
|
856
903
|
let computedWidth = 0, computedHeight = 0;
|
|
857
904
|
if (isBrowser) {
|
|
@@ -919,7 +966,7 @@ class Defuddle {
|
|
|
919
966
|
return `srcset:${dataSrcset}`;
|
|
920
967
|
}
|
|
921
968
|
const id = element.id || '';
|
|
922
|
-
const className = element
|
|
969
|
+
const className = (0, dom_1.getClassName)(element);
|
|
923
970
|
const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
|
|
924
971
|
if (id)
|
|
925
972
|
return `id:${id}`;
|
|
@@ -1035,8 +1082,8 @@ class Defuddle {
|
|
|
1035
1082
|
if (current.id) {
|
|
1036
1083
|
selector += '#' + current.id;
|
|
1037
1084
|
}
|
|
1038
|
-
else if (
|
|
1039
|
-
selector += '.' + current.
|
|
1085
|
+
else if ((0, dom_1.getClassName)(current)) {
|
|
1086
|
+
selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
|
|
1040
1087
|
}
|
|
1041
1088
|
parts.unshift(selector);
|
|
1042
1089
|
current = current.parentElement;
|
|
@@ -1050,9 +1097,23 @@ class Defuddle {
|
|
|
1050
1097
|
* Resolve relative URLs to absolute within a DOM element
|
|
1051
1098
|
*/
|
|
1052
1099
|
resolveRelativeUrls(element) {
|
|
1053
|
-
const
|
|
1054
|
-
if (!
|
|
1100
|
+
const docUrl = this.options.url || this.doc.URL;
|
|
1101
|
+
if (!docUrl)
|
|
1055
1102
|
return;
|
|
1103
|
+
// Respect <base href> for relative URL resolution, matching browser behavior
|
|
1104
|
+
let baseUrl = docUrl;
|
|
1105
|
+
const baseEl = this.doc.querySelector('base[href]');
|
|
1106
|
+
if (baseEl) {
|
|
1107
|
+
const baseHref = baseEl.getAttribute('href');
|
|
1108
|
+
if (baseHref) {
|
|
1109
|
+
try {
|
|
1110
|
+
baseUrl = new URL(baseHref, docUrl).href;
|
|
1111
|
+
}
|
|
1112
|
+
catch {
|
|
1113
|
+
// Invalid base href, fall back to document URL
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1056
1117
|
const resolve = (url) => {
|
|
1057
1118
|
// Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
|
|
1058
1119
|
// Normalize these before URL resolution.
|
|
@@ -1449,12 +1510,78 @@ class Defuddle {
|
|
|
1449
1510
|
}
|
|
1450
1511
|
target.remove();
|
|
1451
1512
|
}
|
|
1513
|
+
// Remove blog post metadata lists near content boundaries.
|
|
1514
|
+
// These are short <ul>/<ol> elements where every item is a brief
|
|
1515
|
+
// label + value pair (date, reading time, share, etc.) with no
|
|
1516
|
+
// prose sentences. Detected structurally: all items are very short,
|
|
1517
|
+
// none contain sentence-ending punctuation, and the total text is minimal.
|
|
1518
|
+
const metadataLists = mainContent.querySelectorAll('ul, ol');
|
|
1519
|
+
for (const list of metadataLists) {
|
|
1520
|
+
if (!list.parentNode)
|
|
1521
|
+
continue;
|
|
1522
|
+
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1523
|
+
if (items.length < 2 || items.length > 8)
|
|
1524
|
+
continue;
|
|
1525
|
+
// Must be near the start or end of content
|
|
1526
|
+
const listText = list.textContent?.trim() || '';
|
|
1527
|
+
const listPos = contentText.indexOf(listText);
|
|
1528
|
+
const distFromEnd = contentText.length - (listPos + listText.length);
|
|
1529
|
+
if (listPos > 500 && distFromEnd > 500)
|
|
1530
|
+
continue;
|
|
1531
|
+
// Skip lists introduced by a preceding paragraph (e.g. "Features include:")
|
|
1532
|
+
// — those are content lists, not standalone metadata
|
|
1533
|
+
const prevSibling = list.previousElementSibling;
|
|
1534
|
+
if (prevSibling) {
|
|
1535
|
+
const prevText = prevSibling.textContent?.trim() || '';
|
|
1536
|
+
if (prevText.endsWith(':'))
|
|
1537
|
+
continue;
|
|
1538
|
+
}
|
|
1539
|
+
// Every item must be very short (label + value) with no prose
|
|
1540
|
+
let isMetadata = true;
|
|
1541
|
+
for (const item of items) {
|
|
1542
|
+
const text = item.textContent?.trim() || '';
|
|
1543
|
+
const words = (0, utils_1.countWords)(text);
|
|
1544
|
+
if (words > 8) {
|
|
1545
|
+
isMetadata = false;
|
|
1546
|
+
break;
|
|
1547
|
+
}
|
|
1548
|
+
// Prose has sentence-ending punctuation; metadata doesn't
|
|
1549
|
+
if (/[.!?]$/.test(text)) {
|
|
1550
|
+
isMetadata = false;
|
|
1551
|
+
break;
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
if (!isMetadata)
|
|
1555
|
+
continue;
|
|
1556
|
+
// Total text should be very short — this is metadata, not content
|
|
1557
|
+
if ((0, utils_1.countWords)(listText) > 30)
|
|
1558
|
+
continue;
|
|
1559
|
+
// Walk up to find the container to remove (e.g. a wrapper div)
|
|
1560
|
+
let target = list;
|
|
1561
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1562
|
+
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1563
|
+
if (parentText !== listText)
|
|
1564
|
+
break;
|
|
1565
|
+
target = target.parentElement;
|
|
1566
|
+
}
|
|
1567
|
+
if (this.debug && debugRemovals) {
|
|
1568
|
+
debugRemovals.push({
|
|
1569
|
+
step: 'removeByContentPattern',
|
|
1570
|
+
reason: 'blog metadata list',
|
|
1571
|
+
text: (0, utils_1.textPreview)(target)
|
|
1572
|
+
});
|
|
1573
|
+
}
|
|
1574
|
+
target.remove();
|
|
1575
|
+
}
|
|
1452
1576
|
// Remove section breadcrumbs
|
|
1453
1577
|
// Short elements containing a link to a parent section of the current URL.
|
|
1454
1578
|
const url = this.options.url || this.doc.URL || '';
|
|
1455
1579
|
let urlPath = '';
|
|
1580
|
+
let pageHost = '';
|
|
1456
1581
|
try {
|
|
1457
|
-
|
|
1582
|
+
const parsedUrl = new URL(url);
|
|
1583
|
+
urlPath = parsedUrl.pathname;
|
|
1584
|
+
pageHost = parsedUrl.hostname.replace(/^www\./, '');
|
|
1458
1585
|
}
|
|
1459
1586
|
catch { }
|
|
1460
1587
|
if (urlPath) {
|
|
@@ -1488,6 +1615,126 @@ class Defuddle {
|
|
|
1488
1615
|
catch { }
|
|
1489
1616
|
}
|
|
1490
1617
|
}
|
|
1618
|
+
// Remove trailing external link lists — a heading + list of purely
|
|
1619
|
+
// off-site links as the last content block (affiliate picks, product
|
|
1620
|
+
// roundups, etc.). Only removed when nothing meaningful follows.
|
|
1621
|
+
if (pageHost) {
|
|
1622
|
+
const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
|
|
1623
|
+
for (const heading of headings) {
|
|
1624
|
+
if (!heading.parentNode)
|
|
1625
|
+
continue;
|
|
1626
|
+
const list = heading.nextElementSibling;
|
|
1627
|
+
if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
|
|
1628
|
+
continue;
|
|
1629
|
+
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1630
|
+
if (items.length < 2)
|
|
1631
|
+
continue;
|
|
1632
|
+
// The list must be the last meaningful block — nothing after it
|
|
1633
|
+
// except whitespace or empty elements. Walk up through ancestors
|
|
1634
|
+
// to check siblings at each level up to mainContent.
|
|
1635
|
+
let trailingContent = false;
|
|
1636
|
+
let checkEl = list;
|
|
1637
|
+
while (checkEl && checkEl !== mainContent) {
|
|
1638
|
+
let sibling = checkEl.nextElementSibling;
|
|
1639
|
+
while (sibling) {
|
|
1640
|
+
if ((sibling.textContent?.trim() || '').length > 0) {
|
|
1641
|
+
trailingContent = true;
|
|
1642
|
+
break;
|
|
1643
|
+
}
|
|
1644
|
+
sibling = sibling.nextElementSibling;
|
|
1645
|
+
}
|
|
1646
|
+
if (trailingContent)
|
|
1647
|
+
break;
|
|
1648
|
+
checkEl = checkEl.parentElement;
|
|
1649
|
+
}
|
|
1650
|
+
if (trailingContent)
|
|
1651
|
+
continue;
|
|
1652
|
+
// Every list item must be primarily a link pointing off-site
|
|
1653
|
+
let allExternalLinks = true;
|
|
1654
|
+
for (const item of items) {
|
|
1655
|
+
const links = item.querySelectorAll('a[href]');
|
|
1656
|
+
if (links.length === 0) {
|
|
1657
|
+
allExternalLinks = false;
|
|
1658
|
+
break;
|
|
1659
|
+
}
|
|
1660
|
+
const itemText = item.textContent?.trim() || '';
|
|
1661
|
+
let linkTextLen = 0;
|
|
1662
|
+
for (const link of links) {
|
|
1663
|
+
linkTextLen += (link.textContent?.trim() || '').length;
|
|
1664
|
+
try {
|
|
1665
|
+
const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
|
|
1666
|
+
if (linkHost === pageHost) {
|
|
1667
|
+
allExternalLinks = false;
|
|
1668
|
+
break;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
catch { }
|
|
1672
|
+
}
|
|
1673
|
+
if (!allExternalLinks)
|
|
1674
|
+
break;
|
|
1675
|
+
if (linkTextLen < itemText.length * 0.6) {
|
|
1676
|
+
allExternalLinks = false;
|
|
1677
|
+
break;
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
if (!allExternalLinks)
|
|
1681
|
+
continue;
|
|
1682
|
+
if (this.debug && debugRemovals) {
|
|
1683
|
+
debugRemovals.push({
|
|
1684
|
+
step: 'removeByContentPattern',
|
|
1685
|
+
reason: 'trailing external link list',
|
|
1686
|
+
text: (0, utils_1.textPreview)(heading)
|
|
1687
|
+
});
|
|
1688
|
+
debugRemovals.push({
|
|
1689
|
+
step: 'removeByContentPattern',
|
|
1690
|
+
reason: 'trailing external link list',
|
|
1691
|
+
text: (0, utils_1.textPreview)(list)
|
|
1692
|
+
});
|
|
1693
|
+
}
|
|
1694
|
+
list.remove();
|
|
1695
|
+
heading.remove();
|
|
1696
|
+
}
|
|
1697
|
+
}
|
|
1698
|
+
// Remove trailing thin sections — the last few direct children of
|
|
1699
|
+
// mainContent that contain a heading but very little prose. These are
|
|
1700
|
+
// typically CTAs, newsletter prompts, or promotional sections that
|
|
1701
|
+
// have been partially stripped by prior removal steps.
|
|
1702
|
+
const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
|
|
1703
|
+
if (totalWords > 300) {
|
|
1704
|
+
// Walk backwards from the last direct child of mainContent,
|
|
1705
|
+
// collecting trailing elements that are thin (empty or very short prose).
|
|
1706
|
+
// Exclude SVG text (path data) from word counts — it's not prose.
|
|
1707
|
+
const trailingEls = [];
|
|
1708
|
+
let trailingWords = 0;
|
|
1709
|
+
let child = mainContent.lastElementChild;
|
|
1710
|
+
while (child) {
|
|
1711
|
+
// Count prose words, excluding SVG path data which inflates word counts
|
|
1712
|
+
let svgWords = 0;
|
|
1713
|
+
for (const svg of child.querySelectorAll('svg')) {
|
|
1714
|
+
svgWords += (0, utils_1.countWords)(svg.textContent || '');
|
|
1715
|
+
}
|
|
1716
|
+
const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
|
|
1717
|
+
if (words > 25)
|
|
1718
|
+
break;
|
|
1719
|
+
trailingWords += words;
|
|
1720
|
+
trailingEls.push(child);
|
|
1721
|
+
child = child.previousElementSibling;
|
|
1722
|
+
}
|
|
1723
|
+
// Must have a heading in the trailing elements and total < 15% of content.
|
|
1724
|
+
// Skip if trailing elements contain content indicators (math, code, tables, images).
|
|
1725
|
+
if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
|
|
1726
|
+
const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
|
|
1727
|
+
const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
|
|
1728
|
+
if (hasHeading && !hasContent) {
|
|
1729
|
+
for (const el of trailingEls) {
|
|
1730
|
+
if (this.debug && debugRemovals) {
|
|
1731
|
+
debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
|
|
1732
|
+
}
|
|
1733
|
+
el.remove();
|
|
1734
|
+
}
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1491
1738
|
// Remove boilerplate sentences and trailing non-content.
|
|
1492
1739
|
// Search elements for end-of-article boilerplate, then truncate
|
|
1493
1740
|
// from the best ancestor that has siblings to remove.
|