defuddle 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -21
- package/dist/cli.js +54 -49
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +9 -0
- package/dist/constants.js +50 -10
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +8 -1
- package/dist/defuddle.js +404 -86
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.js +31 -9
- package/dist/elements/code.js.map +1 -1
- package/dist/elements/headings.d.ts +6 -0
- package/dist/elements/headings.js +55 -50
- package/dist/elements/headings.js.map +1 -1
- package/dist/elements/images.js +10 -1
- package/dist/elements/images.js.map +1 -1
- package/dist/elements/math.base.js +1 -4
- package/dist/elements/math.base.js.map +1 -1
- package/dist/extractor-registry.d.ts +5 -5
- package/dist/extractor-registry.js +8 -8
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/_base.d.ts +6 -1
- package/dist/extractors/_base.js +2 -1
- package/dist/extractors/_base.js.map +1 -1
- package/dist/extractors/github.js +3 -3
- package/dist/extractors/github.js.map +1 -1
- package/dist/extractors/hackernews.js +1 -1
- package/dist/extractors/hackernews.js.map +1 -1
- package/dist/extractors/reddit.js +7 -4
- package/dist/extractors/reddit.js.map +1 -1
- package/dist/extractors/twitter.js +3 -1
- package/dist/extractors/twitter.js.map +1 -1
- package/dist/extractors/youtube.d.ts +35 -2
- package/dist/extractors/youtube.js +359 -30
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/fetch.d.ts +13 -0
- package/dist/fetch.js +181 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +81 -33
- package/dist/markdown.js.map +1 -1
- package/dist/metadata.js +1 -1
- package/dist/metadata.js.map +1 -1
- package/dist/node.d.ts +12 -5
- package/dist/node.js +53 -17
- package/dist/node.js.map +1 -1
- package/dist/scoring.js +15 -10
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.js +112 -60
- package/dist/standardize.js.map +1 -1
- package/dist/types.d.ts +14 -0
- package/dist/utils/dom.d.ts +5 -0
- package/dist/utils/dom.js +8 -0
- package/dist/utils/dom.js.map +1 -1
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils.d.ts +6 -0
- package/dist/utils.js +36 -0
- package/dist/utils.js.map +1 -1
- package/package.json +3 -4
package/dist/defuddle.js
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.Defuddle = void 0;
|
|
4
4
|
const metadata_1 = require("./metadata");
|
|
5
|
+
const headings_1 = require("./elements/headings");
|
|
5
6
|
const extractor_registry_1 = require("./extractor-registry");
|
|
6
7
|
const constants_1 = require("./constants");
|
|
7
8
|
const standardize_1 = require("./standardize");
|
|
8
9
|
const footnotes_1 = require("./elements/footnotes");
|
|
10
|
+
const callouts_1 = require("./elements/callouts");
|
|
9
11
|
const scoring_1 = require("./scoring");
|
|
10
12
|
const utils_1 = require("./utils");
|
|
11
13
|
const dom_1 = require("./utils/dom");
|
|
12
14
|
/** Keys from extractor variables that map to top-level DefuddleResponse fields */
|
|
13
15
|
const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
|
|
14
16
|
// Content pattern detection constants
|
|
17
|
+
const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
|
|
18
|
+
const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
|
|
15
19
|
const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
|
|
16
20
|
const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
17
21
|
const BOILERPLATE_PATTERNS = [
|
|
@@ -72,6 +76,36 @@ class Defuddle {
|
|
|
72
76
|
}
|
|
73
77
|
}
|
|
74
78
|
// If still very little content, the page may be an index/listing page
|
|
79
|
+
// or a page that reveals content at runtime from a hidden wrapper.
|
|
80
|
+
// Retry once with hidden-element removal disabled.
|
|
81
|
+
if (result.wordCount < 50) {
|
|
82
|
+
this._log('Still very little content, retrying without hidden-element removal');
|
|
83
|
+
const hiddenRetry = this.parseInternal({
|
|
84
|
+
removeHiddenElements: false
|
|
85
|
+
});
|
|
86
|
+
if (hiddenRetry.wordCount > result.wordCount * 2) {
|
|
87
|
+
this._log('Hidden-element retry produced more content');
|
|
88
|
+
result = hiddenRetry;
|
|
89
|
+
}
|
|
90
|
+
// Try targeting the largest hidden subtree directly to avoid body-level
|
|
91
|
+
// leftovers (e.g. FPS counters) when hidden content is the real article.
|
|
92
|
+
const hiddenSelector = this.findLargestHiddenContentSelector();
|
|
93
|
+
if (hiddenSelector) {
|
|
94
|
+
this._log('Retrying with hidden content selector:', hiddenSelector);
|
|
95
|
+
const hiddenSelectorRetry = this.parseInternal({
|
|
96
|
+
removeHiddenElements: false,
|
|
97
|
+
removePartialSelectors: false,
|
|
98
|
+
contentSelector: hiddenSelector
|
|
99
|
+
});
|
|
100
|
+
if (hiddenSelectorRetry.wordCount > result.wordCount ||
|
|
101
|
+
(hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
|
|
102
|
+
hiddenSelectorRetry.content.length < result.content.length)) {
|
|
103
|
+
this._log('Hidden-selector retry produced better focused content');
|
|
104
|
+
result = hiddenSelectorRetry;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// If still very little content, the page may be an index/listing page
|
|
75
109
|
// where card elements were scored as non-content or removed by partial
|
|
76
110
|
// selectors (e.g. "post-preview"). Retry with both disabled.
|
|
77
111
|
if (result.wordCount < 50) {
|
|
@@ -95,17 +129,17 @@ class Defuddle {
|
|
|
95
129
|
// longer than what we extracted, the scorer likely picked the wrong
|
|
96
130
|
// element from a feed. Find the correct element in the DOM.
|
|
97
131
|
const schemaText = this._getSchemaText(result.schemaOrgData);
|
|
98
|
-
if (schemaText && this.
|
|
132
|
+
if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
|
|
99
133
|
const contentHtml = this._findContentBySchemaText(schemaText);
|
|
100
134
|
if (contentHtml) {
|
|
101
135
|
this._log('Found DOM content matching schema.org text');
|
|
102
136
|
result.content = contentHtml;
|
|
103
|
-
result.wordCount = this.
|
|
137
|
+
result.wordCount = this.countHtmlWords(contentHtml);
|
|
104
138
|
}
|
|
105
139
|
else {
|
|
106
140
|
this._log('Using schema.org text as content (DOM element not found)');
|
|
107
141
|
result.content = schemaText;
|
|
108
|
-
result.wordCount = this.
|
|
142
|
+
result.wordCount = this.countHtmlWords(schemaText);
|
|
109
143
|
}
|
|
110
144
|
}
|
|
111
145
|
return result;
|
|
@@ -113,17 +147,30 @@ class Defuddle {
|
|
|
113
147
|
/**
|
|
114
148
|
* Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
|
|
115
149
|
*/
|
|
116
|
-
_getSchemaText(schemaOrgData) {
|
|
117
|
-
if (!schemaOrgData)
|
|
150
|
+
_getSchemaText(schemaOrgData, depth = 0) {
|
|
151
|
+
if (!schemaOrgData || depth > 10)
|
|
118
152
|
return '';
|
|
119
153
|
const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
|
|
120
154
|
for (const item of items) {
|
|
155
|
+
// Recurse into nested arrays
|
|
156
|
+
if (Array.isArray(item)) {
|
|
157
|
+
const found = this._getSchemaText(item, depth + 1);
|
|
158
|
+
if (found)
|
|
159
|
+
return found;
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
121
162
|
if (item?.text && typeof item.text === 'string') {
|
|
122
163
|
return item.text;
|
|
123
164
|
}
|
|
124
165
|
if (item?.articleBody && typeof item.articleBody === 'string') {
|
|
125
166
|
return item.articleBody;
|
|
126
167
|
}
|
|
168
|
+
// Traverse @graph arrays (common in JSON-LD with multiple entities)
|
|
169
|
+
if (item?.['@graph'] && Array.isArray(item['@graph'])) {
|
|
170
|
+
const found = this._getSchemaText(item['@graph'], depth + 1);
|
|
171
|
+
if (found)
|
|
172
|
+
return found;
|
|
173
|
+
}
|
|
127
174
|
}
|
|
128
175
|
return '';
|
|
129
176
|
}
|
|
@@ -164,39 +211,43 @@ class Defuddle {
|
|
|
164
211
|
}
|
|
165
212
|
}
|
|
166
213
|
/**
|
|
167
|
-
* Find
|
|
168
|
-
*
|
|
169
|
-
*
|
|
214
|
+
* Find the smallest DOM element whose text contains the search phrase
|
|
215
|
+
* and whose word count is at least 80% of the expected count.
|
|
216
|
+
* Shared by _findSchemaContentElement and _findContentBySchemaText.
|
|
170
217
|
*/
|
|
171
|
-
|
|
172
|
-
const body = this.doc.body;
|
|
173
|
-
if (!body)
|
|
174
|
-
return '';
|
|
175
|
-
// Use the first paragraph as the search phrase.
|
|
176
|
-
// DOM textContent concatenates <p> elements without separators,
|
|
177
|
-
// so we can't cross paragraph boundaries when matching.
|
|
218
|
+
_findElementBySchemaText(root, schemaText) {
|
|
178
219
|
const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
|
|
179
220
|
const searchPhrase = firstPara.substring(0, 100).trim();
|
|
180
221
|
if (!searchPhrase)
|
|
181
|
-
return
|
|
182
|
-
const schemaWordCount =
|
|
183
|
-
// Find the smallest element whose text contains the search phrase
|
|
184
|
-
// and whose word count is close to the schema text's word count
|
|
222
|
+
return null;
|
|
223
|
+
const schemaWordCount = (0, utils_1.countWords)(schemaText);
|
|
185
224
|
let bestMatch = null;
|
|
186
225
|
let bestSize = Infinity;
|
|
187
|
-
const allElements =
|
|
226
|
+
const allElements = root.querySelectorAll('*');
|
|
188
227
|
for (const el of allElements) {
|
|
189
|
-
|
|
228
|
+
if (el === root)
|
|
229
|
+
continue;
|
|
230
|
+
const elText = el.textContent || '';
|
|
190
231
|
if (!elText.includes(searchPhrase))
|
|
191
232
|
continue;
|
|
192
|
-
const elWords =
|
|
193
|
-
// Element should contain roughly the same amount of text
|
|
194
|
-
// (allow some slack for surrounding whitespace / minor extras)
|
|
233
|
+
const elWords = (0, utils_1.countWords)(elText);
|
|
195
234
|
if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
|
|
196
235
|
bestSize = elWords;
|
|
197
236
|
bestMatch = el;
|
|
198
237
|
}
|
|
199
238
|
}
|
|
239
|
+
return bestMatch;
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Find a DOM element whose text matches the schema.org text content.
|
|
243
|
+
* Used when the content scorer picked the wrong element from a feed page.
|
|
244
|
+
* Returns the element's inner HTML including sibling media (images, etc.)
|
|
245
|
+
*/
|
|
246
|
+
_findContentBySchemaText(schemaText) {
|
|
247
|
+
const body = this.doc.body;
|
|
248
|
+
if (!body)
|
|
249
|
+
return '';
|
|
250
|
+
const bestMatch = this._findElementBySchemaText(body, schemaText);
|
|
200
251
|
if (!bestMatch)
|
|
201
252
|
return '';
|
|
202
253
|
// Read the largest sibling image src BEFORE resolveRelativeUrls
|
|
@@ -230,6 +281,8 @@ class Defuddle {
|
|
|
230
281
|
catch { }
|
|
231
282
|
}
|
|
232
283
|
}
|
|
284
|
+
// Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
|
|
285
|
+
(0, headings_1.removeHeadingAnchors)(bestMatch);
|
|
233
286
|
// Now resolve URLs in the text content
|
|
234
287
|
this.resolveRelativeUrls(bestMatch);
|
|
235
288
|
let html = (0, dom_1.serializeHTML)(bestMatch);
|
|
@@ -241,6 +294,27 @@ class Defuddle {
|
|
|
241
294
|
}
|
|
242
295
|
return html;
|
|
243
296
|
}
|
|
297
|
+
findLargestHiddenContentSelector() {
|
|
298
|
+
const body = this.doc.body;
|
|
299
|
+
if (!body)
|
|
300
|
+
return undefined;
|
|
301
|
+
const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
|
|
302
|
+
const className = el.getAttribute('class') || '';
|
|
303
|
+
return !className.includes('math');
|
|
304
|
+
});
|
|
305
|
+
let best = null;
|
|
306
|
+
let bestWords = 0;
|
|
307
|
+
for (const el of candidates) {
|
|
308
|
+
const words = (0, utils_1.countWords)(el.textContent || '');
|
|
309
|
+
if (words > bestWords) {
|
|
310
|
+
best = el;
|
|
311
|
+
bestWords = words;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
if (!best || bestWords < 30)
|
|
315
|
+
return undefined;
|
|
316
|
+
return this.getElementSelector(best);
|
|
317
|
+
}
|
|
244
318
|
/**
|
|
245
319
|
* Get the largest available src from an img element,
|
|
246
320
|
* checking srcset for higher-resolution versions.
|
|
@@ -302,7 +376,8 @@ class Defuddle {
|
|
|
302
376
|
try {
|
|
303
377
|
const url = this.options.url || this.doc.URL;
|
|
304
378
|
const schemaOrgData = this.getSchemaOrgData();
|
|
305
|
-
const
|
|
379
|
+
const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
|
|
380
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
|
|
306
381
|
if (extractor) {
|
|
307
382
|
const extracted = await extractor.extractAsync();
|
|
308
383
|
return this.getExtractorVariables(extracted.variables) || null;
|
|
@@ -317,7 +392,8 @@ class Defuddle {
|
|
|
317
392
|
try {
|
|
318
393
|
const url = this.options.url || this.doc.URL;
|
|
319
394
|
const schemaOrgData = this.getSchemaOrgData();
|
|
320
|
-
const
|
|
395
|
+
const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
|
|
396
|
+
const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
|
|
321
397
|
if (extractor) {
|
|
322
398
|
const startTime = Date.now();
|
|
323
399
|
const extracted = await extractor.extractAsync();
|
|
@@ -336,6 +412,25 @@ class Defuddle {
|
|
|
336
412
|
*/
|
|
337
413
|
parseInternal(overrideOptions = {}) {
|
|
338
414
|
const startTime = Date.now();
|
|
415
|
+
// Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
|
|
416
|
+
if (!this.doc.documentElement) {
|
|
417
|
+
const url = this.options.url || '';
|
|
418
|
+
return {
|
|
419
|
+
content: '',
|
|
420
|
+
title: '',
|
|
421
|
+
description: '',
|
|
422
|
+
domain: url ? new URL(url).hostname : '',
|
|
423
|
+
favicon: '',
|
|
424
|
+
image: '',
|
|
425
|
+
language: '',
|
|
426
|
+
parseTime: Date.now() - startTime,
|
|
427
|
+
published: '',
|
|
428
|
+
author: '',
|
|
429
|
+
site: '',
|
|
430
|
+
schemaOrgData: null,
|
|
431
|
+
wordCount: 0,
|
|
432
|
+
};
|
|
433
|
+
}
|
|
339
434
|
const options = {
|
|
340
435
|
removeExactSelectors: true,
|
|
341
436
|
removePartialSelectors: true,
|
|
@@ -344,6 +439,7 @@ class Defuddle {
|
|
|
344
439
|
removeSmallImages: true,
|
|
345
440
|
removeContentPatterns: true,
|
|
346
441
|
standardize: true,
|
|
442
|
+
includeReplies: 'extractors',
|
|
347
443
|
...this.options,
|
|
348
444
|
...overrideOptions
|
|
349
445
|
};
|
|
@@ -365,7 +461,11 @@ class Defuddle {
|
|
|
365
461
|
try {
|
|
366
462
|
// Use site-specific extractor first, if there is one
|
|
367
463
|
const url = options.url || this.doc.URL;
|
|
368
|
-
const
|
|
464
|
+
const extractorOpts = {
|
|
465
|
+
includeReplies: options.includeReplies,
|
|
466
|
+
language: options.language,
|
|
467
|
+
};
|
|
468
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
|
|
369
469
|
if (extractor && extractor.canExtract()) {
|
|
370
470
|
const extracted = extractor.extract();
|
|
371
471
|
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
@@ -383,6 +483,9 @@ class Defuddle {
|
|
|
383
483
|
const smallImages = this._smallImages;
|
|
384
484
|
// Clone document
|
|
385
485
|
const clone = this.doc.cloneNode(true);
|
|
486
|
+
// Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
|
|
487
|
+
// create when parsing HTML entities like '
|
|
488
|
+
clone.body?.normalize();
|
|
386
489
|
// Flatten shadow DOM content into the clone
|
|
387
490
|
this.flattenShadowRoots(this.doc, clone);
|
|
388
491
|
// Resolve React streaming SSR suspense boundaries
|
|
@@ -398,20 +501,36 @@ class Defuddle {
|
|
|
398
501
|
if (!mainContent) {
|
|
399
502
|
mainContent = this.findMainContent(clone);
|
|
400
503
|
}
|
|
504
|
+
// If we fell back to <body>, try using schema.org articleBody/text
|
|
505
|
+
// to find a more specific content element within the DOM.
|
|
506
|
+
if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
|
|
507
|
+
const schemaText = this._getSchemaText(schemaOrgData);
|
|
508
|
+
if (schemaText) {
|
|
509
|
+
const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
|
|
510
|
+
if (schemaContent) {
|
|
511
|
+
this._log('Found content element via schema.org text');
|
|
512
|
+
mainContent = schemaContent;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
401
516
|
if (!mainContent) {
|
|
402
|
-
const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
|
|
517
|
+
const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
403
518
|
const endTime = Date.now();
|
|
404
519
|
return {
|
|
405
520
|
content: fallbackContent,
|
|
406
521
|
...metadata,
|
|
407
|
-
wordCount: this.
|
|
522
|
+
wordCount: this.countHtmlWords(fallbackContent),
|
|
408
523
|
parseTime: Math.round(endTime - startTime),
|
|
409
524
|
metaTags: pageMetaTags
|
|
410
525
|
};
|
|
411
526
|
}
|
|
527
|
+
// Remove <wbr> elements — word break opportunity hints that carry no
|
|
528
|
+
// content but cause unwanted whitespace during standardization.
|
|
529
|
+
mainContent.querySelectorAll('wbr').forEach(el => el.remove());
|
|
412
530
|
// Standardize footnotes before cleanup (CSS sidenotes use display:none)
|
|
413
531
|
if (options.standardize) {
|
|
414
532
|
(0, footnotes_1.standardizeFootnotes)(mainContent);
|
|
533
|
+
(0, callouts_1.standardizeCallouts)(mainContent);
|
|
415
534
|
}
|
|
416
535
|
// Remove small images
|
|
417
536
|
if (options.removeSmallImages) {
|
|
@@ -421,15 +540,17 @@ class Defuddle {
|
|
|
421
540
|
if (options.removeHiddenElements) {
|
|
422
541
|
this.removeHiddenElements(clone, debugRemovals);
|
|
423
542
|
}
|
|
424
|
-
// Remove
|
|
425
|
-
//
|
|
543
|
+
// Remove clutter using selectors — deterministic removal of known
|
|
544
|
+
// non-content elements (nav, footer, .sidebar, etc.) by class/id.
|
|
545
|
+
// Runs before scoring so the heuristic scorer sees a cleaner DOM.
|
|
546
|
+
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
547
|
+
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
|
|
548
|
+
}
|
|
549
|
+
// Remove non-content blocks by scoring — heuristic removal based
|
|
550
|
+
// on link density, text ratios, and navigation indicators.
|
|
426
551
|
if (options.removeLowScoring) {
|
|
427
552
|
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
428
553
|
}
|
|
429
|
-
// Remove clutter using selectors
|
|
430
|
-
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
431
|
-
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
|
|
432
|
-
}
|
|
433
554
|
// Remove elements by content patterns (read time, boilerplate, article cards)
|
|
434
555
|
if (options.removeContentPatterns && mainContent) {
|
|
435
556
|
this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
|
|
@@ -445,7 +566,7 @@ class Defuddle {
|
|
|
445
566
|
const result = {
|
|
446
567
|
content,
|
|
447
568
|
...metadata,
|
|
448
|
-
wordCount: this.
|
|
569
|
+
wordCount: this.countHtmlWords(content),
|
|
449
570
|
parseTime: Math.round(endTime - startTime),
|
|
450
571
|
metaTags: pageMetaTags
|
|
451
572
|
};
|
|
@@ -459,18 +580,18 @@ class Defuddle {
|
|
|
459
580
|
}
|
|
460
581
|
catch (error) {
|
|
461
582
|
console.error('Defuddle', 'Error processing document:', error);
|
|
462
|
-
const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
|
|
583
|
+
const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
463
584
|
const endTime = Date.now();
|
|
464
585
|
return {
|
|
465
586
|
content: errorContent,
|
|
466
587
|
...metadata,
|
|
467
|
-
wordCount: this.
|
|
588
|
+
wordCount: this.countHtmlWords(errorContent),
|
|
468
589
|
parseTime: Math.round(endTime - startTime),
|
|
469
590
|
metaTags: pageMetaTags
|
|
470
591
|
};
|
|
471
592
|
}
|
|
472
593
|
}
|
|
473
|
-
|
|
594
|
+
countHtmlWords(content) {
|
|
474
595
|
// Strip HTML tags and decode common entities without DOM parsing
|
|
475
596
|
const text = content
|
|
476
597
|
.replace(/<[^>]*>/g, ' ')
|
|
@@ -481,25 +602,8 @@ class Defuddle {
|
|
|
481
602
|
.replace(/"/gi, '"')
|
|
482
603
|
.replace(/&#\d+;/g, ' ')
|
|
483
604
|
.replace(/&\w+;/g, ' ');
|
|
484
|
-
|
|
485
|
-
if (!trimmed)
|
|
486
|
-
return 0;
|
|
487
|
-
// Count words by splitting on whitespace
|
|
488
|
-
let count = 0;
|
|
489
|
-
let inWord = false;
|
|
490
|
-
for (let i = 0; i < trimmed.length; i++) {
|
|
491
|
-
const isSpace = trimmed.charCodeAt(i) <= 32;
|
|
492
|
-
if (!isSpace && !inWord) {
|
|
493
|
-
count++;
|
|
494
|
-
inWord = true;
|
|
495
|
-
}
|
|
496
|
-
else if (isSpace) {
|
|
497
|
-
inWord = false;
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
return count;
|
|
605
|
+
return (0, utils_1.countWords)(text);
|
|
501
606
|
}
|
|
502
|
-
// Make all other methods private by removing the static keyword and using private
|
|
503
607
|
_log(...args) {
|
|
504
608
|
if (this.debug) {
|
|
505
609
|
console.log('Defuddle:', ...args);
|
|
@@ -509,6 +613,8 @@ class Defuddle {
|
|
|
509
613
|
const mobileStyles = [];
|
|
510
614
|
const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
|
|
511
615
|
try {
|
|
616
|
+
if (!doc.styleSheets)
|
|
617
|
+
return mobileStyles;
|
|
512
618
|
// Get all styles, including inline styles
|
|
513
619
|
const sheets = Array.from(doc.styleSheets).filter(sheet => {
|
|
514
620
|
try {
|
|
@@ -646,7 +752,7 @@ class Defuddle {
|
|
|
646
752
|
if (className) {
|
|
647
753
|
const tokens = className.split(/\s+/);
|
|
648
754
|
for (const token of tokens) {
|
|
649
|
-
if (token === 'hidden' || token.endsWith(':hidden')) {
|
|
755
|
+
if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
|
|
650
756
|
elementsToRemove.set(element, `class:${token}`);
|
|
651
757
|
count++;
|
|
652
758
|
break;
|
|
@@ -667,7 +773,7 @@ class Defuddle {
|
|
|
667
773
|
});
|
|
668
774
|
this._log('Removed hidden elements:', count);
|
|
669
775
|
}
|
|
670
|
-
removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
|
|
776
|
+
removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
|
|
671
777
|
const startTime = Date.now();
|
|
672
778
|
let exactSelectorCount = 0;
|
|
673
779
|
let partialSelectorCount = 0;
|
|
@@ -675,9 +781,17 @@ class Defuddle {
|
|
|
675
781
|
const elementsToRemove = new Map();
|
|
676
782
|
// First collect elements matching exact selectors
|
|
677
783
|
if (removeExact) {
|
|
678
|
-
const exactElements = doc.querySelectorAll(constants_1.
|
|
784
|
+
const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
|
|
679
785
|
exactElements.forEach(el => {
|
|
680
786
|
if (el?.parentNode) {
|
|
787
|
+
if (skipHiddenExactSelectors) {
|
|
788
|
+
const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
|
|
789
|
+
const role = (el.getAttribute('role') || '').toLowerCase();
|
|
790
|
+
if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
|
|
791
|
+
(hiddenAncestor && role === 'dialog')) {
|
|
792
|
+
return;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
681
795
|
// Skip elements inside code blocks (e.g. syntax highlighting spans)
|
|
682
796
|
if (el.closest('pre, code')) {
|
|
683
797
|
return;
|
|
@@ -688,16 +802,12 @@ class Defuddle {
|
|
|
688
802
|
});
|
|
689
803
|
}
|
|
690
804
|
if (removePartial) {
|
|
691
|
-
// Pre-compile regexes
|
|
692
|
-
const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
|
|
693
|
-
const partialRegex = new RegExp(combinedPattern, 'i');
|
|
694
|
-
// Pre-compile individual regexes for debug pattern identification
|
|
805
|
+
// Pre-compile individual regexes for debug pattern identification only
|
|
695
806
|
const individualRegexes = this.debug
|
|
696
807
|
? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
|
|
697
808
|
: null;
|
|
698
|
-
//
|
|
699
|
-
const
|
|
700
|
-
const allElements = doc.querySelectorAll(attributeSelector);
|
|
809
|
+
// Use pre-built attribute selector for elements we care about
|
|
810
|
+
const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
|
|
701
811
|
// Process elements for partial matches
|
|
702
812
|
allElements.forEach(el => {
|
|
703
813
|
// Skip if already marked for removal
|
|
@@ -707,13 +817,13 @@ class Defuddle {
|
|
|
707
817
|
// Skip code elements and elements containing code blocks
|
|
708
818
|
// where class names indicate language/syntax, not page structure
|
|
709
819
|
const tag = el.tagName;
|
|
710
|
-
if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre')) {
|
|
820
|
+
if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
|
|
711
821
|
return;
|
|
712
822
|
}
|
|
713
823
|
// Get all relevant attributes and combine into a single string
|
|
714
824
|
const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
|
|
715
825
|
if (attr === 'class') {
|
|
716
|
-
return
|
|
826
|
+
return (0, dom_1.getClassName)(el);
|
|
717
827
|
}
|
|
718
828
|
if (attr === 'id') {
|
|
719
829
|
return el.id || '';
|
|
@@ -725,7 +835,7 @@ class Defuddle {
|
|
|
725
835
|
return;
|
|
726
836
|
}
|
|
727
837
|
// Check for partial match using single regex test
|
|
728
|
-
if (
|
|
838
|
+
if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
|
|
729
839
|
const matchedPattern = individualRegexes
|
|
730
840
|
? individualRegexes.find(r => r.regex.test(attrs))?.pattern
|
|
731
841
|
: undefined;
|
|
@@ -787,8 +897,8 @@ class Defuddle {
|
|
|
787
897
|
const attrHeight = parseInt(element.getAttribute('height') || '0');
|
|
788
898
|
// Check inline style dimensions
|
|
789
899
|
const style = element.getAttribute('style') || '';
|
|
790
|
-
const styleWidth = parseInt(style.match(
|
|
791
|
-
const styleHeight = parseInt(style.match(
|
|
900
|
+
const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
|
|
901
|
+
const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
|
|
792
902
|
// Use getComputedStyle and getBoundingClientRect only in browser
|
|
793
903
|
let computedWidth = 0, computedHeight = 0;
|
|
794
904
|
if (isBrowser) {
|
|
@@ -856,7 +966,7 @@ class Defuddle {
|
|
|
856
966
|
return `srcset:${dataSrcset}`;
|
|
857
967
|
}
|
|
858
968
|
const id = element.id || '';
|
|
859
|
-
const className = element
|
|
969
|
+
const className = (0, dom_1.getClassName)(element);
|
|
860
970
|
const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
|
|
861
971
|
if (id)
|
|
862
972
|
return `id:${id}`;
|
|
@@ -912,7 +1022,7 @@ class Defuddle {
|
|
|
912
1022
|
let best = top;
|
|
913
1023
|
for (let i = 1; i < candidates.length; i++) {
|
|
914
1024
|
const child = candidates[i];
|
|
915
|
-
const childWords = (child.element.textContent || '')
|
|
1025
|
+
const childWords = (0, utils_1.countWords)(child.element.textContent || '');
|
|
916
1026
|
if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
|
|
917
1027
|
// Count how many candidates share this selector index inside
|
|
918
1028
|
// the top element. Use top (not best) as the stable reference
|
|
@@ -972,8 +1082,8 @@ class Defuddle {
|
|
|
972
1082
|
if (current.id) {
|
|
973
1083
|
selector += '#' + current.id;
|
|
974
1084
|
}
|
|
975
|
-
else if (
|
|
976
|
-
selector += '.' + current.
|
|
1085
|
+
else if ((0, dom_1.getClassName)(current)) {
|
|
1086
|
+
selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
|
|
977
1087
|
}
|
|
978
1088
|
parts.unshift(selector);
|
|
979
1089
|
current = current.parentElement;
|
|
@@ -987,15 +1097,35 @@ class Defuddle {
|
|
|
987
1097
|
* Resolve relative URLs to absolute within a DOM element
|
|
988
1098
|
*/
|
|
989
1099
|
resolveRelativeUrls(element) {
|
|
990
|
-
const
|
|
991
|
-
if (!
|
|
1100
|
+
const docUrl = this.options.url || this.doc.URL;
|
|
1101
|
+
if (!docUrl)
|
|
992
1102
|
return;
|
|
1103
|
+
// Respect <base href> for relative URL resolution, matching browser behavior
|
|
1104
|
+
let baseUrl = docUrl;
|
|
1105
|
+
const baseEl = this.doc.querySelector('base[href]');
|
|
1106
|
+
if (baseEl) {
|
|
1107
|
+
const baseHref = baseEl.getAttribute('href');
|
|
1108
|
+
if (baseHref) {
|
|
1109
|
+
try {
|
|
1110
|
+
baseUrl = new URL(baseHref, docUrl).href;
|
|
1111
|
+
}
|
|
1112
|
+
catch {
|
|
1113
|
+
// Invalid base href, fall back to document URL
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
993
1117
|
const resolve = (url) => {
|
|
1118
|
+
// Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
|
|
1119
|
+
// Normalize these before URL resolution.
|
|
1120
|
+
const normalized = url
|
|
1121
|
+
.trim()
|
|
1122
|
+
.replace(/^\\?["']+/, '')
|
|
1123
|
+
.replace(/\\?["']+$/, '');
|
|
994
1124
|
try {
|
|
995
|
-
return new URL(
|
|
1125
|
+
return new URL(normalized, baseUrl).href;
|
|
996
1126
|
}
|
|
997
1127
|
catch {
|
|
998
|
-
return url;
|
|
1128
|
+
return normalized || url;
|
|
999
1129
|
}
|
|
1000
1130
|
};
|
|
1001
1131
|
element.querySelectorAll('[href]').forEach(el => {
|
|
@@ -1051,6 +1181,8 @@ class Defuddle {
|
|
|
1051
1181
|
* Walks both trees in parallel so positional correspondence is exact.
|
|
1052
1182
|
*/
|
|
1053
1183
|
flattenShadowRoots(original, clone) {
|
|
1184
|
+
if (!original.body || !clone.body)
|
|
1185
|
+
return;
|
|
1054
1186
|
const origElements = Array.from(original.body.querySelectorAll('*'));
|
|
1055
1187
|
// Find the first element with a shadow root (also serves as the hasShadowRoots check)
|
|
1056
1188
|
const firstShadow = origElements.find(el => el.shadowRoot);
|
|
@@ -1268,7 +1400,7 @@ class Defuddle {
|
|
|
1268
1400
|
author: extracted.variables?.author || metadata.author,
|
|
1269
1401
|
site: extracted.variables?.site || metadata.site,
|
|
1270
1402
|
schemaOrgData: metadata.schemaOrgData,
|
|
1271
|
-
wordCount: this.
|
|
1403
|
+
wordCount: this.countHtmlWords(extracted.contentHtml),
|
|
1272
1404
|
parseTime: Math.round(Date.now() - startTime),
|
|
1273
1405
|
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
1274
1406
|
metaTags: pageMetaTags,
|
|
@@ -1307,7 +1439,7 @@ class Defuddle {
|
|
|
1307
1439
|
if (el.closest('pre') || el.closest('code'))
|
|
1308
1440
|
continue;
|
|
1309
1441
|
const text = el.textContent?.trim() || '';
|
|
1310
|
-
const words =
|
|
1442
|
+
const words = (0, utils_1.countWords)(text);
|
|
1311
1443
|
// Match date + read time in short elements
|
|
1312
1444
|
if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
|
|
1313
1445
|
// Ensure this is a leaf-ish element, not a large container
|
|
@@ -1361,7 +1493,7 @@ class Defuddle {
|
|
|
1361
1493
|
break;
|
|
1362
1494
|
}
|
|
1363
1495
|
const text = target.textContent?.trim() || '';
|
|
1364
|
-
const words =
|
|
1496
|
+
const words = (0, utils_1.countWords)(text);
|
|
1365
1497
|
if (words > 10)
|
|
1366
1498
|
continue;
|
|
1367
1499
|
// Check if this element is near the start or end of mainContent
|
|
@@ -1378,12 +1510,78 @@ class Defuddle {
|
|
|
1378
1510
|
}
|
|
1379
1511
|
target.remove();
|
|
1380
1512
|
}
|
|
1513
|
+
// Remove blog post metadata lists near content boundaries.
|
|
1514
|
+
// These are short <ul>/<ol> elements where every item is a brief
|
|
1515
|
+
// label + value pair (date, reading time, share, etc.) with no
|
|
1516
|
+
// prose sentences. Detected structurally: all items are very short,
|
|
1517
|
+
// none contain sentence-ending punctuation, and the total text is minimal.
|
|
1518
|
+
const metadataLists = mainContent.querySelectorAll('ul, ol');
|
|
1519
|
+
for (const list of metadataLists) {
|
|
1520
|
+
if (!list.parentNode)
|
|
1521
|
+
continue;
|
|
1522
|
+
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1523
|
+
if (items.length < 2 || items.length > 8)
|
|
1524
|
+
continue;
|
|
1525
|
+
// Must be near the start or end of content
|
|
1526
|
+
const listText = list.textContent?.trim() || '';
|
|
1527
|
+
const listPos = contentText.indexOf(listText);
|
|
1528
|
+
const distFromEnd = contentText.length - (listPos + listText.length);
|
|
1529
|
+
if (listPos > 500 && distFromEnd > 500)
|
|
1530
|
+
continue;
|
|
1531
|
+
// Skip lists introduced by a preceding paragraph (e.g. "Features include:")
|
|
1532
|
+
// — those are content lists, not standalone metadata
|
|
1533
|
+
const prevSibling = list.previousElementSibling;
|
|
1534
|
+
if (prevSibling) {
|
|
1535
|
+
const prevText = prevSibling.textContent?.trim() || '';
|
|
1536
|
+
if (prevText.endsWith(':'))
|
|
1537
|
+
continue;
|
|
1538
|
+
}
|
|
1539
|
+
// Every item must be very short (label + value) with no prose
|
|
1540
|
+
let isMetadata = true;
|
|
1541
|
+
for (const item of items) {
|
|
1542
|
+
const text = item.textContent?.trim() || '';
|
|
1543
|
+
const words = (0, utils_1.countWords)(text);
|
|
1544
|
+
if (words > 8) {
|
|
1545
|
+
isMetadata = false;
|
|
1546
|
+
break;
|
|
1547
|
+
}
|
|
1548
|
+
// Prose has sentence-ending punctuation; metadata doesn't
|
|
1549
|
+
if (/[.!?]$/.test(text)) {
|
|
1550
|
+
isMetadata = false;
|
|
1551
|
+
break;
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
if (!isMetadata)
|
|
1555
|
+
continue;
|
|
1556
|
+
// Total text should be very short — this is metadata, not content
|
|
1557
|
+
if ((0, utils_1.countWords)(listText) > 30)
|
|
1558
|
+
continue;
|
|
1559
|
+
// Walk up to find the container to remove (e.g. a wrapper div)
|
|
1560
|
+
let target = list;
|
|
1561
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1562
|
+
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1563
|
+
if (parentText !== listText)
|
|
1564
|
+
break;
|
|
1565
|
+
target = target.parentElement;
|
|
1566
|
+
}
|
|
1567
|
+
if (this.debug && debugRemovals) {
|
|
1568
|
+
debugRemovals.push({
|
|
1569
|
+
step: 'removeByContentPattern',
|
|
1570
|
+
reason: 'blog metadata list',
|
|
1571
|
+
text: (0, utils_1.textPreview)(target)
|
|
1572
|
+
});
|
|
1573
|
+
}
|
|
1574
|
+
target.remove();
|
|
1575
|
+
}
|
|
1381
1576
|
// Remove section breadcrumbs
|
|
1382
1577
|
// Short elements containing a link to a parent section of the current URL.
|
|
1383
1578
|
const url = this.options.url || this.doc.URL || '';
|
|
1384
1579
|
let urlPath = '';
|
|
1580
|
+
let pageHost = '';
|
|
1385
1581
|
try {
|
|
1386
|
-
|
|
1582
|
+
const parsedUrl = new URL(url);
|
|
1583
|
+
urlPath = parsedUrl.pathname;
|
|
1584
|
+
pageHost = parsedUrl.hostname.replace(/^www\./, '');
|
|
1387
1585
|
}
|
|
1388
1586
|
catch { }
|
|
1389
1587
|
if (urlPath) {
|
|
@@ -1392,7 +1590,7 @@ class Defuddle {
|
|
|
1392
1590
|
if (!el.parentNode)
|
|
1393
1591
|
continue;
|
|
1394
1592
|
const text = el.textContent?.trim() || '';
|
|
1395
|
-
const words =
|
|
1593
|
+
const words = (0, utils_1.countWords)(text);
|
|
1396
1594
|
if (words > 10)
|
|
1397
1595
|
continue;
|
|
1398
1596
|
// Must be a leaf-ish element (no block children)
|
|
@@ -1417,6 +1615,126 @@ class Defuddle {
|
|
|
1417
1615
|
catch { }
|
|
1418
1616
|
}
|
|
1419
1617
|
}
|
|
1618
|
+
// Remove trailing external link lists — a heading + list of purely
|
|
1619
|
+
// off-site links as the last content block (affiliate picks, product
|
|
1620
|
+
// roundups, etc.). Only removed when nothing meaningful follows.
|
|
1621
|
+
if (pageHost) {
|
|
1622
|
+
const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
|
|
1623
|
+
for (const heading of headings) {
|
|
1624
|
+
if (!heading.parentNode)
|
|
1625
|
+
continue;
|
|
1626
|
+
const list = heading.nextElementSibling;
|
|
1627
|
+
if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
|
|
1628
|
+
continue;
|
|
1629
|
+
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1630
|
+
if (items.length < 2)
|
|
1631
|
+
continue;
|
|
1632
|
+
// The list must be the last meaningful block — nothing after it
|
|
1633
|
+
// except whitespace or empty elements. Walk up through ancestors
|
|
1634
|
+
// to check siblings at each level up to mainContent.
|
|
1635
|
+
let trailingContent = false;
|
|
1636
|
+
let checkEl = list;
|
|
1637
|
+
while (checkEl && checkEl !== mainContent) {
|
|
1638
|
+
let sibling = checkEl.nextElementSibling;
|
|
1639
|
+
while (sibling) {
|
|
1640
|
+
if ((sibling.textContent?.trim() || '').length > 0) {
|
|
1641
|
+
trailingContent = true;
|
|
1642
|
+
break;
|
|
1643
|
+
}
|
|
1644
|
+
sibling = sibling.nextElementSibling;
|
|
1645
|
+
}
|
|
1646
|
+
if (trailingContent)
|
|
1647
|
+
break;
|
|
1648
|
+
checkEl = checkEl.parentElement;
|
|
1649
|
+
}
|
|
1650
|
+
if (trailingContent)
|
|
1651
|
+
continue;
|
|
1652
|
+
// Every list item must be primarily a link pointing off-site
|
|
1653
|
+
let allExternalLinks = true;
|
|
1654
|
+
for (const item of items) {
|
|
1655
|
+
const links = item.querySelectorAll('a[href]');
|
|
1656
|
+
if (links.length === 0) {
|
|
1657
|
+
allExternalLinks = false;
|
|
1658
|
+
break;
|
|
1659
|
+
}
|
|
1660
|
+
const itemText = item.textContent?.trim() || '';
|
|
1661
|
+
let linkTextLen = 0;
|
|
1662
|
+
for (const link of links) {
|
|
1663
|
+
linkTextLen += (link.textContent?.trim() || '').length;
|
|
1664
|
+
try {
|
|
1665
|
+
const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
|
|
1666
|
+
if (linkHost === pageHost) {
|
|
1667
|
+
allExternalLinks = false;
|
|
1668
|
+
break;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
catch { }
|
|
1672
|
+
}
|
|
1673
|
+
if (!allExternalLinks)
|
|
1674
|
+
break;
|
|
1675
|
+
if (linkTextLen < itemText.length * 0.6) {
|
|
1676
|
+
allExternalLinks = false;
|
|
1677
|
+
break;
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
if (!allExternalLinks)
|
|
1681
|
+
continue;
|
|
1682
|
+
if (this.debug && debugRemovals) {
|
|
1683
|
+
debugRemovals.push({
|
|
1684
|
+
step: 'removeByContentPattern',
|
|
1685
|
+
reason: 'trailing external link list',
|
|
1686
|
+
text: (0, utils_1.textPreview)(heading)
|
|
1687
|
+
});
|
|
1688
|
+
debugRemovals.push({
|
|
1689
|
+
step: 'removeByContentPattern',
|
|
1690
|
+
reason: 'trailing external link list',
|
|
1691
|
+
text: (0, utils_1.textPreview)(list)
|
|
1692
|
+
});
|
|
1693
|
+
}
|
|
1694
|
+
list.remove();
|
|
1695
|
+
heading.remove();
|
|
1696
|
+
}
|
|
1697
|
+
}
|
|
1698
|
+
// Remove trailing thin sections — the last few direct children of
|
|
1699
|
+
// mainContent that contain a heading but very little prose. These are
|
|
1700
|
+
// typically CTAs, newsletter prompts, or promotional sections that
|
|
1701
|
+
// have been partially stripped by prior removal steps.
|
|
1702
|
+
const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
|
|
1703
|
+
if (totalWords > 300) {
|
|
1704
|
+
// Walk backwards from the last direct child of mainContent,
|
|
1705
|
+
// collecting trailing elements that are thin (empty or very short prose).
|
|
1706
|
+
// Exclude SVG text (path data) from word counts — it's not prose.
|
|
1707
|
+
const trailingEls = [];
|
|
1708
|
+
let trailingWords = 0;
|
|
1709
|
+
let child = mainContent.lastElementChild;
|
|
1710
|
+
while (child) {
|
|
1711
|
+
// Count prose words, excluding SVG path data which inflates word counts
|
|
1712
|
+
let svgWords = 0;
|
|
1713
|
+
for (const svg of child.querySelectorAll('svg')) {
|
|
1714
|
+
svgWords += (0, utils_1.countWords)(svg.textContent || '');
|
|
1715
|
+
}
|
|
1716
|
+
const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
|
|
1717
|
+
if (words > 25)
|
|
1718
|
+
break;
|
|
1719
|
+
trailingWords += words;
|
|
1720
|
+
trailingEls.push(child);
|
|
1721
|
+
child = child.previousElementSibling;
|
|
1722
|
+
}
|
|
1723
|
+
// Must have a heading in the trailing elements and total < 15% of content.
|
|
1724
|
+
// Skip if trailing elements contain content indicators (math, code, tables, images).
|
|
1725
|
+
if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
|
|
1726
|
+
const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
|
|
1727
|
+
const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
|
|
1728
|
+
if (hasHeading && !hasContent) {
|
|
1729
|
+
for (const el of trailingEls) {
|
|
1730
|
+
if (this.debug && debugRemovals) {
|
|
1731
|
+
debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
|
|
1732
|
+
}
|
|
1733
|
+
el.remove();
|
|
1734
|
+
}
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1420
1738
|
// Remove boilerplate sentences and trailing non-content.
|
|
1421
1739
|
// Search elements for end-of-article boilerplate, then truncate
|
|
1422
1740
|
// from the best ancestor that has siblings to remove.
|
|
@@ -1426,7 +1744,7 @@ class Defuddle {
|
|
|
1426
1744
|
if (!el.parentNode)
|
|
1427
1745
|
continue;
|
|
1428
1746
|
const text = el.textContent?.trim() || '';
|
|
1429
|
-
const words =
|
|
1747
|
+
const words = (0, utils_1.countWords)(text);
|
|
1430
1748
|
if (words > 50 || words < 3)
|
|
1431
1749
|
continue;
|
|
1432
1750
|
for (const pattern of BOILERPLATE_PATTERNS) {
|