defuddle 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.d.ts +0 -2
- package/dist/constants.js +39 -10
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +1 -20
- package/dist/defuddle.js +151 -807
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/code.js +76 -11
- package/dist/elements/code.js.map +1 -1
- package/dist/elements/footnotes.js +420 -45
- package/dist/elements/footnotes.js.map +1 -1
- package/dist/elements/headings.js +5 -0
- package/dist/elements/headings.js.map +1 -1
- package/dist/elements/math.base.d.ts +1 -0
- package/dist/elements/math.base.js +4 -1
- package/dist/elements/math.base.js.map +1 -1
- package/dist/elements/math.core.d.ts +1 -0
- package/dist/elements/math.d.ts +1 -1
- package/dist/elements/math.full.d.ts +1 -0
- package/dist/elements/math.full.js +90 -0
- package/dist/elements/math.full.js.map +1 -0
- package/dist/elements/math.js +3 -3
- package/dist/extractor-registry.js +20 -0
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/bbcode-data.d.ts +10 -0
- package/dist/extractors/bbcode-data.js +59 -0
- package/dist/extractors/bbcode-data.js.map +1 -0
- package/dist/extractors/c2-wiki.d.ts +15 -0
- package/dist/extractors/c2-wiki.js +143 -0
- package/dist/extractors/c2-wiki.js.map +1 -0
- package/dist/extractors/reddit.d.ts +1 -0
- package/dist/extractors/reddit.js +14 -14
- package/dist/extractors/reddit.js.map +1 -1
- package/dist/extractors/substack.d.ts +17 -0
- package/dist/extractors/substack.js +188 -0
- package/dist/extractors/substack.js.map +1 -0
- package/dist/extractors/x-article.d.ts +1 -0
- package/dist/extractors/x-article.js +27 -2
- package/dist/extractors/x-article.js.map +1 -1
- package/dist/extractors/x-oembed.js +1 -1
- package/dist/extractors/x-oembed.js.map +1 -1
- package/dist/extractors/youtube.d.ts +9 -2
- package/dist/extractors/youtube.js +161 -29
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/fetch.js +183 -14
- package/dist/fetch.js.map +1 -1
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +27 -2
- package/dist/markdown.js.map +1 -1
- package/dist/metadata.d.ts +4 -3
- package/dist/metadata.js +195 -41
- package/dist/metadata.js.map +1 -1
- package/dist/node.d.ts +1 -1
- package/dist/node.js +3 -6
- package/dist/node.js.map +1 -1
- package/dist/removals/content-patterns.d.ts +2 -0
- package/dist/removals/content-patterns.js +835 -0
- package/dist/removals/content-patterns.js.map +1 -0
- package/dist/removals/hidden.d.ts +2 -0
- package/dist/removals/hidden.js +78 -0
- package/dist/removals/hidden.js.map +1 -0
- package/dist/removals/metadata-block.d.ts +8 -0
- package/dist/removals/metadata-block.js +40 -0
- package/dist/removals/metadata-block.js.map +1 -0
- package/dist/{scoring.d.ts → removals/scoring.d.ts} +1 -1
- package/dist/{scoring.js → removals/scoring.js} +7 -9
- package/dist/removals/scoring.js.map +1 -0
- package/dist/removals/selectors.d.ts +2 -0
- package/dist/removals/selectors.js +118 -0
- package/dist/removals/selectors.js.map +1 -0
- package/dist/removals/small-images.d.ts +3 -0
- package/dist/removals/small-images.js +116 -0
- package/dist/removals/small-images.js.map +1 -0
- package/dist/standardize.d.ts +2 -1
- package/dist/standardize.js +106 -62
- package/dist/standardize.js.map +1 -1
- package/dist/types/extractors.d.ts +1 -0
- package/dist/types.d.ts +5 -0
- package/dist/utils/bbcode.d.ts +6 -0
- package/dist/utils/bbcode.js +57 -0
- package/dist/utils/bbcode.js.map +1 -0
- package/dist/utils.js +1 -1
- package/dist/utils.js.map +1 -1
- package/package.json +1 -1
- package/dist/elements/math.core.js +0 -52
- package/dist/elements/math.core.js.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/scoring.js.map +0 -1
package/dist/defuddle.js
CHANGED
|
@@ -2,34 +2,21 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.Defuddle = void 0;
|
|
4
4
|
const metadata_1 = require("./metadata");
|
|
5
|
-
const headings_1 = require("./elements/headings");
|
|
6
5
|
const extractor_registry_1 = require("./extractor-registry");
|
|
7
6
|
const constants_1 = require("./constants");
|
|
8
7
|
const standardize_1 = require("./standardize");
|
|
9
8
|
const footnotes_1 = require("./elements/footnotes");
|
|
10
9
|
const callouts_1 = require("./elements/callouts");
|
|
11
|
-
const scoring_1 = require("./scoring");
|
|
10
|
+
const scoring_1 = require("./removals/scoring");
|
|
11
|
+
const small_images_1 = require("./removals/small-images");
|
|
12
|
+
const hidden_1 = require("./removals/hidden");
|
|
13
|
+
const selectors_1 = require("./removals/selectors");
|
|
14
|
+
const content_patterns_1 = require("./removals/content-patterns");
|
|
15
|
+
const metadata_block_1 = require("./removals/metadata-block");
|
|
12
16
|
const utils_1 = require("./utils");
|
|
13
17
|
const dom_1 = require("./utils/dom");
|
|
14
18
|
/** Keys from extractor variables that map to top-level DefuddleResponse fields */
|
|
15
19
|
const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
|
|
16
|
-
// Content pattern detection constants
|
|
17
|
-
const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
|
|
18
|
-
const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
|
|
19
|
-
const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
|
|
20
|
-
const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
21
|
-
const BOILERPLATE_PATTERNS = [
|
|
22
|
-
/^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
|
|
23
|
-
/^A version of this (?:article|story) (?:appeared|was published) in\b/i,
|
|
24
|
-
/^Originally (?:published|appeared) (?:in|on|at)\b/i,
|
|
25
|
-
];
|
|
26
|
-
const METADATA_STRIP_PATTERNS = [
|
|
27
|
-
/\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
|
|
28
|
-
/\b\d+(?:st|nd|rd|th)?\b/g,
|
|
29
|
-
/\bmin(?:ute)?s?\b/gi,
|
|
30
|
-
/\bread\b/gi,
|
|
31
|
-
/[|·•—–\-,.\s]/g,
|
|
32
|
-
];
|
|
33
20
|
class Defuddle {
|
|
34
21
|
/**
|
|
35
22
|
* Create a new Defuddle instance
|
|
@@ -39,6 +26,7 @@ class Defuddle {
|
|
|
39
26
|
constructor(doc, options = {}) {
|
|
40
27
|
this._schemaOrgData = undefined;
|
|
41
28
|
this._schemaOrgExtracted = false;
|
|
29
|
+
this._inExtractorPipelineRun = false;
|
|
42
30
|
this.doc = doc;
|
|
43
31
|
this.options = options;
|
|
44
32
|
this.debug = options.debug || false;
|
|
@@ -121,20 +109,23 @@ class Defuddle {
|
|
|
121
109
|
}
|
|
122
110
|
}
|
|
123
111
|
// Strip dangerous elements from this.doc before any fallback paths
|
|
124
|
-
// that read from it
|
|
125
|
-
//
|
|
126
|
-
// for schema.org extraction, site-specific extractors, and math.
|
|
112
|
+
// that read from it. This must happen after parseInternal, which needs
|
|
113
|
+
// script tags for schema.org extraction, site-specific extractors, and math.
|
|
127
114
|
this._stripUnsafeElements();
|
|
128
|
-
// If schema.org has
|
|
129
|
-
//
|
|
130
|
-
//
|
|
115
|
+
// If schema.org has text content that is significantly longer than what we
|
|
116
|
+
// extracted, the scorer likely picked the wrong element from a feed page.
|
|
117
|
+
// Use a 1.5x threshold to avoid triggering when the difference is small
|
|
118
|
+
// (e.g. just related-content link text removed).
|
|
131
119
|
const schemaText = this._getSchemaText(result.schemaOrgData);
|
|
132
|
-
if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
|
|
133
|
-
const
|
|
134
|
-
if (
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
120
|
+
if (schemaText && this.countHtmlWords(schemaText) > result.wordCount * 1.5) {
|
|
121
|
+
const bestMatch = this._findElementBySchemaText(this.doc.body, schemaText);
|
|
122
|
+
if (bestMatch) {
|
|
123
|
+
// Re-run the full pipeline with the schema-identified element as the
|
|
124
|
+
// content root so it benefits from the same cleanup as normal extraction.
|
|
125
|
+
const selector = this.getElementSelector(bestMatch);
|
|
126
|
+
this._log('Schema.org suggests a better content element, retrying with selector:', selector);
|
|
127
|
+
const schemaRetry = this.parseInternal({ contentSelector: selector });
|
|
128
|
+
result = schemaRetry;
|
|
138
129
|
}
|
|
139
130
|
else {
|
|
140
131
|
this._log('Using schema.org text as content (DOM element not found)');
|
|
@@ -238,62 +229,6 @@ class Defuddle {
|
|
|
238
229
|
}
|
|
239
230
|
return bestMatch;
|
|
240
231
|
}
|
|
241
|
-
/**
|
|
242
|
-
* Find a DOM element whose text matches the schema.org text content.
|
|
243
|
-
* Used when the content scorer picked the wrong element from a feed page.
|
|
244
|
-
* Returns the element's inner HTML including sibling media (images, etc.)
|
|
245
|
-
*/
|
|
246
|
-
_findContentBySchemaText(schemaText) {
|
|
247
|
-
const body = this.doc.body;
|
|
248
|
-
if (!body)
|
|
249
|
-
return '';
|
|
250
|
-
const bestMatch = this._findElementBySchemaText(body, schemaText);
|
|
251
|
-
if (!bestMatch)
|
|
252
|
-
return '';
|
|
253
|
-
// Read the largest sibling image src BEFORE resolveRelativeUrls
|
|
254
|
-
// can mangle comma-containing CDN URLs in srcset attributes
|
|
255
|
-
let imageSrc = '';
|
|
256
|
-
let imageAlt = '';
|
|
257
|
-
const parent = bestMatch.parentElement;
|
|
258
|
-
if (parent && parent !== body) {
|
|
259
|
-
const images = parent.querySelectorAll('img');
|
|
260
|
-
let largestImg = null;
|
|
261
|
-
let largestArea = 0;
|
|
262
|
-
for (const img of images) {
|
|
263
|
-
if (bestMatch.contains(img))
|
|
264
|
-
continue;
|
|
265
|
-
const w = parseInt(img.getAttribute('width') || '0', 10);
|
|
266
|
-
const h = parseInt(img.getAttribute('height') || '0', 10);
|
|
267
|
-
const area = w * h;
|
|
268
|
-
if (area > largestArea) {
|
|
269
|
-
largestArea = area;
|
|
270
|
-
largestImg = img;
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
if (largestImg) {
|
|
274
|
-
imageSrc = this._getLargestImageSrc(largestImg);
|
|
275
|
-
imageAlt = largestImg.getAttribute('alt') || '';
|
|
276
|
-
try {
|
|
277
|
-
const baseUrl = this.options.url || this.doc.URL;
|
|
278
|
-
if (baseUrl)
|
|
279
|
-
imageSrc = new URL(imageSrc, baseUrl).href;
|
|
280
|
-
}
|
|
281
|
-
catch { }
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
// Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
|
|
285
|
-
(0, headings_1.removeHeadingAnchors)(bestMatch);
|
|
286
|
-
// Now resolve URLs in the text content
|
|
287
|
-
this.resolveRelativeUrls(bestMatch);
|
|
288
|
-
let html = (0, dom_1.serializeHTML)(bestMatch);
|
|
289
|
-
if (imageSrc) {
|
|
290
|
-
const img = this.doc.createElement('img');
|
|
291
|
-
img.setAttribute('src', imageSrc);
|
|
292
|
-
img.setAttribute('alt', imageAlt);
|
|
293
|
-
html += img.outerHTML;
|
|
294
|
-
}
|
|
295
|
-
return html;
|
|
296
|
-
}
|
|
297
232
|
findLargestHiddenContentSelector() {
|
|
298
233
|
const body = this.doc.body;
|
|
299
234
|
if (!body)
|
|
@@ -412,6 +347,16 @@ class Defuddle {
|
|
|
412
347
|
*/
|
|
413
348
|
parseInternal(overrideOptions = {}) {
|
|
414
349
|
const startTime = Date.now();
|
|
350
|
+
const profile = {};
|
|
351
|
+
const doProfile = this.options.profile ?? false;
|
|
352
|
+
const profileStep = (name, fn) => {
|
|
353
|
+
if (!doProfile)
|
|
354
|
+
return fn();
|
|
355
|
+
const t = performance.now();
|
|
356
|
+
const result = fn();
|
|
357
|
+
profile[name] = Math.round(performance.now() - t);
|
|
358
|
+
return result;
|
|
359
|
+
};
|
|
415
360
|
// Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
|
|
416
361
|
if (!this.doc.documentElement) {
|
|
417
362
|
const url = this.options.url || '';
|
|
@@ -465,10 +410,37 @@ class Defuddle {
|
|
|
465
410
|
includeReplies: options.includeReplies,
|
|
466
411
|
language: options.language,
|
|
467
412
|
};
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
413
|
+
if (!this._inExtractorPipelineRun) {
|
|
414
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
|
|
415
|
+
if (extractor && extractor.canExtract()) {
|
|
416
|
+
const extracted = extractor.extract();
|
|
417
|
+
if (extracted.contentSelector) {
|
|
418
|
+
this._inExtractorPipelineRun = true;
|
|
419
|
+
try {
|
|
420
|
+
const pipelineResult = this.parseInternal({
|
|
421
|
+
contentSelector: extracted.contentSelector,
|
|
422
|
+
removeLowScoring: false,
|
|
423
|
+
removeHiddenElements: false,
|
|
424
|
+
});
|
|
425
|
+
const variables = this.getExtractorVariables(extracted.variables);
|
|
426
|
+
return {
|
|
427
|
+
...pipelineResult,
|
|
428
|
+
title: extracted.variables?.title || pipelineResult.title,
|
|
429
|
+
description: extracted.variables?.description || pipelineResult.description,
|
|
430
|
+
author: extracted.variables?.author || pipelineResult.author,
|
|
431
|
+
published: extracted.variables?.published || pipelineResult.published,
|
|
432
|
+
site: extracted.variables?.site || pipelineResult.site,
|
|
433
|
+
language: extracted.variables?.language || pipelineResult.language,
|
|
434
|
+
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
435
|
+
...(variables ? { variables } : {}),
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
finally {
|
|
439
|
+
this._inExtractorPipelineRun = false;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
443
|
+
}
|
|
472
444
|
}
|
|
473
445
|
// Continue if there is no extractor...
|
|
474
446
|
// Evaluate mobile styles and sizes on original document (cached across retries)
|
|
@@ -478,41 +450,47 @@ class Defuddle {
|
|
|
478
450
|
const mobileStyles = this._mobileStyles;
|
|
479
451
|
// Find small images in original document (cached across retries)
|
|
480
452
|
if (!this._smallImages) {
|
|
481
|
-
this._smallImages =
|
|
453
|
+
this._smallImages = (0, small_images_1.findSmallImages)(this.doc, this.debug);
|
|
482
454
|
}
|
|
483
455
|
const smallImages = this._smallImages;
|
|
484
456
|
// Clone document
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
457
|
+
let clone;
|
|
458
|
+
profileStep('cloneDocument', () => {
|
|
459
|
+
clone = this.doc.cloneNode(true);
|
|
460
|
+
// Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
|
|
461
|
+
// create when parsing HTML entities like '
|
|
462
|
+
clone.body?.normalize();
|
|
463
|
+
});
|
|
489
464
|
// Flatten shadow DOM content into the clone
|
|
490
|
-
this.flattenShadowRoots(this.doc, clone);
|
|
465
|
+
profileStep('flattenShadowRoots', () => this.flattenShadowRoots(this.doc, clone));
|
|
491
466
|
// Resolve React streaming SSR suspense boundaries
|
|
492
|
-
this.resolveStreamedContent(clone);
|
|
467
|
+
profileStep('resolveStreamedContent', () => this.resolveStreamedContent(clone));
|
|
493
468
|
// Apply mobile styles to clone
|
|
494
|
-
this.applyMobileStyles(clone, mobileStyles);
|
|
469
|
+
profileStep('applyMobileStyles', () => this.applyMobileStyles(clone, mobileStyles));
|
|
495
470
|
// Find main content
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
471
|
+
const mainContent = profileStep('findMainContent', () => {
|
|
472
|
+
let found = null;
|
|
473
|
+
if (options.contentSelector) {
|
|
474
|
+
found = clone.querySelector(options.contentSelector);
|
|
475
|
+
this._log('Using contentSelector:', options.contentSelector, found ? 'found' : 'not found');
|
|
476
|
+
}
|
|
477
|
+
if (!found) {
|
|
478
|
+
found = this.findMainContent(clone);
|
|
479
|
+
}
|
|
480
|
+
// If we fell back to <body>, try using schema.org articleBody/text
|
|
481
|
+
// to find a more specific content element within the DOM.
|
|
482
|
+
if (found && found.tagName.toLowerCase() === 'body') {
|
|
483
|
+
const schemaText = this._getSchemaText(schemaOrgData);
|
|
484
|
+
if (schemaText) {
|
|
485
|
+
const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
|
|
486
|
+
if (schemaContent) {
|
|
487
|
+
this._log('Found content element via schema.org text');
|
|
488
|
+
found = schemaContent;
|
|
489
|
+
}
|
|
513
490
|
}
|
|
514
491
|
}
|
|
515
|
-
|
|
492
|
+
return found;
|
|
493
|
+
});
|
|
516
494
|
if (!mainContent) {
|
|
517
495
|
const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
518
496
|
const endTime = Date.now();
|
|
@@ -524,43 +502,66 @@ class Defuddle {
|
|
|
524
502
|
metaTags: pageMetaTags
|
|
525
503
|
};
|
|
526
504
|
}
|
|
527
|
-
// Remove
|
|
528
|
-
//
|
|
529
|
-
|
|
505
|
+
// Remove h1-adjacent date/author metadata blocks from the content.
|
|
506
|
+
// These are extracted as frontmatter but also appear in the body when a
|
|
507
|
+
// wide container (e.g. <main>) is selected as the content element.
|
|
508
|
+
profileStep('removeMetadataBlock', () => {
|
|
509
|
+
if (metadata.published || metadata.author) {
|
|
510
|
+
(0, metadata_block_1.removeMetadataBlock)(mainContent);
|
|
511
|
+
}
|
|
512
|
+
// Remove <wbr> elements — word break opportunity hints that carry no
|
|
513
|
+
// content but cause unwanted whitespace during standardization.
|
|
514
|
+
mainContent.querySelectorAll('wbr').forEach(el => el.remove());
|
|
515
|
+
});
|
|
530
516
|
// Standardize footnotes before cleanup (CSS sidenotes use display:none)
|
|
531
|
-
|
|
532
|
-
(
|
|
533
|
-
|
|
534
|
-
|
|
517
|
+
profileStep('standardizeFootnotesCallouts', () => {
|
|
518
|
+
if (options.standardize) {
|
|
519
|
+
(0, footnotes_1.standardizeFootnotes)(mainContent);
|
|
520
|
+
(0, callouts_1.standardizeCallouts)(mainContent);
|
|
521
|
+
}
|
|
522
|
+
});
|
|
535
523
|
// Remove small images
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
524
|
+
profileStep('removeSmallImages', () => {
|
|
525
|
+
if (options.removeSmallImages) {
|
|
526
|
+
(0, small_images_1.removeSmallImages)(clone, smallImages, this.debug);
|
|
527
|
+
}
|
|
528
|
+
});
|
|
539
529
|
// Remove hidden elements using computed styles
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
530
|
+
profileStep('removeHiddenElements', () => {
|
|
531
|
+
if (options.removeHiddenElements) {
|
|
532
|
+
(0, hidden_1.removeHiddenElements)(clone, this.debug, debugRemovals);
|
|
533
|
+
}
|
|
534
|
+
});
|
|
543
535
|
// Remove clutter using selectors — deterministic removal of known
|
|
544
536
|
// non-content elements (nav, footer, .sidebar, etc.) by class/id.
|
|
545
537
|
// Runs before scoring so the heuristic scorer sees a cleaner DOM.
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
538
|
+
profileStep('removeBySelector', () => {
|
|
539
|
+
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
540
|
+
(0, selectors_1.removeBySelector)(clone, this.debug, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
|
|
541
|
+
}
|
|
542
|
+
});
|
|
549
543
|
// Remove non-content blocks by scoring — heuristic removal based
|
|
550
544
|
// on link density, text ratios, and navigation indicators.
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
545
|
+
profileStep('removeLowScoring', () => {
|
|
546
|
+
if (options.removeLowScoring) {
|
|
547
|
+
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
548
|
+
}
|
|
549
|
+
});
|
|
554
550
|
// Remove elements by content patterns (read time, boilerplate, article cards)
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
551
|
+
profileStep('removeByContentPattern', () => {
|
|
552
|
+
if (options.removeContentPatterns && mainContent) {
|
|
553
|
+
const url = this.options.url || this.doc.URL || '';
|
|
554
|
+
(0, content_patterns_1.removeByContentPattern)(mainContent, this.debug, url, debugRemovals);
|
|
555
|
+
}
|
|
556
|
+
});
|
|
558
557
|
// Normalize the main content
|
|
559
|
-
|
|
560
|
-
(
|
|
561
|
-
|
|
558
|
+
profileStep('standardizeContent', () => {
|
|
559
|
+
if (options.standardize) {
|
|
560
|
+
(0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug, doProfile ? profile : undefined);
|
|
561
|
+
}
|
|
562
|
+
});
|
|
562
563
|
// Resolve relative URLs to absolute
|
|
563
|
-
this.resolveRelativeUrls(mainContent);
|
|
564
|
+
profileStep('resolveRelativeUrls', () => this.resolveRelativeUrls(mainContent));
|
|
564
565
|
const content = mainContent.outerHTML;
|
|
565
566
|
const endTime = Date.now();
|
|
566
567
|
const result = {
|
|
@@ -576,6 +577,9 @@ class Defuddle {
|
|
|
576
577
|
removals: debugRemovals
|
|
577
578
|
};
|
|
578
579
|
}
|
|
580
|
+
if (this.options.profile) {
|
|
581
|
+
result.profile = profile;
|
|
582
|
+
}
|
|
579
583
|
return result;
|
|
580
584
|
}
|
|
581
585
|
catch (error) {
|
|
@@ -700,282 +704,6 @@ class Defuddle {
|
|
|
700
704
|
image.remove();
|
|
701
705
|
});
|
|
702
706
|
}
|
|
703
|
-
removeHiddenElements(doc, debugRemovals) {
|
|
704
|
-
let count = 0;
|
|
705
|
-
const elementsToRemove = new Map();
|
|
706
|
-
// Check inline styles and CSS class-based hidden patterns.
|
|
707
|
-
const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
|
|
708
|
-
// Only use getComputedStyle in browser environments where it's meaningful.
|
|
709
|
-
// In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
|
|
710
|
-
const defaultView = doc.defaultView;
|
|
711
|
-
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
712
|
-
const allElements = doc.querySelectorAll('*');
|
|
713
|
-
for (const element of allElements) {
|
|
714
|
-
// Skip elements that contain math — sites like Wikipedia wrap MathML
|
|
715
|
-
// in display:none spans for accessibility (the visible version is an
|
|
716
|
-
// image/SVG fallback). We need to preserve these for math extraction.
|
|
717
|
-
if (element.querySelector('math, [data-mathml], .katex-mathml') ||
|
|
718
|
-
element.tagName.toLowerCase() === 'math') {
|
|
719
|
-
continue;
|
|
720
|
-
}
|
|
721
|
-
// Check inline style for hidden patterns
|
|
722
|
-
const style = element.getAttribute('style');
|
|
723
|
-
if (style && hiddenStylePattern.test(style)) {
|
|
724
|
-
const reason = style.includes('display') ? 'display:none' :
|
|
725
|
-
style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
|
|
726
|
-
elementsToRemove.set(element, reason);
|
|
727
|
-
count++;
|
|
728
|
-
continue;
|
|
729
|
-
}
|
|
730
|
-
// Use getComputedStyle only in real browser environments
|
|
731
|
-
if (isBrowser) {
|
|
732
|
-
try {
|
|
733
|
-
const computedStyle = defaultView.getComputedStyle(element);
|
|
734
|
-
let reason = '';
|
|
735
|
-
if (computedStyle.display === 'none')
|
|
736
|
-
reason = 'display:none';
|
|
737
|
-
else if (computedStyle.visibility === 'hidden')
|
|
738
|
-
reason = 'visibility:hidden';
|
|
739
|
-
else if (computedStyle.opacity === '0')
|
|
740
|
-
reason = 'opacity:0';
|
|
741
|
-
if (reason) {
|
|
742
|
-
elementsToRemove.set(element, reason);
|
|
743
|
-
count++;
|
|
744
|
-
continue;
|
|
745
|
-
}
|
|
746
|
-
}
|
|
747
|
-
catch (e) { }
|
|
748
|
-
}
|
|
749
|
-
// Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
|
|
750
|
-
// "sm:hidden", "not-machine:hidden")
|
|
751
|
-
const className = element.getAttribute('class') || '';
|
|
752
|
-
if (className) {
|
|
753
|
-
const tokens = className.split(/\s+/);
|
|
754
|
-
for (const token of tokens) {
|
|
755
|
-
if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
|
|
756
|
-
elementsToRemove.set(element, `class:${token}`);
|
|
757
|
-
count++;
|
|
758
|
-
break;
|
|
759
|
-
}
|
|
760
|
-
}
|
|
761
|
-
}
|
|
762
|
-
}
|
|
763
|
-
// Batch remove all hidden elements
|
|
764
|
-
elementsToRemove.forEach((reason, el) => {
|
|
765
|
-
if (this.debug && debugRemovals) {
|
|
766
|
-
debugRemovals.push({
|
|
767
|
-
step: 'removeHiddenElements',
|
|
768
|
-
reason,
|
|
769
|
-
text: (0, utils_1.textPreview)(el)
|
|
770
|
-
});
|
|
771
|
-
}
|
|
772
|
-
el.remove();
|
|
773
|
-
});
|
|
774
|
-
this._log('Removed hidden elements:', count);
|
|
775
|
-
}
|
|
776
|
-
removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
|
|
777
|
-
const startTime = Date.now();
|
|
778
|
-
let exactSelectorCount = 0;
|
|
779
|
-
let partialSelectorCount = 0;
|
|
780
|
-
// Track all elements to be removed, with their match type
|
|
781
|
-
const elementsToRemove = new Map();
|
|
782
|
-
// First collect elements matching exact selectors
|
|
783
|
-
if (removeExact) {
|
|
784
|
-
const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
|
|
785
|
-
exactElements.forEach(el => {
|
|
786
|
-
if (el?.parentNode) {
|
|
787
|
-
if (skipHiddenExactSelectors) {
|
|
788
|
-
const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
|
|
789
|
-
const role = (el.getAttribute('role') || '').toLowerCase();
|
|
790
|
-
if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
|
|
791
|
-
(hiddenAncestor && role === 'dialog')) {
|
|
792
|
-
return;
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
|
-
// Skip elements inside code blocks (e.g. syntax highlighting spans)
|
|
796
|
-
if (el.closest('pre, code')) {
|
|
797
|
-
return;
|
|
798
|
-
}
|
|
799
|
-
elementsToRemove.set(el, { type: 'exact' });
|
|
800
|
-
exactSelectorCount++;
|
|
801
|
-
}
|
|
802
|
-
});
|
|
803
|
-
}
|
|
804
|
-
if (removePartial) {
|
|
805
|
-
// Pre-compile individual regexes for debug pattern identification only
|
|
806
|
-
const individualRegexes = this.debug
|
|
807
|
-
? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
|
|
808
|
-
: null;
|
|
809
|
-
// Use pre-built attribute selector for elements we care about
|
|
810
|
-
const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
|
|
811
|
-
// Process elements for partial matches
|
|
812
|
-
allElements.forEach(el => {
|
|
813
|
-
// Skip if already marked for removal
|
|
814
|
-
if (elementsToRemove.has(el)) {
|
|
815
|
-
return;
|
|
816
|
-
}
|
|
817
|
-
// Skip code elements and elements containing code blocks
|
|
818
|
-
// where class names indicate language/syntax, not page structure
|
|
819
|
-
const tag = el.tagName;
|
|
820
|
-
if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
|
|
821
|
-
return;
|
|
822
|
-
}
|
|
823
|
-
// Get all relevant attributes and combine into a single string
|
|
824
|
-
const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
|
|
825
|
-
if (attr === 'class') {
|
|
826
|
-
return (0, dom_1.getClassName)(el);
|
|
827
|
-
}
|
|
828
|
-
if (attr === 'id') {
|
|
829
|
-
return el.id || '';
|
|
830
|
-
}
|
|
831
|
-
return el.getAttribute(attr) || '';
|
|
832
|
-
}).join(' ').toLowerCase();
|
|
833
|
-
// Skip if no attributes to check
|
|
834
|
-
if (!attrs.trim()) {
|
|
835
|
-
return;
|
|
836
|
-
}
|
|
837
|
-
// Check for partial match using single regex test
|
|
838
|
-
if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
|
|
839
|
-
const matchedPattern = individualRegexes
|
|
840
|
-
? individualRegexes.find(r => r.regex.test(attrs))?.pattern
|
|
841
|
-
: undefined;
|
|
842
|
-
elementsToRemove.set(el, { type: 'partial', selector: matchedPattern });
|
|
843
|
-
partialSelectorCount++;
|
|
844
|
-
}
|
|
845
|
-
});
|
|
846
|
-
}
|
|
847
|
-
// Remove all collected elements in a single pass
|
|
848
|
-
// Skip elements that are ancestors of mainContent to avoid disconnecting it
|
|
849
|
-
// Skip footnote list containers, their parents, and immediate children
|
|
850
|
-
// Skip anchor links inside headings - the heading transform handles these
|
|
851
|
-
elementsToRemove.forEach(({ type, selector }, el) => {
|
|
852
|
-
if (mainContent && el.contains(mainContent)) {
|
|
853
|
-
return;
|
|
854
|
-
}
|
|
855
|
-
if (el.tagName === 'A' && el.closest('h1, h2, h3, h4, h5, h6')) {
|
|
856
|
-
return;
|
|
857
|
-
}
|
|
858
|
-
try {
|
|
859
|
-
if (el.matches(constants_1.FOOTNOTE_LIST_SELECTORS) || el.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS)) {
|
|
860
|
-
return;
|
|
861
|
-
}
|
|
862
|
-
// Protect immediate children of footnote containers (e.g. wikidot div.footnote-footer)
|
|
863
|
-
const parent = el.parentElement;
|
|
864
|
-
if (parent && parent.matches(constants_1.FOOTNOTE_LIST_SELECTORS)) {
|
|
865
|
-
return;
|
|
866
|
-
}
|
|
867
|
-
}
|
|
868
|
-
catch (e) { }
|
|
869
|
-
if (this.debug && debugRemovals) {
|
|
870
|
-
debugRemovals.push({
|
|
871
|
-
step: 'removeBySelector',
|
|
872
|
-
selector: type === 'exact' ? 'exact' : selector,
|
|
873
|
-
reason: type === 'exact' ? 'exact selector match' : `partial match: ${selector}`,
|
|
874
|
-
text: (0, utils_1.textPreview)(el)
|
|
875
|
-
});
|
|
876
|
-
}
|
|
877
|
-
el.remove();
|
|
878
|
-
});
|
|
879
|
-
const endTime = Date.now();
|
|
880
|
-
this._log('Removed clutter elements:', {
|
|
881
|
-
exactSelectors: exactSelectorCount,
|
|
882
|
-
partialSelectors: partialSelectorCount,
|
|
883
|
-
total: elementsToRemove.size,
|
|
884
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
885
|
-
});
|
|
886
|
-
}
|
|
887
|
-
// Find small IMG and SVG elements
|
|
888
|
-
findSmallImages(doc) {
|
|
889
|
-
const MIN_DIMENSION = 33;
|
|
890
|
-
const smallImages = new Set();
|
|
891
|
-
let processedCount = 0;
|
|
892
|
-
const elements = doc.querySelectorAll('img, svg');
|
|
893
|
-
const defaultView = doc.defaultView;
|
|
894
|
-
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
895
|
-
for (const element of elements) {
|
|
896
|
-
const attrWidth = parseInt(element.getAttribute('width') || '0');
|
|
897
|
-
const attrHeight = parseInt(element.getAttribute('height') || '0');
|
|
898
|
-
// Check inline style dimensions
|
|
899
|
-
const style = element.getAttribute('style') || '';
|
|
900
|
-
const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
|
|
901
|
-
const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
|
|
902
|
-
// Use getComputedStyle and getBoundingClientRect only in browser
|
|
903
|
-
let computedWidth = 0, computedHeight = 0;
|
|
904
|
-
if (isBrowser) {
|
|
905
|
-
try {
|
|
906
|
-
const cs = defaultView.getComputedStyle(element);
|
|
907
|
-
computedWidth = parseInt(cs.width) || 0;
|
|
908
|
-
computedHeight = parseInt(cs.height) || 0;
|
|
909
|
-
}
|
|
910
|
-
catch (e) { }
|
|
911
|
-
try {
|
|
912
|
-
const rect = element.getBoundingClientRect();
|
|
913
|
-
if (rect.width > 0)
|
|
914
|
-
computedWidth = computedWidth || rect.width;
|
|
915
|
-
if (rect.height > 0)
|
|
916
|
-
computedHeight = computedHeight || rect.height;
|
|
917
|
-
}
|
|
918
|
-
catch (e) { }
|
|
919
|
-
}
|
|
920
|
-
const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
|
|
921
|
-
const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
|
|
922
|
-
if (widths.length > 0 && heights.length > 0) {
|
|
923
|
-
const effectiveWidth = Math.min(...widths);
|
|
924
|
-
const effectiveHeight = Math.min(...heights);
|
|
925
|
-
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
|
|
926
|
-
const identifier = this.getElementIdentifier(element);
|
|
927
|
-
if (identifier) {
|
|
928
|
-
smallImages.add(identifier);
|
|
929
|
-
processedCount++;
|
|
930
|
-
}
|
|
931
|
-
}
|
|
932
|
-
}
|
|
933
|
-
}
|
|
934
|
-
this._log('Found small elements:', processedCount);
|
|
935
|
-
return smallImages;
|
|
936
|
-
}
|
|
937
|
-
removeSmallImages(doc, smallImages) {
|
|
938
|
-
let removedCount = 0;
|
|
939
|
-
['img', 'svg'].forEach(tag => {
|
|
940
|
-
const elements = doc.getElementsByTagName(tag);
|
|
941
|
-
Array.from(elements).forEach(element => {
|
|
942
|
-
const identifier = this.getElementIdentifier(element);
|
|
943
|
-
if (identifier && smallImages.has(identifier)) {
|
|
944
|
-
element.remove();
|
|
945
|
-
removedCount++;
|
|
946
|
-
}
|
|
947
|
-
});
|
|
948
|
-
});
|
|
949
|
-
this._log('Removed small elements:', removedCount);
|
|
950
|
-
}
|
|
951
|
-
getElementIdentifier(element) {
|
|
952
|
-
// Try to create a unique identifier using various attributes
|
|
953
|
-
if (element.tagName.toLowerCase() === 'img') {
|
|
954
|
-
// For lazy-loaded images, use data-src as identifier if available
|
|
955
|
-
const dataSrc = element.getAttribute('data-src');
|
|
956
|
-
if (dataSrc)
|
|
957
|
-
return `src:${dataSrc}`;
|
|
958
|
-
const src = element.getAttribute('src') || '';
|
|
959
|
-
const srcset = element.getAttribute('srcset') || '';
|
|
960
|
-
const dataSrcset = element.getAttribute('data-srcset');
|
|
961
|
-
if (src)
|
|
962
|
-
return `src:${src}`;
|
|
963
|
-
if (srcset)
|
|
964
|
-
return `srcset:${srcset}`;
|
|
965
|
-
if (dataSrcset)
|
|
966
|
-
return `srcset:${dataSrcset}`;
|
|
967
|
-
}
|
|
968
|
-
const id = element.id || '';
|
|
969
|
-
const className = (0, dom_1.getClassName)(element);
|
|
970
|
-
const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
|
|
971
|
-
if (id)
|
|
972
|
-
return `id:${id}`;
|
|
973
|
-
if (viewBox)
|
|
974
|
-
return `viewBox:${viewBox}`;
|
|
975
|
-
if (className)
|
|
976
|
-
return `class:${className}`;
|
|
977
|
-
return null;
|
|
978
|
-
}
|
|
979
707
|
findMainContent(doc) {
|
|
980
708
|
// Find all potential content containers
|
|
981
709
|
const candidates = [];
|
|
@@ -1121,6 +849,9 @@ class Defuddle {
|
|
|
1121
849
|
.trim()
|
|
1122
850
|
.replace(/^\\?["']+/, '')
|
|
1123
851
|
.replace(/\\?["']+$/, '');
|
|
852
|
+
// Fragment-only hrefs reference anchors within the same document — keep them relative.
|
|
853
|
+
if (normalized.startsWith('#'))
|
|
854
|
+
return normalized;
|
|
1124
855
|
try {
|
|
1125
856
|
return new URL(normalized, baseUrl).href;
|
|
1126
857
|
}
|
|
@@ -1391,7 +1122,7 @@ class Defuddle {
|
|
|
1391
1122
|
return {
|
|
1392
1123
|
content: contentHtml,
|
|
1393
1124
|
title: extracted.variables?.title || metadata.title,
|
|
1394
|
-
description: metadata.description,
|
|
1125
|
+
description: extracted.variables?.description || metadata.description,
|
|
1395
1126
|
domain: metadata.domain,
|
|
1396
1127
|
favicon: metadata.favicon,
|
|
1397
1128
|
image: metadata.image,
|
|
@@ -1424,393 +1155,6 @@ class Defuddle {
|
|
|
1424
1155
|
}
|
|
1425
1156
|
return hasCustom ? custom : undefined;
|
|
1426
1157
|
}
|
|
1427
|
-
/**
|
|
1428
|
-
* Content-based pattern removal for elements that can't be detected by
|
|
1429
|
-
* CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
|
|
1430
|
-
*/
|
|
1431
|
-
removeByContentPattern(mainContent, debugRemovals) {
|
|
1432
|
-
// Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
|
|
1433
|
-
// Only removes leaf elements whose text is PURELY date + read time,
|
|
1434
|
-
// not mixed with other meaningful content like tag names.
|
|
1435
|
-
const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
|
|
1436
|
-
for (const el of candidates) {
|
|
1437
|
-
if (!el.parentNode)
|
|
1438
|
-
continue;
|
|
1439
|
-
if (el.closest('pre') || el.closest('code'))
|
|
1440
|
-
continue;
|
|
1441
|
-
const text = el.textContent?.trim() || '';
|
|
1442
|
-
const words = (0, utils_1.countWords)(text);
|
|
1443
|
-
// Match date + read time in short elements
|
|
1444
|
-
if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
|
|
1445
|
-
// Ensure this is a leaf-ish element, not a large container
|
|
1446
|
-
if (el.querySelectorAll('p, div, section, article').length === 0) {
|
|
1447
|
-
// Verify the text is ONLY date + read time metadata
|
|
1448
|
-
// by stripping all date/time words and checking nothing remains
|
|
1449
|
-
let cleaned = text;
|
|
1450
|
-
for (const pattern of METADATA_STRIP_PATTERNS) {
|
|
1451
|
-
cleaned = cleaned.replace(pattern, '');
|
|
1452
|
-
}
|
|
1453
|
-
if (cleaned.trim().length > 0)
|
|
1454
|
-
continue;
|
|
1455
|
-
if (this.debug && debugRemovals) {
|
|
1456
|
-
debugRemovals.push({
|
|
1457
|
-
step: 'removeByContentPattern',
|
|
1458
|
-
reason: 'read time metadata',
|
|
1459
|
-
text: (0, utils_1.textPreview)(el)
|
|
1460
|
-
});
|
|
1461
|
-
}
|
|
1462
|
-
el.remove();
|
|
1463
|
-
}
|
|
1464
|
-
}
|
|
1465
|
-
}
|
|
1466
|
-
// Remove standalone time/date elements near the start or end of content.
|
|
1467
|
-
// A <time> in its own paragraph at the boundary is metadata (publish date),
|
|
1468
|
-
// but <time> inline within prose should be preserved (see issue #136).
|
|
1469
|
-
const timeElements = Array.from(mainContent.querySelectorAll('time'));
|
|
1470
|
-
const contentText = mainContent.textContent || '';
|
|
1471
|
-
for (const time of timeElements) {
|
|
1472
|
-
if (!time.parentNode)
|
|
1473
|
-
continue;
|
|
1474
|
-
// Walk up through inline/formatting wrappers only (i, em, span, b, strong)
|
|
1475
|
-
// Stop at block elements to avoid removing containers with other content.
|
|
1476
|
-
let target = time;
|
|
1477
|
-
let targetText = target.textContent?.trim() || '';
|
|
1478
|
-
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1479
|
-
const parentTag = target.parentElement.tagName.toLowerCase();
|
|
1480
|
-
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1481
|
-
// If parent is a <p> that only wraps this time, include it
|
|
1482
|
-
if (parentTag === 'p' && parentText === targetText) {
|
|
1483
|
-
target = target.parentElement;
|
|
1484
|
-
break;
|
|
1485
|
-
}
|
|
1486
|
-
// Only walk through inline formatting wrappers
|
|
1487
|
-
if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
|
|
1488
|
-
parentText === targetText) {
|
|
1489
|
-
target = target.parentElement;
|
|
1490
|
-
targetText = parentText;
|
|
1491
|
-
continue;
|
|
1492
|
-
}
|
|
1493
|
-
break;
|
|
1494
|
-
}
|
|
1495
|
-
const text = target.textContent?.trim() || '';
|
|
1496
|
-
const words = (0, utils_1.countWords)(text);
|
|
1497
|
-
if (words > 10)
|
|
1498
|
-
continue;
|
|
1499
|
-
// Check if this element is near the start or end of mainContent
|
|
1500
|
-
const pos = contentText.indexOf(text);
|
|
1501
|
-
const distFromEnd = contentText.length - (pos + text.length);
|
|
1502
|
-
if (pos > 200 && distFromEnd > 200)
|
|
1503
|
-
continue;
|
|
1504
|
-
if (this.debug && debugRemovals) {
|
|
1505
|
-
debugRemovals.push({
|
|
1506
|
-
step: 'removeByContentPattern',
|
|
1507
|
-
reason: 'boundary date element',
|
|
1508
|
-
text: (0, utils_1.textPreview)(target)
|
|
1509
|
-
});
|
|
1510
|
-
}
|
|
1511
|
-
target.remove();
|
|
1512
|
-
}
|
|
1513
|
-
// Remove blog post metadata lists near content boundaries.
|
|
1514
|
-
// These are short <ul>/<ol> elements where every item is a brief
|
|
1515
|
-
// label + value pair (date, reading time, share, etc.) with no
|
|
1516
|
-
// prose sentences. Detected structurally: all items are very short,
|
|
1517
|
-
// none contain sentence-ending punctuation, and the total text is minimal.
|
|
1518
|
-
const metadataLists = mainContent.querySelectorAll('ul, ol');
|
|
1519
|
-
for (const list of metadataLists) {
|
|
1520
|
-
if (!list.parentNode)
|
|
1521
|
-
continue;
|
|
1522
|
-
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1523
|
-
if (items.length < 2 || items.length > 8)
|
|
1524
|
-
continue;
|
|
1525
|
-
// Must be near the start or end of content
|
|
1526
|
-
const listText = list.textContent?.trim() || '';
|
|
1527
|
-
const listPos = contentText.indexOf(listText);
|
|
1528
|
-
const distFromEnd = contentText.length - (listPos + listText.length);
|
|
1529
|
-
if (listPos > 500 && distFromEnd > 500)
|
|
1530
|
-
continue;
|
|
1531
|
-
// Skip lists introduced by a preceding paragraph (e.g. "Features include:")
|
|
1532
|
-
// — those are content lists, not standalone metadata
|
|
1533
|
-
const prevSibling = list.previousElementSibling;
|
|
1534
|
-
if (prevSibling) {
|
|
1535
|
-
const prevText = prevSibling.textContent?.trim() || '';
|
|
1536
|
-
if (prevText.endsWith(':'))
|
|
1537
|
-
continue;
|
|
1538
|
-
}
|
|
1539
|
-
// Every item must be very short (label + value) with no prose
|
|
1540
|
-
let isMetadata = true;
|
|
1541
|
-
for (const item of items) {
|
|
1542
|
-
const text = item.textContent?.trim() || '';
|
|
1543
|
-
const words = (0, utils_1.countWords)(text);
|
|
1544
|
-
if (words > 8) {
|
|
1545
|
-
isMetadata = false;
|
|
1546
|
-
break;
|
|
1547
|
-
}
|
|
1548
|
-
// Prose has sentence-ending punctuation; metadata doesn't
|
|
1549
|
-
if (/[.!?]$/.test(text)) {
|
|
1550
|
-
isMetadata = false;
|
|
1551
|
-
break;
|
|
1552
|
-
}
|
|
1553
|
-
}
|
|
1554
|
-
if (!isMetadata)
|
|
1555
|
-
continue;
|
|
1556
|
-
// Total text should be very short — this is metadata, not content
|
|
1557
|
-
if ((0, utils_1.countWords)(listText) > 30)
|
|
1558
|
-
continue;
|
|
1559
|
-
// Walk up to find the container to remove (e.g. a wrapper div)
|
|
1560
|
-
let target = list;
|
|
1561
|
-
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1562
|
-
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1563
|
-
if (parentText !== listText)
|
|
1564
|
-
break;
|
|
1565
|
-
target = target.parentElement;
|
|
1566
|
-
}
|
|
1567
|
-
if (this.debug && debugRemovals) {
|
|
1568
|
-
debugRemovals.push({
|
|
1569
|
-
step: 'removeByContentPattern',
|
|
1570
|
-
reason: 'blog metadata list',
|
|
1571
|
-
text: (0, utils_1.textPreview)(target)
|
|
1572
|
-
});
|
|
1573
|
-
}
|
|
1574
|
-
target.remove();
|
|
1575
|
-
}
|
|
1576
|
-
// Remove section breadcrumbs
|
|
1577
|
-
// Short elements containing a link to a parent section of the current URL.
|
|
1578
|
-
const url = this.options.url || this.doc.URL || '';
|
|
1579
|
-
let urlPath = '';
|
|
1580
|
-
let pageHost = '';
|
|
1581
|
-
try {
|
|
1582
|
-
const parsedUrl = new URL(url);
|
|
1583
|
-
urlPath = parsedUrl.pathname;
|
|
1584
|
-
pageHost = parsedUrl.hostname.replace(/^www\./, '');
|
|
1585
|
-
}
|
|
1586
|
-
catch { }
|
|
1587
|
-
if (urlPath) {
|
|
1588
|
-
const shortElements = mainContent.querySelectorAll('div, span, p');
|
|
1589
|
-
for (const el of shortElements) {
|
|
1590
|
-
if (!el.parentNode)
|
|
1591
|
-
continue;
|
|
1592
|
-
const text = el.textContent?.trim() || '';
|
|
1593
|
-
const words = (0, utils_1.countWords)(text);
|
|
1594
|
-
if (words > 10)
|
|
1595
|
-
continue;
|
|
1596
|
-
// Must be a leaf-ish element (no block children)
|
|
1597
|
-
if (el.querySelectorAll('p, div, section, article').length > 0)
|
|
1598
|
-
continue;
|
|
1599
|
-
const link = el.querySelector('a[href]');
|
|
1600
|
-
if (!link)
|
|
1601
|
-
continue;
|
|
1602
|
-
try {
|
|
1603
|
-
const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
|
|
1604
|
-
if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
|
|
1605
|
-
if (this.debug && debugRemovals) {
|
|
1606
|
-
debugRemovals.push({
|
|
1607
|
-
step: 'removeByContentPattern',
|
|
1608
|
-
reason: 'section breadcrumb',
|
|
1609
|
-
text: (0, utils_1.textPreview)(el)
|
|
1610
|
-
});
|
|
1611
|
-
}
|
|
1612
|
-
el.remove();
|
|
1613
|
-
}
|
|
1614
|
-
}
|
|
1615
|
-
catch { }
|
|
1616
|
-
}
|
|
1617
|
-
}
|
|
1618
|
-
// Remove trailing external link lists — a heading + list of purely
|
|
1619
|
-
// off-site links as the last content block (affiliate picks, product
|
|
1620
|
-
// roundups, etc.). Only removed when nothing meaningful follows.
|
|
1621
|
-
if (pageHost) {
|
|
1622
|
-
const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
|
|
1623
|
-
for (const heading of headings) {
|
|
1624
|
-
if (!heading.parentNode)
|
|
1625
|
-
continue;
|
|
1626
|
-
const list = heading.nextElementSibling;
|
|
1627
|
-
if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
|
|
1628
|
-
continue;
|
|
1629
|
-
const items = Array.from(list.children).filter(el => el.tagName === 'LI');
|
|
1630
|
-
if (items.length < 2)
|
|
1631
|
-
continue;
|
|
1632
|
-
// The list must be the last meaningful block — nothing after it
|
|
1633
|
-
// except whitespace or empty elements. Walk up through ancestors
|
|
1634
|
-
// to check siblings at each level up to mainContent.
|
|
1635
|
-
let trailingContent = false;
|
|
1636
|
-
let checkEl = list;
|
|
1637
|
-
while (checkEl && checkEl !== mainContent) {
|
|
1638
|
-
let sibling = checkEl.nextElementSibling;
|
|
1639
|
-
while (sibling) {
|
|
1640
|
-
if ((sibling.textContent?.trim() || '').length > 0) {
|
|
1641
|
-
trailingContent = true;
|
|
1642
|
-
break;
|
|
1643
|
-
}
|
|
1644
|
-
sibling = sibling.nextElementSibling;
|
|
1645
|
-
}
|
|
1646
|
-
if (trailingContent)
|
|
1647
|
-
break;
|
|
1648
|
-
checkEl = checkEl.parentElement;
|
|
1649
|
-
}
|
|
1650
|
-
if (trailingContent)
|
|
1651
|
-
continue;
|
|
1652
|
-
// Every list item must be primarily a link pointing off-site
|
|
1653
|
-
let allExternalLinks = true;
|
|
1654
|
-
for (const item of items) {
|
|
1655
|
-
const links = item.querySelectorAll('a[href]');
|
|
1656
|
-
if (links.length === 0) {
|
|
1657
|
-
allExternalLinks = false;
|
|
1658
|
-
break;
|
|
1659
|
-
}
|
|
1660
|
-
const itemText = item.textContent?.trim() || '';
|
|
1661
|
-
let linkTextLen = 0;
|
|
1662
|
-
for (const link of links) {
|
|
1663
|
-
linkTextLen += (link.textContent?.trim() || '').length;
|
|
1664
|
-
try {
|
|
1665
|
-
const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
|
|
1666
|
-
if (linkHost === pageHost) {
|
|
1667
|
-
allExternalLinks = false;
|
|
1668
|
-
break;
|
|
1669
|
-
}
|
|
1670
|
-
}
|
|
1671
|
-
catch { }
|
|
1672
|
-
}
|
|
1673
|
-
if (!allExternalLinks)
|
|
1674
|
-
break;
|
|
1675
|
-
if (linkTextLen < itemText.length * 0.6) {
|
|
1676
|
-
allExternalLinks = false;
|
|
1677
|
-
break;
|
|
1678
|
-
}
|
|
1679
|
-
}
|
|
1680
|
-
if (!allExternalLinks)
|
|
1681
|
-
continue;
|
|
1682
|
-
if (this.debug && debugRemovals) {
|
|
1683
|
-
debugRemovals.push({
|
|
1684
|
-
step: 'removeByContentPattern',
|
|
1685
|
-
reason: 'trailing external link list',
|
|
1686
|
-
text: (0, utils_1.textPreview)(heading)
|
|
1687
|
-
});
|
|
1688
|
-
debugRemovals.push({
|
|
1689
|
-
step: 'removeByContentPattern',
|
|
1690
|
-
reason: 'trailing external link list',
|
|
1691
|
-
text: (0, utils_1.textPreview)(list)
|
|
1692
|
-
});
|
|
1693
|
-
}
|
|
1694
|
-
list.remove();
|
|
1695
|
-
heading.remove();
|
|
1696
|
-
}
|
|
1697
|
-
}
|
|
1698
|
-
// Remove trailing thin sections — the last few direct children of
|
|
1699
|
-
// mainContent that contain a heading but very little prose. These are
|
|
1700
|
-
// typically CTAs, newsletter prompts, or promotional sections that
|
|
1701
|
-
// have been partially stripped by prior removal steps.
|
|
1702
|
-
const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
|
|
1703
|
-
if (totalWords > 300) {
|
|
1704
|
-
// Walk backwards from the last direct child of mainContent,
|
|
1705
|
-
// collecting trailing elements that are thin (empty or very short prose).
|
|
1706
|
-
// Exclude SVG text (path data) from word counts — it's not prose.
|
|
1707
|
-
const trailingEls = [];
|
|
1708
|
-
let trailingWords = 0;
|
|
1709
|
-
let child = mainContent.lastElementChild;
|
|
1710
|
-
while (child) {
|
|
1711
|
-
// Count prose words, excluding SVG path data which inflates word counts
|
|
1712
|
-
let svgWords = 0;
|
|
1713
|
-
for (const svg of child.querySelectorAll('svg')) {
|
|
1714
|
-
svgWords += (0, utils_1.countWords)(svg.textContent || '');
|
|
1715
|
-
}
|
|
1716
|
-
const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
|
|
1717
|
-
if (words > 25)
|
|
1718
|
-
break;
|
|
1719
|
-
trailingWords += words;
|
|
1720
|
-
trailingEls.push(child);
|
|
1721
|
-
child = child.previousElementSibling;
|
|
1722
|
-
}
|
|
1723
|
-
// Must have a heading in the trailing elements and total < 15% of content.
|
|
1724
|
-
// Skip if trailing elements contain content indicators (math, code, tables, images).
|
|
1725
|
-
if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
|
|
1726
|
-
const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
|
|
1727
|
-
const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
|
|
1728
|
-
if (hasHeading && !hasContent) {
|
|
1729
|
-
for (const el of trailingEls) {
|
|
1730
|
-
if (this.debug && debugRemovals) {
|
|
1731
|
-
debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
|
|
1732
|
-
}
|
|
1733
|
-
el.remove();
|
|
1734
|
-
}
|
|
1735
|
-
}
|
|
1736
|
-
}
|
|
1737
|
-
}
|
|
1738
|
-
// Remove boilerplate sentences and trailing non-content.
|
|
1739
|
-
// Search elements for end-of-article boilerplate, then truncate
|
|
1740
|
-
// from the best ancestor that has siblings to remove.
|
|
1741
|
-
const fullText = mainContent.textContent || '';
|
|
1742
|
-
const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
|
|
1743
|
-
for (const el of boilerplateElements) {
|
|
1744
|
-
if (!el.parentNode)
|
|
1745
|
-
continue;
|
|
1746
|
-
const text = el.textContent?.trim() || '';
|
|
1747
|
-
const words = (0, utils_1.countWords)(text);
|
|
1748
|
-
if (words > 50 || words < 3)
|
|
1749
|
-
continue;
|
|
1750
|
-
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
1751
|
-
if (pattern.test(text)) {
|
|
1752
|
-
// Walk up to find an ancestor that has next siblings to truncate.
|
|
1753
|
-
// Don't walk all the way to mainContent's direct child — if there's
|
|
1754
|
-
// a single wrapper div, that would remove everything.
|
|
1755
|
-
let target = el;
|
|
1756
|
-
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1757
|
-
if (target.nextElementSibling)
|
|
1758
|
-
break;
|
|
1759
|
-
target = target.parentElement;
|
|
1760
|
-
}
|
|
1761
|
-
// Only truncate if there's substantial content before the boilerplate
|
|
1762
|
-
const targetText = target.textContent || '';
|
|
1763
|
-
const targetPos = fullText.indexOf(targetText);
|
|
1764
|
-
if (targetPos < 200)
|
|
1765
|
-
continue;
|
|
1766
|
-
// Collect ancestors before modifying the DOM
|
|
1767
|
-
const ancestors = [];
|
|
1768
|
-
let anc = target.parentElement;
|
|
1769
|
-
while (anc && anc !== mainContent) {
|
|
1770
|
-
ancestors.push(anc);
|
|
1771
|
-
anc = anc.parentElement;
|
|
1772
|
-
}
|
|
1773
|
-
// Remove target element and its following siblings
|
|
1774
|
-
this.removeTrailingSiblings(target, true, debugRemovals);
|
|
1775
|
-
// Cascade upward: remove following siblings at each
|
|
1776
|
-
// ancestor level too. Everything after the boilerplate
|
|
1777
|
-
// in document order is non-content.
|
|
1778
|
-
for (const ancestor of ancestors) {
|
|
1779
|
-
this.removeTrailingSiblings(ancestor, false, debugRemovals);
|
|
1780
|
-
}
|
|
1781
|
-
return;
|
|
1782
|
-
}
|
|
1783
|
-
}
|
|
1784
|
-
}
|
|
1785
|
-
}
|
|
1786
|
-
/**
|
|
1787
|
-
* Remove an element's following siblings, and optionally the element itself.
|
|
1788
|
-
*/
|
|
1789
|
-
removeTrailingSiblings(element, removeSelf, debugRemovals) {
|
|
1790
|
-
let sibling = element.nextElementSibling;
|
|
1791
|
-
while (sibling) {
|
|
1792
|
-
const next = sibling.nextElementSibling;
|
|
1793
|
-
if (this.debug && debugRemovals) {
|
|
1794
|
-
debugRemovals.push({
|
|
1795
|
-
step: 'removeByContentPattern',
|
|
1796
|
-
reason: 'trailing non-content',
|
|
1797
|
-
text: (0, utils_1.textPreview)(sibling)
|
|
1798
|
-
});
|
|
1799
|
-
}
|
|
1800
|
-
sibling.remove();
|
|
1801
|
-
sibling = next;
|
|
1802
|
-
}
|
|
1803
|
-
if (removeSelf) {
|
|
1804
|
-
if (this.debug && debugRemovals) {
|
|
1805
|
-
debugRemovals.push({
|
|
1806
|
-
step: 'removeByContentPattern',
|
|
1807
|
-
reason: 'boilerplate text',
|
|
1808
|
-
text: (0, utils_1.textPreview)(element)
|
|
1809
|
-
});
|
|
1810
|
-
}
|
|
1811
|
-
element.remove();
|
|
1812
|
-
}
|
|
1813
|
-
}
|
|
1814
1158
|
}
|
|
1815
1159
|
exports.Defuddle = Defuddle;
|
|
1816
1160
|
//# sourceMappingURL=defuddle.js.map
|