defuddle 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +23 -8
- package/dist/cli.js +2 -1
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +2 -0
- package/dist/constants.js +12 -1
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +48 -1
- package/dist/defuddle.js +519 -213
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/footnotes.js +2 -1
- package/dist/elements/footnotes.js.map +1 -1
- package/dist/extractor-registry.d.ts +1 -0
- package/dist/extractor-registry.js +3 -0
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/_base.d.ts +6 -0
- package/dist/extractors/_base.js +8 -0
- package/dist/extractors/_base.js.map +1 -1
- package/dist/extractors/github.d.ts +10 -2
- package/dist/extractors/github.js +158 -71
- package/dist/extractors/github.js.map +1 -1
- package/dist/extractors/hackernews.js +18 -72
- package/dist/extractors/hackernews.js.map +1 -1
- package/dist/extractors/reddit.d.ts +1 -2
- package/dist/extractors/reddit.js +41 -94
- package/dist/extractors/reddit.js.map +1 -1
- package/dist/extractors/x-oembed.d.ts +0 -1
- package/dist/extractors/x-oembed.js +20 -27
- package/dist/extractors/x-oembed.js.map +1 -1
- package/dist/extractors/youtube.d.ts +37 -0
- package/dist/extractors/youtube.js +409 -9
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/metadata.d.ts +5 -0
- package/dist/metadata.js +28 -0
- package/dist/metadata.js.map +1 -1
- package/dist/node.js +0 -5
- package/dist/node.js.map +1 -1
- package/dist/scoring.d.ts +6 -1
- package/dist/scoring.js +66 -19
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.js +64 -60
- package/dist/standardize.js.map +1 -1
- package/dist/types.d.ts +9 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +9 -0
- package/dist/utils/dom.js +20 -0
- package/dist/utils/dom.js.map +1 -1
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/package.json +1 -1
package/dist/defuddle.js
CHANGED
|
@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
|
|
|
9
9
|
const scoring_1 = require("./scoring");
|
|
10
10
|
const utils_1 = require("./utils");
|
|
11
11
|
const dom_1 = require("./utils/dom");
|
|
12
|
+
/** Keys from extractor variables that map to top-level DefuddleResponse fields */
|
|
13
|
+
const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
|
|
14
|
+
// Content pattern detection constants
|
|
15
|
+
const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
|
|
16
|
+
const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
17
|
+
const BOILERPLATE_PATTERNS = [
|
|
18
|
+
/^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
|
|
19
|
+
/^A version of this (?:article|story) (?:appeared|was published) in\b/i,
|
|
20
|
+
/^Originally (?:published|appeared) (?:in|on|at)\b/i,
|
|
21
|
+
];
|
|
22
|
+
const METADATA_STRIP_PATTERNS = [
|
|
23
|
+
/\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
|
|
24
|
+
/\b\d+(?:st|nd|rd|th)?\b/g,
|
|
25
|
+
/\bmin(?:ute)?s?\b/gi,
|
|
26
|
+
/\bread\b/gi,
|
|
27
|
+
/[|·•—–\-,.\s]/g,
|
|
28
|
+
];
|
|
12
29
|
class Defuddle {
|
|
13
30
|
/**
|
|
14
31
|
* Create a new Defuddle instance
|
|
@@ -16,10 +33,23 @@ class Defuddle {
|
|
|
16
33
|
* @param options - Options for parsing
|
|
17
34
|
*/
|
|
18
35
|
constructor(doc, options = {}) {
|
|
36
|
+
this._schemaOrgData = undefined;
|
|
37
|
+
this._schemaOrgExtracted = false;
|
|
19
38
|
this.doc = doc;
|
|
20
39
|
this.options = options;
|
|
21
40
|
this.debug = options.debug || false;
|
|
22
41
|
}
|
|
42
|
+
/**
|
|
43
|
+
* Lazily extract and cache schema.org data. Must be called before
|
|
44
|
+
* parse() strips script tags from the document.
|
|
45
|
+
*/
|
|
46
|
+
getSchemaOrgData() {
|
|
47
|
+
if (!this._schemaOrgExtracted) {
|
|
48
|
+
this._schemaOrgData = this._extractSchemaOrgData(this.doc);
|
|
49
|
+
this._schemaOrgExtracted = true;
|
|
50
|
+
}
|
|
51
|
+
return this._schemaOrgData;
|
|
52
|
+
}
|
|
23
53
|
/**
|
|
24
54
|
* Parse the document and extract its main content
|
|
25
55
|
*/
|
|
@@ -48,7 +78,8 @@ class Defuddle {
|
|
|
48
78
|
this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
|
|
49
79
|
const indexRetry = this.parseInternal({
|
|
50
80
|
removeLowScoring: false,
|
|
51
|
-
removePartialSelectors: false
|
|
81
|
+
removePartialSelectors: false,
|
|
82
|
+
removeContentPatterns: false
|
|
52
83
|
});
|
|
53
84
|
if (indexRetry.wordCount > result.wordCount) {
|
|
54
85
|
this._log('Index page retry produced more content');
|
|
@@ -125,8 +156,7 @@ class Defuddle {
|
|
|
125
156
|
el.removeAttribute(attr.name);
|
|
126
157
|
}
|
|
127
158
|
else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
|
|
128
|
-
|
|
129
|
-
if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
|
|
159
|
+
if ((0, dom_1.isDangerousUrl)(attr.value)) {
|
|
130
160
|
el.removeAttribute(attr.name);
|
|
131
161
|
}
|
|
132
162
|
}
|
|
@@ -245,46 +275,61 @@ class Defuddle {
|
|
|
245
275
|
return url;
|
|
246
276
|
}
|
|
247
277
|
/**
|
|
248
|
-
* Parse the document
|
|
278
|
+
* Parse the document asynchronously. Checks for extractors that prefer
|
|
279
|
+
* async (e.g. YouTube transcripts) before sync, then falls back to async
|
|
280
|
+
* extractors if sync parse yields no content.
|
|
249
281
|
*/
|
|
250
282
|
async parseAsync() {
|
|
283
|
+
if (this.options.useAsync !== false) {
|
|
284
|
+
const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
|
|
285
|
+
if (asyncResult)
|
|
286
|
+
return asyncResult;
|
|
287
|
+
}
|
|
251
288
|
const result = this.parse();
|
|
252
289
|
if (result.wordCount > 0 || this.options.useAsync === false) {
|
|
253
290
|
return result;
|
|
254
291
|
}
|
|
292
|
+
return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Fetch only async variables (e.g. transcript) without re-parsing.
|
|
296
|
+
* Safe to call after parse() — uses cached schema.org data since
|
|
297
|
+
* parse() strips script tags from the document.
|
|
298
|
+
*/
|
|
299
|
+
async fetchAsyncVariables() {
|
|
300
|
+
if (this.options.useAsync === false)
|
|
301
|
+
return null;
|
|
255
302
|
try {
|
|
256
303
|
const url = this.options.url || this.doc.URL;
|
|
257
|
-
const schemaOrgData = this.
|
|
258
|
-
const extractor = extractor_registry_1.ExtractorRegistry.
|
|
304
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
305
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
|
|
306
|
+
if (extractor) {
|
|
307
|
+
const extracted = await extractor.extractAsync();
|
|
308
|
+
return this.getExtractorVariables(extracted.variables) || null;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
catch (error) {
|
|
312
|
+
console.error('Defuddle', 'Error fetching async variables:', error);
|
|
313
|
+
}
|
|
314
|
+
return null;
|
|
315
|
+
}
|
|
316
|
+
async tryAsyncExtractor(finder) {
|
|
317
|
+
try {
|
|
318
|
+
const url = this.options.url || this.doc.URL;
|
|
319
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
320
|
+
const extractor = finder(this.doc, url, schemaOrgData);
|
|
259
321
|
if (extractor) {
|
|
260
322
|
const startTime = Date.now();
|
|
261
323
|
const extracted = await extractor.extractAsync();
|
|
262
|
-
const contentHtml = this.resolveContentUrls(extracted.contentHtml);
|
|
263
324
|
const pageMetaTags = this._collectMetaTags();
|
|
264
325
|
const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
|
|
265
|
-
|
|
266
|
-
return {
|
|
267
|
-
content: contentHtml,
|
|
268
|
-
title: extracted.variables?.title || metadata.title,
|
|
269
|
-
description: metadata.description,
|
|
270
|
-
domain: metadata.domain,
|
|
271
|
-
favicon: metadata.favicon,
|
|
272
|
-
image: metadata.image,
|
|
273
|
-
published: extracted.variables?.published || metadata.published,
|
|
274
|
-
author: extracted.variables?.author || metadata.author,
|
|
275
|
-
site: extracted.variables?.site || metadata.site,
|
|
276
|
-
schemaOrgData: metadata.schemaOrgData,
|
|
277
|
-
wordCount: this.countWords(extracted.contentHtml),
|
|
278
|
-
parseTime: Math.round(endTime - startTime),
|
|
279
|
-
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
280
|
-
metaTags: pageMetaTags
|
|
281
|
-
};
|
|
326
|
+
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
282
327
|
}
|
|
283
328
|
}
|
|
284
329
|
catch (error) {
|
|
285
330
|
console.error('Defuddle', 'Error in async extraction:', error);
|
|
286
331
|
}
|
|
287
|
-
return
|
|
332
|
+
return null;
|
|
288
333
|
}
|
|
289
334
|
/**
|
|
290
335
|
* Internal parse method that does the actual work
|
|
@@ -297,16 +342,23 @@ class Defuddle {
|
|
|
297
342
|
removeHiddenElements: true,
|
|
298
343
|
removeLowScoring: true,
|
|
299
344
|
removeSmallImages: true,
|
|
345
|
+
removeContentPatterns: true,
|
|
300
346
|
standardize: true,
|
|
301
347
|
...this.options,
|
|
302
348
|
...overrideOptions
|
|
303
349
|
};
|
|
304
350
|
const debugRemovals = [];
|
|
305
|
-
// Extract schema.org data
|
|
306
|
-
const schemaOrgData = this.
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
351
|
+
// Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
|
|
352
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
353
|
+
// Cache meta tags and metadata across retries
|
|
354
|
+
if (!this._metaTags) {
|
|
355
|
+
this._metaTags = this._collectMetaTags();
|
|
356
|
+
}
|
|
357
|
+
const pageMetaTags = this._metaTags;
|
|
358
|
+
if (!this._metadata) {
|
|
359
|
+
this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
|
|
360
|
+
}
|
|
361
|
+
const metadata = this._metadata;
|
|
310
362
|
if (options.removeImages) {
|
|
311
363
|
this.removeImages(this.doc);
|
|
312
364
|
}
|
|
@@ -316,35 +368,25 @@ class Defuddle {
|
|
|
316
368
|
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
|
|
317
369
|
if (extractor && extractor.canExtract()) {
|
|
318
370
|
const extracted = extractor.extract();
|
|
319
|
-
|
|
320
|
-
const endTime = Date.now();
|
|
321
|
-
// console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
|
|
322
|
-
return {
|
|
323
|
-
content: contentHtml,
|
|
324
|
-
title: extracted.variables?.title || metadata.title,
|
|
325
|
-
description: metadata.description,
|
|
326
|
-
domain: metadata.domain,
|
|
327
|
-
favicon: metadata.favicon,
|
|
328
|
-
image: metadata.image,
|
|
329
|
-
published: extracted.variables?.published || metadata.published,
|
|
330
|
-
author: extracted.variables?.author || metadata.author,
|
|
331
|
-
site: extracted.variables?.site || metadata.site,
|
|
332
|
-
schemaOrgData: metadata.schemaOrgData,
|
|
333
|
-
wordCount: this.countWords(extracted.contentHtml),
|
|
334
|
-
parseTime: Math.round(endTime - startTime),
|
|
335
|
-
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
336
|
-
metaTags: pageMetaTags
|
|
337
|
-
};
|
|
371
|
+
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
338
372
|
}
|
|
339
373
|
// Continue if there is no extractor...
|
|
340
|
-
// Evaluate mobile styles and sizes on original document
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
374
|
+
// Evaluate mobile styles and sizes on original document (cached across retries)
|
|
375
|
+
if (!this._mobileStyles) {
|
|
376
|
+
this._mobileStyles = this._evaluateMediaQueries(this.doc);
|
|
377
|
+
}
|
|
378
|
+
const mobileStyles = this._mobileStyles;
|
|
379
|
+
// Find small images in original document (cached across retries)
|
|
380
|
+
if (!this._smallImages) {
|
|
381
|
+
this._smallImages = this.findSmallImages(this.doc);
|
|
382
|
+
}
|
|
383
|
+
const smallImages = this._smallImages;
|
|
344
384
|
// Clone document
|
|
345
385
|
const clone = this.doc.cloneNode(true);
|
|
346
386
|
// Flatten shadow DOM content into the clone
|
|
347
387
|
this.flattenShadowRoots(this.doc, clone);
|
|
388
|
+
// Resolve React streaming SSR suspense boundaries
|
|
389
|
+
this.resolveStreamedContent(clone);
|
|
348
390
|
// Apply mobile styles to clone
|
|
349
391
|
this.applyMobileStyles(clone, mobileStyles);
|
|
350
392
|
// Find main content
|
|
@@ -382,12 +424,16 @@ class Defuddle {
|
|
|
382
424
|
// Remove non-content blocks by scoring
|
|
383
425
|
// Tries to find lists, navigation based on text content and link density
|
|
384
426
|
if (options.removeLowScoring) {
|
|
385
|
-
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals);
|
|
427
|
+
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
386
428
|
}
|
|
387
429
|
// Remove clutter using selectors
|
|
388
430
|
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
389
431
|
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
|
|
390
432
|
}
|
|
433
|
+
// Remove elements by content patterns (read time, boilerplate, article cards)
|
|
434
|
+
if (options.removeContentPatterns && mainContent) {
|
|
435
|
+
this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
|
|
436
|
+
}
|
|
391
437
|
// Normalize the main content
|
|
392
438
|
if (options.standardize) {
|
|
393
439
|
(0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
|
|
@@ -425,17 +471,33 @@ class Defuddle {
|
|
|
425
471
|
}
|
|
426
472
|
}
|
|
427
473
|
countWords(content) {
|
|
428
|
-
//
|
|
429
|
-
const
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
.
|
|
435
|
-
.replace(
|
|
436
|
-
.
|
|
437
|
-
.
|
|
438
|
-
|
|
474
|
+
// Strip HTML tags and decode common entities without DOM parsing
|
|
475
|
+
const text = content
|
|
476
|
+
.replace(/<[^>]*>/g, ' ')
|
|
477
|
+
.replace(/ /gi, ' ')
|
|
478
|
+
.replace(/&/gi, '&')
|
|
479
|
+
.replace(/</gi, '<')
|
|
480
|
+
.replace(/>/gi, '>')
|
|
481
|
+
.replace(/"/gi, '"')
|
|
482
|
+
.replace(/&#\d+;/g, ' ')
|
|
483
|
+
.replace(/&\w+;/g, ' ');
|
|
484
|
+
const trimmed = text.trim();
|
|
485
|
+
if (!trimmed)
|
|
486
|
+
return 0;
|
|
487
|
+
// Count words by splitting on whitespace
|
|
488
|
+
let count = 0;
|
|
489
|
+
let inWord = false;
|
|
490
|
+
for (let i = 0; i < trimmed.length; i++) {
|
|
491
|
+
const isSpace = trimmed.charCodeAt(i) <= 32;
|
|
492
|
+
if (!isSpace && !inWord) {
|
|
493
|
+
count++;
|
|
494
|
+
inWord = true;
|
|
495
|
+
}
|
|
496
|
+
else if (isSpace) {
|
|
497
|
+
inWord = false;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
return count;
|
|
439
501
|
}
|
|
440
502
|
// Make all other methods private by removing the static keyword and using private
|
|
441
503
|
_log(...args) {
|
|
@@ -535,36 +597,34 @@ class Defuddle {
|
|
|
535
597
|
removeHiddenElements(doc, debugRemovals) {
|
|
536
598
|
let count = 0;
|
|
537
599
|
const elementsToRemove = new Map();
|
|
538
|
-
//
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
//
|
|
542
|
-
const
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
600
|
+
// Check inline styles and CSS class-based hidden patterns.
|
|
601
|
+
const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
|
|
602
|
+
// Only use getComputedStyle in browser environments where it's meaningful.
|
|
603
|
+
// In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
|
|
604
|
+
const defaultView = doc.defaultView;
|
|
605
|
+
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
606
|
+
const allElements = doc.querySelectorAll('*');
|
|
607
|
+
for (const element of allElements) {
|
|
608
|
+
// Skip elements that contain math — sites like Wikipedia wrap MathML
|
|
609
|
+
// in display:none spans for accessibility (the visible version is an
|
|
610
|
+
// image/SVG fallback). We need to preserve these for math extraction.
|
|
611
|
+
if (element.querySelector('math, [data-mathml], .katex-mathml') ||
|
|
612
|
+
element.tagName.toLowerCase() === 'math') {
|
|
613
|
+
continue;
|
|
614
|
+
}
|
|
615
|
+
// Check inline style for hidden patterns
|
|
616
|
+
const style = element.getAttribute('style');
|
|
617
|
+
if (style && hiddenStylePattern.test(style)) {
|
|
618
|
+
const reason = style.includes('display') ? 'display:none' :
|
|
619
|
+
style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
|
|
620
|
+
elementsToRemove.set(element, reason);
|
|
621
|
+
count++;
|
|
622
|
+
continue;
|
|
623
|
+
}
|
|
624
|
+
// Use getComputedStyle only in real browser environments
|
|
625
|
+
if (isBrowser) {
|
|
547
626
|
try {
|
|
548
|
-
|
|
549
|
-
}
|
|
550
|
-
catch (e) {
|
|
551
|
-
// If we can't get computed style, check inline styles
|
|
552
|
-
const style = element.getAttribute('style');
|
|
553
|
-
if (!style)
|
|
554
|
-
return null;
|
|
555
|
-
// Create a temporary style element to parse inline styles
|
|
556
|
-
const tempStyle = doc.createElement('style');
|
|
557
|
-
tempStyle.textContent = `* { ${style} }`;
|
|
558
|
-
doc.head.appendChild(tempStyle);
|
|
559
|
-
const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
|
|
560
|
-
doc.head.removeChild(tempStyle);
|
|
561
|
-
return computedStyle;
|
|
562
|
-
}
|
|
563
|
-
});
|
|
564
|
-
// Write phase - mark elements for removal
|
|
565
|
-
batch.forEach((element, index) => {
|
|
566
|
-
const computedStyle = styles[index];
|
|
567
|
-
if (computedStyle) {
|
|
627
|
+
const computedStyle = defaultView.getComputedStyle(element);
|
|
568
628
|
let reason = '';
|
|
569
629
|
if (computedStyle.display === 'none')
|
|
570
630
|
reason = 'display:none';
|
|
@@ -575,25 +635,24 @@ class Defuddle {
|
|
|
575
635
|
if (reason) {
|
|
576
636
|
elementsToRemove.set(element, reason);
|
|
577
637
|
count++;
|
|
638
|
+
continue;
|
|
578
639
|
}
|
|
579
640
|
}
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
}
|
|
593
|
-
}
|
|
641
|
+
catch (e) { }
|
|
642
|
+
}
|
|
643
|
+
// Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
|
|
644
|
+
// "sm:hidden", "not-machine:hidden")
|
|
645
|
+
const className = element.getAttribute('class') || '';
|
|
646
|
+
if (className) {
|
|
647
|
+
const tokens = className.split(/\s+/);
|
|
648
|
+
for (const token of tokens) {
|
|
649
|
+
if (token === 'hidden' || token.endsWith(':hidden')) {
|
|
650
|
+
elementsToRemove.set(element, `class:${token}`);
|
|
651
|
+
count++;
|
|
652
|
+
break;
|
|
594
653
|
}
|
|
595
654
|
}
|
|
596
|
-
}
|
|
655
|
+
}
|
|
597
656
|
}
|
|
598
657
|
// Batch remove all hidden elements
|
|
599
658
|
elementsToRemove.forEach((reason, el) => {
|
|
@@ -719,106 +778,50 @@ class Defuddle {
|
|
|
719
778
|
findSmallImages(doc) {
|
|
720
779
|
const MIN_DIMENSION = 33;
|
|
721
780
|
const smallImages = new Set();
|
|
722
|
-
const transformRegex = /scale\(([\d.]+)\)/;
|
|
723
|
-
const startTime = Date.now();
|
|
724
781
|
let processedCount = 0;
|
|
725
|
-
|
|
726
|
-
const
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
return element.ownerDocument.defaultView?.getComputedStyle(element);
|
|
753
|
-
}
|
|
754
|
-
catch (e) {
|
|
755
|
-
return null;
|
|
756
|
-
}
|
|
757
|
-
});
|
|
758
|
-
// Get bounding rectangles if available
|
|
759
|
-
const rects = batch.map(({ element }) => {
|
|
760
|
-
try {
|
|
761
|
-
return element.getBoundingClientRect();
|
|
762
|
-
}
|
|
763
|
-
catch (e) {
|
|
764
|
-
return null;
|
|
765
|
-
}
|
|
766
|
-
});
|
|
767
|
-
// Process phase - no DOM operations
|
|
768
|
-
batch.forEach((measurement, index) => {
|
|
769
|
-
try {
|
|
770
|
-
const style = styles[index];
|
|
771
|
-
const rect = rects[index];
|
|
772
|
-
if (!style)
|
|
773
|
-
return;
|
|
774
|
-
// Get transform scale in the same batch
|
|
775
|
-
const transform = style.transform;
|
|
776
|
-
const scale = transform ?
|
|
777
|
-
parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
|
|
778
|
-
// Calculate effective dimensions
|
|
779
|
-
const widths = [
|
|
780
|
-
measurement.naturalWidth,
|
|
781
|
-
measurement.attrWidth,
|
|
782
|
-
parseInt(style.width) || 0,
|
|
783
|
-
rect ? rect.width * scale : 0
|
|
784
|
-
].filter(dim => typeof dim === 'number' && dim > 0);
|
|
785
|
-
const heights = [
|
|
786
|
-
measurement.naturalHeight,
|
|
787
|
-
measurement.attrHeight,
|
|
788
|
-
parseInt(style.height) || 0,
|
|
789
|
-
rect ? rect.height * scale : 0
|
|
790
|
-
].filter(dim => typeof dim === 'number' && dim > 0);
|
|
791
|
-
// Decision phase - no DOM operations
|
|
792
|
-
if (widths.length > 0 && heights.length > 0) {
|
|
793
|
-
const effectiveWidth = Math.min(...widths);
|
|
794
|
-
const effectiveHeight = Math.min(...heights);
|
|
795
|
-
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
|
|
796
|
-
const identifier = this.getElementIdentifier(measurement.element);
|
|
797
|
-
if (identifier) {
|
|
798
|
-
smallImages.add(identifier);
|
|
799
|
-
processedCount++;
|
|
800
|
-
}
|
|
801
|
-
}
|
|
802
|
-
}
|
|
803
|
-
}
|
|
804
|
-
catch (e) {
|
|
805
|
-
if (this.debug) {
|
|
806
|
-
console.warn('Defuddle: Failed to process element dimensions:', e);
|
|
807
|
-
}
|
|
808
|
-
}
|
|
809
|
-
});
|
|
782
|
+
const elements = doc.querySelectorAll('img, svg');
|
|
783
|
+
const defaultView = doc.defaultView;
|
|
784
|
+
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
785
|
+
for (const element of elements) {
|
|
786
|
+
const attrWidth = parseInt(element.getAttribute('width') || '0');
|
|
787
|
+
const attrHeight = parseInt(element.getAttribute('height') || '0');
|
|
788
|
+
// Check inline style dimensions
|
|
789
|
+
const style = element.getAttribute('style') || '';
|
|
790
|
+
const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
|
|
791
|
+
const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
|
|
792
|
+
// Use getComputedStyle and getBoundingClientRect only in browser
|
|
793
|
+
let computedWidth = 0, computedHeight = 0;
|
|
794
|
+
if (isBrowser) {
|
|
795
|
+
try {
|
|
796
|
+
const cs = defaultView.getComputedStyle(element);
|
|
797
|
+
computedWidth = parseInt(cs.width) || 0;
|
|
798
|
+
computedHeight = parseInt(cs.height) || 0;
|
|
799
|
+
}
|
|
800
|
+
catch (e) { }
|
|
801
|
+
try {
|
|
802
|
+
const rect = element.getBoundingClientRect();
|
|
803
|
+
if (rect.width > 0)
|
|
804
|
+
computedWidth = computedWidth || rect.width;
|
|
805
|
+
if (rect.height > 0)
|
|
806
|
+
computedHeight = computedHeight || rect.height;
|
|
807
|
+
}
|
|
808
|
+
catch (e) { }
|
|
810
809
|
}
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
810
|
+
const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
|
|
811
|
+
const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
|
|
812
|
+
if (widths.length > 0 && heights.length > 0) {
|
|
813
|
+
const effectiveWidth = Math.min(...widths);
|
|
814
|
+
const effectiveHeight = Math.min(...heights);
|
|
815
|
+
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
|
|
816
|
+
const identifier = this.getElementIdentifier(element);
|
|
817
|
+
if (identifier) {
|
|
818
|
+
smallImages.add(identifier);
|
|
819
|
+
processedCount++;
|
|
820
|
+
}
|
|
814
821
|
}
|
|
815
822
|
}
|
|
816
823
|
}
|
|
817
|
-
|
|
818
|
-
this._log('Found small elements:', {
|
|
819
|
-
count: processedCount,
|
|
820
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
821
|
-
});
|
|
824
|
+
this._log('Found small elements:', processedCount);
|
|
822
825
|
return smallImages;
|
|
823
826
|
}
|
|
824
827
|
removeSmallImages(doc, smallImages) {
|
|
@@ -953,13 +956,11 @@ class Defuddle {
|
|
|
953
956
|
}
|
|
954
957
|
findContentByScoring(doc) {
|
|
955
958
|
const candidates = [];
|
|
956
|
-
constants_1.
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
}
|
|
962
|
-
});
|
|
959
|
+
doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
|
|
960
|
+
const score = scoring_1.ContentScorer.scoreElement(element);
|
|
961
|
+
if (score > 0) {
|
|
962
|
+
candidates.push({ score, element });
|
|
963
|
+
}
|
|
963
964
|
});
|
|
964
965
|
return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
|
|
965
966
|
}
|
|
@@ -1050,12 +1051,12 @@ class Defuddle {
|
|
|
1050
1051
|
* Walks both trees in parallel so positional correspondence is exact.
|
|
1051
1052
|
*/
|
|
1052
1053
|
flattenShadowRoots(original, clone) {
|
|
1053
|
-
const origElements = Array.from(original.body.
|
|
1054
|
+
const origElements = Array.from(original.body.querySelectorAll('*'));
|
|
1054
1055
|
// Find the first element with a shadow root (also serves as the hasShadowRoots check)
|
|
1055
1056
|
const firstShadow = origElements.find(el => el.shadowRoot);
|
|
1056
1057
|
if (!firstShadow)
|
|
1057
1058
|
return;
|
|
1058
|
-
const cloneElements = Array.from(clone.body.
|
|
1059
|
+
const cloneElements = Array.from(clone.body.querySelectorAll('*'));
|
|
1059
1060
|
// Check if we can directly read shadow DOM content (main world / Node.js).
|
|
1060
1061
|
// In content script isolated worlds, shadowRoot exists but content is empty.
|
|
1061
1062
|
const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
|
|
@@ -1096,6 +1097,68 @@ class Defuddle {
|
|
|
1096
1097
|
}
|
|
1097
1098
|
}
|
|
1098
1099
|
}
|
|
1100
|
+
/**
|
|
1101
|
+
* Resolve React streaming SSR suspense boundaries.
|
|
1102
|
+
* React's streaming SSR places content in hidden divs (id="S:0") and
|
|
1103
|
+
* template placeholders (id="B:0") with $RC scripts to swap them.
|
|
1104
|
+
* Since we don't execute scripts, we perform the swap manually.
|
|
1105
|
+
*/
|
|
1106
|
+
resolveStreamedContent(doc) {
|
|
1107
|
+
// Find $RC("B:X","S:X") calls in inline scripts
|
|
1108
|
+
const scripts = doc.querySelectorAll('script');
|
|
1109
|
+
const swaps = [];
|
|
1110
|
+
const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
|
|
1111
|
+
for (const script of scripts) {
|
|
1112
|
+
const text = script.textContent || '';
|
|
1113
|
+
if (!text.includes('$RC('))
|
|
1114
|
+
continue;
|
|
1115
|
+
rcPattern.lastIndex = 0;
|
|
1116
|
+
let match;
|
|
1117
|
+
while ((match = rcPattern.exec(text)) !== null) {
|
|
1118
|
+
swaps.push({ templateId: match[1], contentId: match[2] });
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
if (swaps.length === 0)
|
|
1122
|
+
return;
|
|
1123
|
+
let swapCount = 0;
|
|
1124
|
+
for (const { templateId, contentId } of swaps) {
|
|
1125
|
+
const template = doc.getElementById(templateId);
|
|
1126
|
+
const content = doc.getElementById(contentId);
|
|
1127
|
+
if (!template || !content)
|
|
1128
|
+
continue;
|
|
1129
|
+
const parent = template.parentNode;
|
|
1130
|
+
if (!parent)
|
|
1131
|
+
continue;
|
|
1132
|
+
// Remove the fallback/skeleton content after the template
|
|
1133
|
+
// until the <!--/$--> comment marker
|
|
1134
|
+
let next = template.nextSibling;
|
|
1135
|
+
let foundMarker = false;
|
|
1136
|
+
while (next) {
|
|
1137
|
+
const following = next.nextSibling;
|
|
1138
|
+
if (next.nodeType === 8 && next.data === '/$') {
|
|
1139
|
+
next.remove();
|
|
1140
|
+
foundMarker = true;
|
|
1141
|
+
break;
|
|
1142
|
+
}
|
|
1143
|
+
next.remove();
|
|
1144
|
+
next = following;
|
|
1145
|
+
}
|
|
1146
|
+
// Skip swap if marker wasn't found — malformed streaming output
|
|
1147
|
+
if (!foundMarker)
|
|
1148
|
+
continue;
|
|
1149
|
+
// Insert content children before the template position
|
|
1150
|
+
while (content.firstChild) {
|
|
1151
|
+
parent.insertBefore(content.firstChild, template);
|
|
1152
|
+
}
|
|
1153
|
+
// Clean up the template and hidden div
|
|
1154
|
+
template.remove();
|
|
1155
|
+
content.remove();
|
|
1156
|
+
swapCount++;
|
|
1157
|
+
}
|
|
1158
|
+
if (swapCount > 0) {
|
|
1159
|
+
this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1099
1162
|
/**
|
|
1100
1163
|
* Replace a shadow DOM host element with a div containing its shadow content.
|
|
1101
1164
|
* Custom elements (tag names with hyphens) would re-initialize when inserted
|
|
@@ -1187,6 +1250,249 @@ class Defuddle {
|
|
|
1187
1250
|
_decodeHTMLEntities(text) {
|
|
1188
1251
|
return (0, dom_1.decodeHTMLEntities)(this.doc, text);
|
|
1189
1252
|
}
|
|
1253
|
+
/**
|
|
1254
|
+
* Build a DefuddleResponse from an extractor result with metadata
|
|
1255
|
+
*/
|
|
1256
|
+
buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
|
|
1257
|
+
const contentHtml = this.resolveContentUrls(extracted.contentHtml);
|
|
1258
|
+
const variables = this.getExtractorVariables(extracted.variables);
|
|
1259
|
+
return {
|
|
1260
|
+
content: contentHtml,
|
|
1261
|
+
title: extracted.variables?.title || metadata.title,
|
|
1262
|
+
description: metadata.description,
|
|
1263
|
+
domain: metadata.domain,
|
|
1264
|
+
favicon: metadata.favicon,
|
|
1265
|
+
image: metadata.image,
|
|
1266
|
+
language: extracted.variables?.language || metadata.language,
|
|
1267
|
+
published: extracted.variables?.published || metadata.published,
|
|
1268
|
+
author: extracted.variables?.author || metadata.author,
|
|
1269
|
+
site: extracted.variables?.site || metadata.site,
|
|
1270
|
+
schemaOrgData: metadata.schemaOrgData,
|
|
1271
|
+
wordCount: this.countWords(extracted.contentHtml),
|
|
1272
|
+
parseTime: Math.round(Date.now() - startTime),
|
|
1273
|
+
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
1274
|
+
metaTags: pageMetaTags,
|
|
1275
|
+
...(variables ? { variables } : {}),
|
|
1276
|
+
};
|
|
1277
|
+
}
|
|
1278
|
+
/**
|
|
1279
|
+
* Filter extractor variables to only include custom ones
|
|
1280
|
+
* (exclude standard fields that are already mapped to top-level properties)
|
|
1281
|
+
*/
|
|
1282
|
+
getExtractorVariables(variables) {
|
|
1283
|
+
if (!variables)
|
|
1284
|
+
return undefined;
|
|
1285
|
+
const custom = {};
|
|
1286
|
+
let hasCustom = false;
|
|
1287
|
+
for (const [key, value] of Object.entries(variables)) {
|
|
1288
|
+
if (!STANDARD_VARIABLE_KEYS.has(key)) {
|
|
1289
|
+
custom[key] = value;
|
|
1290
|
+
hasCustom = true;
|
|
1291
|
+
}
|
|
1292
|
+
}
|
|
1293
|
+
return hasCustom ? custom : undefined;
|
|
1294
|
+
}
|
|
1295
|
+
/**
|
|
1296
|
+
* Content-based pattern removal for elements that can't be detected by
|
|
1297
|
+
* CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
|
|
1298
|
+
*/
|
|
1299
|
+
removeByContentPattern(mainContent, debugRemovals) {
|
|
1300
|
+
// Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
|
|
1301
|
+
// Only removes leaf elements whose text is PURELY date + read time,
|
|
1302
|
+
// not mixed with other meaningful content like tag names.
|
|
1303
|
+
const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
|
|
1304
|
+
for (const el of candidates) {
|
|
1305
|
+
if (!el.parentNode)
|
|
1306
|
+
continue;
|
|
1307
|
+
if (el.closest('pre') || el.closest('code'))
|
|
1308
|
+
continue;
|
|
1309
|
+
const text = el.textContent?.trim() || '';
|
|
1310
|
+
const words = text.split(/\s+/).length;
|
|
1311
|
+
// Match date + read time in short elements
|
|
1312
|
+
if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
|
|
1313
|
+
// Ensure this is a leaf-ish element, not a large container
|
|
1314
|
+
if (el.querySelectorAll('p, div, section, article').length === 0) {
|
|
1315
|
+
// Verify the text is ONLY date + read time metadata
|
|
1316
|
+
// by stripping all date/time words and checking nothing remains
|
|
1317
|
+
let cleaned = text;
|
|
1318
|
+
for (const pattern of METADATA_STRIP_PATTERNS) {
|
|
1319
|
+
cleaned = cleaned.replace(pattern, '');
|
|
1320
|
+
}
|
|
1321
|
+
if (cleaned.trim().length > 0)
|
|
1322
|
+
continue;
|
|
1323
|
+
if (this.debug && debugRemovals) {
|
|
1324
|
+
debugRemovals.push({
|
|
1325
|
+
step: 'removeByContentPattern',
|
|
1326
|
+
reason: 'read time metadata',
|
|
1327
|
+
text: (0, utils_1.textPreview)(el)
|
|
1328
|
+
});
|
|
1329
|
+
}
|
|
1330
|
+
el.remove();
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
// Remove standalone time/date elements near the start or end of content.
|
|
1335
|
+
// A <time> in its own paragraph at the boundary is metadata (publish date),
|
|
1336
|
+
// but <time> inline within prose should be preserved (see issue #136).
|
|
1337
|
+
const timeElements = Array.from(mainContent.querySelectorAll('time'));
|
|
1338
|
+
const contentText = mainContent.textContent || '';
|
|
1339
|
+
for (const time of timeElements) {
|
|
1340
|
+
if (!time.parentNode)
|
|
1341
|
+
continue;
|
|
1342
|
+
// Walk up through inline/formatting wrappers only (i, em, span, b, strong)
|
|
1343
|
+
// Stop at block elements to avoid removing containers with other content.
|
|
1344
|
+
let target = time;
|
|
1345
|
+
let targetText = target.textContent?.trim() || '';
|
|
1346
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1347
|
+
const parentTag = target.parentElement.tagName.toLowerCase();
|
|
1348
|
+
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1349
|
+
// If parent is a <p> that only wraps this time, include it
|
|
1350
|
+
if (parentTag === 'p' && parentText === targetText) {
|
|
1351
|
+
target = target.parentElement;
|
|
1352
|
+
break;
|
|
1353
|
+
}
|
|
1354
|
+
// Only walk through inline formatting wrappers
|
|
1355
|
+
if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
|
|
1356
|
+
parentText === targetText) {
|
|
1357
|
+
target = target.parentElement;
|
|
1358
|
+
targetText = parentText;
|
|
1359
|
+
continue;
|
|
1360
|
+
}
|
|
1361
|
+
break;
|
|
1362
|
+
}
|
|
1363
|
+
const text = target.textContent?.trim() || '';
|
|
1364
|
+
const words = text.split(/\s+/).length;
|
|
1365
|
+
if (words > 10)
|
|
1366
|
+
continue;
|
|
1367
|
+
// Check if this element is near the start or end of mainContent
|
|
1368
|
+
const pos = contentText.indexOf(text);
|
|
1369
|
+
const distFromEnd = contentText.length - (pos + text.length);
|
|
1370
|
+
if (pos > 200 && distFromEnd > 200)
|
|
1371
|
+
continue;
|
|
1372
|
+
if (this.debug && debugRemovals) {
|
|
1373
|
+
debugRemovals.push({
|
|
1374
|
+
step: 'removeByContentPattern',
|
|
1375
|
+
reason: 'boundary date element',
|
|
1376
|
+
text: (0, utils_1.textPreview)(target)
|
|
1377
|
+
});
|
|
1378
|
+
}
|
|
1379
|
+
target.remove();
|
|
1380
|
+
}
|
|
1381
|
+
// Remove section breadcrumbs
|
|
1382
|
+
// Short elements containing a link to a parent section of the current URL.
|
|
1383
|
+
const url = this.options.url || this.doc.URL || '';
|
|
1384
|
+
let urlPath = '';
|
|
1385
|
+
try {
|
|
1386
|
+
urlPath = new URL(url).pathname;
|
|
1387
|
+
}
|
|
1388
|
+
catch { }
|
|
1389
|
+
if (urlPath) {
|
|
1390
|
+
const shortElements = mainContent.querySelectorAll('div, span, p');
|
|
1391
|
+
for (const el of shortElements) {
|
|
1392
|
+
if (!el.parentNode)
|
|
1393
|
+
continue;
|
|
1394
|
+
const text = el.textContent?.trim() || '';
|
|
1395
|
+
const words = text.split(/\s+/).length;
|
|
1396
|
+
if (words > 10)
|
|
1397
|
+
continue;
|
|
1398
|
+
// Must be a leaf-ish element (no block children)
|
|
1399
|
+
if (el.querySelectorAll('p, div, section, article').length > 0)
|
|
1400
|
+
continue;
|
|
1401
|
+
const link = el.querySelector('a[href]');
|
|
1402
|
+
if (!link)
|
|
1403
|
+
continue;
|
|
1404
|
+
try {
|
|
1405
|
+
const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
|
|
1406
|
+
if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
|
|
1407
|
+
if (this.debug && debugRemovals) {
|
|
1408
|
+
debugRemovals.push({
|
|
1409
|
+
step: 'removeByContentPattern',
|
|
1410
|
+
reason: 'section breadcrumb',
|
|
1411
|
+
text: (0, utils_1.textPreview)(el)
|
|
1412
|
+
});
|
|
1413
|
+
}
|
|
1414
|
+
el.remove();
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
catch { }
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
// Remove boilerplate sentences and trailing non-content.
|
|
1421
|
+
// Search elements for end-of-article boilerplate, then truncate
|
|
1422
|
+
// from the best ancestor that has siblings to remove.
|
|
1423
|
+
const fullText = mainContent.textContent || '';
|
|
1424
|
+
const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
|
|
1425
|
+
for (const el of boilerplateElements) {
|
|
1426
|
+
if (!el.parentNode)
|
|
1427
|
+
continue;
|
|
1428
|
+
const text = el.textContent?.trim() || '';
|
|
1429
|
+
const words = text.split(/\s+/).length;
|
|
1430
|
+
if (words > 50 || words < 3)
|
|
1431
|
+
continue;
|
|
1432
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
1433
|
+
if (pattern.test(text)) {
|
|
1434
|
+
// Walk up to find an ancestor that has next siblings to truncate.
|
|
1435
|
+
// Don't walk all the way to mainContent's direct child — if there's
|
|
1436
|
+
// a single wrapper div, that would remove everything.
|
|
1437
|
+
let target = el;
|
|
1438
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1439
|
+
if (target.nextElementSibling)
|
|
1440
|
+
break;
|
|
1441
|
+
target = target.parentElement;
|
|
1442
|
+
}
|
|
1443
|
+
// Only truncate if there's substantial content before the boilerplate
|
|
1444
|
+
const targetText = target.textContent || '';
|
|
1445
|
+
const targetPos = fullText.indexOf(targetText);
|
|
1446
|
+
if (targetPos < 200)
|
|
1447
|
+
continue;
|
|
1448
|
+
// Collect ancestors before modifying the DOM
|
|
1449
|
+
const ancestors = [];
|
|
1450
|
+
let anc = target.parentElement;
|
|
1451
|
+
while (anc && anc !== mainContent) {
|
|
1452
|
+
ancestors.push(anc);
|
|
1453
|
+
anc = anc.parentElement;
|
|
1454
|
+
}
|
|
1455
|
+
// Remove target element and its following siblings
|
|
1456
|
+
this.removeTrailingSiblings(target, true, debugRemovals);
|
|
1457
|
+
// Cascade upward: remove following siblings at each
|
|
1458
|
+
// ancestor level too. Everything after the boilerplate
|
|
1459
|
+
// in document order is non-content.
|
|
1460
|
+
for (const ancestor of ancestors) {
|
|
1461
|
+
this.removeTrailingSiblings(ancestor, false, debugRemovals);
|
|
1462
|
+
}
|
|
1463
|
+
return;
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
}
|
|
1468
|
+
/**
|
|
1469
|
+
* Remove an element's following siblings, and optionally the element itself.
|
|
1470
|
+
*/
|
|
1471
|
+
removeTrailingSiblings(element, removeSelf, debugRemovals) {
|
|
1472
|
+
let sibling = element.nextElementSibling;
|
|
1473
|
+
while (sibling) {
|
|
1474
|
+
const next = sibling.nextElementSibling;
|
|
1475
|
+
if (this.debug && debugRemovals) {
|
|
1476
|
+
debugRemovals.push({
|
|
1477
|
+
step: 'removeByContentPattern',
|
|
1478
|
+
reason: 'trailing non-content',
|
|
1479
|
+
text: (0, utils_1.textPreview)(sibling)
|
|
1480
|
+
});
|
|
1481
|
+
}
|
|
1482
|
+
sibling.remove();
|
|
1483
|
+
sibling = next;
|
|
1484
|
+
}
|
|
1485
|
+
if (removeSelf) {
|
|
1486
|
+
if (this.debug && debugRemovals) {
|
|
1487
|
+
debugRemovals.push({
|
|
1488
|
+
step: 'removeByContentPattern',
|
|
1489
|
+
reason: 'boilerplate text',
|
|
1490
|
+
text: (0, utils_1.textPreview)(element)
|
|
1491
|
+
});
|
|
1492
|
+
}
|
|
1493
|
+
element.remove();
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1190
1496
|
}
|
|
1191
1497
|
exports.Defuddle = Defuddle;
|
|
1192
1498
|
//# sourceMappingURL=defuddle.js.map
|