defuddle 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -29
- package/dist/cli.js +15 -46
- package/dist/cli.js.map +1 -1
- package/dist/constants.d.ts +9 -0
- package/dist/constants.js +33 -9
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +50 -2
- package/dist/defuddle.js +615 -238
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/code.js +31 -9
- package/dist/elements/code.js.map +1 -1
- package/dist/elements/footnotes.js +2 -1
- package/dist/elements/footnotes.js.map +1 -1
- package/dist/elements/headings.js +42 -50
- package/dist/elements/headings.js.map +1 -1
- package/dist/extractor-registry.d.ts +1 -0
- package/dist/extractor-registry.js +3 -0
- package/dist/extractor-registry.js.map +1 -1
- package/dist/extractors/_base.d.ts +6 -0
- package/dist/extractors/_base.js +8 -0
- package/dist/extractors/_base.js.map +1 -1
- package/dist/extractors/github.d.ts +10 -2
- package/dist/extractors/github.js +158 -71
- package/dist/extractors/github.js.map +1 -1
- package/dist/extractors/hackernews.js +18 -72
- package/dist/extractors/hackernews.js.map +1 -1
- package/dist/extractors/reddit.d.ts +1 -2
- package/dist/extractors/reddit.js +41 -94
- package/dist/extractors/reddit.js.map +1 -1
- package/dist/extractors/x-oembed.d.ts +0 -1
- package/dist/extractors/x-oembed.js +20 -27
- package/dist/extractors/x-oembed.js.map +1 -1
- package/dist/extractors/youtube.d.ts +57 -0
- package/dist/extractors/youtube.js +619 -10
- package/dist/extractors/youtube.js.map +1 -1
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/markdown.js +5 -0
- package/dist/markdown.js.map +1 -1
- package/dist/metadata.d.ts +5 -0
- package/dist/metadata.js +28 -0
- package/dist/metadata.js.map +1 -1
- package/dist/node.d.ts +12 -5
- package/dist/node.js +53 -22
- package/dist/node.js.map +1 -1
- package/dist/scoring.d.ts +6 -1
- package/dist/scoring.js +69 -22
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.js +152 -63
- package/dist/standardize.js.map +1 -1
- package/dist/types.d.ts +9 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +9 -0
- package/dist/utils/dom.js +20 -0
- package/dist/utils/dom.js.map +1 -1
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +6 -0
- package/dist/utils.js +36 -0
- package/dist/utils.js.map +1 -1
- package/package.json +3 -4
package/dist/defuddle.js
CHANGED
|
@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
|
|
|
9
9
|
const scoring_1 = require("./scoring");
|
|
10
10
|
const utils_1 = require("./utils");
|
|
11
11
|
const dom_1 = require("./utils/dom");
|
|
12
|
+
/** Keys from extractor variables that map to top-level DefuddleResponse fields */
|
|
13
|
+
const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
|
|
14
|
+
// Content pattern detection constants
|
|
15
|
+
const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
|
|
16
|
+
const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
|
|
17
|
+
const BOILERPLATE_PATTERNS = [
|
|
18
|
+
/^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
|
|
19
|
+
/^A version of this (?:article|story) (?:appeared|was published) in\b/i,
|
|
20
|
+
/^Originally (?:published|appeared) (?:in|on|at)\b/i,
|
|
21
|
+
];
|
|
22
|
+
const METADATA_STRIP_PATTERNS = [
|
|
23
|
+
/\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
|
|
24
|
+
/\b\d+(?:st|nd|rd|th)?\b/g,
|
|
25
|
+
/\bmin(?:ute)?s?\b/gi,
|
|
26
|
+
/\bread\b/gi,
|
|
27
|
+
/[|·•—–\-,.\s]/g,
|
|
28
|
+
];
|
|
12
29
|
class Defuddle {
|
|
13
30
|
/**
|
|
14
31
|
* Create a new Defuddle instance
|
|
@@ -16,10 +33,23 @@ class Defuddle {
|
|
|
16
33
|
* @param options - Options for parsing
|
|
17
34
|
*/
|
|
18
35
|
constructor(doc, options = {}) {
|
|
36
|
+
this._schemaOrgData = undefined;
|
|
37
|
+
this._schemaOrgExtracted = false;
|
|
19
38
|
this.doc = doc;
|
|
20
39
|
this.options = options;
|
|
21
40
|
this.debug = options.debug || false;
|
|
22
41
|
}
|
|
42
|
+
/**
|
|
43
|
+
* Lazily extract and cache schema.org data. Must be called before
|
|
44
|
+
* parse() strips script tags from the document.
|
|
45
|
+
*/
|
|
46
|
+
getSchemaOrgData() {
|
|
47
|
+
if (!this._schemaOrgExtracted) {
|
|
48
|
+
this._schemaOrgData = this._extractSchemaOrgData(this.doc);
|
|
49
|
+
this._schemaOrgExtracted = true;
|
|
50
|
+
}
|
|
51
|
+
return this._schemaOrgData;
|
|
52
|
+
}
|
|
23
53
|
/**
|
|
24
54
|
* Parse the document and extract its main content
|
|
25
55
|
*/
|
|
@@ -42,13 +72,44 @@ class Defuddle {
|
|
|
42
72
|
}
|
|
43
73
|
}
|
|
44
74
|
// If still very little content, the page may be an index/listing page
|
|
75
|
+
// or a page that reveals content at runtime from a hidden wrapper.
|
|
76
|
+
// Retry once with hidden-element removal disabled.
|
|
77
|
+
if (result.wordCount < 50) {
|
|
78
|
+
this._log('Still very little content, retrying without hidden-element removal');
|
|
79
|
+
const hiddenRetry = this.parseInternal({
|
|
80
|
+
removeHiddenElements: false
|
|
81
|
+
});
|
|
82
|
+
if (hiddenRetry.wordCount > result.wordCount * 2) {
|
|
83
|
+
this._log('Hidden-element retry produced more content');
|
|
84
|
+
result = hiddenRetry;
|
|
85
|
+
}
|
|
86
|
+
// Try targeting the largest hidden subtree directly to avoid body-level
|
|
87
|
+
// leftovers (e.g. FPS counters) when hidden content is the real article.
|
|
88
|
+
const hiddenSelector = this.findLargestHiddenContentSelector();
|
|
89
|
+
if (hiddenSelector) {
|
|
90
|
+
this._log('Retrying with hidden content selector:', hiddenSelector);
|
|
91
|
+
const hiddenSelectorRetry = this.parseInternal({
|
|
92
|
+
removeHiddenElements: false,
|
|
93
|
+
removePartialSelectors: false,
|
|
94
|
+
contentSelector: hiddenSelector
|
|
95
|
+
});
|
|
96
|
+
if (hiddenSelectorRetry.wordCount > result.wordCount ||
|
|
97
|
+
(hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
|
|
98
|
+
hiddenSelectorRetry.content.length < result.content.length)) {
|
|
99
|
+
this._log('Hidden-selector retry produced better focused content');
|
|
100
|
+
result = hiddenSelectorRetry;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// If still very little content, the page may be an index/listing page
|
|
45
105
|
// where card elements were scored as non-content or removed by partial
|
|
46
106
|
// selectors (e.g. "post-preview"). Retry with both disabled.
|
|
47
107
|
if (result.wordCount < 50) {
|
|
48
108
|
this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
|
|
49
109
|
const indexRetry = this.parseInternal({
|
|
50
110
|
removeLowScoring: false,
|
|
51
|
-
removePartialSelectors: false
|
|
111
|
+
removePartialSelectors: false,
|
|
112
|
+
removeContentPatterns: false
|
|
52
113
|
});
|
|
53
114
|
if (indexRetry.wordCount > result.wordCount) {
|
|
54
115
|
this._log('Index page retry produced more content');
|
|
@@ -64,17 +125,17 @@ class Defuddle {
|
|
|
64
125
|
// longer than what we extracted, the scorer likely picked the wrong
|
|
65
126
|
// element from a feed. Find the correct element in the DOM.
|
|
66
127
|
const schemaText = this._getSchemaText(result.schemaOrgData);
|
|
67
|
-
if (schemaText && this.
|
|
128
|
+
if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
|
|
68
129
|
const contentHtml = this._findContentBySchemaText(schemaText);
|
|
69
130
|
if (contentHtml) {
|
|
70
131
|
this._log('Found DOM content matching schema.org text');
|
|
71
132
|
result.content = contentHtml;
|
|
72
|
-
result.wordCount = this.
|
|
133
|
+
result.wordCount = this.countHtmlWords(contentHtml);
|
|
73
134
|
}
|
|
74
135
|
else {
|
|
75
136
|
this._log('Using schema.org text as content (DOM element not found)');
|
|
76
137
|
result.content = schemaText;
|
|
77
|
-
result.wordCount = this.
|
|
138
|
+
result.wordCount = this.countHtmlWords(schemaText);
|
|
78
139
|
}
|
|
79
140
|
}
|
|
80
141
|
return result;
|
|
@@ -125,8 +186,7 @@ class Defuddle {
|
|
|
125
186
|
el.removeAttribute(attr.name);
|
|
126
187
|
}
|
|
127
188
|
else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
|
|
128
|
-
|
|
129
|
-
if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
|
|
189
|
+
if ((0, dom_1.isDangerousUrl)(attr.value)) {
|
|
130
190
|
el.removeAttribute(attr.name);
|
|
131
191
|
}
|
|
132
192
|
}
|
|
@@ -149,7 +209,7 @@ class Defuddle {
|
|
|
149
209
|
const searchPhrase = firstPara.substring(0, 100).trim();
|
|
150
210
|
if (!searchPhrase)
|
|
151
211
|
return '';
|
|
152
|
-
const schemaWordCount = this.
|
|
212
|
+
const schemaWordCount = this.countHtmlWords(schemaText);
|
|
153
213
|
// Find the smallest element whose text contains the search phrase
|
|
154
214
|
// and whose word count is close to the schema text's word count
|
|
155
215
|
let bestMatch = null;
|
|
@@ -159,7 +219,7 @@ class Defuddle {
|
|
|
159
219
|
const elText = (el.textContent || '');
|
|
160
220
|
if (!elText.includes(searchPhrase))
|
|
161
221
|
continue;
|
|
162
|
-
const elWords =
|
|
222
|
+
const elWords = (0, utils_1.countWords)(elText);
|
|
163
223
|
// Element should contain roughly the same amount of text
|
|
164
224
|
// (allow some slack for surrounding whitespace / minor extras)
|
|
165
225
|
if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
|
|
@@ -211,6 +271,27 @@ class Defuddle {
|
|
|
211
271
|
}
|
|
212
272
|
return html;
|
|
213
273
|
}
|
|
274
|
+
findLargestHiddenContentSelector() {
|
|
275
|
+
const body = this.doc.body;
|
|
276
|
+
if (!body)
|
|
277
|
+
return undefined;
|
|
278
|
+
const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
|
|
279
|
+
const className = el.getAttribute('class') || '';
|
|
280
|
+
return !className.includes('math');
|
|
281
|
+
});
|
|
282
|
+
let best = null;
|
|
283
|
+
let bestWords = 0;
|
|
284
|
+
for (const el of candidates) {
|
|
285
|
+
const words = (0, utils_1.countWords)(el.textContent || '');
|
|
286
|
+
if (words > bestWords) {
|
|
287
|
+
best = el;
|
|
288
|
+
bestWords = words;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
if (!best || bestWords < 30)
|
|
292
|
+
return undefined;
|
|
293
|
+
return this.getElementSelector(best);
|
|
294
|
+
}
|
|
214
295
|
/**
|
|
215
296
|
* Get the largest available src from an img element,
|
|
216
297
|
* checking srcset for higher-resolution versions.
|
|
@@ -245,68 +326,109 @@ class Defuddle {
|
|
|
245
326
|
return url;
|
|
246
327
|
}
|
|
247
328
|
/**
|
|
248
|
-
* Parse the document
|
|
329
|
+
* Parse the document asynchronously. Checks for extractors that prefer
|
|
330
|
+
* async (e.g. YouTube transcripts) before sync, then falls back to async
|
|
331
|
+
* extractors if sync parse yields no content.
|
|
249
332
|
*/
|
|
250
333
|
async parseAsync() {
|
|
334
|
+
if (this.options.useAsync !== false) {
|
|
335
|
+
const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
|
|
336
|
+
if (asyncResult)
|
|
337
|
+
return asyncResult;
|
|
338
|
+
}
|
|
251
339
|
const result = this.parse();
|
|
252
340
|
if (result.wordCount > 0 || this.options.useAsync === false) {
|
|
253
341
|
return result;
|
|
254
342
|
}
|
|
343
|
+
return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Fetch only async variables (e.g. transcript) without re-parsing.
|
|
347
|
+
* Safe to call after parse() — uses cached schema.org data since
|
|
348
|
+
* parse() strips script tags from the document.
|
|
349
|
+
*/
|
|
350
|
+
async fetchAsyncVariables() {
|
|
351
|
+
if (this.options.useAsync === false)
|
|
352
|
+
return null;
|
|
353
|
+
try {
|
|
354
|
+
const url = this.options.url || this.doc.URL;
|
|
355
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
356
|
+
const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
|
|
357
|
+
if (extractor) {
|
|
358
|
+
const extracted = await extractor.extractAsync();
|
|
359
|
+
return this.getExtractorVariables(extracted.variables) || null;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
catch (error) {
|
|
363
|
+
console.error('Defuddle', 'Error fetching async variables:', error);
|
|
364
|
+
}
|
|
365
|
+
return null;
|
|
366
|
+
}
|
|
367
|
+
async tryAsyncExtractor(finder) {
|
|
255
368
|
try {
|
|
256
369
|
const url = this.options.url || this.doc.URL;
|
|
257
|
-
const schemaOrgData = this.
|
|
258
|
-
const extractor =
|
|
370
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
371
|
+
const extractor = finder(this.doc, url, schemaOrgData);
|
|
259
372
|
if (extractor) {
|
|
260
373
|
const startTime = Date.now();
|
|
261
374
|
const extracted = await extractor.extractAsync();
|
|
262
|
-
const contentHtml = this.resolveContentUrls(extracted.contentHtml);
|
|
263
375
|
const pageMetaTags = this._collectMetaTags();
|
|
264
376
|
const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
|
|
265
|
-
|
|
266
|
-
return {
|
|
267
|
-
content: contentHtml,
|
|
268
|
-
title: extracted.variables?.title || metadata.title,
|
|
269
|
-
description: metadata.description,
|
|
270
|
-
domain: metadata.domain,
|
|
271
|
-
favicon: metadata.favicon,
|
|
272
|
-
image: metadata.image,
|
|
273
|
-
published: extracted.variables?.published || metadata.published,
|
|
274
|
-
author: extracted.variables?.author || metadata.author,
|
|
275
|
-
site: extracted.variables?.site || metadata.site,
|
|
276
|
-
schemaOrgData: metadata.schemaOrgData,
|
|
277
|
-
wordCount: this.countWords(extracted.contentHtml),
|
|
278
|
-
parseTime: Math.round(endTime - startTime),
|
|
279
|
-
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
280
|
-
metaTags: pageMetaTags
|
|
281
|
-
};
|
|
377
|
+
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
282
378
|
}
|
|
283
379
|
}
|
|
284
380
|
catch (error) {
|
|
285
381
|
console.error('Defuddle', 'Error in async extraction:', error);
|
|
286
382
|
}
|
|
287
|
-
return
|
|
383
|
+
return null;
|
|
288
384
|
}
|
|
289
385
|
/**
|
|
290
386
|
* Internal parse method that does the actual work
|
|
291
387
|
*/
|
|
292
388
|
parseInternal(overrideOptions = {}) {
|
|
293
389
|
const startTime = Date.now();
|
|
390
|
+
// Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
|
|
391
|
+
if (!this.doc.documentElement) {
|
|
392
|
+
const url = this.options.url || '';
|
|
393
|
+
return {
|
|
394
|
+
content: '',
|
|
395
|
+
title: '',
|
|
396
|
+
description: '',
|
|
397
|
+
domain: url ? new URL(url).hostname : '',
|
|
398
|
+
favicon: '',
|
|
399
|
+
image: '',
|
|
400
|
+
language: '',
|
|
401
|
+
parseTime: Date.now() - startTime,
|
|
402
|
+
published: '',
|
|
403
|
+
author: '',
|
|
404
|
+
site: '',
|
|
405
|
+
schemaOrgData: null,
|
|
406
|
+
wordCount: 0,
|
|
407
|
+
};
|
|
408
|
+
}
|
|
294
409
|
const options = {
|
|
295
410
|
removeExactSelectors: true,
|
|
296
411
|
removePartialSelectors: true,
|
|
297
412
|
removeHiddenElements: true,
|
|
298
413
|
removeLowScoring: true,
|
|
299
414
|
removeSmallImages: true,
|
|
415
|
+
removeContentPatterns: true,
|
|
300
416
|
standardize: true,
|
|
301
417
|
...this.options,
|
|
302
418
|
...overrideOptions
|
|
303
419
|
};
|
|
304
420
|
const debugRemovals = [];
|
|
305
|
-
// Extract schema.org data
|
|
306
|
-
const schemaOrgData = this.
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
421
|
+
// Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
|
|
422
|
+
const schemaOrgData = this.getSchemaOrgData();
|
|
423
|
+
// Cache meta tags and metadata across retries
|
|
424
|
+
if (!this._metaTags) {
|
|
425
|
+
this._metaTags = this._collectMetaTags();
|
|
426
|
+
}
|
|
427
|
+
const pageMetaTags = this._metaTags;
|
|
428
|
+
if (!this._metadata) {
|
|
429
|
+
this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
|
|
430
|
+
}
|
|
431
|
+
const metadata = this._metadata;
|
|
310
432
|
if (options.removeImages) {
|
|
311
433
|
this.removeImages(this.doc);
|
|
312
434
|
}
|
|
@@ -316,35 +438,28 @@ class Defuddle {
|
|
|
316
438
|
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
|
|
317
439
|
if (extractor && extractor.canExtract()) {
|
|
318
440
|
const extracted = extractor.extract();
|
|
319
|
-
|
|
320
|
-
const endTime = Date.now();
|
|
321
|
-
// console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
|
|
322
|
-
return {
|
|
323
|
-
content: contentHtml,
|
|
324
|
-
title: extracted.variables?.title || metadata.title,
|
|
325
|
-
description: metadata.description,
|
|
326
|
-
domain: metadata.domain,
|
|
327
|
-
favicon: metadata.favicon,
|
|
328
|
-
image: metadata.image,
|
|
329
|
-
published: extracted.variables?.published || metadata.published,
|
|
330
|
-
author: extracted.variables?.author || metadata.author,
|
|
331
|
-
site: extracted.variables?.site || metadata.site,
|
|
332
|
-
schemaOrgData: metadata.schemaOrgData,
|
|
333
|
-
wordCount: this.countWords(extracted.contentHtml),
|
|
334
|
-
parseTime: Math.round(endTime - startTime),
|
|
335
|
-
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
336
|
-
metaTags: pageMetaTags
|
|
337
|
-
};
|
|
441
|
+
return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
|
|
338
442
|
}
|
|
339
443
|
// Continue if there is no extractor...
|
|
340
|
-
// Evaluate mobile styles and sizes on original document
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
444
|
+
// Evaluate mobile styles and sizes on original document (cached across retries)
|
|
445
|
+
if (!this._mobileStyles) {
|
|
446
|
+
this._mobileStyles = this._evaluateMediaQueries(this.doc);
|
|
447
|
+
}
|
|
448
|
+
const mobileStyles = this._mobileStyles;
|
|
449
|
+
// Find small images in original document (cached across retries)
|
|
450
|
+
if (!this._smallImages) {
|
|
451
|
+
this._smallImages = this.findSmallImages(this.doc);
|
|
452
|
+
}
|
|
453
|
+
const smallImages = this._smallImages;
|
|
344
454
|
// Clone document
|
|
345
455
|
const clone = this.doc.cloneNode(true);
|
|
456
|
+
// Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
|
|
457
|
+
// create when parsing HTML entities like '
|
|
458
|
+
clone.body?.normalize();
|
|
346
459
|
// Flatten shadow DOM content into the clone
|
|
347
460
|
this.flattenShadowRoots(this.doc, clone);
|
|
461
|
+
// Resolve React streaming SSR suspense boundaries
|
|
462
|
+
this.resolveStreamedContent(clone);
|
|
348
463
|
// Apply mobile styles to clone
|
|
349
464
|
this.applyMobileStyles(clone, mobileStyles);
|
|
350
465
|
// Find main content
|
|
@@ -357,12 +472,12 @@ class Defuddle {
|
|
|
357
472
|
mainContent = this.findMainContent(clone);
|
|
358
473
|
}
|
|
359
474
|
if (!mainContent) {
|
|
360
|
-
const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
|
|
475
|
+
const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
361
476
|
const endTime = Date.now();
|
|
362
477
|
return {
|
|
363
478
|
content: fallbackContent,
|
|
364
479
|
...metadata,
|
|
365
|
-
wordCount: this.
|
|
480
|
+
wordCount: this.countHtmlWords(fallbackContent),
|
|
366
481
|
parseTime: Math.round(endTime - startTime),
|
|
367
482
|
metaTags: pageMetaTags
|
|
368
483
|
};
|
|
@@ -382,11 +497,15 @@ class Defuddle {
|
|
|
382
497
|
// Remove non-content blocks by scoring
|
|
383
498
|
// Tries to find lists, navigation based on text content and link density
|
|
384
499
|
if (options.removeLowScoring) {
|
|
385
|
-
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals);
|
|
500
|
+
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
|
|
386
501
|
}
|
|
387
502
|
// Remove clutter using selectors
|
|
388
503
|
if (options.removeExactSelectors || options.removePartialSelectors) {
|
|
389
|
-
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
|
|
504
|
+
this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
|
|
505
|
+
}
|
|
506
|
+
// Remove elements by content patterns (read time, boilerplate, article cards)
|
|
507
|
+
if (options.removeContentPatterns && mainContent) {
|
|
508
|
+
this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
|
|
390
509
|
}
|
|
391
510
|
// Normalize the main content
|
|
392
511
|
if (options.standardize) {
|
|
@@ -399,7 +518,7 @@ class Defuddle {
|
|
|
399
518
|
const result = {
|
|
400
519
|
content,
|
|
401
520
|
...metadata,
|
|
402
|
-
wordCount: this.
|
|
521
|
+
wordCount: this.countHtmlWords(content),
|
|
403
522
|
parseTime: Math.round(endTime - startTime),
|
|
404
523
|
metaTags: pageMetaTags
|
|
405
524
|
};
|
|
@@ -413,29 +532,29 @@ class Defuddle {
|
|
|
413
532
|
}
|
|
414
533
|
catch (error) {
|
|
415
534
|
console.error('Defuddle', 'Error processing document:', error);
|
|
416
|
-
const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
|
|
535
|
+
const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
|
|
417
536
|
const endTime = Date.now();
|
|
418
537
|
return {
|
|
419
538
|
content: errorContent,
|
|
420
539
|
...metadata,
|
|
421
|
-
wordCount: this.
|
|
540
|
+
wordCount: this.countHtmlWords(errorContent),
|
|
422
541
|
parseTime: Math.round(endTime - startTime),
|
|
423
542
|
metaTags: pageMetaTags
|
|
424
543
|
};
|
|
425
544
|
}
|
|
426
545
|
}
|
|
427
|
-
|
|
428
|
-
//
|
|
429
|
-
const
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
.
|
|
435
|
-
.replace(
|
|
436
|
-
.
|
|
437
|
-
.
|
|
438
|
-
return
|
|
546
|
+
countHtmlWords(content) {
|
|
547
|
+
// Strip HTML tags and decode common entities without DOM parsing
|
|
548
|
+
const text = content
|
|
549
|
+
.replace(/<[^>]*>/g, ' ')
|
|
550
|
+
.replace(/ /gi, ' ')
|
|
551
|
+
.replace(/&/gi, '&')
|
|
552
|
+
.replace(/</gi, '<')
|
|
553
|
+
.replace(/>/gi, '>')
|
|
554
|
+
.replace(/"/gi, '"')
|
|
555
|
+
.replace(/&#\d+;/g, ' ')
|
|
556
|
+
.replace(/&\w+;/g, ' ');
|
|
557
|
+
return (0, utils_1.countWords)(text);
|
|
439
558
|
}
|
|
440
559
|
// Make all other methods private by removing the static keyword and using private
|
|
441
560
|
_log(...args) {
|
|
@@ -447,6 +566,8 @@ class Defuddle {
|
|
|
447
566
|
const mobileStyles = [];
|
|
448
567
|
const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
|
|
449
568
|
try {
|
|
569
|
+
if (!doc.styleSheets)
|
|
570
|
+
return mobileStyles;
|
|
450
571
|
// Get all styles, including inline styles
|
|
451
572
|
const sheets = Array.from(doc.styleSheets).filter(sheet => {
|
|
452
573
|
try {
|
|
@@ -535,36 +656,34 @@ class Defuddle {
|
|
|
535
656
|
removeHiddenElements(doc, debugRemovals) {
|
|
536
657
|
let count = 0;
|
|
537
658
|
const elementsToRemove = new Map();
|
|
538
|
-
//
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
//
|
|
542
|
-
const
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
659
|
+
// Check inline styles and CSS class-based hidden patterns.
|
|
660
|
+
const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
|
|
661
|
+
// Only use getComputedStyle in browser environments where it's meaningful.
|
|
662
|
+
// In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
|
|
663
|
+
const defaultView = doc.defaultView;
|
|
664
|
+
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
665
|
+
const allElements = doc.querySelectorAll('*');
|
|
666
|
+
for (const element of allElements) {
|
|
667
|
+
// Skip elements that contain math — sites like Wikipedia wrap MathML
|
|
668
|
+
// in display:none spans for accessibility (the visible version is an
|
|
669
|
+
// image/SVG fallback). We need to preserve these for math extraction.
|
|
670
|
+
if (element.querySelector('math, [data-mathml], .katex-mathml') ||
|
|
671
|
+
element.tagName.toLowerCase() === 'math') {
|
|
672
|
+
continue;
|
|
673
|
+
}
|
|
674
|
+
// Check inline style for hidden patterns
|
|
675
|
+
const style = element.getAttribute('style');
|
|
676
|
+
if (style && hiddenStylePattern.test(style)) {
|
|
677
|
+
const reason = style.includes('display') ? 'display:none' :
|
|
678
|
+
style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
|
|
679
|
+
elementsToRemove.set(element, reason);
|
|
680
|
+
count++;
|
|
681
|
+
continue;
|
|
682
|
+
}
|
|
683
|
+
// Use getComputedStyle only in real browser environments
|
|
684
|
+
if (isBrowser) {
|
|
547
685
|
try {
|
|
548
|
-
|
|
549
|
-
}
|
|
550
|
-
catch (e) {
|
|
551
|
-
// If we can't get computed style, check inline styles
|
|
552
|
-
const style = element.getAttribute('style');
|
|
553
|
-
if (!style)
|
|
554
|
-
return null;
|
|
555
|
-
// Create a temporary style element to parse inline styles
|
|
556
|
-
const tempStyle = doc.createElement('style');
|
|
557
|
-
tempStyle.textContent = `* { ${style} }`;
|
|
558
|
-
doc.head.appendChild(tempStyle);
|
|
559
|
-
const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
|
|
560
|
-
doc.head.removeChild(tempStyle);
|
|
561
|
-
return computedStyle;
|
|
562
|
-
}
|
|
563
|
-
});
|
|
564
|
-
// Write phase - mark elements for removal
|
|
565
|
-
batch.forEach((element, index) => {
|
|
566
|
-
const computedStyle = styles[index];
|
|
567
|
-
if (computedStyle) {
|
|
686
|
+
const computedStyle = defaultView.getComputedStyle(element);
|
|
568
687
|
let reason = '';
|
|
569
688
|
if (computedStyle.display === 'none')
|
|
570
689
|
reason = 'display:none';
|
|
@@ -575,25 +694,24 @@ class Defuddle {
|
|
|
575
694
|
if (reason) {
|
|
576
695
|
elementsToRemove.set(element, reason);
|
|
577
696
|
count++;
|
|
697
|
+
continue;
|
|
578
698
|
}
|
|
579
699
|
}
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
}
|
|
593
|
-
}
|
|
700
|
+
catch (e) { }
|
|
701
|
+
}
|
|
702
|
+
// Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
|
|
703
|
+
// "sm:hidden", "not-machine:hidden")
|
|
704
|
+
const className = element.getAttribute('class') || '';
|
|
705
|
+
if (className) {
|
|
706
|
+
const tokens = className.split(/\s+/);
|
|
707
|
+
for (const token of tokens) {
|
|
708
|
+
if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
|
|
709
|
+
elementsToRemove.set(element, `class:${token}`);
|
|
710
|
+
count++;
|
|
711
|
+
break;
|
|
594
712
|
}
|
|
595
713
|
}
|
|
596
|
-
}
|
|
714
|
+
}
|
|
597
715
|
}
|
|
598
716
|
// Batch remove all hidden elements
|
|
599
717
|
elementsToRemove.forEach((reason, el) => {
|
|
@@ -608,7 +726,7 @@ class Defuddle {
|
|
|
608
726
|
});
|
|
609
727
|
this._log('Removed hidden elements:', count);
|
|
610
728
|
}
|
|
611
|
-
removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
|
|
729
|
+
removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
|
|
612
730
|
const startTime = Date.now();
|
|
613
731
|
let exactSelectorCount = 0;
|
|
614
732
|
let partialSelectorCount = 0;
|
|
@@ -616,9 +734,17 @@ class Defuddle {
|
|
|
616
734
|
const elementsToRemove = new Map();
|
|
617
735
|
// First collect elements matching exact selectors
|
|
618
736
|
if (removeExact) {
|
|
619
|
-
const exactElements = doc.querySelectorAll(constants_1.
|
|
737
|
+
const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
|
|
620
738
|
exactElements.forEach(el => {
|
|
621
739
|
if (el?.parentNode) {
|
|
740
|
+
if (skipHiddenExactSelectors) {
|
|
741
|
+
const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
|
|
742
|
+
const role = (el.getAttribute('role') || '').toLowerCase();
|
|
743
|
+
if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
|
|
744
|
+
(hiddenAncestor && role === 'dialog')) {
|
|
745
|
+
return;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
622
748
|
// Skip elements inside code blocks (e.g. syntax highlighting spans)
|
|
623
749
|
if (el.closest('pre, code')) {
|
|
624
750
|
return;
|
|
@@ -629,16 +755,12 @@ class Defuddle {
|
|
|
629
755
|
});
|
|
630
756
|
}
|
|
631
757
|
if (removePartial) {
|
|
632
|
-
// Pre-compile regexes
|
|
633
|
-
const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
|
|
634
|
-
const partialRegex = new RegExp(combinedPattern, 'i');
|
|
635
|
-
// Pre-compile individual regexes for debug pattern identification
|
|
758
|
+
// Pre-compile individual regexes for debug pattern identification only
|
|
636
759
|
const individualRegexes = this.debug
|
|
637
760
|
? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
|
|
638
761
|
: null;
|
|
639
|
-
//
|
|
640
|
-
const
|
|
641
|
-
const allElements = doc.querySelectorAll(attributeSelector);
|
|
762
|
+
// Use pre-built attribute selector for elements we care about
|
|
763
|
+
const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
|
|
642
764
|
// Process elements for partial matches
|
|
643
765
|
allElements.forEach(el => {
|
|
644
766
|
// Skip if already marked for removal
|
|
@@ -666,7 +788,7 @@ class Defuddle {
|
|
|
666
788
|
return;
|
|
667
789
|
}
|
|
668
790
|
// Check for partial match using single regex test
|
|
669
|
-
if (
|
|
791
|
+
if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
|
|
670
792
|
const matchedPattern = individualRegexes
|
|
671
793
|
? individualRegexes.find(r => r.regex.test(attrs))?.pattern
|
|
672
794
|
: undefined;
|
|
@@ -719,106 +841,50 @@ class Defuddle {
|
|
|
719
841
|
findSmallImages(doc) {
|
|
720
842
|
const MIN_DIMENSION = 33;
|
|
721
843
|
const smallImages = new Set();
|
|
722
|
-
const transformRegex = /scale\(([\d.]+)\)/;
|
|
723
|
-
const startTime = Date.now();
|
|
724
844
|
let processedCount = 0;
|
|
725
|
-
|
|
726
|
-
const
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
return element.ownerDocument.defaultView?.getComputedStyle(element);
|
|
753
|
-
}
|
|
754
|
-
catch (e) {
|
|
755
|
-
return null;
|
|
756
|
-
}
|
|
757
|
-
});
|
|
758
|
-
// Get bounding rectangles if available
|
|
759
|
-
const rects = batch.map(({ element }) => {
|
|
760
|
-
try {
|
|
761
|
-
return element.getBoundingClientRect();
|
|
762
|
-
}
|
|
763
|
-
catch (e) {
|
|
764
|
-
return null;
|
|
765
|
-
}
|
|
766
|
-
});
|
|
767
|
-
// Process phase - no DOM operations
|
|
768
|
-
batch.forEach((measurement, index) => {
|
|
769
|
-
try {
|
|
770
|
-
const style = styles[index];
|
|
771
|
-
const rect = rects[index];
|
|
772
|
-
if (!style)
|
|
773
|
-
return;
|
|
774
|
-
// Get transform scale in the same batch
|
|
775
|
-
const transform = style.transform;
|
|
776
|
-
const scale = transform ?
|
|
777
|
-
parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
|
|
778
|
-
// Calculate effective dimensions
|
|
779
|
-
const widths = [
|
|
780
|
-
measurement.naturalWidth,
|
|
781
|
-
measurement.attrWidth,
|
|
782
|
-
parseInt(style.width) || 0,
|
|
783
|
-
rect ? rect.width * scale : 0
|
|
784
|
-
].filter(dim => typeof dim === 'number' && dim > 0);
|
|
785
|
-
const heights = [
|
|
786
|
-
measurement.naturalHeight,
|
|
787
|
-
measurement.attrHeight,
|
|
788
|
-
parseInt(style.height) || 0,
|
|
789
|
-
rect ? rect.height * scale : 0
|
|
790
|
-
].filter(dim => typeof dim === 'number' && dim > 0);
|
|
791
|
-
// Decision phase - no DOM operations
|
|
792
|
-
if (widths.length > 0 && heights.length > 0) {
|
|
793
|
-
const effectiveWidth = Math.min(...widths);
|
|
794
|
-
const effectiveHeight = Math.min(...heights);
|
|
795
|
-
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
|
|
796
|
-
const identifier = this.getElementIdentifier(measurement.element);
|
|
797
|
-
if (identifier) {
|
|
798
|
-
smallImages.add(identifier);
|
|
799
|
-
processedCount++;
|
|
800
|
-
}
|
|
801
|
-
}
|
|
802
|
-
}
|
|
803
|
-
}
|
|
804
|
-
catch (e) {
|
|
805
|
-
if (this.debug) {
|
|
806
|
-
console.warn('Defuddle: Failed to process element dimensions:', e);
|
|
807
|
-
}
|
|
808
|
-
}
|
|
809
|
-
});
|
|
845
|
+
const elements = doc.querySelectorAll('img, svg');
|
|
846
|
+
const defaultView = doc.defaultView;
|
|
847
|
+
const isBrowser = typeof window !== 'undefined' && defaultView === window;
|
|
848
|
+
for (const element of elements) {
|
|
849
|
+
const attrWidth = parseInt(element.getAttribute('width') || '0');
|
|
850
|
+
const attrHeight = parseInt(element.getAttribute('height') || '0');
|
|
851
|
+
// Check inline style dimensions
|
|
852
|
+
const style = element.getAttribute('style') || '';
|
|
853
|
+
const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
|
|
854
|
+
const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
|
|
855
|
+
// Use getComputedStyle and getBoundingClientRect only in browser
|
|
856
|
+
let computedWidth = 0, computedHeight = 0;
|
|
857
|
+
if (isBrowser) {
|
|
858
|
+
try {
|
|
859
|
+
const cs = defaultView.getComputedStyle(element);
|
|
860
|
+
computedWidth = parseInt(cs.width) || 0;
|
|
861
|
+
computedHeight = parseInt(cs.height) || 0;
|
|
862
|
+
}
|
|
863
|
+
catch (e) { }
|
|
864
|
+
try {
|
|
865
|
+
const rect = element.getBoundingClientRect();
|
|
866
|
+
if (rect.width > 0)
|
|
867
|
+
computedWidth = computedWidth || rect.width;
|
|
868
|
+
if (rect.height > 0)
|
|
869
|
+
computedHeight = computedHeight || rect.height;
|
|
870
|
+
}
|
|
871
|
+
catch (e) { }
|
|
810
872
|
}
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
873
|
+
const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
|
|
874
|
+
const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
|
|
875
|
+
if (widths.length > 0 && heights.length > 0) {
|
|
876
|
+
const effectiveWidth = Math.min(...widths);
|
|
877
|
+
const effectiveHeight = Math.min(...heights);
|
|
878
|
+
if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
|
|
879
|
+
const identifier = this.getElementIdentifier(element);
|
|
880
|
+
if (identifier) {
|
|
881
|
+
smallImages.add(identifier);
|
|
882
|
+
processedCount++;
|
|
883
|
+
}
|
|
814
884
|
}
|
|
815
885
|
}
|
|
816
886
|
}
|
|
817
|
-
|
|
818
|
-
this._log('Found small elements:', {
|
|
819
|
-
count: processedCount,
|
|
820
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
821
|
-
});
|
|
887
|
+
this._log('Found small elements:', processedCount);
|
|
822
888
|
return smallImages;
|
|
823
889
|
}
|
|
824
890
|
removeSmallImages(doc, smallImages) {
|
|
@@ -909,7 +975,7 @@ class Defuddle {
|
|
|
909
975
|
let best = top;
|
|
910
976
|
for (let i = 1; i < candidates.length; i++) {
|
|
911
977
|
const child = candidates[i];
|
|
912
|
-
const childWords = (child.element.textContent || '')
|
|
978
|
+
const childWords = (0, utils_1.countWords)(child.element.textContent || '');
|
|
913
979
|
if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
|
|
914
980
|
// Count how many candidates share this selector index inside
|
|
915
981
|
// the top element. Use top (not best) as the stable reference
|
|
@@ -953,13 +1019,11 @@ class Defuddle {
|
|
|
953
1019
|
}
|
|
954
1020
|
findContentByScoring(doc) {
|
|
955
1021
|
const candidates = [];
|
|
956
|
-
constants_1.
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
}
|
|
962
|
-
});
|
|
1022
|
+
doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
|
|
1023
|
+
const score = scoring_1.ContentScorer.scoreElement(element);
|
|
1024
|
+
if (score > 0) {
|
|
1025
|
+
candidates.push({ score, element });
|
|
1026
|
+
}
|
|
963
1027
|
});
|
|
964
1028
|
return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
|
|
965
1029
|
}
|
|
@@ -990,11 +1054,17 @@ class Defuddle {
|
|
|
990
1054
|
if (!baseUrl)
|
|
991
1055
|
return;
|
|
992
1056
|
const resolve = (url) => {
|
|
1057
|
+
// Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
|
|
1058
|
+
// Normalize these before URL resolution.
|
|
1059
|
+
const normalized = url
|
|
1060
|
+
.trim()
|
|
1061
|
+
.replace(/^\\?["']+/, '')
|
|
1062
|
+
.replace(/\\?["']+$/, '');
|
|
993
1063
|
try {
|
|
994
|
-
return new URL(
|
|
1064
|
+
return new URL(normalized, baseUrl).href;
|
|
995
1065
|
}
|
|
996
1066
|
catch {
|
|
997
|
-
return url;
|
|
1067
|
+
return normalized || url;
|
|
998
1068
|
}
|
|
999
1069
|
};
|
|
1000
1070
|
element.querySelectorAll('[href]').forEach(el => {
|
|
@@ -1050,12 +1120,14 @@ class Defuddle {
|
|
|
1050
1120
|
* Walks both trees in parallel so positional correspondence is exact.
|
|
1051
1121
|
*/
|
|
1052
1122
|
flattenShadowRoots(original, clone) {
|
|
1053
|
-
|
|
1123
|
+
if (!original.body || !clone.body)
|
|
1124
|
+
return;
|
|
1125
|
+
const origElements = Array.from(original.body.querySelectorAll('*'));
|
|
1054
1126
|
// Find the first element with a shadow root (also serves as the hasShadowRoots check)
|
|
1055
1127
|
const firstShadow = origElements.find(el => el.shadowRoot);
|
|
1056
1128
|
if (!firstShadow)
|
|
1057
1129
|
return;
|
|
1058
|
-
const cloneElements = Array.from(clone.body.
|
|
1130
|
+
const cloneElements = Array.from(clone.body.querySelectorAll('*'));
|
|
1059
1131
|
// Check if we can directly read shadow DOM content (main world / Node.js).
|
|
1060
1132
|
// In content script isolated worlds, shadowRoot exists but content is empty.
|
|
1061
1133
|
const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
|
|
@@ -1096,6 +1168,68 @@ class Defuddle {
|
|
|
1096
1168
|
}
|
|
1097
1169
|
}
|
|
1098
1170
|
}
|
|
1171
|
+
/**
|
|
1172
|
+
* Resolve React streaming SSR suspense boundaries.
|
|
1173
|
+
* React's streaming SSR places content in hidden divs (id="S:0") and
|
|
1174
|
+
* template placeholders (id="B:0") with $RC scripts to swap them.
|
|
1175
|
+
* Since we don't execute scripts, we perform the swap manually.
|
|
1176
|
+
*/
|
|
1177
|
+
resolveStreamedContent(doc) {
|
|
1178
|
+
// Find $RC("B:X","S:X") calls in inline scripts
|
|
1179
|
+
const scripts = doc.querySelectorAll('script');
|
|
1180
|
+
const swaps = [];
|
|
1181
|
+
const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
|
|
1182
|
+
for (const script of scripts) {
|
|
1183
|
+
const text = script.textContent || '';
|
|
1184
|
+
if (!text.includes('$RC('))
|
|
1185
|
+
continue;
|
|
1186
|
+
rcPattern.lastIndex = 0;
|
|
1187
|
+
let match;
|
|
1188
|
+
while ((match = rcPattern.exec(text)) !== null) {
|
|
1189
|
+
swaps.push({ templateId: match[1], contentId: match[2] });
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
if (swaps.length === 0)
|
|
1193
|
+
return;
|
|
1194
|
+
let swapCount = 0;
|
|
1195
|
+
for (const { templateId, contentId } of swaps) {
|
|
1196
|
+
const template = doc.getElementById(templateId);
|
|
1197
|
+
const content = doc.getElementById(contentId);
|
|
1198
|
+
if (!template || !content)
|
|
1199
|
+
continue;
|
|
1200
|
+
const parent = template.parentNode;
|
|
1201
|
+
if (!parent)
|
|
1202
|
+
continue;
|
|
1203
|
+
// Remove the fallback/skeleton content after the template
|
|
1204
|
+
// until the <!--/$--> comment marker
|
|
1205
|
+
let next = template.nextSibling;
|
|
1206
|
+
let foundMarker = false;
|
|
1207
|
+
while (next) {
|
|
1208
|
+
const following = next.nextSibling;
|
|
1209
|
+
if (next.nodeType === 8 && next.data === '/$') {
|
|
1210
|
+
next.remove();
|
|
1211
|
+
foundMarker = true;
|
|
1212
|
+
break;
|
|
1213
|
+
}
|
|
1214
|
+
next.remove();
|
|
1215
|
+
next = following;
|
|
1216
|
+
}
|
|
1217
|
+
// Skip swap if marker wasn't found — malformed streaming output
|
|
1218
|
+
if (!foundMarker)
|
|
1219
|
+
continue;
|
|
1220
|
+
// Insert content children before the template position
|
|
1221
|
+
while (content.firstChild) {
|
|
1222
|
+
parent.insertBefore(content.firstChild, template);
|
|
1223
|
+
}
|
|
1224
|
+
// Clean up the template and hidden div
|
|
1225
|
+
template.remove();
|
|
1226
|
+
content.remove();
|
|
1227
|
+
swapCount++;
|
|
1228
|
+
}
|
|
1229
|
+
if (swapCount > 0) {
|
|
1230
|
+
this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1099
1233
|
/**
|
|
1100
1234
|
* Replace a shadow DOM host element with a div containing its shadow content.
|
|
1101
1235
|
* Custom elements (tag names with hyphens) would re-initialize when inserted
|
|
@@ -1187,6 +1321,249 @@ class Defuddle {
|
|
|
1187
1321
|
_decodeHTMLEntities(text) {
|
|
1188
1322
|
return (0, dom_1.decodeHTMLEntities)(this.doc, text);
|
|
1189
1323
|
}
|
|
1324
|
+
/**
|
|
1325
|
+
* Build a DefuddleResponse from an extractor result with metadata
|
|
1326
|
+
*/
|
|
1327
|
+
buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
|
|
1328
|
+
const contentHtml = this.resolveContentUrls(extracted.contentHtml);
|
|
1329
|
+
const variables = this.getExtractorVariables(extracted.variables);
|
|
1330
|
+
return {
|
|
1331
|
+
content: contentHtml,
|
|
1332
|
+
title: extracted.variables?.title || metadata.title,
|
|
1333
|
+
description: metadata.description,
|
|
1334
|
+
domain: metadata.domain,
|
|
1335
|
+
favicon: metadata.favicon,
|
|
1336
|
+
image: metadata.image,
|
|
1337
|
+
language: extracted.variables?.language || metadata.language,
|
|
1338
|
+
published: extracted.variables?.published || metadata.published,
|
|
1339
|
+
author: extracted.variables?.author || metadata.author,
|
|
1340
|
+
site: extracted.variables?.site || metadata.site,
|
|
1341
|
+
schemaOrgData: metadata.schemaOrgData,
|
|
1342
|
+
wordCount: this.countHtmlWords(extracted.contentHtml),
|
|
1343
|
+
parseTime: Math.round(Date.now() - startTime),
|
|
1344
|
+
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
|
|
1345
|
+
metaTags: pageMetaTags,
|
|
1346
|
+
...(variables ? { variables } : {}),
|
|
1347
|
+
};
|
|
1348
|
+
}
|
|
1349
|
+
/**
|
|
1350
|
+
* Filter extractor variables to only include custom ones
|
|
1351
|
+
* (exclude standard fields that are already mapped to top-level properties)
|
|
1352
|
+
*/
|
|
1353
|
+
getExtractorVariables(variables) {
|
|
1354
|
+
if (!variables)
|
|
1355
|
+
return undefined;
|
|
1356
|
+
const custom = {};
|
|
1357
|
+
let hasCustom = false;
|
|
1358
|
+
for (const [key, value] of Object.entries(variables)) {
|
|
1359
|
+
if (!STANDARD_VARIABLE_KEYS.has(key)) {
|
|
1360
|
+
custom[key] = value;
|
|
1361
|
+
hasCustom = true;
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
return hasCustom ? custom : undefined;
|
|
1365
|
+
}
|
|
1366
|
+
/**
|
|
1367
|
+
* Content-based pattern removal for elements that can't be detected by
|
|
1368
|
+
* CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
|
|
1369
|
+
*/
|
|
1370
|
+
removeByContentPattern(mainContent, debugRemovals) {
|
|
1371
|
+
// Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
|
|
1372
|
+
// Only removes leaf elements whose text is PURELY date + read time,
|
|
1373
|
+
// not mixed with other meaningful content like tag names.
|
|
1374
|
+
const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
|
|
1375
|
+
for (const el of candidates) {
|
|
1376
|
+
if (!el.parentNode)
|
|
1377
|
+
continue;
|
|
1378
|
+
if (el.closest('pre') || el.closest('code'))
|
|
1379
|
+
continue;
|
|
1380
|
+
const text = el.textContent?.trim() || '';
|
|
1381
|
+
const words = (0, utils_1.countWords)(text);
|
|
1382
|
+
// Match date + read time in short elements
|
|
1383
|
+
if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
|
|
1384
|
+
// Ensure this is a leaf-ish element, not a large container
|
|
1385
|
+
if (el.querySelectorAll('p, div, section, article').length === 0) {
|
|
1386
|
+
// Verify the text is ONLY date + read time metadata
|
|
1387
|
+
// by stripping all date/time words and checking nothing remains
|
|
1388
|
+
let cleaned = text;
|
|
1389
|
+
for (const pattern of METADATA_STRIP_PATTERNS) {
|
|
1390
|
+
cleaned = cleaned.replace(pattern, '');
|
|
1391
|
+
}
|
|
1392
|
+
if (cleaned.trim().length > 0)
|
|
1393
|
+
continue;
|
|
1394
|
+
if (this.debug && debugRemovals) {
|
|
1395
|
+
debugRemovals.push({
|
|
1396
|
+
step: 'removeByContentPattern',
|
|
1397
|
+
reason: 'read time metadata',
|
|
1398
|
+
text: (0, utils_1.textPreview)(el)
|
|
1399
|
+
});
|
|
1400
|
+
}
|
|
1401
|
+
el.remove();
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
// Remove standalone time/date elements near the start or end of content.
|
|
1406
|
+
// A <time> in its own paragraph at the boundary is metadata (publish date),
|
|
1407
|
+
// but <time> inline within prose should be preserved (see issue #136).
|
|
1408
|
+
const timeElements = Array.from(mainContent.querySelectorAll('time'));
|
|
1409
|
+
const contentText = mainContent.textContent || '';
|
|
1410
|
+
for (const time of timeElements) {
|
|
1411
|
+
if (!time.parentNode)
|
|
1412
|
+
continue;
|
|
1413
|
+
// Walk up through inline/formatting wrappers only (i, em, span, b, strong)
|
|
1414
|
+
// Stop at block elements to avoid removing containers with other content.
|
|
1415
|
+
let target = time;
|
|
1416
|
+
let targetText = target.textContent?.trim() || '';
|
|
1417
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1418
|
+
const parentTag = target.parentElement.tagName.toLowerCase();
|
|
1419
|
+
const parentText = target.parentElement.textContent?.trim() || '';
|
|
1420
|
+
// If parent is a <p> that only wraps this time, include it
|
|
1421
|
+
if (parentTag === 'p' && parentText === targetText) {
|
|
1422
|
+
target = target.parentElement;
|
|
1423
|
+
break;
|
|
1424
|
+
}
|
|
1425
|
+
// Only walk through inline formatting wrappers
|
|
1426
|
+
if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
|
|
1427
|
+
parentText === targetText) {
|
|
1428
|
+
target = target.parentElement;
|
|
1429
|
+
targetText = parentText;
|
|
1430
|
+
continue;
|
|
1431
|
+
}
|
|
1432
|
+
break;
|
|
1433
|
+
}
|
|
1434
|
+
const text = target.textContent?.trim() || '';
|
|
1435
|
+
const words = (0, utils_1.countWords)(text);
|
|
1436
|
+
if (words > 10)
|
|
1437
|
+
continue;
|
|
1438
|
+
// Check if this element is near the start or end of mainContent
|
|
1439
|
+
const pos = contentText.indexOf(text);
|
|
1440
|
+
const distFromEnd = contentText.length - (pos + text.length);
|
|
1441
|
+
if (pos > 200 && distFromEnd > 200)
|
|
1442
|
+
continue;
|
|
1443
|
+
if (this.debug && debugRemovals) {
|
|
1444
|
+
debugRemovals.push({
|
|
1445
|
+
step: 'removeByContentPattern',
|
|
1446
|
+
reason: 'boundary date element',
|
|
1447
|
+
text: (0, utils_1.textPreview)(target)
|
|
1448
|
+
});
|
|
1449
|
+
}
|
|
1450
|
+
target.remove();
|
|
1451
|
+
}
|
|
1452
|
+
// Remove section breadcrumbs
|
|
1453
|
+
// Short elements containing a link to a parent section of the current URL.
|
|
1454
|
+
const url = this.options.url || this.doc.URL || '';
|
|
1455
|
+
let urlPath = '';
|
|
1456
|
+
try {
|
|
1457
|
+
urlPath = new URL(url).pathname;
|
|
1458
|
+
}
|
|
1459
|
+
catch { }
|
|
1460
|
+
if (urlPath) {
|
|
1461
|
+
const shortElements = mainContent.querySelectorAll('div, span, p');
|
|
1462
|
+
for (const el of shortElements) {
|
|
1463
|
+
if (!el.parentNode)
|
|
1464
|
+
continue;
|
|
1465
|
+
const text = el.textContent?.trim() || '';
|
|
1466
|
+
const words = (0, utils_1.countWords)(text);
|
|
1467
|
+
if (words > 10)
|
|
1468
|
+
continue;
|
|
1469
|
+
// Must be a leaf-ish element (no block children)
|
|
1470
|
+
if (el.querySelectorAll('p, div, section, article').length > 0)
|
|
1471
|
+
continue;
|
|
1472
|
+
const link = el.querySelector('a[href]');
|
|
1473
|
+
if (!link)
|
|
1474
|
+
continue;
|
|
1475
|
+
try {
|
|
1476
|
+
const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
|
|
1477
|
+
if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
|
|
1478
|
+
if (this.debug && debugRemovals) {
|
|
1479
|
+
debugRemovals.push({
|
|
1480
|
+
step: 'removeByContentPattern',
|
|
1481
|
+
reason: 'section breadcrumb',
|
|
1482
|
+
text: (0, utils_1.textPreview)(el)
|
|
1483
|
+
});
|
|
1484
|
+
}
|
|
1485
|
+
el.remove();
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
catch { }
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
// Remove boilerplate sentences and trailing non-content.
|
|
1492
|
+
// Search elements for end-of-article boilerplate, then truncate
|
|
1493
|
+
// from the best ancestor that has siblings to remove.
|
|
1494
|
+
const fullText = mainContent.textContent || '';
|
|
1495
|
+
const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
|
|
1496
|
+
for (const el of boilerplateElements) {
|
|
1497
|
+
if (!el.parentNode)
|
|
1498
|
+
continue;
|
|
1499
|
+
const text = el.textContent?.trim() || '';
|
|
1500
|
+
const words = (0, utils_1.countWords)(text);
|
|
1501
|
+
if (words > 50 || words < 3)
|
|
1502
|
+
continue;
|
|
1503
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
1504
|
+
if (pattern.test(text)) {
|
|
1505
|
+
// Walk up to find an ancestor that has next siblings to truncate.
|
|
1506
|
+
// Don't walk all the way to mainContent's direct child — if there's
|
|
1507
|
+
// a single wrapper div, that would remove everything.
|
|
1508
|
+
let target = el;
|
|
1509
|
+
while (target.parentElement && target.parentElement !== mainContent) {
|
|
1510
|
+
if (target.nextElementSibling)
|
|
1511
|
+
break;
|
|
1512
|
+
target = target.parentElement;
|
|
1513
|
+
}
|
|
1514
|
+
// Only truncate if there's substantial content before the boilerplate
|
|
1515
|
+
const targetText = target.textContent || '';
|
|
1516
|
+
const targetPos = fullText.indexOf(targetText);
|
|
1517
|
+
if (targetPos < 200)
|
|
1518
|
+
continue;
|
|
1519
|
+
// Collect ancestors before modifying the DOM
|
|
1520
|
+
const ancestors = [];
|
|
1521
|
+
let anc = target.parentElement;
|
|
1522
|
+
while (anc && anc !== mainContent) {
|
|
1523
|
+
ancestors.push(anc);
|
|
1524
|
+
anc = anc.parentElement;
|
|
1525
|
+
}
|
|
1526
|
+
// Remove target element and its following siblings
|
|
1527
|
+
this.removeTrailingSiblings(target, true, debugRemovals);
|
|
1528
|
+
// Cascade upward: remove following siblings at each
|
|
1529
|
+
// ancestor level too. Everything after the boilerplate
|
|
1530
|
+
// in document order is non-content.
|
|
1531
|
+
for (const ancestor of ancestors) {
|
|
1532
|
+
this.removeTrailingSiblings(ancestor, false, debugRemovals);
|
|
1533
|
+
}
|
|
1534
|
+
return;
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
/**
|
|
1540
|
+
* Remove an element's following siblings, and optionally the element itself.
|
|
1541
|
+
*/
|
|
1542
|
+
removeTrailingSiblings(element, removeSelf, debugRemovals) {
|
|
1543
|
+
let sibling = element.nextElementSibling;
|
|
1544
|
+
while (sibling) {
|
|
1545
|
+
const next = sibling.nextElementSibling;
|
|
1546
|
+
if (this.debug && debugRemovals) {
|
|
1547
|
+
debugRemovals.push({
|
|
1548
|
+
step: 'removeByContentPattern',
|
|
1549
|
+
reason: 'trailing non-content',
|
|
1550
|
+
text: (0, utils_1.textPreview)(sibling)
|
|
1551
|
+
});
|
|
1552
|
+
}
|
|
1553
|
+
sibling.remove();
|
|
1554
|
+
sibling = next;
|
|
1555
|
+
}
|
|
1556
|
+
if (removeSelf) {
|
|
1557
|
+
if (this.debug && debugRemovals) {
|
|
1558
|
+
debugRemovals.push({
|
|
1559
|
+
step: 'removeByContentPattern',
|
|
1560
|
+
reason: 'boilerplate text',
|
|
1561
|
+
text: (0, utils_1.textPreview)(element)
|
|
1562
|
+
});
|
|
1563
|
+
}
|
|
1564
|
+
element.remove();
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1190
1567
|
}
|
|
1191
1568
|
exports.Defuddle = Defuddle;
|
|
1192
1569
|
//# sourceMappingURL=defuddle.js.map
|