defuddle 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/README.md +49 -29
  2. package/dist/cli.js +15 -46
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +9 -0
  5. package/dist/constants.js +33 -9
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +50 -2
  8. package/dist/defuddle.js +615 -238
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/code.js +31 -9
  11. package/dist/elements/code.js.map +1 -1
  12. package/dist/elements/footnotes.js +2 -1
  13. package/dist/elements/footnotes.js.map +1 -1
  14. package/dist/elements/headings.js +42 -50
  15. package/dist/elements/headings.js.map +1 -1
  16. package/dist/extractor-registry.d.ts +1 -0
  17. package/dist/extractor-registry.js +3 -0
  18. package/dist/extractor-registry.js.map +1 -1
  19. package/dist/extractors/_base.d.ts +6 -0
  20. package/dist/extractors/_base.js +8 -0
  21. package/dist/extractors/_base.js.map +1 -1
  22. package/dist/extractors/github.d.ts +10 -2
  23. package/dist/extractors/github.js +158 -71
  24. package/dist/extractors/github.js.map +1 -1
  25. package/dist/extractors/hackernews.js +18 -72
  26. package/dist/extractors/hackernews.js.map +1 -1
  27. package/dist/extractors/reddit.d.ts +1 -2
  28. package/dist/extractors/reddit.js +41 -94
  29. package/dist/extractors/reddit.js.map +1 -1
  30. package/dist/extractors/x-oembed.d.ts +0 -1
  31. package/dist/extractors/x-oembed.js +20 -27
  32. package/dist/extractors/x-oembed.js.map +1 -1
  33. package/dist/extractors/youtube.d.ts +57 -0
  34. package/dist/extractors/youtube.js +619 -10
  35. package/dist/extractors/youtube.js.map +1 -1
  36. package/dist/index.full.js +1 -1
  37. package/dist/index.js +1 -1
  38. package/dist/markdown.js +5 -0
  39. package/dist/markdown.js.map +1 -1
  40. package/dist/metadata.d.ts +5 -0
  41. package/dist/metadata.js +28 -0
  42. package/dist/metadata.js.map +1 -1
  43. package/dist/node.d.ts +12 -5
  44. package/dist/node.js +53 -22
  45. package/dist/node.js.map +1 -1
  46. package/dist/scoring.d.ts +6 -1
  47. package/dist/scoring.js +69 -22
  48. package/dist/scoring.js.map +1 -1
  49. package/dist/standardize.js +152 -63
  50. package/dist/standardize.js.map +1 -1
  51. package/dist/types.d.ts +9 -0
  52. package/dist/utils/comments.d.ts +44 -0
  53. package/dist/utils/comments.js +103 -0
  54. package/dist/utils/comments.js.map +1 -0
  55. package/dist/utils/dom.d.ts +9 -0
  56. package/dist/utils/dom.js +20 -0
  57. package/dist/utils/dom.js.map +1 -1
  58. package/dist/utils/linkedom-compat.d.ts +5 -0
  59. package/dist/utils/linkedom-compat.js +23 -0
  60. package/dist/utils/linkedom-compat.js.map +1 -0
  61. package/dist/utils/transcript.d.ts +37 -0
  62. package/dist/utils/transcript.js +61 -0
  63. package/dist/utils/transcript.js.map +1 -0
  64. package/dist/utils.d.ts +6 -0
  65. package/dist/utils.js +36 -0
  66. package/dist/utils.js.map +1 -1
  67. package/package.json +3 -4
package/dist/defuddle.js CHANGED
@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
9
9
  const scoring_1 = require("./scoring");
10
10
  const utils_1 = require("./utils");
11
11
  const dom_1 = require("./utils/dom");
12
+ /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
+ const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
+ // Content pattern detection constants
15
+ const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
+ const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
+ const BOILERPLATE_PATTERNS = [
18
+ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
19
+ /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
20
+ /^Originally (?:published|appeared) (?:in|on|at)\b/i,
21
+ ];
22
+ const METADATA_STRIP_PATTERNS = [
23
+ /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
24
+ /\b\d+(?:st|nd|rd|th)?\b/g,
25
+ /\bmin(?:ute)?s?\b/gi,
26
+ /\bread\b/gi,
27
+ /[|·•—–\-,.\s]/g,
28
+ ];
12
29
  class Defuddle {
13
30
  /**
14
31
  * Create a new Defuddle instance
@@ -16,10 +33,23 @@ class Defuddle {
16
33
  * @param options - Options for parsing
17
34
  */
18
35
  constructor(doc, options = {}) {
36
+ this._schemaOrgData = undefined;
37
+ this._schemaOrgExtracted = false;
19
38
  this.doc = doc;
20
39
  this.options = options;
21
40
  this.debug = options.debug || false;
22
41
  }
42
+ /**
43
+ * Lazily extract and cache schema.org data. Must be called before
44
+ * parse() strips script tags from the document.
45
+ */
46
+ getSchemaOrgData() {
47
+ if (!this._schemaOrgExtracted) {
48
+ this._schemaOrgData = this._extractSchemaOrgData(this.doc);
49
+ this._schemaOrgExtracted = true;
50
+ }
51
+ return this._schemaOrgData;
52
+ }
23
53
  /**
24
54
  * Parse the document and extract its main content
25
55
  */
@@ -42,13 +72,44 @@ class Defuddle {
42
72
  }
43
73
  }
44
74
  // If still very little content, the page may be an index/listing page
75
+ // or a page that reveals content at runtime from a hidden wrapper.
76
+ // Retry once with hidden-element removal disabled.
77
+ if (result.wordCount < 50) {
78
+ this._log('Still very little content, retrying without hidden-element removal');
79
+ const hiddenRetry = this.parseInternal({
80
+ removeHiddenElements: false
81
+ });
82
+ if (hiddenRetry.wordCount > result.wordCount * 2) {
83
+ this._log('Hidden-element retry produced more content');
84
+ result = hiddenRetry;
85
+ }
86
+ // Try targeting the largest hidden subtree directly to avoid body-level
87
+ // leftovers (e.g. FPS counters) when hidden content is the real article.
88
+ const hiddenSelector = this.findLargestHiddenContentSelector();
89
+ if (hiddenSelector) {
90
+ this._log('Retrying with hidden content selector:', hiddenSelector);
91
+ const hiddenSelectorRetry = this.parseInternal({
92
+ removeHiddenElements: false,
93
+ removePartialSelectors: false,
94
+ contentSelector: hiddenSelector
95
+ });
96
+ if (hiddenSelectorRetry.wordCount > result.wordCount ||
97
+ (hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
98
+ hiddenSelectorRetry.content.length < result.content.length)) {
99
+ this._log('Hidden-selector retry produced better focused content');
100
+ result = hiddenSelectorRetry;
101
+ }
102
+ }
103
+ }
104
+ // If still very little content, the page may be an index/listing page
45
105
  // where card elements were scored as non-content or removed by partial
46
106
  // selectors (e.g. "post-preview"). Retry with both disabled.
47
107
  if (result.wordCount < 50) {
48
108
  this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
49
109
  const indexRetry = this.parseInternal({
50
110
  removeLowScoring: false,
51
- removePartialSelectors: false
111
+ removePartialSelectors: false,
112
+ removeContentPatterns: false
52
113
  });
53
114
  if (indexRetry.wordCount > result.wordCount) {
54
115
  this._log('Index page retry produced more content');
@@ -64,17 +125,17 @@ class Defuddle {
64
125
  // longer than what we extracted, the scorer likely picked the wrong
65
126
  // element from a feed. Find the correct element in the DOM.
66
127
  const schemaText = this._getSchemaText(result.schemaOrgData);
67
- if (schemaText && this.countWords(schemaText) > result.wordCount) {
128
+ if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
68
129
  const contentHtml = this._findContentBySchemaText(schemaText);
69
130
  if (contentHtml) {
70
131
  this._log('Found DOM content matching schema.org text');
71
132
  result.content = contentHtml;
72
- result.wordCount = this.countWords(contentHtml);
133
+ result.wordCount = this.countHtmlWords(contentHtml);
73
134
  }
74
135
  else {
75
136
  this._log('Using schema.org text as content (DOM element not found)');
76
137
  result.content = schemaText;
77
- result.wordCount = this.countWords(schemaText);
138
+ result.wordCount = this.countHtmlWords(schemaText);
78
139
  }
79
140
  }
80
141
  return result;
@@ -125,8 +186,7 @@ class Defuddle {
125
186
  el.removeAttribute(attr.name);
126
187
  }
127
188
  else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
128
- const val = attr.value.replace(/[\s\u0000-\u001F]+/g, '').toLowerCase();
129
- if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
189
+ if ((0, dom_1.isDangerousUrl)(attr.value)) {
130
190
  el.removeAttribute(attr.name);
131
191
  }
132
192
  }
@@ -149,7 +209,7 @@ class Defuddle {
149
209
  const searchPhrase = firstPara.substring(0, 100).trim();
150
210
  if (!searchPhrase)
151
211
  return '';
152
- const schemaWordCount = this.countWords(schemaText);
212
+ const schemaWordCount = this.countHtmlWords(schemaText);
153
213
  // Find the smallest element whose text contains the search phrase
154
214
  // and whose word count is close to the schema text's word count
155
215
  let bestMatch = null;
@@ -159,7 +219,7 @@ class Defuddle {
159
219
  const elText = (el.textContent || '');
160
220
  if (!elText.includes(searchPhrase))
161
221
  continue;
162
- const elWords = elText.trim().split(/\s+/).length;
222
+ const elWords = (0, utils_1.countWords)(elText);
163
223
  // Element should contain roughly the same amount of text
164
224
  // (allow some slack for surrounding whitespace / minor extras)
165
225
  if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
@@ -211,6 +271,27 @@ class Defuddle {
211
271
  }
212
272
  return html;
213
273
  }
274
+ findLargestHiddenContentSelector() {
275
+ const body = this.doc.body;
276
+ if (!body)
277
+ return undefined;
278
+ const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
279
+ const className = el.getAttribute('class') || '';
280
+ return !className.includes('math');
281
+ });
282
+ let best = null;
283
+ let bestWords = 0;
284
+ for (const el of candidates) {
285
+ const words = (0, utils_1.countWords)(el.textContent || '');
286
+ if (words > bestWords) {
287
+ best = el;
288
+ bestWords = words;
289
+ }
290
+ }
291
+ if (!best || bestWords < 30)
292
+ return undefined;
293
+ return this.getElementSelector(best);
294
+ }
214
295
  /**
215
296
  * Get the largest available src from an img element,
216
297
  * checking srcset for higher-resolution versions.
@@ -245,68 +326,109 @@ class Defuddle {
245
326
  return url;
246
327
  }
247
328
  /**
248
- * Parse the document, falling back to async extractors if sync parse yields no content
329
+ * Parse the document asynchronously. Checks for extractors that prefer
330
+ * async (e.g. YouTube transcripts) before sync, then falls back to async
331
+ * extractors if sync parse yields no content.
249
332
  */
250
333
  async parseAsync() {
334
+ if (this.options.useAsync !== false) {
335
+ const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
336
+ if (asyncResult)
337
+ return asyncResult;
338
+ }
251
339
  const result = this.parse();
252
340
  if (result.wordCount > 0 || this.options.useAsync === false) {
253
341
  return result;
254
342
  }
343
+ return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
344
+ }
345
+ /**
346
+ * Fetch only async variables (e.g. transcript) without re-parsing.
347
+ * Safe to call after parse() — uses cached schema.org data since
348
+ * parse() strips script tags from the document.
349
+ */
350
+ async fetchAsyncVariables() {
351
+ if (this.options.useAsync === false)
352
+ return null;
353
+ try {
354
+ const url = this.options.url || this.doc.URL;
355
+ const schemaOrgData = this.getSchemaOrgData();
356
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
357
+ if (extractor) {
358
+ const extracted = await extractor.extractAsync();
359
+ return this.getExtractorVariables(extracted.variables) || null;
360
+ }
361
+ }
362
+ catch (error) {
363
+ console.error('Defuddle', 'Error fetching async variables:', error);
364
+ }
365
+ return null;
366
+ }
367
+ async tryAsyncExtractor(finder) {
255
368
  try {
256
369
  const url = this.options.url || this.doc.URL;
257
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
258
- const extractor = extractor_registry_1.ExtractorRegistry.findAsyncExtractor(this.doc, url, schemaOrgData);
370
+ const schemaOrgData = this.getSchemaOrgData();
371
+ const extractor = finder(this.doc, url, schemaOrgData);
259
372
  if (extractor) {
260
373
  const startTime = Date.now();
261
374
  const extracted = await extractor.extractAsync();
262
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
263
375
  const pageMetaTags = this._collectMetaTags();
264
376
  const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
265
- const endTime = Date.now();
266
- return {
267
- content: contentHtml,
268
- title: extracted.variables?.title || metadata.title,
269
- description: metadata.description,
270
- domain: metadata.domain,
271
- favicon: metadata.favicon,
272
- image: metadata.image,
273
- published: extracted.variables?.published || metadata.published,
274
- author: extracted.variables?.author || metadata.author,
275
- site: extracted.variables?.site || metadata.site,
276
- schemaOrgData: metadata.schemaOrgData,
277
- wordCount: this.countWords(extracted.contentHtml),
278
- parseTime: Math.round(endTime - startTime),
279
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
280
- metaTags: pageMetaTags
281
- };
377
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
282
378
  }
283
379
  }
284
380
  catch (error) {
285
381
  console.error('Defuddle', 'Error in async extraction:', error);
286
382
  }
287
- return result;
383
+ return null;
288
384
  }
289
385
  /**
290
386
  * Internal parse method that does the actual work
291
387
  */
292
388
  parseInternal(overrideOptions = {}) {
293
389
  const startTime = Date.now();
390
+ // Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
391
+ if (!this.doc.documentElement) {
392
+ const url = this.options.url || '';
393
+ return {
394
+ content: '',
395
+ title: '',
396
+ description: '',
397
+ domain: url ? new URL(url).hostname : '',
398
+ favicon: '',
399
+ image: '',
400
+ language: '',
401
+ parseTime: Date.now() - startTime,
402
+ published: '',
403
+ author: '',
404
+ site: '',
405
+ schemaOrgData: null,
406
+ wordCount: 0,
407
+ };
408
+ }
294
409
  const options = {
295
410
  removeExactSelectors: true,
296
411
  removePartialSelectors: true,
297
412
  removeHiddenElements: true,
298
413
  removeLowScoring: true,
299
414
  removeSmallImages: true,
415
+ removeContentPatterns: true,
300
416
  standardize: true,
301
417
  ...this.options,
302
418
  ...overrideOptions
303
419
  };
304
420
  const debugRemovals = [];
305
- // Extract schema.org data
306
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
307
- const pageMetaTags = this._collectMetaTags();
308
- // Extract metadata
309
- const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
421
+ // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
422
+ const schemaOrgData = this.getSchemaOrgData();
423
+ // Cache meta tags and metadata across retries
424
+ if (!this._metaTags) {
425
+ this._metaTags = this._collectMetaTags();
426
+ }
427
+ const pageMetaTags = this._metaTags;
428
+ if (!this._metadata) {
429
+ this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
430
+ }
431
+ const metadata = this._metadata;
310
432
  if (options.removeImages) {
311
433
  this.removeImages(this.doc);
312
434
  }
@@ -316,35 +438,28 @@ class Defuddle {
316
438
  const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
317
439
  if (extractor && extractor.canExtract()) {
318
440
  const extracted = extractor.extract();
319
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
320
- const endTime = Date.now();
321
- // console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
322
- return {
323
- content: contentHtml,
324
- title: extracted.variables?.title || metadata.title,
325
- description: metadata.description,
326
- domain: metadata.domain,
327
- favicon: metadata.favicon,
328
- image: metadata.image,
329
- published: extracted.variables?.published || metadata.published,
330
- author: extracted.variables?.author || metadata.author,
331
- site: extracted.variables?.site || metadata.site,
332
- schemaOrgData: metadata.schemaOrgData,
333
- wordCount: this.countWords(extracted.contentHtml),
334
- parseTime: Math.round(endTime - startTime),
335
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
336
- metaTags: pageMetaTags
337
- };
441
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
338
442
  }
339
443
  // Continue if there is no extractor...
340
- // Evaluate mobile styles and sizes on original document
341
- const mobileStyles = this._evaluateMediaQueries(this.doc);
342
- // Find small images in original document, excluding lazy-loaded ones
343
- const smallImages = this.findSmallImages(this.doc);
444
+ // Evaluate mobile styles and sizes on original document (cached across retries)
445
+ if (!this._mobileStyles) {
446
+ this._mobileStyles = this._evaluateMediaQueries(this.doc);
447
+ }
448
+ const mobileStyles = this._mobileStyles;
449
+ // Find small images in original document (cached across retries)
450
+ if (!this._smallImages) {
451
+ this._smallImages = this.findSmallImages(this.doc);
452
+ }
453
+ const smallImages = this._smallImages;
344
454
  // Clone document
345
455
  const clone = this.doc.cloneNode(true);
456
+ // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
457
+ // create when parsing HTML entities like &#39;
458
+ clone.body?.normalize();
346
459
  // Flatten shadow DOM content into the clone
347
460
  this.flattenShadowRoots(this.doc, clone);
461
+ // Resolve React streaming SSR suspense boundaries
462
+ this.resolveStreamedContent(clone);
348
463
  // Apply mobile styles to clone
349
464
  this.applyMobileStyles(clone, mobileStyles);
350
465
  // Find main content
@@ -357,12 +472,12 @@ class Defuddle {
357
472
  mainContent = this.findMainContent(clone);
358
473
  }
359
474
  if (!mainContent) {
360
- const fallbackContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
475
+ const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
361
476
  const endTime = Date.now();
362
477
  return {
363
478
  content: fallbackContent,
364
479
  ...metadata,
365
- wordCount: this.countWords(fallbackContent),
480
+ wordCount: this.countHtmlWords(fallbackContent),
366
481
  parseTime: Math.round(endTime - startTime),
367
482
  metaTags: pageMetaTags
368
483
  };
@@ -382,11 +497,15 @@ class Defuddle {
382
497
  // Remove non-content blocks by scoring
383
498
  // Tries to find lists, navigation based on text content and link density
384
499
  if (options.removeLowScoring) {
385
- scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals);
500
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
386
501
  }
387
502
  // Remove clutter using selectors
388
503
  if (options.removeExactSelectors || options.removePartialSelectors) {
389
- this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
504
+ this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
505
+ }
506
+ // Remove elements by content patterns (read time, boilerplate, article cards)
507
+ if (options.removeContentPatterns && mainContent) {
508
+ this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
390
509
  }
391
510
  // Normalize the main content
392
511
  if (options.standardize) {
@@ -399,7 +518,7 @@ class Defuddle {
399
518
  const result = {
400
519
  content,
401
520
  ...metadata,
402
- wordCount: this.countWords(content),
521
+ wordCount: this.countHtmlWords(content),
403
522
  parseTime: Math.round(endTime - startTime),
404
523
  metaTags: pageMetaTags
405
524
  };
@@ -413,29 +532,29 @@ class Defuddle {
413
532
  }
414
533
  catch (error) {
415
534
  console.error('Defuddle', 'Error processing document:', error);
416
- const errorContent = this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body));
535
+ const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
417
536
  const endTime = Date.now();
418
537
  return {
419
538
  content: errorContent,
420
539
  ...metadata,
421
- wordCount: this.countWords(errorContent),
540
+ wordCount: this.countHtmlWords(errorContent),
422
541
  parseTime: Math.round(endTime - startTime),
423
542
  metaTags: pageMetaTags
424
543
  };
425
544
  }
426
545
  }
427
- countWords(content) {
428
- // Parse HTML content to extract text
429
- const tempDiv = this.doc.createElement('div');
430
- tempDiv.appendChild((0, dom_1.parseHTML)(this.doc, content));
431
- // Get text content, removing extra whitespace
432
- const text = tempDiv.textContent || '';
433
- const words = text
434
- .trim()
435
- .replace(/\s+/g, ' ') // Replace multiple spaces with single space
436
- .split(' ')
437
- .filter(word => word.length > 0); // Filter out empty strings
438
- return words.length;
546
+ countHtmlWords(content) {
547
+ // Strip HTML tags and decode common entities without DOM parsing
548
+ const text = content
549
+ .replace(/<[^>]*>/g, ' ')
550
+ .replace(/&nbsp;/gi, ' ')
551
+ .replace(/&amp;/gi, '&')
552
+ .replace(/&lt;/gi, '<')
553
+ .replace(/&gt;/gi, '>')
554
+ .replace(/&quot;/gi, '"')
555
+ .replace(/&#\d+;/g, ' ')
556
+ .replace(/&\w+;/g, ' ');
557
+ return (0, utils_1.countWords)(text);
439
558
  }
440
559
  // Make all other methods private by removing the static keyword and using private
441
560
  _log(...args) {
@@ -447,6 +566,8 @@ class Defuddle {
447
566
  const mobileStyles = [];
448
567
  const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
449
568
  try {
569
+ if (!doc.styleSheets)
570
+ return mobileStyles;
450
571
  // Get all styles, including inline styles
451
572
  const sheets = Array.from(doc.styleSheets).filter(sheet => {
452
573
  try {
@@ -535,36 +656,34 @@ class Defuddle {
535
656
  removeHiddenElements(doc, debugRemovals) {
536
657
  let count = 0;
537
658
  const elementsToRemove = new Map();
538
- // Use querySelectorAll instead of getElementsByTagName because
539
- // linkedom's cloneNode does not wire up live HTMLCollections.
540
- const allElements = Array.from(doc.querySelectorAll('*'));
541
- // Process styles in batches to minimize layout thrashing
542
- const BATCH_SIZE = 100;
543
- for (let i = 0; i < allElements.length; i += BATCH_SIZE) {
544
- const batch = allElements.slice(i, i + BATCH_SIZE);
545
- // Read phase - gather all computedStyles
546
- const styles = batch.map(element => {
659
+ // Check inline styles and CSS class-based hidden patterns.
660
+ const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
661
+ // Only use getComputedStyle in browser environments where it's meaningful.
662
+ // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
663
+ const defaultView = doc.defaultView;
664
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
665
+ const allElements = doc.querySelectorAll('*');
666
+ for (const element of allElements) {
667
+ // Skip elements that contain math — sites like Wikipedia wrap MathML
668
+ // in display:none spans for accessibility (the visible version is an
669
+ // image/SVG fallback). We need to preserve these for math extraction.
670
+ if (element.querySelector('math, [data-mathml], .katex-mathml') ||
671
+ element.tagName.toLowerCase() === 'math') {
672
+ continue;
673
+ }
674
+ // Check inline style for hidden patterns
675
+ const style = element.getAttribute('style');
676
+ if (style && hiddenStylePattern.test(style)) {
677
+ const reason = style.includes('display') ? 'display:none' :
678
+ style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
679
+ elementsToRemove.set(element, reason);
680
+ count++;
681
+ continue;
682
+ }
683
+ // Use getComputedStyle only in real browser environments
684
+ if (isBrowser) {
547
685
  try {
548
- return element.ownerDocument.defaultView?.getComputedStyle(element);
549
- }
550
- catch (e) {
551
- // If we can't get computed style, check inline styles
552
- const style = element.getAttribute('style');
553
- if (!style)
554
- return null;
555
- // Create a temporary style element to parse inline styles
556
- const tempStyle = doc.createElement('style');
557
- tempStyle.textContent = `* { ${style} }`;
558
- doc.head.appendChild(tempStyle);
559
- const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
560
- doc.head.removeChild(tempStyle);
561
- return computedStyle;
562
- }
563
- });
564
- // Write phase - mark elements for removal
565
- batch.forEach((element, index) => {
566
- const computedStyle = styles[index];
567
- if (computedStyle) {
686
+ const computedStyle = defaultView.getComputedStyle(element);
568
687
  let reason = '';
569
688
  if (computedStyle.display === 'none')
570
689
  reason = 'display:none';
@@ -575,25 +694,24 @@ class Defuddle {
575
694
  if (reason) {
576
695
  elementsToRemove.set(element, reason);
577
696
  count++;
697
+ continue;
578
698
  }
579
699
  }
580
- // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
581
- // "sm:hidden", "not-machine:hidden") which JSDOM/linkedom can't
582
- // resolve through computed styles.
583
- if (!elementsToRemove.has(element)) {
584
- const className = element.getAttribute('class') || '';
585
- if (className) {
586
- const tokens = className.split(/\s+/);
587
- for (const token of tokens) {
588
- if (token === 'hidden' || token.endsWith(':hidden')) {
589
- elementsToRemove.set(element, `class:${token}`);
590
- count++;
591
- break;
592
- }
593
- }
700
+ catch (e) { }
701
+ }
702
+ // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
703
+ // "sm:hidden", "not-machine:hidden")
704
+ const className = element.getAttribute('class') || '';
705
+ if (className) {
706
+ const tokens = className.split(/\s+/);
707
+ for (const token of tokens) {
708
+ if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
709
+ elementsToRemove.set(element, `class:${token}`);
710
+ count++;
711
+ break;
594
712
  }
595
713
  }
596
- });
714
+ }
597
715
  }
598
716
  // Batch remove all hidden elements
599
717
  elementsToRemove.forEach((reason, el) => {
@@ -608,7 +726,7 @@ class Defuddle {
608
726
  });
609
727
  this._log('Removed hidden elements:', count);
610
728
  }
611
- removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals) {
729
+ removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
612
730
  const startTime = Date.now();
613
731
  let exactSelectorCount = 0;
614
732
  let partialSelectorCount = 0;
@@ -616,9 +734,17 @@ class Defuddle {
616
734
  const elementsToRemove = new Map();
617
735
  // First collect elements matching exact selectors
618
736
  if (removeExact) {
619
- const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS.join(','));
737
+ const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
620
738
  exactElements.forEach(el => {
621
739
  if (el?.parentNode) {
740
+ if (skipHiddenExactSelectors) {
741
+ const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
742
+ const role = (el.getAttribute('role') || '').toLowerCase();
743
+ if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
744
+ (hiddenAncestor && role === 'dialog')) {
745
+ return;
746
+ }
747
+ }
622
748
  // Skip elements inside code blocks (e.g. syntax highlighting spans)
623
749
  if (el.closest('pre, code')) {
624
750
  return;
@@ -629,16 +755,12 @@ class Defuddle {
629
755
  });
630
756
  }
631
757
  if (removePartial) {
632
- // Pre-compile regexes and combine into a single regex for better performance
633
- const combinedPattern = constants_1.PARTIAL_SELECTORS.join('|');
634
- const partialRegex = new RegExp(combinedPattern, 'i');
635
- // Pre-compile individual regexes for debug pattern identification
758
+ // Pre-compile individual regexes for debug pattern identification only
636
759
  const individualRegexes = this.debug
637
760
  ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
638
761
  : null;
639
- // Create an efficient attribute selector for elements we care about
640
- const attributeSelector = constants_1.TEST_ATTRIBUTES.map(attr => `[${attr}]`).join(',');
641
- const allElements = doc.querySelectorAll(attributeSelector);
762
+ // Use pre-built attribute selector for elements we care about
763
+ const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
642
764
  // Process elements for partial matches
643
765
  allElements.forEach(el => {
644
766
  // Skip if already marked for removal
@@ -666,7 +788,7 @@ class Defuddle {
666
788
  return;
667
789
  }
668
790
  // Check for partial match using single regex test
669
- if (partialRegex.test(attrs)) {
791
+ if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
670
792
  const matchedPattern = individualRegexes
671
793
  ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
672
794
  : undefined;
@@ -719,106 +841,50 @@ class Defuddle {
719
841
  findSmallImages(doc) {
720
842
  const MIN_DIMENSION = 33;
721
843
  const smallImages = new Set();
722
- const transformRegex = /scale\(([\d.]+)\)/;
723
- const startTime = Date.now();
724
844
  let processedCount = 0;
725
- // 1. Read phase - Gather all elements in a single pass
726
- const elements = [
727
- ...Array.from(doc.getElementsByTagName('img')),
728
- ...Array.from(doc.getElementsByTagName('svg'))
729
- ];
730
- if (elements.length === 0) {
731
- return smallImages;
732
- }
733
- // 2. Batch process - Collect all measurements in one go
734
- const measurements = elements.map(element => ({
735
- element,
736
- // Static attributes (no reflow)
737
- naturalWidth: element.tagName.toLowerCase() === 'img' ?
738
- parseInt(element.getAttribute('width') || '0') || 0 : 0,
739
- naturalHeight: element.tagName.toLowerCase() === 'img' ?
740
- parseInt(element.getAttribute('height') || '0') || 0 : 0,
741
- attrWidth: parseInt(element.getAttribute('width') || '0'),
742
- attrHeight: parseInt(element.getAttribute('height') || '0')
743
- }));
744
- // 3. Batch compute styles - Process in chunks to avoid long tasks
745
- const BATCH_SIZE = 50;
746
- for (let i = 0; i < measurements.length; i += BATCH_SIZE) {
747
- const batch = measurements.slice(i, i + BATCH_SIZE);
748
- try {
749
- // Read phase - compute all styles at once
750
- const styles = batch.map(({ element }) => {
751
- try {
752
- return element.ownerDocument.defaultView?.getComputedStyle(element);
753
- }
754
- catch (e) {
755
- return null;
756
- }
757
- });
758
- // Get bounding rectangles if available
759
- const rects = batch.map(({ element }) => {
760
- try {
761
- return element.getBoundingClientRect();
762
- }
763
- catch (e) {
764
- return null;
765
- }
766
- });
767
- // Process phase - no DOM operations
768
- batch.forEach((measurement, index) => {
769
- try {
770
- const style = styles[index];
771
- const rect = rects[index];
772
- if (!style)
773
- return;
774
- // Get transform scale in the same batch
775
- const transform = style.transform;
776
- const scale = transform ?
777
- parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
778
- // Calculate effective dimensions
779
- const widths = [
780
- measurement.naturalWidth,
781
- measurement.attrWidth,
782
- parseInt(style.width) || 0,
783
- rect ? rect.width * scale : 0
784
- ].filter(dim => typeof dim === 'number' && dim > 0);
785
- const heights = [
786
- measurement.naturalHeight,
787
- measurement.attrHeight,
788
- parseInt(style.height) || 0,
789
- rect ? rect.height * scale : 0
790
- ].filter(dim => typeof dim === 'number' && dim > 0);
791
- // Decision phase - no DOM operations
792
- if (widths.length > 0 && heights.length > 0) {
793
- const effectiveWidth = Math.min(...widths);
794
- const effectiveHeight = Math.min(...heights);
795
- if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
796
- const identifier = this.getElementIdentifier(measurement.element);
797
- if (identifier) {
798
- smallImages.add(identifier);
799
- processedCount++;
800
- }
801
- }
802
- }
803
- }
804
- catch (e) {
805
- if (this.debug) {
806
- console.warn('Defuddle: Failed to process element dimensions:', e);
807
- }
808
- }
809
- });
845
+ const elements = doc.querySelectorAll('img, svg');
846
+ const defaultView = doc.defaultView;
847
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
848
+ for (const element of elements) {
849
+ const attrWidth = parseInt(element.getAttribute('width') || '0');
850
+ const attrHeight = parseInt(element.getAttribute('height') || '0');
851
+ // Check inline style dimensions
852
+ const style = element.getAttribute('style') || '';
853
+ const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
854
+ const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
855
+ // Use getComputedStyle and getBoundingClientRect only in browser
856
+ let computedWidth = 0, computedHeight = 0;
857
+ if (isBrowser) {
858
+ try {
859
+ const cs = defaultView.getComputedStyle(element);
860
+ computedWidth = parseInt(cs.width) || 0;
861
+ computedHeight = parseInt(cs.height) || 0;
862
+ }
863
+ catch (e) { }
864
+ try {
865
+ const rect = element.getBoundingClientRect();
866
+ if (rect.width > 0)
867
+ computedWidth = computedWidth || rect.width;
868
+ if (rect.height > 0)
869
+ computedHeight = computedHeight || rect.height;
870
+ }
871
+ catch (e) { }
810
872
  }
811
- catch (e) {
812
- if (this.debug) {
813
- console.warn('Defuddle: Failed to process batch:', e);
873
+ const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
874
+ const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
875
+ if (widths.length > 0 && heights.length > 0) {
876
+ const effectiveWidth = Math.min(...widths);
877
+ const effectiveHeight = Math.min(...heights);
878
+ if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
879
+ const identifier = this.getElementIdentifier(element);
880
+ if (identifier) {
881
+ smallImages.add(identifier);
882
+ processedCount++;
883
+ }
814
884
  }
815
885
  }
816
886
  }
817
- const endTime = Date.now();
818
- this._log('Found small elements:', {
819
- count: processedCount,
820
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
821
- });
887
+ this._log('Found small elements:', processedCount);
822
888
  return smallImages;
823
889
  }
824
890
  removeSmallImages(doc, smallImages) {
@@ -909,7 +975,7 @@ class Defuddle {
909
975
  let best = top;
910
976
  for (let i = 1; i < candidates.length; i++) {
911
977
  const child = candidates[i];
912
- const childWords = (child.element.textContent || '').split(/\s+/).length;
978
+ const childWords = (0, utils_1.countWords)(child.element.textContent || '');
913
979
  if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
914
980
  // Count how many candidates share this selector index inside
915
981
  // the top element. Use top (not best) as the stable reference
@@ -953,13 +1019,11 @@ class Defuddle {
953
1019
  }
954
1020
  findContentByScoring(doc) {
955
1021
  const candidates = [];
956
- constants_1.BLOCK_ELEMENTS.forEach((tag) => {
957
- Array.from(doc.getElementsByTagName(tag)).forEach((element) => {
958
- const score = scoring_1.ContentScorer.scoreElement(element);
959
- if (score > 0) {
960
- candidates.push({ score, element });
961
- }
962
- });
1022
+ doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
1023
+ const score = scoring_1.ContentScorer.scoreElement(element);
1024
+ if (score > 0) {
1025
+ candidates.push({ score, element });
1026
+ }
963
1027
  });
964
1028
  return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
965
1029
  }
@@ -990,11 +1054,17 @@ class Defuddle {
990
1054
  if (!baseUrl)
991
1055
  return;
992
1056
  const resolve = (url) => {
1057
+ // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
1058
+ // Normalize these before URL resolution.
1059
+ const normalized = url
1060
+ .trim()
1061
+ .replace(/^\\?["']+/, '')
1062
+ .replace(/\\?["']+$/, '');
993
1063
  try {
994
- return new URL(url, baseUrl).href;
1064
+ return new URL(normalized, baseUrl).href;
995
1065
  }
996
1066
  catch {
997
- return url;
1067
+ return normalized || url;
998
1068
  }
999
1069
  };
1000
1070
  element.querySelectorAll('[href]').forEach(el => {
@@ -1050,12 +1120,14 @@ class Defuddle {
1050
1120
  * Walks both trees in parallel so positional correspondence is exact.
1051
1121
  */
1052
1122
  flattenShadowRoots(original, clone) {
1053
- const origElements = Array.from(original.body.getElementsByTagName('*'));
1123
+ if (!original.body || !clone.body)
1124
+ return;
1125
+ const origElements = Array.from(original.body.querySelectorAll('*'));
1054
1126
  // Find the first element with a shadow root (also serves as the hasShadowRoots check)
1055
1127
  const firstShadow = origElements.find(el => el.shadowRoot);
1056
1128
  if (!firstShadow)
1057
1129
  return;
1058
- const cloneElements = Array.from(clone.body.getElementsByTagName('*'));
1130
+ const cloneElements = Array.from(clone.body.querySelectorAll('*'));
1059
1131
  // Check if we can directly read shadow DOM content (main world / Node.js).
1060
1132
  // In content script isolated worlds, shadowRoot exists but content is empty.
1061
1133
  const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
@@ -1096,6 +1168,68 @@ class Defuddle {
1096
1168
  }
1097
1169
  }
1098
1170
  }
1171
+ /**
1172
+ * Resolve React streaming SSR suspense boundaries.
1173
+ * React's streaming SSR places content in hidden divs (id="S:0") and
1174
+ * template placeholders (id="B:0") with $RC scripts to swap them.
1175
+ * Since we don't execute scripts, we perform the swap manually.
1176
+ */
1177
+ resolveStreamedContent(doc) {
1178
+ // Find $RC("B:X","S:X") calls in inline scripts
1179
+ const scripts = doc.querySelectorAll('script');
1180
+ const swaps = [];
1181
+ const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
1182
+ for (const script of scripts) {
1183
+ const text = script.textContent || '';
1184
+ if (!text.includes('$RC('))
1185
+ continue;
1186
+ rcPattern.lastIndex = 0;
1187
+ let match;
1188
+ while ((match = rcPattern.exec(text)) !== null) {
1189
+ swaps.push({ templateId: match[1], contentId: match[2] });
1190
+ }
1191
+ }
1192
+ if (swaps.length === 0)
1193
+ return;
1194
+ let swapCount = 0;
1195
+ for (const { templateId, contentId } of swaps) {
1196
+ const template = doc.getElementById(templateId);
1197
+ const content = doc.getElementById(contentId);
1198
+ if (!template || !content)
1199
+ continue;
1200
+ const parent = template.parentNode;
1201
+ if (!parent)
1202
+ continue;
1203
+ // Remove the fallback/skeleton content after the template
1204
+ // until the <!--/$--> comment marker
1205
+ let next = template.nextSibling;
1206
+ let foundMarker = false;
1207
+ while (next) {
1208
+ const following = next.nextSibling;
1209
+ if (next.nodeType === 8 && next.data === '/$') {
1210
+ next.remove();
1211
+ foundMarker = true;
1212
+ break;
1213
+ }
1214
+ next.remove();
1215
+ next = following;
1216
+ }
1217
+ // Skip swap if marker wasn't found — malformed streaming output
1218
+ if (!foundMarker)
1219
+ continue;
1220
+ // Insert content children before the template position
1221
+ while (content.firstChild) {
1222
+ parent.insertBefore(content.firstChild, template);
1223
+ }
1224
+ // Clean up the template and hidden div
1225
+ template.remove();
1226
+ content.remove();
1227
+ swapCount++;
1228
+ }
1229
+ if (swapCount > 0) {
1230
+ this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
1231
+ }
1232
+ }
1099
1233
  /**
1100
1234
  * Replace a shadow DOM host element with a div containing its shadow content.
1101
1235
  * Custom elements (tag names with hyphens) would re-initialize when inserted
@@ -1187,6 +1321,249 @@ class Defuddle {
1187
1321
  _decodeHTMLEntities(text) {
1188
1322
  return (0, dom_1.decodeHTMLEntities)(this.doc, text);
1189
1323
  }
1324
+ /**
1325
+ * Build a DefuddleResponse from an extractor result with metadata
1326
+ */
1327
+ buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
1328
+ const contentHtml = this.resolveContentUrls(extracted.contentHtml);
1329
+ const variables = this.getExtractorVariables(extracted.variables);
1330
+ return {
1331
+ content: contentHtml,
1332
+ title: extracted.variables?.title || metadata.title,
1333
+ description: metadata.description,
1334
+ domain: metadata.domain,
1335
+ favicon: metadata.favicon,
1336
+ image: metadata.image,
1337
+ language: extracted.variables?.language || metadata.language,
1338
+ published: extracted.variables?.published || metadata.published,
1339
+ author: extracted.variables?.author || metadata.author,
1340
+ site: extracted.variables?.site || metadata.site,
1341
+ schemaOrgData: metadata.schemaOrgData,
1342
+ wordCount: this.countHtmlWords(extracted.contentHtml),
1343
+ parseTime: Math.round(Date.now() - startTime),
1344
+ extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
1345
+ metaTags: pageMetaTags,
1346
+ ...(variables ? { variables } : {}),
1347
+ };
1348
+ }
1349
+ /**
1350
+ * Filter extractor variables to only include custom ones
1351
+ * (exclude standard fields that are already mapped to top-level properties)
1352
+ */
1353
+ getExtractorVariables(variables) {
1354
+ if (!variables)
1355
+ return undefined;
1356
+ const custom = {};
1357
+ let hasCustom = false;
1358
+ for (const [key, value] of Object.entries(variables)) {
1359
+ if (!STANDARD_VARIABLE_KEYS.has(key)) {
1360
+ custom[key] = value;
1361
+ hasCustom = true;
1362
+ }
1363
+ }
1364
+ return hasCustom ? custom : undefined;
1365
+ }
1366
+ /**
1367
+ * Content-based pattern removal for elements that can't be detected by
1368
+ * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
1369
+ */
1370
+ removeByContentPattern(mainContent, debugRemovals) {
1371
+ // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
1372
+ // Only removes leaf elements whose text is PURELY date + read time,
1373
+ // not mixed with other meaningful content like tag names.
1374
+ const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
1375
+ for (const el of candidates) {
1376
+ if (!el.parentNode)
1377
+ continue;
1378
+ if (el.closest('pre') || el.closest('code'))
1379
+ continue;
1380
+ const text = el.textContent?.trim() || '';
1381
+ const words = (0, utils_1.countWords)(text);
1382
+ // Match date + read time in short elements
1383
+ if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1384
+ // Ensure this is a leaf-ish element, not a large container
1385
+ if (el.querySelectorAll('p, div, section, article').length === 0) {
1386
+ // Verify the text is ONLY date + read time metadata
1387
+ // by stripping all date/time words and checking nothing remains
1388
+ let cleaned = text;
1389
+ for (const pattern of METADATA_STRIP_PATTERNS) {
1390
+ cleaned = cleaned.replace(pattern, '');
1391
+ }
1392
+ if (cleaned.trim().length > 0)
1393
+ continue;
1394
+ if (this.debug && debugRemovals) {
1395
+ debugRemovals.push({
1396
+ step: 'removeByContentPattern',
1397
+ reason: 'read time metadata',
1398
+ text: (0, utils_1.textPreview)(el)
1399
+ });
1400
+ }
1401
+ el.remove();
1402
+ }
1403
+ }
1404
+ }
1405
+ // Remove standalone time/date elements near the start or end of content.
1406
+ // A <time> in its own paragraph at the boundary is metadata (publish date),
1407
+ // but <time> inline within prose should be preserved (see issue #136).
1408
+ const timeElements = Array.from(mainContent.querySelectorAll('time'));
1409
+ const contentText = mainContent.textContent || '';
1410
+ for (const time of timeElements) {
1411
+ if (!time.parentNode)
1412
+ continue;
1413
+ // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
1414
+ // Stop at block elements to avoid removing containers with other content.
1415
+ let target = time;
1416
+ let targetText = target.textContent?.trim() || '';
1417
+ while (target.parentElement && target.parentElement !== mainContent) {
1418
+ const parentTag = target.parentElement.tagName.toLowerCase();
1419
+ const parentText = target.parentElement.textContent?.trim() || '';
1420
+ // If parent is a <p> that only wraps this time, include it
1421
+ if (parentTag === 'p' && parentText === targetText) {
1422
+ target = target.parentElement;
1423
+ break;
1424
+ }
1425
+ // Only walk through inline formatting wrappers
1426
+ if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
1427
+ parentText === targetText) {
1428
+ target = target.parentElement;
1429
+ targetText = parentText;
1430
+ continue;
1431
+ }
1432
+ break;
1433
+ }
1434
+ const text = target.textContent?.trim() || '';
1435
+ const words = (0, utils_1.countWords)(text);
1436
+ if (words > 10)
1437
+ continue;
1438
+ // Check if this element is near the start or end of mainContent
1439
+ const pos = contentText.indexOf(text);
1440
+ const distFromEnd = contentText.length - (pos + text.length);
1441
+ if (pos > 200 && distFromEnd > 200)
1442
+ continue;
1443
+ if (this.debug && debugRemovals) {
1444
+ debugRemovals.push({
1445
+ step: 'removeByContentPattern',
1446
+ reason: 'boundary date element',
1447
+ text: (0, utils_1.textPreview)(target)
1448
+ });
1449
+ }
1450
+ target.remove();
1451
+ }
1452
+ // Remove section breadcrumbs
1453
+ // Short elements containing a link to a parent section of the current URL.
1454
+ const url = this.options.url || this.doc.URL || '';
1455
+ let urlPath = '';
1456
+ try {
1457
+ urlPath = new URL(url).pathname;
1458
+ }
1459
+ catch { }
1460
+ if (urlPath) {
1461
+ const shortElements = mainContent.querySelectorAll('div, span, p');
1462
+ for (const el of shortElements) {
1463
+ if (!el.parentNode)
1464
+ continue;
1465
+ const text = el.textContent?.trim() || '';
1466
+ const words = (0, utils_1.countWords)(text);
1467
+ if (words > 10)
1468
+ continue;
1469
+ // Must be a leaf-ish element (no block children)
1470
+ if (el.querySelectorAll('p, div, section, article').length > 0)
1471
+ continue;
1472
+ const link = el.querySelector('a[href]');
1473
+ if (!link)
1474
+ continue;
1475
+ try {
1476
+ const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
1477
+ if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
1478
+ if (this.debug && debugRemovals) {
1479
+ debugRemovals.push({
1480
+ step: 'removeByContentPattern',
1481
+ reason: 'section breadcrumb',
1482
+ text: (0, utils_1.textPreview)(el)
1483
+ });
1484
+ }
1485
+ el.remove();
1486
+ }
1487
+ }
1488
+ catch { }
1489
+ }
1490
+ }
1491
+ // Remove boilerplate sentences and trailing non-content.
1492
+ // Search elements for end-of-article boilerplate, then truncate
1493
+ // from the best ancestor that has siblings to remove.
1494
+ const fullText = mainContent.textContent || '';
1495
+ const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
1496
+ for (const el of boilerplateElements) {
1497
+ if (!el.parentNode)
1498
+ continue;
1499
+ const text = el.textContent?.trim() || '';
1500
+ const words = (0, utils_1.countWords)(text);
1501
+ if (words > 50 || words < 3)
1502
+ continue;
1503
+ for (const pattern of BOILERPLATE_PATTERNS) {
1504
+ if (pattern.test(text)) {
1505
+ // Walk up to find an ancestor that has next siblings to truncate.
1506
+ // Don't walk all the way to mainContent's direct child — if there's
1507
+ // a single wrapper div, that would remove everything.
1508
+ let target = el;
1509
+ while (target.parentElement && target.parentElement !== mainContent) {
1510
+ if (target.nextElementSibling)
1511
+ break;
1512
+ target = target.parentElement;
1513
+ }
1514
+ // Only truncate if there's substantial content before the boilerplate
1515
+ const targetText = target.textContent || '';
1516
+ const targetPos = fullText.indexOf(targetText);
1517
+ if (targetPos < 200)
1518
+ continue;
1519
+ // Collect ancestors before modifying the DOM
1520
+ const ancestors = [];
1521
+ let anc = target.parentElement;
1522
+ while (anc && anc !== mainContent) {
1523
+ ancestors.push(anc);
1524
+ anc = anc.parentElement;
1525
+ }
1526
+ // Remove target element and its following siblings
1527
+ this.removeTrailingSiblings(target, true, debugRemovals);
1528
+ // Cascade upward: remove following siblings at each
1529
+ // ancestor level too. Everything after the boilerplate
1530
+ // in document order is non-content.
1531
+ for (const ancestor of ancestors) {
1532
+ this.removeTrailingSiblings(ancestor, false, debugRemovals);
1533
+ }
1534
+ return;
1535
+ }
1536
+ }
1537
+ }
1538
+ }
1539
+ /**
1540
+ * Remove an element's following siblings, and optionally the element itself.
1541
+ */
1542
+ removeTrailingSiblings(element, removeSelf, debugRemovals) {
1543
+ let sibling = element.nextElementSibling;
1544
+ while (sibling) {
1545
+ const next = sibling.nextElementSibling;
1546
+ if (this.debug && debugRemovals) {
1547
+ debugRemovals.push({
1548
+ step: 'removeByContentPattern',
1549
+ reason: 'trailing non-content',
1550
+ text: (0, utils_1.textPreview)(sibling)
1551
+ });
1552
+ }
1553
+ sibling.remove();
1554
+ sibling = next;
1555
+ }
1556
+ if (removeSelf) {
1557
+ if (this.debug && debugRemovals) {
1558
+ debugRemovals.push({
1559
+ step: 'removeByContentPattern',
1560
+ reason: 'boilerplate text',
1561
+ text: (0, utils_1.textPreview)(element)
1562
+ });
1563
+ }
1564
+ element.remove();
1565
+ }
1566
+ }
1190
1567
  }
1191
1568
  exports.Defuddle = Defuddle;
1192
1569
  //# sourceMappingURL=defuddle.js.map