defuddle 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +23 -8
  2. package/dist/cli.js +2 -1
  3. package/dist/cli.js.map +1 -1
  4. package/dist/constants.d.ts +2 -0
  5. package/dist/constants.js +12 -1
  6. package/dist/constants.js.map +1 -1
  7. package/dist/defuddle.d.ts +48 -1
  8. package/dist/defuddle.js +519 -213
  9. package/dist/defuddle.js.map +1 -1
  10. package/dist/elements/footnotes.js +2 -1
  11. package/dist/elements/footnotes.js.map +1 -1
  12. package/dist/extractor-registry.d.ts +1 -0
  13. package/dist/extractor-registry.js +3 -0
  14. package/dist/extractor-registry.js.map +1 -1
  15. package/dist/extractors/_base.d.ts +6 -0
  16. package/dist/extractors/_base.js +8 -0
  17. package/dist/extractors/_base.js.map +1 -1
  18. package/dist/extractors/github.d.ts +10 -2
  19. package/dist/extractors/github.js +158 -71
  20. package/dist/extractors/github.js.map +1 -1
  21. package/dist/extractors/hackernews.js +18 -72
  22. package/dist/extractors/hackernews.js.map +1 -1
  23. package/dist/extractors/reddit.d.ts +1 -2
  24. package/dist/extractors/reddit.js +41 -94
  25. package/dist/extractors/reddit.js.map +1 -1
  26. package/dist/extractors/x-oembed.d.ts +0 -1
  27. package/dist/extractors/x-oembed.js +20 -27
  28. package/dist/extractors/x-oembed.js.map +1 -1
  29. package/dist/extractors/youtube.d.ts +37 -0
  30. package/dist/extractors/youtube.js +409 -9
  31. package/dist/extractors/youtube.js.map +1 -1
  32. package/dist/index.full.js +1 -1
  33. package/dist/index.js +1 -1
  34. package/dist/metadata.d.ts +5 -0
  35. package/dist/metadata.js +28 -0
  36. package/dist/metadata.js.map +1 -1
  37. package/dist/node.js +0 -5
  38. package/dist/node.js.map +1 -1
  39. package/dist/scoring.d.ts +6 -1
  40. package/dist/scoring.js +66 -19
  41. package/dist/scoring.js.map +1 -1
  42. package/dist/standardize.js +64 -60
  43. package/dist/standardize.js.map +1 -1
  44. package/dist/types.d.ts +9 -0
  45. package/dist/utils/comments.d.ts +44 -0
  46. package/dist/utils/comments.js +103 -0
  47. package/dist/utils/comments.js.map +1 -0
  48. package/dist/utils/dom.d.ts +9 -0
  49. package/dist/utils/dom.js +20 -0
  50. package/dist/utils/dom.js.map +1 -1
  51. package/dist/utils/transcript.d.ts +37 -0
  52. package/dist/utils/transcript.js +61 -0
  53. package/dist/utils/transcript.js.map +1 -0
  54. package/package.json +1 -1
package/dist/defuddle.js CHANGED
@@ -9,6 +9,23 @@ const footnotes_1 = require("./elements/footnotes");
9
9
  const scoring_1 = require("./scoring");
10
10
  const utils_1 = require("./utils");
11
11
  const dom_1 = require("./utils/dom");
12
+ /** Keys from extractor variables that map to top-level DefuddleResponse fields */
13
+ const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
14
+ // Content pattern detection constants
15
+ const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
16
+ const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
17
+ const BOILERPLATE_PATTERNS = [
18
+ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
19
+ /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
20
+ /^Originally (?:published|appeared) (?:in|on|at)\b/i,
21
+ ];
22
+ const METADATA_STRIP_PATTERNS = [
23
+ /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
24
+ /\b\d+(?:st|nd|rd|th)?\b/g,
25
+ /\bmin(?:ute)?s?\b/gi,
26
+ /\bread\b/gi,
27
+ /[|·•—–\-,.\s]/g,
28
+ ];
12
29
  class Defuddle {
13
30
  /**
14
31
  * Create a new Defuddle instance
@@ -16,10 +33,23 @@ class Defuddle {
16
33
  * @param options - Options for parsing
17
34
  */
18
35
  constructor(doc, options = {}) {
36
+ this._schemaOrgData = undefined;
37
+ this._schemaOrgExtracted = false;
19
38
  this.doc = doc;
20
39
  this.options = options;
21
40
  this.debug = options.debug || false;
22
41
  }
42
+ /**
43
+ * Lazily extract and cache schema.org data. Must be called before
44
+ * parse() strips script tags from the document.
45
+ */
46
+ getSchemaOrgData() {
47
+ if (!this._schemaOrgExtracted) {
48
+ this._schemaOrgData = this._extractSchemaOrgData(this.doc);
49
+ this._schemaOrgExtracted = true;
50
+ }
51
+ return this._schemaOrgData;
52
+ }
23
53
  /**
24
54
  * Parse the document and extract its main content
25
55
  */
@@ -48,7 +78,8 @@ class Defuddle {
48
78
  this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
49
79
  const indexRetry = this.parseInternal({
50
80
  removeLowScoring: false,
51
- removePartialSelectors: false
81
+ removePartialSelectors: false,
82
+ removeContentPatterns: false
52
83
  });
53
84
  if (indexRetry.wordCount > result.wordCount) {
54
85
  this._log('Index page retry produced more content');
@@ -125,8 +156,7 @@ class Defuddle {
125
156
  el.removeAttribute(attr.name);
126
157
  }
127
158
  else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
128
- const val = attr.value.replace(/[\s\u0000-\u001F]+/g, '').toLowerCase();
129
- if (val.startsWith('javascript:') || val.startsWith('data:text/html')) {
159
+ if ((0, dom_1.isDangerousUrl)(attr.value)) {
130
160
  el.removeAttribute(attr.name);
131
161
  }
132
162
  }
@@ -245,46 +275,61 @@ class Defuddle {
245
275
  return url;
246
276
  }
247
277
  /**
248
- * Parse the document, falling back to async extractors if sync parse yields no content
278
+ * Parse the document asynchronously. Checks for extractors that prefer
279
+ * async (e.g. YouTube transcripts) before sync, then falls back to async
280
+ * extractors if sync parse yields no content.
249
281
  */
250
282
  async parseAsync() {
283
+ if (this.options.useAsync !== false) {
284
+ const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
285
+ if (asyncResult)
286
+ return asyncResult;
287
+ }
251
288
  const result = this.parse();
252
289
  if (result.wordCount > 0 || this.options.useAsync === false) {
253
290
  return result;
254
291
  }
292
+ return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
293
+ }
294
+ /**
295
+ * Fetch only async variables (e.g. transcript) without re-parsing.
296
+ * Safe to call after parse() — uses cached schema.org data since
297
+ * parse() strips script tags from the document.
298
+ */
299
+ async fetchAsyncVariables() {
300
+ if (this.options.useAsync === false)
301
+ return null;
255
302
  try {
256
303
  const url = this.options.url || this.doc.URL;
257
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
258
- const extractor = extractor_registry_1.ExtractorRegistry.findAsyncExtractor(this.doc, url, schemaOrgData);
304
+ const schemaOrgData = this.getSchemaOrgData();
305
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData);
306
+ if (extractor) {
307
+ const extracted = await extractor.extractAsync();
308
+ return this.getExtractorVariables(extracted.variables) || null;
309
+ }
310
+ }
311
+ catch (error) {
312
+ console.error('Defuddle', 'Error fetching async variables:', error);
313
+ }
314
+ return null;
315
+ }
316
+ async tryAsyncExtractor(finder) {
317
+ try {
318
+ const url = this.options.url || this.doc.URL;
319
+ const schemaOrgData = this.getSchemaOrgData();
320
+ const extractor = finder(this.doc, url, schemaOrgData);
259
321
  if (extractor) {
260
322
  const startTime = Date.now();
261
323
  const extracted = await extractor.extractAsync();
262
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
263
324
  const pageMetaTags = this._collectMetaTags();
264
325
  const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
265
- const endTime = Date.now();
266
- return {
267
- content: contentHtml,
268
- title: extracted.variables?.title || metadata.title,
269
- description: metadata.description,
270
- domain: metadata.domain,
271
- favicon: metadata.favicon,
272
- image: metadata.image,
273
- published: extracted.variables?.published || metadata.published,
274
- author: extracted.variables?.author || metadata.author,
275
- site: extracted.variables?.site || metadata.site,
276
- schemaOrgData: metadata.schemaOrgData,
277
- wordCount: this.countWords(extracted.contentHtml),
278
- parseTime: Math.round(endTime - startTime),
279
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
280
- metaTags: pageMetaTags
281
- };
326
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
282
327
  }
283
328
  }
284
329
  catch (error) {
285
330
  console.error('Defuddle', 'Error in async extraction:', error);
286
331
  }
287
- return result;
332
+ return null;
288
333
  }
289
334
  /**
290
335
  * Internal parse method that does the actual work
@@ -297,16 +342,23 @@ class Defuddle {
297
342
  removeHiddenElements: true,
298
343
  removeLowScoring: true,
299
344
  removeSmallImages: true,
345
+ removeContentPatterns: true,
300
346
  standardize: true,
301
347
  ...this.options,
302
348
  ...overrideOptions
303
349
  };
304
350
  const debugRemovals = [];
305
- // Extract schema.org data
306
- const schemaOrgData = this._extractSchemaOrgData(this.doc);
307
- const pageMetaTags = this._collectMetaTags();
308
- // Extract metadata
309
- const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
351
+ // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
352
+ const schemaOrgData = this.getSchemaOrgData();
353
+ // Cache meta tags and metadata across retries
354
+ if (!this._metaTags) {
355
+ this._metaTags = this._collectMetaTags();
356
+ }
357
+ const pageMetaTags = this._metaTags;
358
+ if (!this._metadata) {
359
+ this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
360
+ }
361
+ const metadata = this._metadata;
310
362
  if (options.removeImages) {
311
363
  this.removeImages(this.doc);
312
364
  }
@@ -316,35 +368,25 @@ class Defuddle {
316
368
  const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
317
369
  if (extractor && extractor.canExtract()) {
318
370
  const extracted = extractor.extract();
319
- const contentHtml = this.resolveContentUrls(extracted.contentHtml);
320
- const endTime = Date.now();
321
- // console.log('Using extractor:', extractor.constructor.name.replace('Extractor', ''));
322
- return {
323
- content: contentHtml,
324
- title: extracted.variables?.title || metadata.title,
325
- description: metadata.description,
326
- domain: metadata.domain,
327
- favicon: metadata.favicon,
328
- image: metadata.image,
329
- published: extracted.variables?.published || metadata.published,
330
- author: extracted.variables?.author || metadata.author,
331
- site: extracted.variables?.site || metadata.site,
332
- schemaOrgData: metadata.schemaOrgData,
333
- wordCount: this.countWords(extracted.contentHtml),
334
- parseTime: Math.round(endTime - startTime),
335
- extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
336
- metaTags: pageMetaTags
337
- };
371
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
338
372
  }
339
373
  // Continue if there is no extractor...
340
- // Evaluate mobile styles and sizes on original document
341
- const mobileStyles = this._evaluateMediaQueries(this.doc);
342
- // Find small images in original document, excluding lazy-loaded ones
343
- const smallImages = this.findSmallImages(this.doc);
374
+ // Evaluate mobile styles and sizes on original document (cached across retries)
375
+ if (!this._mobileStyles) {
376
+ this._mobileStyles = this._evaluateMediaQueries(this.doc);
377
+ }
378
+ const mobileStyles = this._mobileStyles;
379
+ // Find small images in original document (cached across retries)
380
+ if (!this._smallImages) {
381
+ this._smallImages = this.findSmallImages(this.doc);
382
+ }
383
+ const smallImages = this._smallImages;
344
384
  // Clone document
345
385
  const clone = this.doc.cloneNode(true);
346
386
  // Flatten shadow DOM content into the clone
347
387
  this.flattenShadowRoots(this.doc, clone);
388
+ // Resolve React streaming SSR suspense boundaries
389
+ this.resolveStreamedContent(clone);
348
390
  // Apply mobile styles to clone
349
391
  this.applyMobileStyles(clone, mobileStyles);
350
392
  // Find main content
@@ -382,12 +424,16 @@ class Defuddle {
382
424
  // Remove non-content blocks by scoring
383
425
  // Tries to find lists, navigation based on text content and link density
384
426
  if (options.removeLowScoring) {
385
- scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals);
427
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
386
428
  }
387
429
  // Remove clutter using selectors
388
430
  if (options.removeExactSelectors || options.removePartialSelectors) {
389
431
  this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals);
390
432
  }
433
+ // Remove elements by content patterns (read time, boilerplate, article cards)
434
+ if (options.removeContentPatterns && mainContent) {
435
+ this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
436
+ }
391
437
  // Normalize the main content
392
438
  if (options.standardize) {
393
439
  (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
@@ -425,17 +471,33 @@ class Defuddle {
425
471
  }
426
472
  }
427
473
  countWords(content) {
428
- // Parse HTML content to extract text
429
- const tempDiv = this.doc.createElement('div');
430
- tempDiv.appendChild((0, dom_1.parseHTML)(this.doc, content));
431
- // Get text content, removing extra whitespace
432
- const text = tempDiv.textContent || '';
433
- const words = text
434
- .trim()
435
- .replace(/\s+/g, ' ') // Replace multiple spaces with single space
436
- .split(' ')
437
- .filter(word => word.length > 0); // Filter out empty strings
438
- return words.length;
474
+ // Strip HTML tags and decode common entities without DOM parsing
475
+ const text = content
476
+ .replace(/<[^>]*>/g, ' ')
477
+ .replace(/&nbsp;/gi, ' ')
478
+ .replace(/&amp;/gi, '&')
479
+ .replace(/&lt;/gi, '<')
480
+ .replace(/&gt;/gi, '>')
481
+ .replace(/&quot;/gi, '"')
482
+ .replace(/&#\d+;/g, ' ')
483
+ .replace(/&\w+;/g, ' ');
484
+ const trimmed = text.trim();
485
+ if (!trimmed)
486
+ return 0;
487
+ // Count words by splitting on whitespace
488
+ let count = 0;
489
+ let inWord = false;
490
+ for (let i = 0; i < trimmed.length; i++) {
491
+ const isSpace = trimmed.charCodeAt(i) <= 32;
492
+ if (!isSpace && !inWord) {
493
+ count++;
494
+ inWord = true;
495
+ }
496
+ else if (isSpace) {
497
+ inWord = false;
498
+ }
499
+ }
500
+ return count;
439
501
  }
440
502
  // Make all other methods private by removing the static keyword and using private
441
503
  _log(...args) {
@@ -535,36 +597,34 @@ class Defuddle {
535
597
  removeHiddenElements(doc, debugRemovals) {
536
598
  let count = 0;
537
599
  const elementsToRemove = new Map();
538
- // Use querySelectorAll instead of getElementsByTagName because
539
- // linkedom's cloneNode does not wire up live HTMLCollections.
540
- const allElements = Array.from(doc.querySelectorAll('*'));
541
- // Process styles in batches to minimize layout thrashing
542
- const BATCH_SIZE = 100;
543
- for (let i = 0; i < allElements.length; i += BATCH_SIZE) {
544
- const batch = allElements.slice(i, i + BATCH_SIZE);
545
- // Read phase - gather all computedStyles
546
- const styles = batch.map(element => {
600
+ // Check inline styles and CSS class-based hidden patterns.
601
+ const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
602
+ // Only use getComputedStyle in browser environments where it's meaningful.
603
+ // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
604
+ const defaultView = doc.defaultView;
605
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
606
+ const allElements = doc.querySelectorAll('*');
607
+ for (const element of allElements) {
608
+ // Skip elements that contain math — sites like Wikipedia wrap MathML
609
+ // in display:none spans for accessibility (the visible version is an
610
+ // image/SVG fallback). We need to preserve these for math extraction.
611
+ if (element.querySelector('math, [data-mathml], .katex-mathml') ||
612
+ element.tagName.toLowerCase() === 'math') {
613
+ continue;
614
+ }
615
+ // Check inline style for hidden patterns
616
+ const style = element.getAttribute('style');
617
+ if (style && hiddenStylePattern.test(style)) {
618
+ const reason = style.includes('display') ? 'display:none' :
619
+ style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
620
+ elementsToRemove.set(element, reason);
621
+ count++;
622
+ continue;
623
+ }
624
+ // Use getComputedStyle only in real browser environments
625
+ if (isBrowser) {
547
626
  try {
548
- return element.ownerDocument.defaultView?.getComputedStyle(element);
549
- }
550
- catch (e) {
551
- // If we can't get computed style, check inline styles
552
- const style = element.getAttribute('style');
553
- if (!style)
554
- return null;
555
- // Create a temporary style element to parse inline styles
556
- const tempStyle = doc.createElement('style');
557
- tempStyle.textContent = `* { ${style} }`;
558
- doc.head.appendChild(tempStyle);
559
- const computedStyle = element.ownerDocument.defaultView?.getComputedStyle(element);
560
- doc.head.removeChild(tempStyle);
561
- return computedStyle;
562
- }
563
- });
564
- // Write phase - mark elements for removal
565
- batch.forEach((element, index) => {
566
- const computedStyle = styles[index];
567
- if (computedStyle) {
627
+ const computedStyle = defaultView.getComputedStyle(element);
568
628
  let reason = '';
569
629
  if (computedStyle.display === 'none')
570
630
  reason = 'display:none';
@@ -575,25 +635,24 @@ class Defuddle {
575
635
  if (reason) {
576
636
  elementsToRemove.set(element, reason);
577
637
  count++;
638
+ continue;
578
639
  }
579
640
  }
580
- // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
581
- // "sm:hidden", "not-machine:hidden") which JSDOM/linkedom can't
582
- // resolve through computed styles.
583
- if (!elementsToRemove.has(element)) {
584
- const className = element.getAttribute('class') || '';
585
- if (className) {
586
- const tokens = className.split(/\s+/);
587
- for (const token of tokens) {
588
- if (token === 'hidden' || token.endsWith(':hidden')) {
589
- elementsToRemove.set(element, `class:${token}`);
590
- count++;
591
- break;
592
- }
593
- }
641
+ catch (e) { }
642
+ }
643
+ // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
644
+ // "sm:hidden", "not-machine:hidden")
645
+ const className = element.getAttribute('class') || '';
646
+ if (className) {
647
+ const tokens = className.split(/\s+/);
648
+ for (const token of tokens) {
649
+ if (token === 'hidden' || token.endsWith(':hidden')) {
650
+ elementsToRemove.set(element, `class:${token}`);
651
+ count++;
652
+ break;
594
653
  }
595
654
  }
596
- });
655
+ }
597
656
  }
598
657
  // Batch remove all hidden elements
599
658
  elementsToRemove.forEach((reason, el) => {
@@ -719,106 +778,50 @@ class Defuddle {
719
778
  findSmallImages(doc) {
720
779
  const MIN_DIMENSION = 33;
721
780
  const smallImages = new Set();
722
- const transformRegex = /scale\(([\d.]+)\)/;
723
- const startTime = Date.now();
724
781
  let processedCount = 0;
725
- // 1. Read phase - Gather all elements in a single pass
726
- const elements = [
727
- ...Array.from(doc.getElementsByTagName('img')),
728
- ...Array.from(doc.getElementsByTagName('svg'))
729
- ];
730
- if (elements.length === 0) {
731
- return smallImages;
732
- }
733
- // 2. Batch process - Collect all measurements in one go
734
- const measurements = elements.map(element => ({
735
- element,
736
- // Static attributes (no reflow)
737
- naturalWidth: element.tagName.toLowerCase() === 'img' ?
738
- parseInt(element.getAttribute('width') || '0') || 0 : 0,
739
- naturalHeight: element.tagName.toLowerCase() === 'img' ?
740
- parseInt(element.getAttribute('height') || '0') || 0 : 0,
741
- attrWidth: parseInt(element.getAttribute('width') || '0'),
742
- attrHeight: parseInt(element.getAttribute('height') || '0')
743
- }));
744
- // 3. Batch compute styles - Process in chunks to avoid long tasks
745
- const BATCH_SIZE = 50;
746
- for (let i = 0; i < measurements.length; i += BATCH_SIZE) {
747
- const batch = measurements.slice(i, i + BATCH_SIZE);
748
- try {
749
- // Read phase - compute all styles at once
750
- const styles = batch.map(({ element }) => {
751
- try {
752
- return element.ownerDocument.defaultView?.getComputedStyle(element);
753
- }
754
- catch (e) {
755
- return null;
756
- }
757
- });
758
- // Get bounding rectangles if available
759
- const rects = batch.map(({ element }) => {
760
- try {
761
- return element.getBoundingClientRect();
762
- }
763
- catch (e) {
764
- return null;
765
- }
766
- });
767
- // Process phase - no DOM operations
768
- batch.forEach((measurement, index) => {
769
- try {
770
- const style = styles[index];
771
- const rect = rects[index];
772
- if (!style)
773
- return;
774
- // Get transform scale in the same batch
775
- const transform = style.transform;
776
- const scale = transform ?
777
- parseFloat(transform.match(transformRegex)?.[1] || '1') : 1;
778
- // Calculate effective dimensions
779
- const widths = [
780
- measurement.naturalWidth,
781
- measurement.attrWidth,
782
- parseInt(style.width) || 0,
783
- rect ? rect.width * scale : 0
784
- ].filter(dim => typeof dim === 'number' && dim > 0);
785
- const heights = [
786
- measurement.naturalHeight,
787
- measurement.attrHeight,
788
- parseInt(style.height) || 0,
789
- rect ? rect.height * scale : 0
790
- ].filter(dim => typeof dim === 'number' && dim > 0);
791
- // Decision phase - no DOM operations
792
- if (widths.length > 0 && heights.length > 0) {
793
- const effectiveWidth = Math.min(...widths);
794
- const effectiveHeight = Math.min(...heights);
795
- if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
796
- const identifier = this.getElementIdentifier(measurement.element);
797
- if (identifier) {
798
- smallImages.add(identifier);
799
- processedCount++;
800
- }
801
- }
802
- }
803
- }
804
- catch (e) {
805
- if (this.debug) {
806
- console.warn('Defuddle: Failed to process element dimensions:', e);
807
- }
808
- }
809
- });
782
+ const elements = doc.querySelectorAll('img, svg');
783
+ const defaultView = doc.defaultView;
784
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
785
+ for (const element of elements) {
786
+ const attrWidth = parseInt(element.getAttribute('width') || '0');
787
+ const attrHeight = parseInt(element.getAttribute('height') || '0');
788
+ // Check inline style dimensions
789
+ const style = element.getAttribute('style') || '';
790
+ const styleWidth = parseInt(style.match(/width\s*:\s*(\d+)/)?.[1] || '0');
791
+ const styleHeight = parseInt(style.match(/height\s*:\s*(\d+)/)?.[1] || '0');
792
+ // Use getComputedStyle and getBoundingClientRect only in browser
793
+ let computedWidth = 0, computedHeight = 0;
794
+ if (isBrowser) {
795
+ try {
796
+ const cs = defaultView.getComputedStyle(element);
797
+ computedWidth = parseInt(cs.width) || 0;
798
+ computedHeight = parseInt(cs.height) || 0;
799
+ }
800
+ catch (e) { }
801
+ try {
802
+ const rect = element.getBoundingClientRect();
803
+ if (rect.width > 0)
804
+ computedWidth = computedWidth || rect.width;
805
+ if (rect.height > 0)
806
+ computedHeight = computedHeight || rect.height;
807
+ }
808
+ catch (e) { }
810
809
  }
811
- catch (e) {
812
- if (this.debug) {
813
- console.warn('Defuddle: Failed to process batch:', e);
810
+ const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
811
+ const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
812
+ if (widths.length > 0 && heights.length > 0) {
813
+ const effectiveWidth = Math.min(...widths);
814
+ const effectiveHeight = Math.min(...heights);
815
+ if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
816
+ const identifier = this.getElementIdentifier(element);
817
+ if (identifier) {
818
+ smallImages.add(identifier);
819
+ processedCount++;
820
+ }
814
821
  }
815
822
  }
816
823
  }
817
- const endTime = Date.now();
818
- this._log('Found small elements:', {
819
- count: processedCount,
820
- processingTime: `${(endTime - startTime).toFixed(2)}ms`
821
- });
824
+ this._log('Found small elements:', processedCount);
822
825
  return smallImages;
823
826
  }
824
827
  removeSmallImages(doc, smallImages) {
@@ -953,13 +956,11 @@ class Defuddle {
953
956
  }
954
957
  findContentByScoring(doc) {
955
958
  const candidates = [];
956
- constants_1.BLOCK_ELEMENTS.forEach((tag) => {
957
- Array.from(doc.getElementsByTagName(tag)).forEach((element) => {
958
- const score = scoring_1.ContentScorer.scoreElement(element);
959
- if (score > 0) {
960
- candidates.push({ score, element });
961
- }
962
- });
959
+ doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
960
+ const score = scoring_1.ContentScorer.scoreElement(element);
961
+ if (score > 0) {
962
+ candidates.push({ score, element });
963
+ }
963
964
  });
964
965
  return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
965
966
  }
@@ -1050,12 +1051,12 @@ class Defuddle {
1050
1051
  * Walks both trees in parallel so positional correspondence is exact.
1051
1052
  */
1052
1053
  flattenShadowRoots(original, clone) {
1053
- const origElements = Array.from(original.body.getElementsByTagName('*'));
1054
+ const origElements = Array.from(original.body.querySelectorAll('*'));
1054
1055
  // Find the first element with a shadow root (also serves as the hasShadowRoots check)
1055
1056
  const firstShadow = origElements.find(el => el.shadowRoot);
1056
1057
  if (!firstShadow)
1057
1058
  return;
1058
- const cloneElements = Array.from(clone.body.getElementsByTagName('*'));
1059
+ const cloneElements = Array.from(clone.body.querySelectorAll('*'));
1059
1060
  // Check if we can directly read shadow DOM content (main world / Node.js).
1060
1061
  // In content script isolated worlds, shadowRoot exists but content is empty.
1061
1062
  const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
@@ -1096,6 +1097,68 @@ class Defuddle {
1096
1097
  }
1097
1098
  }
1098
1099
  }
1100
+ /**
1101
+ * Resolve React streaming SSR suspense boundaries.
1102
+ * React's streaming SSR places content in hidden divs (id="S:0") and
1103
+ * template placeholders (id="B:0") with $RC scripts to swap them.
1104
+ * Since we don't execute scripts, we perform the swap manually.
1105
+ */
1106
+ resolveStreamedContent(doc) {
1107
+ // Find $RC("B:X","S:X") calls in inline scripts
1108
+ const scripts = doc.querySelectorAll('script');
1109
+ const swaps = [];
1110
+ const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
1111
+ for (const script of scripts) {
1112
+ const text = script.textContent || '';
1113
+ if (!text.includes('$RC('))
1114
+ continue;
1115
+ rcPattern.lastIndex = 0;
1116
+ let match;
1117
+ while ((match = rcPattern.exec(text)) !== null) {
1118
+ swaps.push({ templateId: match[1], contentId: match[2] });
1119
+ }
1120
+ }
1121
+ if (swaps.length === 0)
1122
+ return;
1123
+ let swapCount = 0;
1124
+ for (const { templateId, contentId } of swaps) {
1125
+ const template = doc.getElementById(templateId);
1126
+ const content = doc.getElementById(contentId);
1127
+ if (!template || !content)
1128
+ continue;
1129
+ const parent = template.parentNode;
1130
+ if (!parent)
1131
+ continue;
1132
+ // Remove the fallback/skeleton content after the template
1133
+ // until the <!--/$--> comment marker
1134
+ let next = template.nextSibling;
1135
+ let foundMarker = false;
1136
+ while (next) {
1137
+ const following = next.nextSibling;
1138
+ if (next.nodeType === 8 && next.data === '/$') {
1139
+ next.remove();
1140
+ foundMarker = true;
1141
+ break;
1142
+ }
1143
+ next.remove();
1144
+ next = following;
1145
+ }
1146
+ // Skip swap if marker wasn't found — malformed streaming output
1147
+ if (!foundMarker)
1148
+ continue;
1149
+ // Insert content children before the template position
1150
+ while (content.firstChild) {
1151
+ parent.insertBefore(content.firstChild, template);
1152
+ }
1153
+ // Clean up the template and hidden div
1154
+ template.remove();
1155
+ content.remove();
1156
+ swapCount++;
1157
+ }
1158
+ if (swapCount > 0) {
1159
+ this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
1160
+ }
1161
+ }
1099
1162
  /**
1100
1163
  * Replace a shadow DOM host element with a div containing its shadow content.
1101
1164
  * Custom elements (tag names with hyphens) would re-initialize when inserted
@@ -1187,6 +1250,249 @@ class Defuddle {
1187
1250
  _decodeHTMLEntities(text) {
1188
1251
  return (0, dom_1.decodeHTMLEntities)(this.doc, text);
1189
1252
  }
1253
+ /**
1254
+ * Build a DefuddleResponse from an extractor result with metadata
1255
+ */
1256
+ buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
1257
+ const contentHtml = this.resolveContentUrls(extracted.contentHtml);
1258
+ const variables = this.getExtractorVariables(extracted.variables);
1259
+ return {
1260
+ content: contentHtml,
1261
+ title: extracted.variables?.title || metadata.title,
1262
+ description: metadata.description,
1263
+ domain: metadata.domain,
1264
+ favicon: metadata.favicon,
1265
+ image: metadata.image,
1266
+ language: extracted.variables?.language || metadata.language,
1267
+ published: extracted.variables?.published || metadata.published,
1268
+ author: extracted.variables?.author || metadata.author,
1269
+ site: extracted.variables?.site || metadata.site,
1270
+ schemaOrgData: metadata.schemaOrgData,
1271
+ wordCount: this.countWords(extracted.contentHtml),
1272
+ parseTime: Math.round(Date.now() - startTime),
1273
+ extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
1274
+ metaTags: pageMetaTags,
1275
+ ...(variables ? { variables } : {}),
1276
+ };
1277
+ }
1278
+ /**
1279
+ * Filter extractor variables to only include custom ones
1280
+ * (exclude standard fields that are already mapped to top-level properties)
1281
+ */
1282
+ getExtractorVariables(variables) {
1283
+ if (!variables)
1284
+ return undefined;
1285
+ const custom = {};
1286
+ let hasCustom = false;
1287
+ for (const [key, value] of Object.entries(variables)) {
1288
+ if (!STANDARD_VARIABLE_KEYS.has(key)) {
1289
+ custom[key] = value;
1290
+ hasCustom = true;
1291
+ }
1292
+ }
1293
+ return hasCustom ? custom : undefined;
1294
+ }
1295
+ /**
1296
+ * Content-based pattern removal for elements that can't be detected by
1297
+ * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
1298
+ */
1299
+ removeByContentPattern(mainContent, debugRemovals) {
1300
+ // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
1301
+ // Only removes leaf elements whose text is PURELY date + read time,
1302
+ // not mixed with other meaningful content like tag names.
1303
+ const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
1304
+ for (const el of candidates) {
1305
+ if (!el.parentNode)
1306
+ continue;
1307
+ if (el.closest('pre') || el.closest('code'))
1308
+ continue;
1309
+ const text = el.textContent?.trim() || '';
1310
+ const words = text.split(/\s+/).length;
1311
+ // Match date + read time in short elements
1312
+ if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1313
+ // Ensure this is a leaf-ish element, not a large container
1314
+ if (el.querySelectorAll('p, div, section, article').length === 0) {
1315
+ // Verify the text is ONLY date + read time metadata
1316
+ // by stripping all date/time words and checking nothing remains
1317
+ let cleaned = text;
1318
+ for (const pattern of METADATA_STRIP_PATTERNS) {
1319
+ cleaned = cleaned.replace(pattern, '');
1320
+ }
1321
+ if (cleaned.trim().length > 0)
1322
+ continue;
1323
+ if (this.debug && debugRemovals) {
1324
+ debugRemovals.push({
1325
+ step: 'removeByContentPattern',
1326
+ reason: 'read time metadata',
1327
+ text: (0, utils_1.textPreview)(el)
1328
+ });
1329
+ }
1330
+ el.remove();
1331
+ }
1332
+ }
1333
+ }
1334
+ // Remove standalone time/date elements near the start or end of content.
1335
+ // A <time> in its own paragraph at the boundary is metadata (publish date),
1336
+ // but <time> inline within prose should be preserved (see issue #136).
1337
+ const timeElements = Array.from(mainContent.querySelectorAll('time'));
1338
+ const contentText = mainContent.textContent || '';
1339
+ for (const time of timeElements) {
1340
+ if (!time.parentNode)
1341
+ continue;
1342
+ // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
1343
+ // Stop at block elements to avoid removing containers with other content.
1344
+ let target = time;
1345
+ let targetText = target.textContent?.trim() || '';
1346
+ while (target.parentElement && target.parentElement !== mainContent) {
1347
+ const parentTag = target.parentElement.tagName.toLowerCase();
1348
+ const parentText = target.parentElement.textContent?.trim() || '';
1349
+ // If parent is a <p> that only wraps this time, include it
1350
+ if (parentTag === 'p' && parentText === targetText) {
1351
+ target = target.parentElement;
1352
+ break;
1353
+ }
1354
+ // Only walk through inline formatting wrappers
1355
+ if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
1356
+ parentText === targetText) {
1357
+ target = target.parentElement;
1358
+ targetText = parentText;
1359
+ continue;
1360
+ }
1361
+ break;
1362
+ }
1363
+ const text = target.textContent?.trim() || '';
1364
+ const words = text.split(/\s+/).length;
1365
+ if (words > 10)
1366
+ continue;
1367
+ // Check if this element is near the start or end of mainContent
1368
+ const pos = contentText.indexOf(text);
1369
+ const distFromEnd = contentText.length - (pos + text.length);
1370
+ if (pos > 200 && distFromEnd > 200)
1371
+ continue;
1372
+ if (this.debug && debugRemovals) {
1373
+ debugRemovals.push({
1374
+ step: 'removeByContentPattern',
1375
+ reason: 'boundary date element',
1376
+ text: (0, utils_1.textPreview)(target)
1377
+ });
1378
+ }
1379
+ target.remove();
1380
+ }
1381
+ // Remove section breadcrumbs
1382
+ // Short elements containing a link to a parent section of the current URL.
1383
+ const url = this.options.url || this.doc.URL || '';
1384
+ let urlPath = '';
1385
+ try {
1386
+ urlPath = new URL(url).pathname;
1387
+ }
1388
+ catch { }
1389
+ if (urlPath) {
1390
+ const shortElements = mainContent.querySelectorAll('div, span, p');
1391
+ for (const el of shortElements) {
1392
+ if (!el.parentNode)
1393
+ continue;
1394
+ const text = el.textContent?.trim() || '';
1395
+ const words = text.split(/\s+/).length;
1396
+ if (words > 10)
1397
+ continue;
1398
+ // Must be a leaf-ish element (no block children)
1399
+ if (el.querySelectorAll('p, div, section, article').length > 0)
1400
+ continue;
1401
+ const link = el.querySelector('a[href]');
1402
+ if (!link)
1403
+ continue;
1404
+ try {
1405
+ const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
1406
+ if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
1407
+ if (this.debug && debugRemovals) {
1408
+ debugRemovals.push({
1409
+ step: 'removeByContentPattern',
1410
+ reason: 'section breadcrumb',
1411
+ text: (0, utils_1.textPreview)(el)
1412
+ });
1413
+ }
1414
+ el.remove();
1415
+ }
1416
+ }
1417
+ catch { }
1418
+ }
1419
+ }
1420
+ // Remove boilerplate sentences and trailing non-content.
1421
+ // Search elements for end-of-article boilerplate, then truncate
1422
+ // from the best ancestor that has siblings to remove.
1423
+ const fullText = mainContent.textContent || '';
1424
+ const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
1425
+ for (const el of boilerplateElements) {
1426
+ if (!el.parentNode)
1427
+ continue;
1428
+ const text = el.textContent?.trim() || '';
1429
+ const words = text.split(/\s+/).length;
1430
+ if (words > 50 || words < 3)
1431
+ continue;
1432
+ for (const pattern of BOILERPLATE_PATTERNS) {
1433
+ if (pattern.test(text)) {
1434
+ // Walk up to find an ancestor that has next siblings to truncate.
1435
+ // Don't walk all the way to mainContent's direct child — if there's
1436
+ // a single wrapper div, that would remove everything.
1437
+ let target = el;
1438
+ while (target.parentElement && target.parentElement !== mainContent) {
1439
+ if (target.nextElementSibling)
1440
+ break;
1441
+ target = target.parentElement;
1442
+ }
1443
+ // Only truncate if there's substantial content before the boilerplate
1444
+ const targetText = target.textContent || '';
1445
+ const targetPos = fullText.indexOf(targetText);
1446
+ if (targetPos < 200)
1447
+ continue;
1448
+ // Collect ancestors before modifying the DOM
1449
+ const ancestors = [];
1450
+ let anc = target.parentElement;
1451
+ while (anc && anc !== mainContent) {
1452
+ ancestors.push(anc);
1453
+ anc = anc.parentElement;
1454
+ }
1455
+ // Remove target element and its following siblings
1456
+ this.removeTrailingSiblings(target, true, debugRemovals);
1457
+ // Cascade upward: remove following siblings at each
1458
+ // ancestor level too. Everything after the boilerplate
1459
+ // in document order is non-content.
1460
+ for (const ancestor of ancestors) {
1461
+ this.removeTrailingSiblings(ancestor, false, debugRemovals);
1462
+ }
1463
+ return;
1464
+ }
1465
+ }
1466
+ }
1467
+ }
1468
+ /**
1469
+ * Remove an element's following siblings, and optionally the element itself.
1470
+ */
1471
+ removeTrailingSiblings(element, removeSelf, debugRemovals) {
1472
+ let sibling = element.nextElementSibling;
1473
+ while (sibling) {
1474
+ const next = sibling.nextElementSibling;
1475
+ if (this.debug && debugRemovals) {
1476
+ debugRemovals.push({
1477
+ step: 'removeByContentPattern',
1478
+ reason: 'trailing non-content',
1479
+ text: (0, utils_1.textPreview)(sibling)
1480
+ });
1481
+ }
1482
+ sibling.remove();
1483
+ sibling = next;
1484
+ }
1485
+ if (removeSelf) {
1486
+ if (this.debug && debugRemovals) {
1487
+ debugRemovals.push({
1488
+ step: 'removeByContentPattern',
1489
+ reason: 'boilerplate text',
1490
+ text: (0, utils_1.textPreview)(element)
1491
+ });
1492
+ }
1493
+ element.remove();
1494
+ }
1495
+ }
1190
1496
  }
1191
1497
  exports.Defuddle = Defuddle;
1192
1498
  //# sourceMappingURL=defuddle.js.map