@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,1816 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Defuddle = void 0;
4
+ const metadata_1 = require("./metadata");
5
+ const headings_1 = require("./elements/headings");
6
+ const extractor_registry_1 = require("./extractor-registry");
7
+ const constants_1 = require("./constants");
8
+ const standardize_1 = require("./standardize");
9
+ const footnotes_1 = require("./elements/footnotes");
10
+ const callouts_1 = require("./elements/callouts");
11
+ const scoring_1 = require("./scoring");
12
+ const utils_1 = require("./utils");
13
+ const dom_1 = require("./utils/dom");
14
+ /** Keys from extractor variables that map to top-level DefuddleResponse fields */
15
+ const STANDARD_VARIABLE_KEYS = new Set(['title', 'author', 'published', 'site', 'description', 'image', 'language']);
16
+ // Content pattern detection constants
17
+ const STYLE_WIDTH_PATTERN = /width\s*:\s*(\d+)/;
18
+ const STYLE_HEIGHT_PATTERN = /height\s*:\s*(\d+)/;
19
+ const CONTENT_DATE_PATTERN = /(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2}/i;
20
+ const CONTENT_READ_TIME_PATTERN = /\d+\s*min(?:ute)?s?\s+read\b/i;
21
+ const BOILERPLATE_PATTERNS = [
22
+ /^This (?:article|story|piece) (?:appeared|was published|originally appeared) in\b/i,
23
+ /^A version of this (?:article|story) (?:appeared|was published) in\b/i,
24
+ /^Originally (?:published|appeared) (?:in|on|at)\b/i,
25
+ ];
26
+ const METADATA_STRIP_PATTERNS = [
27
+ /\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b/gi,
28
+ /\b\d+(?:st|nd|rd|th)?\b/g,
29
+ /\bmin(?:ute)?s?\b/gi,
30
+ /\bread\b/gi,
31
+ /[|·•—–\-,.\s]/g,
32
+ ];
33
+ class Defuddle {
34
+ /**
35
+ * Create a new Defuddle instance
36
+ * @param doc - The document to parse
37
+ * @param options - Options for parsing
38
+ */
39
+ constructor(doc, options = {}) {
40
+ this._schemaOrgData = undefined;
41
+ this._schemaOrgExtracted = false;
42
+ this.doc = doc;
43
+ this.options = options;
44
+ this.debug = options.debug || false;
45
+ }
46
+ /**
47
+ * Lazily extract and cache schema.org data. Must be called before
48
+ * parse() strips script tags from the document.
49
+ */
50
+ getSchemaOrgData() {
51
+ if (!this._schemaOrgExtracted) {
52
+ this._schemaOrgData = this._extractSchemaOrgData(this.doc);
53
+ this._schemaOrgExtracted = true;
54
+ }
55
+ return this._schemaOrgData;
56
+ }
57
+ /**
58
+ * Parse the document and extract its main content
59
+ */
60
+ parse() {
61
+ // Try first with default settings
62
+ let result = this.parseInternal();
63
+ // If result has very little content, try again without clutter removal
64
+ if (result.wordCount < 200) {
65
+ this._log('Initial parse returned very little content, trying again');
66
+ const retryResult = this.parseInternal({
67
+ removePartialSelectors: false
68
+ });
69
+ // Only use the retry if it produces significantly more content.
70
+ // A small increase likely means partial selectors correctly removed
71
+ // clutter (author blocks, related articles, etc.) from a short article.
72
+ // A large increase (2x+) suggests partial selectors were too aggressive.
73
+ if (retryResult.wordCount > result.wordCount * 2) {
74
+ this._log('Retry produced more content');
75
+ result = retryResult;
76
+ }
77
+ }
78
+ // If still very little content, the page may be an index/listing page
79
+ // or a page that reveals content at runtime from a hidden wrapper.
80
+ // Retry once with hidden-element removal disabled.
81
+ if (result.wordCount < 50) {
82
+ this._log('Still very little content, retrying without hidden-element removal');
83
+ const hiddenRetry = this.parseInternal({
84
+ removeHiddenElements: false
85
+ });
86
+ if (hiddenRetry.wordCount > result.wordCount * 2) {
87
+ this._log('Hidden-element retry produced more content');
88
+ result = hiddenRetry;
89
+ }
90
+ // Try targeting the largest hidden subtree directly to avoid body-level
91
+ // leftovers (e.g. FPS counters) when hidden content is the real article.
92
+ const hiddenSelector = this.findLargestHiddenContentSelector();
93
+ if (hiddenSelector) {
94
+ this._log('Retrying with hidden content selector:', hiddenSelector);
95
+ const hiddenSelectorRetry = this.parseInternal({
96
+ removeHiddenElements: false,
97
+ removePartialSelectors: false,
98
+ contentSelector: hiddenSelector
99
+ });
100
+ if (hiddenSelectorRetry.wordCount > result.wordCount ||
101
+ (hiddenSelectorRetry.wordCount > Math.max(20, result.wordCount * 0.7) &&
102
+ hiddenSelectorRetry.content.length < result.content.length)) {
103
+ this._log('Hidden-selector retry produced better focused content');
104
+ result = hiddenSelectorRetry;
105
+ }
106
+ }
107
+ }
108
+ // If still very little content, the page may be an index/listing page
109
+ // where card elements were scored as non-content or removed by partial
110
+ // selectors (e.g. "post-preview"). Retry with both disabled.
111
+ if (result.wordCount < 50) {
112
+ this._log('Still very little content, retrying without scoring/partial selectors (possible index page)');
113
+ const indexRetry = this.parseInternal({
114
+ removeLowScoring: false,
115
+ removePartialSelectors: false,
116
+ removeContentPatterns: false
117
+ });
118
+ if (indexRetry.wordCount > result.wordCount) {
119
+ this._log('Index page retry produced more content');
120
+ result = indexRetry;
121
+ }
122
+ }
123
+ // Strip dangerous elements from this.doc before any fallback paths
124
+ // that read from it (e.g. _findContentBySchemaText).
125
+ // This must happen after parseInternal, which needs script tags
126
+ // for schema.org extraction, site-specific extractors, and math.
127
+ this._stripUnsafeElements();
128
+ // If schema.org has a SocialMediaPosting with text content that is
129
+ // longer than what we extracted, the scorer likely picked the wrong
130
+ // element from a feed. Find the correct element in the DOM.
131
+ const schemaText = this._getSchemaText(result.schemaOrgData);
132
+ if (schemaText && this.countHtmlWords(schemaText) > result.wordCount) {
133
+ const contentHtml = this._findContentBySchemaText(schemaText);
134
+ if (contentHtml) {
135
+ this._log('Found DOM content matching schema.org text');
136
+ result.content = contentHtml;
137
+ result.wordCount = this.countHtmlWords(contentHtml);
138
+ }
139
+ else {
140
+ this._log('Using schema.org text as content (DOM element not found)');
141
+ result.content = schemaText;
142
+ result.wordCount = this.countHtmlWords(schemaText);
143
+ }
144
+ }
145
+ return result;
146
+ }
147
+ /**
148
+ * Extract text content from schema.org data (e.g. SocialMediaPosting, Article)
149
+ */
150
+ _getSchemaText(schemaOrgData, depth = 0) {
151
+ if (!schemaOrgData || depth > 10)
152
+ return '';
153
+ const items = Array.isArray(schemaOrgData) ? schemaOrgData : [schemaOrgData];
154
+ for (const item of items) {
155
+ // Recurse into nested arrays
156
+ if (Array.isArray(item)) {
157
+ const found = this._getSchemaText(item, depth + 1);
158
+ if (found)
159
+ return found;
160
+ continue;
161
+ }
162
+ if (item?.text && typeof item.text === 'string') {
163
+ return item.text;
164
+ }
165
+ if (item?.articleBody && typeof item.articleBody === 'string') {
166
+ return item.articleBody;
167
+ }
168
+ // Traverse @graph arrays (common in JSON-LD with multiple entities)
169
+ if (item?.['@graph'] && Array.isArray(item['@graph'])) {
170
+ const found = this._getSchemaText(item['@graph'], depth + 1);
171
+ if (found)
172
+ return found;
173
+ }
174
+ }
175
+ return '';
176
+ }
177
+ /**
178
+ * Remove dangerous elements and attributes from this.doc.
179
+ * Called after parseInternal so that extractors and schema extraction
180
+ * can still read script tags they depend on.
181
+ */
182
+ _stripUnsafeElements() {
183
+ const body = this.doc.body;
184
+ if (!body)
185
+ return;
186
+ // Remove dangerous elements. Iframes are kept — same-origin policy
187
+ // isolates them, and they're widely used for legitimate media embeds.
188
+ // Dangerous iframe attributes (srcdoc, javascript: src) are stripped
189
+ // in the attribute pass below. Math scripts are preserved for LaTeX
190
+ // content (matching the EXACT_SELECTORS approach).
191
+ const dangerousElements = body.querySelectorAll('script:not([type^="math/"]), style, noscript, frame, frameset, object, embed, applet, base');
192
+ for (const el of dangerousElements)
193
+ el.remove();
194
+ // Remove event handler attributes, dangerous URIs, and srcdoc
195
+ const allElements = body.querySelectorAll('*');
196
+ for (const el of allElements) {
197
+ for (const attr of Array.from(el.attributes)) {
198
+ const name = attr.name.toLowerCase();
199
+ if (name.startsWith('on')) {
200
+ el.removeAttribute(attr.name);
201
+ }
202
+ else if (name === 'srcdoc') {
203
+ el.removeAttribute(attr.name);
204
+ }
205
+ else if (['href', 'src', 'action', 'formaction', 'xlink:href'].includes(name)) {
206
+ if ((0, dom_1.isDangerousUrl)(attr.value)) {
207
+ el.removeAttribute(attr.name);
208
+ }
209
+ }
210
+ }
211
+ }
212
+ }
213
+ /**
214
+ * Find the smallest DOM element whose text contains the search phrase
215
+ * and whose word count is at least 80% of the expected count.
216
+ * Shared by _findSchemaContentElement and _findContentBySchemaText.
217
+ */
218
+ _findElementBySchemaText(root, schemaText) {
219
+ const firstPara = schemaText.split(/\n\s*\n/)[0]?.trim() || '';
220
+ const searchPhrase = firstPara.substring(0, 100).trim();
221
+ if (!searchPhrase)
222
+ return null;
223
+ const schemaWordCount = (0, utils_1.countWords)(schemaText);
224
+ let bestMatch = null;
225
+ let bestSize = Infinity;
226
+ const allElements = root.querySelectorAll('*');
227
+ for (const el of allElements) {
228
+ if (el === root)
229
+ continue;
230
+ const elText = el.textContent || '';
231
+ if (!elText.includes(searchPhrase))
232
+ continue;
233
+ const elWords = (0, utils_1.countWords)(elText);
234
+ if (elWords >= schemaWordCount * 0.8 && elWords < bestSize) {
235
+ bestSize = elWords;
236
+ bestMatch = el;
237
+ }
238
+ }
239
+ return bestMatch;
240
+ }
241
+ /**
242
+ * Find a DOM element whose text matches the schema.org text content.
243
+ * Used when the content scorer picked the wrong element from a feed page.
244
+ * Returns the element's inner HTML including sibling media (images, etc.)
245
+ */
246
+ _findContentBySchemaText(schemaText) {
247
+ const body = this.doc.body;
248
+ if (!body)
249
+ return '';
250
+ const bestMatch = this._findElementBySchemaText(body, schemaText);
251
+ if (!bestMatch)
252
+ return '';
253
+ // Read the largest sibling image src BEFORE resolveRelativeUrls
254
+ // can mangle comma-containing CDN URLs in srcset attributes
255
+ let imageSrc = '';
256
+ let imageAlt = '';
257
+ const parent = bestMatch.parentElement;
258
+ if (parent && parent !== body) {
259
+ const images = parent.querySelectorAll('img');
260
+ let largestImg = null;
261
+ let largestArea = 0;
262
+ for (const img of images) {
263
+ if (bestMatch.contains(img))
264
+ continue;
265
+ const w = parseInt(img.getAttribute('width') || '0', 10);
266
+ const h = parseInt(img.getAttribute('height') || '0', 10);
267
+ const area = w * h;
268
+ if (area > largestArea) {
269
+ largestArea = area;
270
+ largestImg = img;
271
+ }
272
+ }
273
+ if (largestImg) {
274
+ imageSrc = this._getLargestImageSrc(largestImg);
275
+ imageAlt = largestImg.getAttribute('alt') || '';
276
+ try {
277
+ const baseUrl = this.options.url || this.doc.URL;
278
+ if (baseUrl)
279
+ imageSrc = new URL(imageSrc, baseUrl).href;
280
+ }
281
+ catch { }
282
+ }
283
+ }
284
+ // Remove heading anchor links before serialization (e.g. <h2>Title<a href="#foo">#</a></h2>)
285
+ (0, headings_1.removeHeadingAnchors)(bestMatch);
286
+ // Now resolve URLs in the text content
287
+ this.resolveRelativeUrls(bestMatch);
288
+ let html = (0, dom_1.serializeHTML)(bestMatch);
289
+ if (imageSrc) {
290
+ const img = this.doc.createElement('img');
291
+ img.setAttribute('src', imageSrc);
292
+ img.setAttribute('alt', imageAlt);
293
+ html += img.outerHTML;
294
+ }
295
+ return html;
296
+ }
297
+ findLargestHiddenContentSelector() {
298
+ const body = this.doc.body;
299
+ if (!body)
300
+ return undefined;
301
+ const candidates = Array.from(body.querySelectorAll(constants_1.HIDDEN_EXACT_SKIP_SELECTOR)).filter(el => {
302
+ const className = el.getAttribute('class') || '';
303
+ return !className.includes('math');
304
+ });
305
+ let best = null;
306
+ let bestWords = 0;
307
+ for (const el of candidates) {
308
+ const words = (0, utils_1.countWords)(el.textContent || '');
309
+ if (words > bestWords) {
310
+ best = el;
311
+ bestWords = words;
312
+ }
313
+ }
314
+ if (!best || bestWords < 30)
315
+ return undefined;
316
+ return this.getElementSelector(best);
317
+ }
318
+ /**
319
+ * Get the largest available src from an img element,
320
+ * checking srcset for higher-resolution versions.
321
+ */
322
+ _getLargestImageSrc(img) {
323
+ const srcset = img.getAttribute('srcset') || '';
324
+ if (!srcset)
325
+ return img.getAttribute('src') || '';
326
+ // Parse srcset entries: each ends with a width descriptor (e.g. "424w")
327
+ // URLs may contain commas (e.g. Substack CDN), so split on width descriptors
328
+ const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?)w/g;
329
+ let bestUrl = '';
330
+ let bestWidth = 0;
331
+ let match;
332
+ let lastIndex = 0;
333
+ while ((match = entryPattern.exec(srcset)) !== null) {
334
+ let url = match[1].trim();
335
+ if (lastIndex > 0) {
336
+ url = url.replace(/^,\s*/, '');
337
+ }
338
+ lastIndex = entryPattern.lastIndex;
339
+ const width = parseFloat(match[2]);
340
+ if (url && width > bestWidth) {
341
+ bestWidth = width;
342
+ bestUrl = url;
343
+ }
344
+ }
345
+ let url = bestUrl || img.getAttribute('src') || '';
346
+ // Strip CDN width/crop constraints to get the full resolution image
347
+ // (e.g. Cloudinary-style params: ,w_852,c_limit → removed)
348
+ url = url.replace(/,w_\d+/g, '').replace(/,c_\w+/g, '');
349
+ return url;
350
+ }
351
+ /**
352
+ * Parse the document asynchronously. Checks for extractors that prefer
353
+ * async (e.g. YouTube transcripts) before sync, then falls back to async
354
+ * extractors if sync parse yields no content.
355
+ */
356
+ async parseAsync() {
357
+ if (this.options.useAsync !== false) {
358
+ const asyncResult = await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry));
359
+ if (asyncResult)
360
+ return asyncResult;
361
+ }
362
+ const result = this.parse();
363
+ if (result.wordCount > 0 || this.options.useAsync === false) {
364
+ return result;
365
+ }
366
+ return (await this.tryAsyncExtractor(extractor_registry_1.ExtractorRegistry.findAsyncExtractor.bind(extractor_registry_1.ExtractorRegistry))) ?? result;
367
+ }
368
+ /**
369
+ * Fetch only async variables (e.g. transcript) without re-parsing.
370
+ * Safe to call after parse() — uses cached schema.org data since
371
+ * parse() strips script tags from the document.
372
+ */
373
+ async fetchAsyncVariables() {
374
+ if (this.options.useAsync === false)
375
+ return null;
376
+ try {
377
+ const url = this.options.url || this.doc.URL;
378
+ const schemaOrgData = this.getSchemaOrgData();
379
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
380
+ const extractor = extractor_registry_1.ExtractorRegistry.findPreferredAsyncExtractor(this.doc, url, schemaOrgData, extractorOpts);
381
+ if (extractor) {
382
+ const extracted = await extractor.extractAsync();
383
+ return this.getExtractorVariables(extracted.variables) || null;
384
+ }
385
+ }
386
+ catch (error) {
387
+ console.error('Defuddle', 'Error fetching async variables:', error);
388
+ }
389
+ return null;
390
+ }
391
+ async tryAsyncExtractor(finder) {
392
+ try {
393
+ const url = this.options.url || this.doc.URL;
394
+ const schemaOrgData = this.getSchemaOrgData();
395
+ const extractorOpts = { includeReplies: this.options.includeReplies ?? 'extractors', language: this.options.language };
396
+ const extractor = finder(this.doc, url, schemaOrgData, extractorOpts);
397
+ if (extractor) {
398
+ const startTime = Date.now();
399
+ const extracted = await extractor.extractAsync();
400
+ const pageMetaTags = this._collectMetaTags();
401
+ const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
402
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
403
+ }
404
+ }
405
+ catch (error) {
406
+ console.error('Defuddle', 'Error in async extraction:', error);
407
+ }
408
+ return null;
409
+ }
410
+ /**
411
+ * Internal parse method that does the actual work
412
+ */
413
+ parseInternal(overrideOptions = {}) {
414
+ const startTime = Date.now();
415
+ // Guard against empty/broken documents (e.g. empty HTML, bot-blocked pages)
416
+ if (!this.doc.documentElement) {
417
+ const url = this.options.url || '';
418
+ return {
419
+ content: '',
420
+ title: '',
421
+ description: '',
422
+ domain: url ? new URL(url).hostname : '',
423
+ favicon: '',
424
+ image: '',
425
+ language: '',
426
+ parseTime: Date.now() - startTime,
427
+ published: '',
428
+ author: '',
429
+ site: '',
430
+ schemaOrgData: null,
431
+ wordCount: 0,
432
+ };
433
+ }
434
+ const options = {
435
+ removeExactSelectors: true,
436
+ removePartialSelectors: true,
437
+ removeHiddenElements: true,
438
+ removeLowScoring: true,
439
+ removeSmallImages: true,
440
+ removeContentPatterns: true,
441
+ standardize: true,
442
+ includeReplies: 'extractors',
443
+ ...this.options,
444
+ ...overrideOptions
445
+ };
446
+ const debugRemovals = [];
447
+ // Extract schema.org data (cached — must happen before _stripUnsafeElements removes scripts)
448
+ const schemaOrgData = this.getSchemaOrgData();
449
+ // Cache meta tags and metadata across retries
450
+ if (!this._metaTags) {
451
+ this._metaTags = this._collectMetaTags();
452
+ }
453
+ const pageMetaTags = this._metaTags;
454
+ if (!this._metadata) {
455
+ this._metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData, pageMetaTags);
456
+ }
457
+ const metadata = this._metadata;
458
+ if (options.removeImages) {
459
+ this.removeImages(this.doc);
460
+ }
461
+ try {
462
+ // Use site-specific extractor first, if there is one
463
+ const url = options.url || this.doc.URL;
464
+ const extractorOpts = {
465
+ includeReplies: options.includeReplies,
466
+ language: options.language,
467
+ };
468
+ const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData, extractorOpts);
469
+ if (extractor && extractor.canExtract()) {
470
+ const extracted = extractor.extract();
471
+ return this.buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags);
472
+ }
473
+ // Continue if there is no extractor...
474
+ // Evaluate mobile styles and sizes on original document (cached across retries)
475
+ if (!this._mobileStyles) {
476
+ this._mobileStyles = this._evaluateMediaQueries(this.doc);
477
+ }
478
+ const mobileStyles = this._mobileStyles;
479
+ // Find small images in original document (cached across retries)
480
+ if (!this._smallImages) {
481
+ this._smallImages = this.findSmallImages(this.doc);
482
+ }
483
+ const smallImages = this._smallImages;
484
+ // Clone document
485
+ const clone = this.doc.cloneNode(true);
486
+ // Merge adjacent text nodes that some DOM implementations (e.g. linkedom)
487
+ // create when parsing HTML entities like &#39;
488
+ clone.body?.normalize();
489
+ // Flatten shadow DOM content into the clone
490
+ this.flattenShadowRoots(this.doc, clone);
491
+ // Resolve React streaming SSR suspense boundaries
492
+ this.resolveStreamedContent(clone);
493
+ // Apply mobile styles to clone
494
+ this.applyMobileStyles(clone, mobileStyles);
495
+ // Find main content
496
+ let mainContent = null;
497
+ if (options.contentSelector) {
498
+ mainContent = clone.querySelector(options.contentSelector);
499
+ this._log('Using contentSelector:', options.contentSelector, mainContent ? 'found' : 'not found');
500
+ }
501
+ if (!mainContent) {
502
+ mainContent = this.findMainContent(clone);
503
+ }
504
+ // If we fell back to <body>, try using schema.org articleBody/text
505
+ // to find a more specific content element within the DOM.
506
+ if (mainContent && mainContent.tagName.toLowerCase() === 'body') {
507
+ const schemaText = this._getSchemaText(schemaOrgData);
508
+ if (schemaText) {
509
+ const schemaContent = this._findElementBySchemaText(clone.body, schemaText);
510
+ if (schemaContent) {
511
+ this._log('Found content element via schema.org text');
512
+ mainContent = schemaContent;
513
+ }
514
+ }
515
+ }
516
+ if (!mainContent) {
517
+ const fallbackContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
518
+ const endTime = Date.now();
519
+ return {
520
+ content: fallbackContent,
521
+ ...metadata,
522
+ wordCount: this.countHtmlWords(fallbackContent),
523
+ parseTime: Math.round(endTime - startTime),
524
+ metaTags: pageMetaTags
525
+ };
526
+ }
527
+ // Remove <wbr> elements — word break opportunity hints that carry no
528
+ // content but cause unwanted whitespace during standardization.
529
+ mainContent.querySelectorAll('wbr').forEach(el => el.remove());
530
+ // Standardize footnotes before cleanup (CSS sidenotes use display:none)
531
+ if (options.standardize) {
532
+ (0, footnotes_1.standardizeFootnotes)(mainContent);
533
+ (0, callouts_1.standardizeCallouts)(mainContent);
534
+ }
535
+ // Remove small images
536
+ if (options.removeSmallImages) {
537
+ this.removeSmallImages(clone, smallImages);
538
+ }
539
+ // Remove hidden elements using computed styles
540
+ if (options.removeHiddenElements) {
541
+ this.removeHiddenElements(clone, debugRemovals);
542
+ }
543
+ // Remove clutter using selectors — deterministic removal of known
544
+ // non-content elements (nav, footer, .sidebar, etc.) by class/id.
545
+ // Runs before scoring so the heuristic scorer sees a cleaner DOM.
546
+ if (options.removeExactSelectors || options.removePartialSelectors) {
547
+ this.removeBySelector(clone, options.removeExactSelectors, options.removePartialSelectors, mainContent, debugRemovals, options.removeHiddenElements === false);
548
+ }
549
+ // Remove non-content blocks by scoring — heuristic removal based
550
+ // on link density, text ratios, and navigation indicators.
551
+ if (options.removeLowScoring) {
552
+ scoring_1.ContentScorer.scoreAndRemove(clone, this.debug, debugRemovals, mainContent);
553
+ }
554
+ // Remove elements by content patterns (read time, boilerplate, article cards)
555
+ if (options.removeContentPatterns && mainContent) {
556
+ this.removeByContentPattern(mainContent, this.debug ? debugRemovals : undefined);
557
+ }
558
+ // Normalize the main content
559
+ if (options.standardize) {
560
+ (0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
561
+ }
562
+ // Resolve relative URLs to absolute
563
+ this.resolveRelativeUrls(mainContent);
564
+ const content = mainContent.outerHTML;
565
+ const endTime = Date.now();
566
+ const result = {
567
+ content,
568
+ ...metadata,
569
+ wordCount: this.countHtmlWords(content),
570
+ parseTime: Math.round(endTime - startTime),
571
+ metaTags: pageMetaTags
572
+ };
573
+ if (this.debug) {
574
+ result.debug = {
575
+ contentSelector: this.getElementSelector(mainContent),
576
+ removals: debugRemovals
577
+ };
578
+ }
579
+ return result;
580
+ }
581
+ catch (error) {
582
+ console.error('Defuddle', 'Error processing document:', error);
583
+ const errorContent = this.doc.body ? this.resolveContentUrls((0, dom_1.serializeHTML)(this.doc.body)) : '';
584
+ const endTime = Date.now();
585
+ return {
586
+ content: errorContent,
587
+ ...metadata,
588
+ wordCount: this.countHtmlWords(errorContent),
589
+ parseTime: Math.round(endTime - startTime),
590
+ metaTags: pageMetaTags
591
+ };
592
+ }
593
+ }
594
+ countHtmlWords(content) {
595
+ // Strip HTML tags and decode common entities without DOM parsing
596
+ const text = content
597
+ .replace(/<[^>]*>/g, ' ')
598
+ .replace(/&nbsp;/gi, ' ')
599
+ .replace(/&amp;/gi, '&')
600
+ .replace(/&lt;/gi, '<')
601
+ .replace(/&gt;/gi, '>')
602
+ .replace(/&quot;/gi, '"')
603
+ .replace(/&#\d+;/g, ' ')
604
+ .replace(/&\w+;/g, ' ');
605
+ return (0, utils_1.countWords)(text);
606
+ }
607
+ _log(...args) {
608
+ if (this.debug) {
609
+ console.log('Defuddle:', ...args);
610
+ }
611
+ }
612
+ _evaluateMediaQueries(doc) {
613
+ const mobileStyles = [];
614
+ const maxWidthRegex = /max-width[^:]*:\s*(\d+)/;
615
+ try {
616
+ if (!doc.styleSheets)
617
+ return mobileStyles;
618
+ // Get all styles, including inline styles
619
+ const sheets = Array.from(doc.styleSheets).filter(sheet => {
620
+ try {
621
+ // Access rules once to check validity
622
+ sheet.cssRules;
623
+ return true;
624
+ }
625
+ catch (e) {
626
+ // Expected error for cross-origin stylesheets or Node.js environment
627
+ if (e instanceof DOMException && e.name === 'SecurityError') {
628
+ return false;
629
+ }
630
+ return false;
631
+ }
632
+ });
633
+ // Process all sheets in a single pass
634
+ const mediaRules = sheets.flatMap(sheet => {
635
+ try {
636
+ // Check if we're in a browser environment where CSSMediaRule is available
637
+ if (typeof CSSMediaRule === 'undefined') {
638
+ return [];
639
+ }
640
+ return Array.from(sheet.cssRules)
641
+ .filter((rule) => rule instanceof CSSMediaRule &&
642
+ rule.conditionText.includes('max-width'));
643
+ }
644
+ catch (e) {
645
+ if (this.debug) {
646
+ console.warn('Defuddle: Failed to process stylesheet:', e);
647
+ }
648
+ return [];
649
+ }
650
+ });
651
+ // Process all media rules in a single pass
652
+ mediaRules.forEach(rule => {
653
+ const match = rule.conditionText.match(maxWidthRegex);
654
+ if (match) {
655
+ const maxWidth = parseInt(match[1]);
656
+ if (constants_1.MOBILE_WIDTH <= maxWidth) {
657
+ // Batch process all style rules
658
+ const styleRules = Array.from(rule.cssRules)
659
+ .filter((r) => r instanceof CSSStyleRule);
660
+ styleRules.forEach(cssRule => {
661
+ try {
662
+ mobileStyles.push({
663
+ selector: cssRule.selectorText,
664
+ styles: cssRule.style.cssText
665
+ });
666
+ }
667
+ catch (e) {
668
+ if (this.debug) {
669
+ console.warn('Defuddle: Failed to process CSS rule:', e);
670
+ }
671
+ }
672
+ });
673
+ }
674
+ }
675
+ });
676
+ }
677
+ catch (e) {
678
+ console.error('Defuddle: Error evaluating media queries:', e);
679
+ }
680
+ return mobileStyles;
681
+ }
682
+ applyMobileStyles(doc, mobileStyles) {
683
+ let appliedCount = 0;
684
+ mobileStyles.forEach(({ selector, styles }) => {
685
+ try {
686
+ const elements = doc.querySelectorAll(selector);
687
+ elements.forEach(element => {
688
+ element.setAttribute('style', (element.getAttribute('style') || '') + styles);
689
+ appliedCount++;
690
+ });
691
+ }
692
+ catch (e) {
693
+ console.error('Defuddle', 'Error applying styles for selector:', selector, e);
694
+ }
695
+ });
696
+ }
697
+ removeImages(doc) {
698
+ const images = doc.getElementsByTagName('img');
699
+ Array.from(images).forEach(image => {
700
+ image.remove();
701
+ });
702
+ }
703
+ removeHiddenElements(doc, debugRemovals) {
704
+ let count = 0;
705
+ const elementsToRemove = new Map();
706
+ // Check inline styles and CSS class-based hidden patterns.
707
+ const hiddenStylePattern = /(?:^|;\s*)(?:display\s*:\s*none|visibility\s*:\s*hidden|opacity\s*:\s*0)(?:\s*;|\s*$)/i;
708
+ // Only use getComputedStyle in browser environments where it's meaningful.
709
+ // In JSDOM/linkedom without stylesheets, it's extremely slow and unreliable.
710
+ const defaultView = doc.defaultView;
711
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
712
+ const allElements = doc.querySelectorAll('*');
713
+ for (const element of allElements) {
714
+ // Skip elements that contain math — sites like Wikipedia wrap MathML
715
+ // in display:none spans for accessibility (the visible version is an
716
+ // image/SVG fallback). We need to preserve these for math extraction.
717
+ if (element.querySelector('math, [data-mathml], .katex-mathml') ||
718
+ element.tagName.toLowerCase() === 'math') {
719
+ continue;
720
+ }
721
+ // Check inline style for hidden patterns
722
+ const style = element.getAttribute('style');
723
+ if (style && hiddenStylePattern.test(style)) {
724
+ const reason = style.includes('display') ? 'display:none' :
725
+ style.includes('visibility') ? 'visibility:hidden' : 'opacity:0';
726
+ elementsToRemove.set(element, reason);
727
+ count++;
728
+ continue;
729
+ }
730
+ // Use getComputedStyle only in real browser environments
731
+ if (isBrowser) {
732
+ try {
733
+ const computedStyle = defaultView.getComputedStyle(element);
734
+ let reason = '';
735
+ if (computedStyle.display === 'none')
736
+ reason = 'display:none';
737
+ else if (computedStyle.visibility === 'hidden')
738
+ reason = 'visibility:hidden';
739
+ else if (computedStyle.opacity === '0')
740
+ reason = 'opacity:0';
741
+ if (reason) {
742
+ elementsToRemove.set(element, reason);
743
+ count++;
744
+ continue;
745
+ }
746
+ }
747
+ catch (e) { }
748
+ }
749
+ // Detect CSS framework hidden utilities (e.g. Tailwind's "hidden",
750
+ // "sm:hidden", "not-machine:hidden")
751
+ const className = element.getAttribute('class') || '';
752
+ if (className) {
753
+ const tokens = className.split(/\s+/);
754
+ for (const token of tokens) {
755
+ if (token === 'hidden' || token.endsWith(':hidden') || token === 'invisible' || token.endsWith(':invisible')) {
756
+ elementsToRemove.set(element, `class:${token}`);
757
+ count++;
758
+ break;
759
+ }
760
+ }
761
+ }
762
+ }
763
+ // Batch remove all hidden elements
764
+ elementsToRemove.forEach((reason, el) => {
765
+ if (this.debug && debugRemovals) {
766
+ debugRemovals.push({
767
+ step: 'removeHiddenElements',
768
+ reason,
769
+ text: (0, utils_1.textPreview)(el)
770
+ });
771
+ }
772
+ el.remove();
773
+ });
774
+ this._log('Removed hidden elements:', count);
775
+ }
776
+ removeBySelector(doc, removeExact = true, removePartial = true, mainContent, debugRemovals, skipHiddenExactSelectors = false) {
777
+ const startTime = Date.now();
778
+ let exactSelectorCount = 0;
779
+ let partialSelectorCount = 0;
780
+ // Track all elements to be removed, with their match type
781
+ const elementsToRemove = new Map();
782
+ // First collect elements matching exact selectors
783
+ if (removeExact) {
784
+ const exactElements = doc.querySelectorAll(constants_1.EXACT_SELECTORS_JOINED);
785
+ exactElements.forEach(el => {
786
+ if (el?.parentNode) {
787
+ if (skipHiddenExactSelectors) {
788
+ const hiddenAncestor = el.closest(constants_1.HIDDEN_EXACT_SKIP_SELECTOR);
789
+ const role = (el.getAttribute('role') || '').toLowerCase();
790
+ if (el.matches(constants_1.HIDDEN_EXACT_SELECTOR) ||
791
+ (hiddenAncestor && role === 'dialog')) {
792
+ return;
793
+ }
794
+ }
795
+ // Skip elements inside code blocks (e.g. syntax highlighting spans)
796
+ if (el.closest('pre, code')) {
797
+ return;
798
+ }
799
+ elementsToRemove.set(el, { type: 'exact' });
800
+ exactSelectorCount++;
801
+ }
802
+ });
803
+ }
804
+ if (removePartial) {
805
+ // Pre-compile individual regexes for debug pattern identification only
806
+ const individualRegexes = this.debug
807
+ ? constants_1.PARTIAL_SELECTORS.map(p => ({ pattern: p, regex: new RegExp(p, 'i') }))
808
+ : null;
809
+ // Use pre-built attribute selector for elements we care about
810
+ const allElements = doc.querySelectorAll(constants_1.TEST_ATTRIBUTES_SELECTOR);
811
+ // Process elements for partial matches
812
+ allElements.forEach(el => {
813
+ // Skip if already marked for removal
814
+ if (elementsToRemove.has(el)) {
815
+ return;
816
+ }
817
+ // Skip code elements and elements containing code blocks
818
+ // where class names indicate language/syntax, not page structure
819
+ const tag = el.tagName;
820
+ if (tag === 'CODE' || tag === 'PRE' || el.querySelector('pre') || el.closest('code, pre')) {
821
+ return;
822
+ }
823
+ // Get all relevant attributes and combine into a single string
824
+ const attrs = constants_1.TEST_ATTRIBUTES.map(attr => {
825
+ if (attr === 'class') {
826
+ return (0, dom_1.getClassName)(el);
827
+ }
828
+ if (attr === 'id') {
829
+ return el.id || '';
830
+ }
831
+ return el.getAttribute(attr) || '';
832
+ }).join(' ').toLowerCase();
833
+ // Skip if no attributes to check
834
+ if (!attrs.trim()) {
835
+ return;
836
+ }
837
+ // Check for partial match using single regex test
838
+ if (constants_1.PARTIAL_SELECTORS_REGEX.test(attrs)) {
839
+ const matchedPattern = individualRegexes
840
+ ? individualRegexes.find(r => r.regex.test(attrs))?.pattern
841
+ : undefined;
842
+ elementsToRemove.set(el, { type: 'partial', selector: matchedPattern });
843
+ partialSelectorCount++;
844
+ }
845
+ });
846
+ }
847
+ // Remove all collected elements in a single pass
848
+ // Skip elements that are ancestors of mainContent to avoid disconnecting it
849
+ // Skip footnote list containers, their parents, and immediate children
850
+ // Skip anchor links inside headings - the heading transform handles these
851
+ elementsToRemove.forEach(({ type, selector }, el) => {
852
+ if (mainContent && el.contains(mainContent)) {
853
+ return;
854
+ }
855
+ if (el.tagName === 'A' && el.closest('h1, h2, h3, h4, h5, h6')) {
856
+ return;
857
+ }
858
+ try {
859
+ if (el.matches(constants_1.FOOTNOTE_LIST_SELECTORS) || el.querySelector(constants_1.FOOTNOTE_LIST_SELECTORS)) {
860
+ return;
861
+ }
862
+ // Protect immediate children of footnote containers (e.g. wikidot div.footnote-footer)
863
+ const parent = el.parentElement;
864
+ if (parent && parent.matches(constants_1.FOOTNOTE_LIST_SELECTORS)) {
865
+ return;
866
+ }
867
+ }
868
+ catch (e) { }
869
+ if (this.debug && debugRemovals) {
870
+ debugRemovals.push({
871
+ step: 'removeBySelector',
872
+ selector: type === 'exact' ? 'exact' : selector,
873
+ reason: type === 'exact' ? 'exact selector match' : `partial match: ${selector}`,
874
+ text: (0, utils_1.textPreview)(el)
875
+ });
876
+ }
877
+ el.remove();
878
+ });
879
+ const endTime = Date.now();
880
+ this._log('Removed clutter elements:', {
881
+ exactSelectors: exactSelectorCount,
882
+ partialSelectors: partialSelectorCount,
883
+ total: elementsToRemove.size,
884
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
885
+ });
886
+ }
887
+ // Find small IMG and SVG elements
888
+ findSmallImages(doc) {
889
+ const MIN_DIMENSION = 33;
890
+ const smallImages = new Set();
891
+ let processedCount = 0;
892
+ const elements = doc.querySelectorAll('img, svg');
893
+ const defaultView = doc.defaultView;
894
+ const isBrowser = typeof window !== 'undefined' && defaultView === window;
895
+ for (const element of elements) {
896
+ const attrWidth = parseInt(element.getAttribute('width') || '0');
897
+ const attrHeight = parseInt(element.getAttribute('height') || '0');
898
+ // Check inline style dimensions
899
+ const style = element.getAttribute('style') || '';
900
+ const styleWidth = parseInt(style.match(STYLE_WIDTH_PATTERN)?.[1] || '0');
901
+ const styleHeight = parseInt(style.match(STYLE_HEIGHT_PATTERN)?.[1] || '0');
902
+ // Use getComputedStyle and getBoundingClientRect only in browser
903
+ let computedWidth = 0, computedHeight = 0;
904
+ if (isBrowser) {
905
+ try {
906
+ const cs = defaultView.getComputedStyle(element);
907
+ computedWidth = parseInt(cs.width) || 0;
908
+ computedHeight = parseInt(cs.height) || 0;
909
+ }
910
+ catch (e) { }
911
+ try {
912
+ const rect = element.getBoundingClientRect();
913
+ if (rect.width > 0)
914
+ computedWidth = computedWidth || rect.width;
915
+ if (rect.height > 0)
916
+ computedHeight = computedHeight || rect.height;
917
+ }
918
+ catch (e) { }
919
+ }
920
+ const widths = [attrWidth, styleWidth, computedWidth].filter(d => d > 0);
921
+ const heights = [attrHeight, styleHeight, computedHeight].filter(d => d > 0);
922
+ if (widths.length > 0 && heights.length > 0) {
923
+ const effectiveWidth = Math.min(...widths);
924
+ const effectiveHeight = Math.min(...heights);
925
+ if (effectiveWidth < MIN_DIMENSION || effectiveHeight < MIN_DIMENSION) {
926
+ const identifier = this.getElementIdentifier(element);
927
+ if (identifier) {
928
+ smallImages.add(identifier);
929
+ processedCount++;
930
+ }
931
+ }
932
+ }
933
+ }
934
+ this._log('Found small elements:', processedCount);
935
+ return smallImages;
936
+ }
937
+ removeSmallImages(doc, smallImages) {
938
+ let removedCount = 0;
939
+ ['img', 'svg'].forEach(tag => {
940
+ const elements = doc.getElementsByTagName(tag);
941
+ Array.from(elements).forEach(element => {
942
+ const identifier = this.getElementIdentifier(element);
943
+ if (identifier && smallImages.has(identifier)) {
944
+ element.remove();
945
+ removedCount++;
946
+ }
947
+ });
948
+ });
949
+ this._log('Removed small elements:', removedCount);
950
+ }
951
+ getElementIdentifier(element) {
952
+ // Try to create a unique identifier using various attributes
953
+ if (element.tagName.toLowerCase() === 'img') {
954
+ // For lazy-loaded images, use data-src as identifier if available
955
+ const dataSrc = element.getAttribute('data-src');
956
+ if (dataSrc)
957
+ return `src:${dataSrc}`;
958
+ const src = element.getAttribute('src') || '';
959
+ const srcset = element.getAttribute('srcset') || '';
960
+ const dataSrcset = element.getAttribute('data-srcset');
961
+ if (src)
962
+ return `src:${src}`;
963
+ if (srcset)
964
+ return `srcset:${srcset}`;
965
+ if (dataSrcset)
966
+ return `srcset:${dataSrcset}`;
967
+ }
968
+ const id = element.id || '';
969
+ const className = (0, dom_1.getClassName)(element);
970
+ const viewBox = element.tagName.toLowerCase() === 'svg' ? element.getAttribute('viewBox') || '' : '';
971
+ if (id)
972
+ return `id:${id}`;
973
+ if (viewBox)
974
+ return `viewBox:${viewBox}`;
975
+ if (className)
976
+ return `class:${className}`;
977
+ return null;
978
+ }
979
+ findMainContent(doc) {
980
+ // Find all potential content containers
981
+ const candidates = [];
982
+ constants_1.ENTRY_POINT_ELEMENTS.forEach((selector, index) => {
983
+ const elements = doc.querySelectorAll(selector);
984
+ elements.forEach(element => {
985
+ // Base score from selector priority (earlier = higher)
986
+ let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40;
987
+ // Add score based on content analysis
988
+ score += scoring_1.ContentScorer.scoreElement(element);
989
+ candidates.push({ element, score, selectorIndex: index });
990
+ });
991
+ });
992
+ if (candidates.length === 0) {
993
+ // Fall back to scoring block elements
994
+ return this.findContentByScoring(doc);
995
+ }
996
+ // Sort by score descending
997
+ candidates.sort((a, b) => b.score - a.score);
998
+ if (this.debug) {
999
+ this._log('Content candidates:', candidates.map(c => ({
1000
+ element: c.element.tagName,
1001
+ selector: this.getElementSelector(c.element),
1002
+ score: c.score
1003
+ })));
1004
+ }
1005
+ // If we only matched body, try table-based detection
1006
+ if (candidates.length === 1 && candidates[0].element.tagName.toLowerCase() === 'body') {
1007
+ const tableContent = this.findTableBasedContent(doc);
1008
+ if (tableContent) {
1009
+ return tableContent;
1010
+ }
1011
+ }
1012
+ // If the top candidate contains a child candidate that matched a
1013
+ // higher-priority selector, prefer the most specific (deepest) child.
1014
+ // This prevents e.g. <main> from winning over a contained <article>
1015
+ // just because sibling noise inflates the parent's content score.
1016
+ // Only prefer the child if it has meaningful content (>50 words),
1017
+ // otherwise it may be an empty card element (e.g. related article cards).
1018
+ // Skip this when the parent contains multiple children matching the
1019
+ // same selector — that indicates a listing/portfolio page where the
1020
+ // parent is the real content container.
1021
+ const top = candidates[0];
1022
+ let best = top;
1023
+ for (let i = 1; i < candidates.length; i++) {
1024
+ const child = candidates[i];
1025
+ const childWords = (0, utils_1.countWords)(child.element.textContent || '');
1026
+ if (child.selectorIndex < best.selectorIndex && best.element.contains(child.element) && childWords > 50) {
1027
+ // Count how many candidates share this selector index inside
1028
+ // the top element. Use top (not best) as the stable reference
1029
+ // so the check isn't affected by earlier iterations.
1030
+ let siblingsAtIndex = 0;
1031
+ for (const c of candidates) {
1032
+ if (c.selectorIndex === child.selectorIndex && top.element.contains(c.element)) {
1033
+ if (++siblingsAtIndex > 1)
1034
+ break;
1035
+ }
1036
+ }
1037
+ if (siblingsAtIndex > 1) {
1038
+ // Multiple articles/cards inside the parent — it's a listing page
1039
+ continue;
1040
+ }
1041
+ best = child;
1042
+ }
1043
+ }
1044
+ if (best !== top) {
1045
+ return best.element;
1046
+ }
1047
+ return top.element;
1048
+ }
1049
+ findTableBasedContent(doc) {
1050
+ // First check if this looks like an old-style table-based layout
1051
+ const tables = Array.from(doc.getElementsByTagName('table'));
1052
+ const hasTableLayout = tables.some(table => {
1053
+ const width = parseInt(table.getAttribute('width') || '0');
1054
+ const style = this.getComputedStyle(table);
1055
+ return width > 400 ||
1056
+ (style?.width?.includes('px') && parseInt(style.width) > 400) ||
1057
+ table.getAttribute('align') === 'center' ||
1058
+ (table.className || '').toLowerCase().includes('content') ||
1059
+ (table.className || '').toLowerCase().includes('article');
1060
+ });
1061
+ if (!hasTableLayout) {
1062
+ return null; // Don't try table-based extraction for modern layouts
1063
+ }
1064
+ const cells = Array.from(doc.getElementsByTagName('td'));
1065
+ return scoring_1.ContentScorer.findBestElement(cells);
1066
+ }
1067
+ findContentByScoring(doc) {
1068
+ const candidates = [];
1069
+ doc.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR).forEach((element) => {
1070
+ const score = scoring_1.ContentScorer.scoreElement(element);
1071
+ if (score > 0) {
1072
+ candidates.push({ score, element });
1073
+ }
1074
+ });
1075
+ return candidates.length > 0 ? candidates.sort((a, b) => b.score - a.score)[0].element : null;
1076
+ }
1077
+ getElementSelector(element) {
1078
+ const parts = [];
1079
+ let current = element;
1080
+ while (current && current !== this.doc.documentElement) {
1081
+ let selector = current.tagName.toLowerCase();
1082
+ if (current.id) {
1083
+ selector += '#' + current.id;
1084
+ }
1085
+ else if ((0, dom_1.getClassName)(current)) {
1086
+ selector += '.' + (0, dom_1.getClassName)(current).trim().split(/\s+/).join('.');
1087
+ }
1088
+ parts.unshift(selector);
1089
+ current = current.parentElement;
1090
+ }
1091
+ return parts.join(' > ');
1092
+ }
1093
+ getComputedStyle(element) {
1094
+ return (0, utils_1.getComputedStyle)(element);
1095
+ }
1096
+ /**
1097
+ * Resolve relative URLs to absolute within a DOM element
1098
+ */
1099
+ resolveRelativeUrls(element) {
1100
+ const docUrl = this.options.url || this.doc.URL;
1101
+ if (!docUrl)
1102
+ return;
1103
+ // Respect <base href> for relative URL resolution, matching browser behavior
1104
+ let baseUrl = docUrl;
1105
+ const baseEl = this.doc.querySelector('base[href]');
1106
+ if (baseEl) {
1107
+ const baseHref = baseEl.getAttribute('href');
1108
+ if (baseHref) {
1109
+ try {
1110
+ baseUrl = new URL(baseHref, docUrl).href;
1111
+ }
1112
+ catch {
1113
+ // Invalid base href, fall back to document URL
1114
+ }
1115
+ }
1116
+ }
1117
+ const resolve = (url) => {
1118
+ // Some pages ship escaped quoted hrefs like \"mailto:...\" in server templates.
1119
+ // Normalize these before URL resolution.
1120
+ const normalized = url
1121
+ .trim()
1122
+ .replace(/^\\?["']+/, '')
1123
+ .replace(/\\?["']+$/, '');
1124
+ try {
1125
+ return new URL(normalized, baseUrl).href;
1126
+ }
1127
+ catch {
1128
+ return normalized || url;
1129
+ }
1130
+ };
1131
+ element.querySelectorAll('[href]').forEach(el => {
1132
+ const href = el.getAttribute('href');
1133
+ if (href)
1134
+ el.setAttribute('href', resolve(href));
1135
+ });
1136
+ element.querySelectorAll('[src]').forEach(el => {
1137
+ const src = el.getAttribute('src');
1138
+ if (src)
1139
+ el.setAttribute('src', resolve(src));
1140
+ });
1141
+ element.querySelectorAll('[srcset]').forEach(el => {
1142
+ const srcset = el.getAttribute('srcset');
1143
+ if (srcset) {
1144
+ // Parse srcset using width/density descriptors as delimiters,
1145
+ // not commas — URLs may contain commas (e.g. CDN transform params)
1146
+ const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
1147
+ const entries = [];
1148
+ let match;
1149
+ let lastIdx = 0;
1150
+ while ((match = entryPattern.exec(srcset)) !== null) {
1151
+ let url = match[1].trim();
1152
+ if (lastIdx > 0) {
1153
+ url = url.replace(/^,\s*/, '');
1154
+ }
1155
+ lastIdx = entryPattern.lastIndex;
1156
+ entries.push(`${resolve(url)} ${match[2]}`);
1157
+ }
1158
+ if (entries.length > 0) {
1159
+ el.setAttribute('srcset', entries.join(', '));
1160
+ }
1161
+ else {
1162
+ // Fallback: simple comma split for srcsets without descriptors
1163
+ const resolved = srcset.split(',').map(entry => {
1164
+ const parts = entry.trim().split(/\s+/);
1165
+ if (parts[0])
1166
+ parts[0] = resolve(parts[0]);
1167
+ return parts.join(' ');
1168
+ }).join(', ');
1169
+ el.setAttribute('srcset', resolved);
1170
+ }
1171
+ }
1172
+ });
1173
+ element.querySelectorAll('[poster]').forEach(el => {
1174
+ const poster = el.getAttribute('poster');
1175
+ if (poster)
1176
+ el.setAttribute('poster', resolve(poster));
1177
+ });
1178
+ }
1179
+ /**
1180
+ * Flatten shadow DOM content into a cloned document.
1181
+ * Walks both trees in parallel so positional correspondence is exact.
1182
+ */
1183
+ flattenShadowRoots(original, clone) {
1184
+ if (!original.body || !clone.body)
1185
+ return;
1186
+ const origElements = Array.from(original.body.querySelectorAll('*'));
1187
+ // Find the first element with a shadow root (also serves as the hasShadowRoots check)
1188
+ const firstShadow = origElements.find(el => el.shadowRoot);
1189
+ if (!firstShadow)
1190
+ return;
1191
+ const cloneElements = Array.from(clone.body.querySelectorAll('*'));
1192
+ // Check if we can directly read shadow DOM content (main world / Node.js).
1193
+ // In content script isolated worlds, shadowRoot exists but content is empty.
1194
+ const canReadShadow = (firstShadow.shadowRoot?.childNodes?.length ?? 0) > 0;
1195
+ if (canReadShadow) {
1196
+ // Direct traversal works (main world / Node.js)
1197
+ for (let i = origElements.length - 1; i >= 0; i--) {
1198
+ const origEl = origElements[i];
1199
+ if (!origEl.shadowRoot)
1200
+ continue;
1201
+ const cloneEl = cloneElements[i];
1202
+ if (!cloneEl)
1203
+ continue;
1204
+ const shadowHtml = origEl.shadowRoot.innerHTML;
1205
+ if (shadowHtml.length > 0) {
1206
+ this.replaceShadowHost(cloneEl, shadowHtml, clone);
1207
+ }
1208
+ }
1209
+ }
1210
+ else {
1211
+ // Content script isolated world — read data-defuddle-shadow attributes
1212
+ // stamped by an external main-world script.
1213
+ const shadowData = [];
1214
+ for (let i = 0; i < origElements.length; i++) {
1215
+ const origEl = origElements[i];
1216
+ const shadowHtml = origEl.getAttribute('data-defuddle-shadow');
1217
+ if (!shadowHtml)
1218
+ continue;
1219
+ const cloneEl = cloneElements[i];
1220
+ if (!cloneEl)
1221
+ continue;
1222
+ shadowData.push({ cloneEl, html: shadowHtml });
1223
+ // Clean up temporary attributes from both original and clone
1224
+ origEl.removeAttribute('data-defuddle-shadow');
1225
+ cloneEl.removeAttribute('data-defuddle-shadow');
1226
+ }
1227
+ for (const { cloneEl, html } of shadowData) {
1228
+ this.replaceShadowHost(cloneEl, html, clone);
1229
+ }
1230
+ }
1231
+ }
1232
+ /**
1233
+ * Resolve React streaming SSR suspense boundaries.
1234
+ * React's streaming SSR places content in hidden divs (id="S:0") and
1235
+ * template placeholders (id="B:0") with $RC scripts to swap them.
1236
+ * Since we don't execute scripts, we perform the swap manually.
1237
+ */
1238
+ resolveStreamedContent(doc) {
1239
+ // Find $RC("B:X","S:X") calls in inline scripts
1240
+ const scripts = doc.querySelectorAll('script');
1241
+ const swaps = [];
1242
+ const rcPattern = /\$RC\("(B:\d+)","(S:\d+)"\)/g;
1243
+ for (const script of scripts) {
1244
+ const text = script.textContent || '';
1245
+ if (!text.includes('$RC('))
1246
+ continue;
1247
+ rcPattern.lastIndex = 0;
1248
+ let match;
1249
+ while ((match = rcPattern.exec(text)) !== null) {
1250
+ swaps.push({ templateId: match[1], contentId: match[2] });
1251
+ }
1252
+ }
1253
+ if (swaps.length === 0)
1254
+ return;
1255
+ let swapCount = 0;
1256
+ for (const { templateId, contentId } of swaps) {
1257
+ const template = doc.getElementById(templateId);
1258
+ const content = doc.getElementById(contentId);
1259
+ if (!template || !content)
1260
+ continue;
1261
+ const parent = template.parentNode;
1262
+ if (!parent)
1263
+ continue;
1264
+ // Remove the fallback/skeleton content after the template
1265
+ // until the <!--/$--> comment marker
1266
+ let next = template.nextSibling;
1267
+ let foundMarker = false;
1268
+ while (next) {
1269
+ const following = next.nextSibling;
1270
+ if (next.nodeType === 8 && next.data === '/$') {
1271
+ next.remove();
1272
+ foundMarker = true;
1273
+ break;
1274
+ }
1275
+ next.remove();
1276
+ next = following;
1277
+ }
1278
+ // Skip swap if marker wasn't found — malformed streaming output
1279
+ if (!foundMarker)
1280
+ continue;
1281
+ // Insert content children before the template position
1282
+ while (content.firstChild) {
1283
+ parent.insertBefore(content.firstChild, template);
1284
+ }
1285
+ // Clean up the template and hidden div
1286
+ template.remove();
1287
+ content.remove();
1288
+ swapCount++;
1289
+ }
1290
+ if (swapCount > 0) {
1291
+ this._log('Resolved streamed content:', swapCount, 'suspense boundaries');
1292
+ }
1293
+ }
1294
+ /**
1295
+ * Replace a shadow DOM host element with a div containing its shadow content.
1296
+ * Custom elements (tag names with hyphens) would re-initialize when inserted
1297
+ * into a live DOM, recreating their shadow roots and hiding the content.
1298
+ */
1299
+ replaceShadowHost(el, shadowHtml, doc) {
1300
+ const fragment = (0, dom_1.parseHTML)(doc, shadowHtml);
1301
+ if (el.tagName.includes('-')) {
1302
+ // Custom element — replace with a div to prevent re-initialization
1303
+ const div = doc.createElement('div');
1304
+ div.appendChild(fragment);
1305
+ el.parentNode?.replaceChild(div, el);
1306
+ }
1307
+ else {
1308
+ el.textContent = '';
1309
+ el.appendChild(fragment);
1310
+ }
1311
+ }
1312
+ /**
1313
+ * Resolve relative URLs in an HTML string
1314
+ */
1315
+ resolveContentUrls(html) {
1316
+ const baseUrl = this.options.url || this.doc.URL;
1317
+ if (!baseUrl)
1318
+ return html;
1319
+ const container = this.doc.createElement('div');
1320
+ container.appendChild((0, dom_1.parseHTML)(this.doc, html));
1321
+ this.resolveRelativeUrls(container);
1322
+ return (0, dom_1.serializeHTML)(container);
1323
+ }
1324
+ _extractSchemaOrgData(doc) {
1325
+ const schemaScripts = doc.querySelectorAll('script[type="application/ld+json"]');
1326
+ const rawSchemaItems = [];
1327
+ schemaScripts.forEach(script => {
1328
+ let jsonContent = script.textContent || '';
1329
+ try {
1330
+ jsonContent = jsonContent
1331
+ .replace(/\/\*[\s\S]*?\*\/|^\s*\/\/.*$/gm, '')
1332
+ .replace(/^\s*<!\[CDATA\[([\s\S]*?)\]\]>\s*$/, '$1')
1333
+ .replace(/^\s*(\*\/|\/\*)\s*|\s*(\*\/|\/\*)\s*$/g, '')
1334
+ .trim();
1335
+ const jsonData = JSON.parse(jsonContent);
1336
+ if (jsonData['@graph'] && Array.isArray(jsonData['@graph'])) {
1337
+ rawSchemaItems.push(...jsonData['@graph']);
1338
+ }
1339
+ else {
1340
+ rawSchemaItems.push(jsonData);
1341
+ }
1342
+ }
1343
+ catch (error) {
1344
+ console.error('Defuddle: Error parsing schema.org data:', error);
1345
+ if (this.debug) {
1346
+ console.error('Defuddle: Problematic JSON content:', jsonContent);
1347
+ }
1348
+ }
1349
+ });
1350
+ const decodeStringsInObject = (item) => {
1351
+ if (typeof item === 'string') {
1352
+ return this._decodeHTMLEntities(item);
1353
+ }
1354
+ else if (Array.isArray(item)) {
1355
+ return item.map(decodeStringsInObject);
1356
+ }
1357
+ else if (typeof item === 'object' && item !== null) {
1358
+ const newItem = {};
1359
+ for (const key in item) {
1360
+ if (Object.prototype.hasOwnProperty.call(item, key)) {
1361
+ newItem[key] = decodeStringsInObject(item[key]);
1362
+ }
1363
+ }
1364
+ return newItem;
1365
+ }
1366
+ return item;
1367
+ };
1368
+ return rawSchemaItems.map(decodeStringsInObject);
1369
+ }
1370
+ _collectMetaTags() {
1371
+ const pageMetaTags = [];
1372
+ this.doc.querySelectorAll('meta').forEach(meta => {
1373
+ const name = meta.getAttribute('name');
1374
+ const property = meta.getAttribute('property');
1375
+ let content = meta.getAttribute('content');
1376
+ if (content) {
1377
+ pageMetaTags.push({ name, property, content: this._decodeHTMLEntities(content) });
1378
+ }
1379
+ });
1380
+ return pageMetaTags;
1381
+ }
1382
+ _decodeHTMLEntities(text) {
1383
+ return (0, dom_1.decodeHTMLEntities)(this.doc, text);
1384
+ }
1385
+ /**
1386
+ * Build a DefuddleResponse from an extractor result with metadata
1387
+ */
1388
+ buildExtractorResponse(extracted, metadata, startTime, extractor, pageMetaTags) {
1389
+ const contentHtml = this.resolveContentUrls(extracted.contentHtml);
1390
+ const variables = this.getExtractorVariables(extracted.variables);
1391
+ return {
1392
+ content: contentHtml,
1393
+ title: extracted.variables?.title || metadata.title,
1394
+ description: metadata.description,
1395
+ domain: metadata.domain,
1396
+ favicon: metadata.favicon,
1397
+ image: metadata.image,
1398
+ language: extracted.variables?.language || metadata.language,
1399
+ published: extracted.variables?.published || metadata.published,
1400
+ author: extracted.variables?.author || metadata.author,
1401
+ site: extracted.variables?.site || metadata.site,
1402
+ schemaOrgData: metadata.schemaOrgData,
1403
+ wordCount: this.countHtmlWords(extracted.contentHtml),
1404
+ parseTime: Math.round(Date.now() - startTime),
1405
+ extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase(),
1406
+ metaTags: pageMetaTags,
1407
+ ...(variables ? { variables } : {}),
1408
+ };
1409
+ }
1410
+ /**
1411
+ * Filter extractor variables to only include custom ones
1412
+ * (exclude standard fields that are already mapped to top-level properties)
1413
+ */
1414
+ getExtractorVariables(variables) {
1415
+ if (!variables)
1416
+ return undefined;
1417
+ const custom = {};
1418
+ let hasCustom = false;
1419
+ for (const [key, value] of Object.entries(variables)) {
1420
+ if (!STANDARD_VARIABLE_KEYS.has(key)) {
1421
+ custom[key] = value;
1422
+ hasCustom = true;
1423
+ }
1424
+ }
1425
+ return hasCustom ? custom : undefined;
1426
+ }
1427
+ /**
1428
+ * Content-based pattern removal for elements that can't be detected by
1429
+ * CSS selectors (e.g. Tailwind/CSS-in-JS sites with non-semantic class names).
1430
+ */
1431
+ removeByContentPattern(mainContent, debugRemovals) {
1432
+ // Remove read time metadata (e.g. "Mar 4th 2026 | 3 min read")
1433
+ // Only removes leaf elements whose text is PURELY date + read time,
1434
+ // not mixed with other meaningful content like tag names.
1435
+ const candidates = Array.from(mainContent.querySelectorAll('p, span, div, time'));
1436
+ for (const el of candidates) {
1437
+ if (!el.parentNode)
1438
+ continue;
1439
+ if (el.closest('pre') || el.closest('code'))
1440
+ continue;
1441
+ const text = el.textContent?.trim() || '';
1442
+ const words = (0, utils_1.countWords)(text);
1443
+ // Match date + read time in short elements
1444
+ if (words <= 15 && CONTENT_DATE_PATTERN.test(text) && CONTENT_READ_TIME_PATTERN.test(text)) {
1445
+ // Ensure this is a leaf-ish element, not a large container
1446
+ if (el.querySelectorAll('p, div, section, article').length === 0) {
1447
+ // Verify the text is ONLY date + read time metadata
1448
+ // by stripping all date/time words and checking nothing remains
1449
+ let cleaned = text;
1450
+ for (const pattern of METADATA_STRIP_PATTERNS) {
1451
+ cleaned = cleaned.replace(pattern, '');
1452
+ }
1453
+ if (cleaned.trim().length > 0)
1454
+ continue;
1455
+ if (this.debug && debugRemovals) {
1456
+ debugRemovals.push({
1457
+ step: 'removeByContentPattern',
1458
+ reason: 'read time metadata',
1459
+ text: (0, utils_1.textPreview)(el)
1460
+ });
1461
+ }
1462
+ el.remove();
1463
+ }
1464
+ }
1465
+ }
1466
+ // Remove standalone time/date elements near the start or end of content.
1467
+ // A <time> in its own paragraph at the boundary is metadata (publish date),
1468
+ // but <time> inline within prose should be preserved (see issue #136).
1469
+ const timeElements = Array.from(mainContent.querySelectorAll('time'));
1470
+ const contentText = mainContent.textContent || '';
1471
+ for (const time of timeElements) {
1472
+ if (!time.parentNode)
1473
+ continue;
1474
+ // Walk up through inline/formatting wrappers only (i, em, span, b, strong)
1475
+ // Stop at block elements to avoid removing containers with other content.
1476
+ let target = time;
1477
+ let targetText = target.textContent?.trim() || '';
1478
+ while (target.parentElement && target.parentElement !== mainContent) {
1479
+ const parentTag = target.parentElement.tagName.toLowerCase();
1480
+ const parentText = target.parentElement.textContent?.trim() || '';
1481
+ // If parent is a <p> that only wraps this time, include it
1482
+ if (parentTag === 'p' && parentText === targetText) {
1483
+ target = target.parentElement;
1484
+ break;
1485
+ }
1486
+ // Only walk through inline formatting wrappers
1487
+ if (['i', 'em', 'span', 'b', 'strong', 'small'].includes(parentTag) &&
1488
+ parentText === targetText) {
1489
+ target = target.parentElement;
1490
+ targetText = parentText;
1491
+ continue;
1492
+ }
1493
+ break;
1494
+ }
1495
+ const text = target.textContent?.trim() || '';
1496
+ const words = (0, utils_1.countWords)(text);
1497
+ if (words > 10)
1498
+ continue;
1499
+ // Check if this element is near the start or end of mainContent
1500
+ const pos = contentText.indexOf(text);
1501
+ const distFromEnd = contentText.length - (pos + text.length);
1502
+ if (pos > 200 && distFromEnd > 200)
1503
+ continue;
1504
+ if (this.debug && debugRemovals) {
1505
+ debugRemovals.push({
1506
+ step: 'removeByContentPattern',
1507
+ reason: 'boundary date element',
1508
+ text: (0, utils_1.textPreview)(target)
1509
+ });
1510
+ }
1511
+ target.remove();
1512
+ }
1513
+ // Remove blog post metadata lists near content boundaries.
1514
+ // These are short <ul>/<ol> elements where every item is a brief
1515
+ // label + value pair (date, reading time, share, etc.) with no
1516
+ // prose sentences. Detected structurally: all items are very short,
1517
+ // none contain sentence-ending punctuation, and the total text is minimal.
1518
+ const metadataLists = mainContent.querySelectorAll('ul, ol');
1519
+ for (const list of metadataLists) {
1520
+ if (!list.parentNode)
1521
+ continue;
1522
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1523
+ if (items.length < 2 || items.length > 8)
1524
+ continue;
1525
+ // Must be near the start or end of content
1526
+ const listText = list.textContent?.trim() || '';
1527
+ const listPos = contentText.indexOf(listText);
1528
+ const distFromEnd = contentText.length - (listPos + listText.length);
1529
+ if (listPos > 500 && distFromEnd > 500)
1530
+ continue;
1531
+ // Skip lists introduced by a preceding paragraph (e.g. "Features include:")
1532
+ // — those are content lists, not standalone metadata
1533
+ const prevSibling = list.previousElementSibling;
1534
+ if (prevSibling) {
1535
+ const prevText = prevSibling.textContent?.trim() || '';
1536
+ if (prevText.endsWith(':'))
1537
+ continue;
1538
+ }
1539
+ // Every item must be very short (label + value) with no prose
1540
+ let isMetadata = true;
1541
+ for (const item of items) {
1542
+ const text = item.textContent?.trim() || '';
1543
+ const words = (0, utils_1.countWords)(text);
1544
+ if (words > 8) {
1545
+ isMetadata = false;
1546
+ break;
1547
+ }
1548
+ // Prose has sentence-ending punctuation; metadata doesn't
1549
+ if (/[.!?]$/.test(text)) {
1550
+ isMetadata = false;
1551
+ break;
1552
+ }
1553
+ }
1554
+ if (!isMetadata)
1555
+ continue;
1556
+ // Total text should be very short — this is metadata, not content
1557
+ if ((0, utils_1.countWords)(listText) > 30)
1558
+ continue;
1559
+ // Walk up to find the container to remove (e.g. a wrapper div)
1560
+ let target = list;
1561
+ while (target.parentElement && target.parentElement !== mainContent) {
1562
+ const parentText = target.parentElement.textContent?.trim() || '';
1563
+ if (parentText !== listText)
1564
+ break;
1565
+ target = target.parentElement;
1566
+ }
1567
+ if (this.debug && debugRemovals) {
1568
+ debugRemovals.push({
1569
+ step: 'removeByContentPattern',
1570
+ reason: 'blog metadata list',
1571
+ text: (0, utils_1.textPreview)(target)
1572
+ });
1573
+ }
1574
+ target.remove();
1575
+ }
1576
+ // Remove section breadcrumbs
1577
+ // Short elements containing a link to a parent section of the current URL.
1578
+ const url = this.options.url || this.doc.URL || '';
1579
+ let urlPath = '';
1580
+ let pageHost = '';
1581
+ try {
1582
+ const parsedUrl = new URL(url);
1583
+ urlPath = parsedUrl.pathname;
1584
+ pageHost = parsedUrl.hostname.replace(/^www\./, '');
1585
+ }
1586
+ catch { }
1587
+ if (urlPath) {
1588
+ const shortElements = mainContent.querySelectorAll('div, span, p');
1589
+ for (const el of shortElements) {
1590
+ if (!el.parentNode)
1591
+ continue;
1592
+ const text = el.textContent?.trim() || '';
1593
+ const words = (0, utils_1.countWords)(text);
1594
+ if (words > 10)
1595
+ continue;
1596
+ // Must be a leaf-ish element (no block children)
1597
+ if (el.querySelectorAll('p, div, section, article').length > 0)
1598
+ continue;
1599
+ const link = el.querySelector('a[href]');
1600
+ if (!link)
1601
+ continue;
1602
+ try {
1603
+ const linkPath = new URL(link.getAttribute('href') || '', url).pathname;
1604
+ if (linkPath !== '/' && linkPath !== urlPath && urlPath.startsWith(linkPath)) {
1605
+ if (this.debug && debugRemovals) {
1606
+ debugRemovals.push({
1607
+ step: 'removeByContentPattern',
1608
+ reason: 'section breadcrumb',
1609
+ text: (0, utils_1.textPreview)(el)
1610
+ });
1611
+ }
1612
+ el.remove();
1613
+ }
1614
+ }
1615
+ catch { }
1616
+ }
1617
+ }
1618
+ // Remove trailing external link lists — a heading + list of purely
1619
+ // off-site links as the last content block (affiliate picks, product
1620
+ // roundups, etc.). Only removed when nothing meaningful follows.
1621
+ if (pageHost) {
1622
+ const headings = mainContent.querySelectorAll('h2, h3, h4, h5, h6');
1623
+ for (const heading of headings) {
1624
+ if (!heading.parentNode)
1625
+ continue;
1626
+ const list = heading.nextElementSibling;
1627
+ if (!list || (list.tagName !== 'UL' && list.tagName !== 'OL'))
1628
+ continue;
1629
+ const items = Array.from(list.children).filter(el => el.tagName === 'LI');
1630
+ if (items.length < 2)
1631
+ continue;
1632
+ // The list must be the last meaningful block — nothing after it
1633
+ // except whitespace or empty elements. Walk up through ancestors
1634
+ // to check siblings at each level up to mainContent.
1635
+ let trailingContent = false;
1636
+ let checkEl = list;
1637
+ while (checkEl && checkEl !== mainContent) {
1638
+ let sibling = checkEl.nextElementSibling;
1639
+ while (sibling) {
1640
+ if ((sibling.textContent?.trim() || '').length > 0) {
1641
+ trailingContent = true;
1642
+ break;
1643
+ }
1644
+ sibling = sibling.nextElementSibling;
1645
+ }
1646
+ if (trailingContent)
1647
+ break;
1648
+ checkEl = checkEl.parentElement;
1649
+ }
1650
+ if (trailingContent)
1651
+ continue;
1652
+ // Every list item must be primarily a link pointing off-site
1653
+ let allExternalLinks = true;
1654
+ for (const item of items) {
1655
+ const links = item.querySelectorAll('a[href]');
1656
+ if (links.length === 0) {
1657
+ allExternalLinks = false;
1658
+ break;
1659
+ }
1660
+ const itemText = item.textContent?.trim() || '';
1661
+ let linkTextLen = 0;
1662
+ for (const link of links) {
1663
+ linkTextLen += (link.textContent?.trim() || '').length;
1664
+ try {
1665
+ const linkHost = new URL(link.getAttribute('href') || '', url).hostname.replace(/^www\./, '');
1666
+ if (linkHost === pageHost) {
1667
+ allExternalLinks = false;
1668
+ break;
1669
+ }
1670
+ }
1671
+ catch { }
1672
+ }
1673
+ if (!allExternalLinks)
1674
+ break;
1675
+ if (linkTextLen < itemText.length * 0.6) {
1676
+ allExternalLinks = false;
1677
+ break;
1678
+ }
1679
+ }
1680
+ if (!allExternalLinks)
1681
+ continue;
1682
+ if (this.debug && debugRemovals) {
1683
+ debugRemovals.push({
1684
+ step: 'removeByContentPattern',
1685
+ reason: 'trailing external link list',
1686
+ text: (0, utils_1.textPreview)(heading)
1687
+ });
1688
+ debugRemovals.push({
1689
+ step: 'removeByContentPattern',
1690
+ reason: 'trailing external link list',
1691
+ text: (0, utils_1.textPreview)(list)
1692
+ });
1693
+ }
1694
+ list.remove();
1695
+ heading.remove();
1696
+ }
1697
+ }
1698
+ // Remove trailing thin sections — the last few direct children of
1699
+ // mainContent that contain a heading but very little prose. These are
1700
+ // typically CTAs, newsletter prompts, or promotional sections that
1701
+ // have been partially stripped by prior removal steps.
1702
+ const totalWords = (0, utils_1.countWords)(mainContent.textContent || '');
1703
+ if (totalWords > 300) {
1704
+ // Walk backwards from the last direct child of mainContent,
1705
+ // collecting trailing elements that are thin (empty or very short prose).
1706
+ // Exclude SVG text (path data) from word counts — it's not prose.
1707
+ const trailingEls = [];
1708
+ let trailingWords = 0;
1709
+ let child = mainContent.lastElementChild;
1710
+ while (child) {
1711
+ // Count prose words, excluding SVG path data which inflates word counts
1712
+ let svgWords = 0;
1713
+ for (const svg of child.querySelectorAll('svg')) {
1714
+ svgWords += (0, utils_1.countWords)(svg.textContent || '');
1715
+ }
1716
+ const words = (0, utils_1.countWords)(child.textContent?.trim() || '') - svgWords;
1717
+ if (words > 25)
1718
+ break;
1719
+ trailingWords += words;
1720
+ trailingEls.push(child);
1721
+ child = child.previousElementSibling;
1722
+ }
1723
+ // Must have a heading in the trailing elements and total < 15% of content.
1724
+ // Skip if trailing elements contain content indicators (math, code, tables, images).
1725
+ if (trailingEls.length >= 1 && trailingWords < totalWords * 0.15) {
1726
+ const hasHeading = trailingEls.some(el => /^H[1-6]$/.test(el.tagName) || el.querySelector('h1, h2, h3, h4, h5, h6'));
1727
+ const hasContent = trailingEls.some(el => el.querySelector(constants_1.CONTENT_ELEMENT_SELECTOR));
1728
+ if (hasHeading && !hasContent) {
1729
+ for (const el of trailingEls) {
1730
+ if (this.debug && debugRemovals) {
1731
+ debugRemovals.push({ step: 'removeByContentPattern', reason: 'trailing thin section', text: (0, utils_1.textPreview)(el) });
1732
+ }
1733
+ el.remove();
1734
+ }
1735
+ }
1736
+ }
1737
+ }
1738
+ // Remove boilerplate sentences and trailing non-content.
1739
+ // Search elements for end-of-article boilerplate, then truncate
1740
+ // from the best ancestor that has siblings to remove.
1741
+ const fullText = mainContent.textContent || '';
1742
+ const boilerplateElements = mainContent.querySelectorAll('p, div, span, section');
1743
+ for (const el of boilerplateElements) {
1744
+ if (!el.parentNode)
1745
+ continue;
1746
+ const text = el.textContent?.trim() || '';
1747
+ const words = (0, utils_1.countWords)(text);
1748
+ if (words > 50 || words < 3)
1749
+ continue;
1750
+ for (const pattern of BOILERPLATE_PATTERNS) {
1751
+ if (pattern.test(text)) {
1752
+ // Walk up to find an ancestor that has next siblings to truncate.
1753
+ // Don't walk all the way to mainContent's direct child — if there's
1754
+ // a single wrapper div, that would remove everything.
1755
+ let target = el;
1756
+ while (target.parentElement && target.parentElement !== mainContent) {
1757
+ if (target.nextElementSibling)
1758
+ break;
1759
+ target = target.parentElement;
1760
+ }
1761
+ // Only truncate if there's substantial content before the boilerplate
1762
+ const targetText = target.textContent || '';
1763
+ const targetPos = fullText.indexOf(targetText);
1764
+ if (targetPos < 200)
1765
+ continue;
1766
+ // Collect ancestors before modifying the DOM
1767
+ const ancestors = [];
1768
+ let anc = target.parentElement;
1769
+ while (anc && anc !== mainContent) {
1770
+ ancestors.push(anc);
1771
+ anc = anc.parentElement;
1772
+ }
1773
+ // Remove target element and its following siblings
1774
+ this.removeTrailingSiblings(target, true, debugRemovals);
1775
+ // Cascade upward: remove following siblings at each
1776
+ // ancestor level too. Everything after the boilerplate
1777
+ // in document order is non-content.
1778
+ for (const ancestor of ancestors) {
1779
+ this.removeTrailingSiblings(ancestor, false, debugRemovals);
1780
+ }
1781
+ return;
1782
+ }
1783
+ }
1784
+ }
1785
+ }
1786
+ /**
1787
+ * Remove an element's following siblings, and optionally the element itself.
1788
+ */
1789
+ removeTrailingSiblings(element, removeSelf, debugRemovals) {
1790
+ let sibling = element.nextElementSibling;
1791
+ while (sibling) {
1792
+ const next = sibling.nextElementSibling;
1793
+ if (this.debug && debugRemovals) {
1794
+ debugRemovals.push({
1795
+ step: 'removeByContentPattern',
1796
+ reason: 'trailing non-content',
1797
+ text: (0, utils_1.textPreview)(sibling)
1798
+ });
1799
+ }
1800
+ sibling.remove();
1801
+ sibling = next;
1802
+ }
1803
+ if (removeSelf) {
1804
+ if (this.debug && debugRemovals) {
1805
+ debugRemovals.push({
1806
+ step: 'removeByContentPattern',
1807
+ reason: 'boilerplate text',
1808
+ text: (0, utils_1.textPreview)(element)
1809
+ });
1810
+ }
1811
+ element.remove();
1812
+ }
1813
+ }
1814
+ }
1815
+ exports.Defuddle = Defuddle;
1816
+ //# sourceMappingURL=defuddle.js.map