@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,1101 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.standardizeContent = standardizeContent;
4
+ const constants_1 = require("./constants");
5
+ const math_1 = require("./elements/math");
6
+ const code_1 = require("./elements/code");
7
+ const headings_1 = require("./elements/headings");
8
+ const images_1 = require("./elements/images");
9
+ const utils_1 = require("./utils");
10
+ const dom_1 = require("./utils/dom");
11
+ // Module-level debug flag, set by standardizeContent for child functions
12
+ let _debug = false;
13
+ const ELEMENT_STANDARDIZATION_RULES = [
14
+ ...math_1.mathRules,
15
+ ...code_1.codeBlockRules,
16
+ ...headings_1.headingRules,
17
+ ...images_1.imageRules,
18
+ // Convert divs with paragraph role to actual paragraphs
19
+ {
20
+ selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
21
+ element: 'p',
22
+ transform: (el, doc) => {
23
+ const p = doc.createElement('p');
24
+ (0, dom_1.transferContent)(el, p);
25
+ // Copy allowed attributes
26
+ Array.from(el.attributes).forEach(attr => {
27
+ if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
28
+ p.setAttribute(attr.name, attr.value);
29
+ }
30
+ });
31
+ return p;
32
+ }
33
+ },
34
+ // Convert divs with list roles to actual lists
35
+ {
36
+ selector: 'div[role="list"]',
37
+ element: 'ul',
38
+ // Custom handler for list type detection and transformation
39
+ transform: (el, doc) => {
40
+ // First determine if this is an ordered list
41
+ const firstItem = el.querySelector('div[role="listitem"] .label');
42
+ const label = firstItem?.textContent?.trim() || '';
43
+ const isOrdered = label.match(/^\d+\)/);
44
+ // Create the appropriate list type
45
+ const list = doc.createElement(isOrdered ? 'ol' : 'ul');
46
+ // Process each list item
47
+ const items = el.querySelectorAll('div[role="listitem"]');
48
+ items.forEach(item => {
49
+ const li = doc.createElement('li');
50
+ const content = item.querySelector('.content');
51
+ if (content) {
52
+ // Convert any paragraph divs inside content
53
+ const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
54
+ paragraphDivs.forEach(div => {
55
+ const p = doc.createElement('p');
56
+ (0, dom_1.transferContent)(div, p);
57
+ div.replaceWith(p);
58
+ });
59
+ // Convert any nested lists recursively
60
+ const nestedLists = content.querySelectorAll('div[role="list"]');
61
+ nestedLists.forEach(nestedList => {
62
+ const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
63
+ const nestedLabel = firstNestedItem?.textContent?.trim() || '';
64
+ const isNestedOrdered = nestedLabel.match(/^\d+\)/);
65
+ const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
66
+ // Process nested items
67
+ const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
68
+ nestedItems.forEach(nestedItem => {
69
+ const nestedLi = doc.createElement('li');
70
+ const nestedContent = nestedItem.querySelector('.content');
71
+ if (nestedContent) {
72
+ // Convert paragraph divs in nested items
73
+ const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
74
+ nestedParagraphs.forEach(div => {
75
+ const p = doc.createElement('p');
76
+ (0, dom_1.transferContent)(div, p);
77
+ div.replaceWith(p);
78
+ });
79
+ (0, dom_1.transferContent)(nestedContent, nestedLi);
80
+ }
81
+ newNestedList.appendChild(nestedLi);
82
+ });
83
+ nestedList.replaceWith(newNestedList);
84
+ });
85
+ (0, dom_1.transferContent)(content, li);
86
+ }
87
+ list.appendChild(li);
88
+ });
89
+ return list;
90
+ }
91
+ },
92
+ {
93
+ selector: 'div[role="listitem"]',
94
+ element: 'li',
95
+ // Custom handler for list item content
96
+ transform: (el, doc) => {
97
+ const content = el.querySelector('.content');
98
+ if (!content)
99
+ return el;
100
+ // Convert any paragraph divs inside content
101
+ const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
102
+ paragraphDivs.forEach(div => {
103
+ const p = doc.createElement('p');
104
+ (0, dom_1.transferContent)(div, p);
105
+ div.replaceWith(p);
106
+ });
107
+ return content;
108
+ }
109
+ }
110
+ ];
111
+ function standardizeContent(element, metadata, doc, debug = false) {
112
+ _debug = debug;
113
+ standardizeSpaces(element);
114
+ // Remove HTML comments
115
+ removeHtmlComments(element);
116
+ // Handle H1 elements - remove first one and convert others to H2
117
+ standardizeHeadings(element, metadata.title, doc);
118
+ // Wrap code elements with white-space: pre in <pre> before attribute stripping
119
+ wrapPreformattedCode(element, doc);
120
+ // Convert embedded content to standard formats
121
+ standardizeElements(element, doc);
122
+ // If not debug mode, do the full cleanup
123
+ if (!debug) {
124
+ // First pass of div flattening
125
+ flattenWrapperElements(element, doc);
126
+ // Strip unwanted attributes
127
+ stripUnwantedAttributes(element, debug);
128
+ // Unwrap bare spans (no attributes remaining after stripping)
129
+ unwrapBareSpans(element);
130
+ // Unwrap javascript: links — keep text, remove the link
131
+ // Unwrap links inside inline code — markdown can't render links in backtick code
132
+ Array.from(element.querySelectorAll('code a')).forEach(unwrapElement);
133
+ // Unwrap javascript: links — keep text, remove the link
134
+ Array.from(element.querySelectorAll('a[href^="javascript:"]')).forEach(unwrapElement);
135
+ // Unwrap anchor links that wrap headings (e.g. clickable section headers)
136
+ Array.from(element.querySelectorAll('a[href^="#"]')).forEach(link => {
137
+ if (link.querySelector('h1, h2, h3, h4, h5, h6')) {
138
+ unwrapElement(link);
139
+ }
140
+ });
141
+ // Remove heading anchor links (e.g. <h2>Title<a href="#title">#</a></h2>)
142
+ (0, headings_1.removeHeadingAnchors)(element);
143
+ // Remove obsolete plugin elements
144
+ element.querySelectorAll('object, embed, applet').forEach(el => el.remove());
145
+ // Remove empty elements
146
+ removeEmptyElements(element);
147
+ // Remove trailing headings
148
+ removeTrailingHeadings(element);
149
+ // Remove orphaned leading/trailing <hr> elements
150
+ removeOrphanedDividers(element);
151
+ // Final pass of div flattening after cleanup operations
152
+ flattenWrapperElements(element, doc);
153
+ // Standardize consecutive br elements
154
+ stripExtraBrElements(element);
155
+ // Clean up empty lines
156
+ removeEmptyLines(element, doc);
157
+ }
158
+ else {
159
+ // In debug mode, still do basic cleanup but preserve structure
160
+ stripUnwantedAttributes(element, debug);
161
+ removeTrailingHeadings(element);
162
+ stripExtraBrElements(element);
163
+ (0, utils_1.logDebug)(_debug, 'Debug mode: Skipping div flattening to preserve structure');
164
+ }
165
+ }
166
+ /**
167
+ * Wrap <code> elements that have white-space: pre (via inline style)
168
+ * in a <pre> element, so they get treated as code blocks.
169
+ */
170
+ function wrapPreformattedCode(element, doc) {
171
+ const codeElements = Array.from(element.querySelectorAll('code'));
172
+ for (const code of codeElements) {
173
+ // Skip if already inside a <pre>
174
+ if (code.closest('pre'))
175
+ continue;
176
+ // Check inline style for white-space: pre
177
+ const style = code.getAttribute('style') || '';
178
+ if (!/white-space\s*:\s*pre/.test(style))
179
+ continue;
180
+ // Wrap in <pre>
181
+ const pre = doc.createElement('pre');
182
+ code.parentNode?.insertBefore(pre, code);
183
+ pre.appendChild(code);
184
+ }
185
+ }
186
+ function standardizeSpaces(element) {
187
+ const processNode = (node) => {
188
+ // Skip pre and code elements
189
+ if ((0, utils_1.isElement)(node)) {
190
+ const tag = node.tagName.toLowerCase();
191
+ if (tag === 'pre' || tag === 'code') {
192
+ return;
193
+ }
194
+ }
195
+ // Process text nodes
196
+ if ((0, utils_1.isTextNode)(node)) {
197
+ const text = node.textContent || '';
198
+ // Replace &nbsp; with regular spaces, preserving them between words
199
+ const newText = text.replace(/\xA0/g, ' ');
200
+ if (newText !== text) {
201
+ node.textContent = newText;
202
+ }
203
+ }
204
+ // Process children recursively
205
+ if (node.hasChildNodes()) {
206
+ Array.from(node.childNodes).forEach(processNode);
207
+ }
208
+ };
209
+ processNode(element);
210
+ }
211
+ function removeTrailingHeadings(element) {
212
+ let removedCount = 0;
213
+ const hasContentAfter = (el) => {
214
+ // Check if there's any meaningful content after this element
215
+ let nextContent = '';
216
+ let sibling = el.nextSibling;
217
+ // First check direct siblings
218
+ while (sibling) {
219
+ if ((0, utils_1.isTextNode)(sibling)) { // TEXT_NODE
220
+ nextContent += sibling.textContent || '';
221
+ }
222
+ else if ((0, utils_1.isElement)(sibling)) { // ELEMENT_NODE
223
+ // If we find an element sibling, check its content
224
+ nextContent += sibling.textContent || '';
225
+ }
226
+ sibling = sibling.nextSibling;
227
+ }
228
+ // If we found meaningful content at this level, return true
229
+ if (nextContent.trim()) {
230
+ return true;
231
+ }
232
+ // If no content found at this level and we have a parent,
233
+ // check for content after the parent
234
+ const parent = el.parentElement;
235
+ if (parent && parent !== element) {
236
+ return hasContentAfter(parent);
237
+ }
238
+ return false;
239
+ };
240
+ // Process all headings from bottom to top
241
+ const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
242
+ .reverse();
243
+ for (const heading of headings) {
244
+ if (!hasContentAfter(heading)) {
245
+ heading.remove();
246
+ removedCount++;
247
+ }
248
+ else {
249
+ // Stop processing once we find a heading with content after it
250
+ break;
251
+ }
252
+ }
253
+ if (removedCount > 0) {
254
+ (0, utils_1.logDebug)(_debug, 'Removed trailing headings:', removedCount);
255
+ }
256
+ }
257
+ function removeOrphanedDividers(element) {
258
+ // Remove leading <hr> elements (skipping whitespace text nodes)
259
+ while (true) {
260
+ let node = element.firstChild;
261
+ while (node && (0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
262
+ node = node.nextSibling;
263
+ }
264
+ if (node && (0, utils_1.isElement)(node) && node.tagName.toLowerCase() === 'hr') {
265
+ node.remove();
266
+ }
267
+ else {
268
+ break;
269
+ }
270
+ }
271
+ // Remove trailing <hr> elements (skipping whitespace text nodes)
272
+ while (true) {
273
+ let node = element.lastChild;
274
+ while (node && (0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
275
+ node = node.previousSibling;
276
+ }
277
+ if (node && (0, utils_1.isElement)(node) && node.tagName.toLowerCase() === 'hr') {
278
+ node.remove();
279
+ }
280
+ else {
281
+ break;
282
+ }
283
+ }
284
+ }
285
+ function standardizeHeadings(element, title, doc) {
286
+ const normalizeText = (text) => {
287
+ return text
288
+ .replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
289
+ .replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
290
+ .trim()
291
+ .toLowerCase();
292
+ };
293
+ const h1s = element.getElementsByTagName('h1');
294
+ Array.from(h1s).forEach(h1 => {
295
+ const h2 = doc.createElement('h2');
296
+ (0, dom_1.transferContent)(h1, h2);
297
+ // Copy allowed attributes
298
+ Array.from(h1.attributes).forEach(attr => {
299
+ if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
300
+ h2.setAttribute(attr.name, attr.value);
301
+ }
302
+ });
303
+ h1.parentNode?.replaceChild(h2, h1);
304
+ });
305
+ // Remove first H2 if it matches title
306
+ const h2s = element.getElementsByTagName('h2');
307
+ if (h2s.length > 0) {
308
+ const firstH2 = h2s[0];
309
+ const firstH2Text = normalizeText(firstH2.textContent || '');
310
+ const normalizedTitle = normalizeText(title);
311
+ if (normalizedTitle && normalizedTitle === firstH2Text) {
312
+ firstH2.remove();
313
+ }
314
+ }
315
+ }
316
+ function removeHtmlComments(element) {
317
+ let removedCount = 0;
318
+ const doc = element.ownerDocument;
319
+ // Use TreeWalker to find comment nodes directly (O(n) instead of O(n*m))
320
+ const walker = doc.createTreeWalker(element, 128 /* NodeFilter.SHOW_COMMENT */);
321
+ const comments = [];
322
+ while (walker.nextNode()) {
323
+ comments.push(walker.currentNode);
324
+ }
325
+ for (const node of comments) {
326
+ node.parentNode?.removeChild(node);
327
+ removedCount++;
328
+ }
329
+ (0, utils_1.logDebug)(_debug, 'Removed HTML comments:', removedCount);
330
+ }
331
+ function stripUnwantedAttributes(element, debug) {
332
+ let attributeCount = 0;
333
+ const processElement = (el) => {
334
+ // Skip SVG elements - preserve all their attributes
335
+ if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
336
+ return;
337
+ }
338
+ const attributes = Array.from(el.attributes);
339
+ const tag = el.tagName.toLowerCase();
340
+ attributes.forEach(attr => {
341
+ const attrName = attr.name.toLowerCase();
342
+ const attrValue = attr.value;
343
+ // Special cases for preserving specific attributes
344
+ if (
345
+ // Preserve footnote IDs
346
+ (attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
347
+ attrValue.startsWith('fn:') || // Footnote content
348
+ attrValue === 'footnotes' // Footnotes container
349
+ )) ||
350
+ // Preserve code block language classes and footnote backref class
351
+ (attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
352
+ attrValue === 'footnote-backref' ||
353
+ /^callout(?:-|$)/.test(attrValue)))) {
354
+ return;
355
+ }
356
+ // In debug mode, allow debug attributes and data- attributes
357
+ if (debug) {
358
+ if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
359
+ !constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
360
+ !attrName.startsWith('data-')) {
361
+ el.removeAttribute(attr.name);
362
+ attributeCount++;
363
+ }
364
+ }
365
+ else {
366
+ // In normal mode, only allow standard attributes
367
+ if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
368
+ el.removeAttribute(attr.name);
369
+ attributeCount++;
370
+ }
371
+ }
372
+ });
373
+ };
374
+ processElement(element);
375
+ element.querySelectorAll('*').forEach(processElement);
376
+ (0, utils_1.logDebug)(_debug, 'Stripped attributes:', attributeCount);
377
+ }
378
+ function unwrapElement(el) {
379
+ while (el.firstChild) {
380
+ el.parentNode?.insertBefore(el.firstChild, el);
381
+ }
382
+ el.remove();
383
+ }
384
+ function unwrapBareSpans(element) {
385
+ // Process deepest spans first so nested bare spans collapse in one pass
386
+ const spans = Array.from(element.querySelectorAll('span')).reverse();
387
+ let unwrappedCount = 0;
388
+ for (const span of spans) {
389
+ if (!span.parentNode)
390
+ continue;
391
+ if (span.attributes.length > 0)
392
+ continue;
393
+ const parent = span.parentNode;
394
+ if (!parent)
395
+ continue;
396
+ // Replace span with its children
397
+ while (span.firstChild) {
398
+ parent.insertBefore(span.firstChild, span);
399
+ }
400
+ span.remove();
401
+ unwrappedCount++;
402
+ }
403
+ // Merge adjacent text nodes left behind in one pass
404
+ if (unwrappedCount > 0) {
405
+ element.normalize();
406
+ }
407
+ (0, utils_1.logDebug)(_debug, 'Unwrapped bare spans:', unwrappedCount);
408
+ }
409
+ function removeEmptyElements(element) {
410
+ let removedCount = 0;
411
+ const isEmptyElement = (el) => {
412
+ if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase()))
413
+ return false;
414
+ // Special case: divs that only contain spans with commas
415
+ if (el.tagName === 'DIV') {
416
+ const children = el.children;
417
+ if (children.length > 0) {
418
+ let allCommaSpans = true;
419
+ for (let i = 0; i < children.length; i++) {
420
+ const child = children[i];
421
+ if (child.tagName !== 'SPAN') {
422
+ allCommaSpans = false;
423
+ break;
424
+ }
425
+ const content = child.textContent?.trim() || '';
426
+ if (content !== ',' && content !== '' && content !== ' ') {
427
+ allCommaSpans = false;
428
+ break;
429
+ }
430
+ }
431
+ if (allCommaSpans)
432
+ return true;
433
+ }
434
+ }
435
+ const textContent = el.textContent || '';
436
+ if (textContent.trim().length > 0 || textContent.includes('\u00A0'))
437
+ return false;
438
+ // Check if element has no meaningful children (no element children, only whitespace text)
439
+ if (!el.hasChildNodes())
440
+ return true;
441
+ const childNodes = el.childNodes;
442
+ for (let i = 0; i < childNodes.length; i++) {
443
+ const node = childNodes[i];
444
+ if (!(0, utils_1.isTextNode)(node))
445
+ return false;
446
+ const nodeText = node.textContent || '';
447
+ if (nodeText.trim().length > 0 || nodeText.includes('\u00A0'))
448
+ return false;
449
+ }
450
+ return true;
451
+ };
452
+ // Process deepest-first in a single pass by reversing the element list
453
+ // (querySelectorAll returns document order, reverse gives deepest last → first)
454
+ const allElements = Array.from(element.querySelectorAll('*')).reverse();
455
+ for (const el of allElements) {
456
+ if (el.parentNode && isEmptyElement(el)) {
457
+ el.remove();
458
+ removedCount++;
459
+ }
460
+ }
461
+ (0, utils_1.logDebug)(_debug, 'Removed empty elements:', removedCount);
462
+ }
463
+ function stripExtraBrElements(element) {
464
+ let processedCount = 0;
465
+ const startTime = Date.now();
466
+ // Get all br elements directly
467
+ const brElements = Array.from(element.getElementsByTagName('br'));
468
+ // Keep track of consecutive br elements
469
+ let consecutiveBrs = [];
470
+ // Helper to process collected br elements
471
+ const processBrs = () => {
472
+ if (consecutiveBrs.length > 2) {
473
+ // Keep only two br elements
474
+ for (let i = 2; i < consecutiveBrs.length; i++) {
475
+ consecutiveBrs[i].remove();
476
+ processedCount++;
477
+ }
478
+ }
479
+ consecutiveBrs = [];
480
+ };
481
+ // Process all br elements
482
+ brElements.forEach(currentNode => {
483
+ // Check if this br is consecutive with previous ones
484
+ let isConsecutive = false;
485
+ if (consecutiveBrs.length > 0) {
486
+ const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
487
+ let node = currentNode.previousSibling;
488
+ // Skip whitespace text nodes
489
+ while (node && (0, utils_1.isTextNode)(node) && !node.textContent?.trim()) {
490
+ node = node.previousSibling;
491
+ }
492
+ if (node === lastBr) {
493
+ isConsecutive = true;
494
+ }
495
+ }
496
+ if (isConsecutive) {
497
+ consecutiveBrs.push(currentNode);
498
+ }
499
+ else {
500
+ // Process any previously collected brs before starting new group
501
+ processBrs();
502
+ consecutiveBrs = [currentNode];
503
+ }
504
+ });
505
+ // Process any remaining br elements
506
+ processBrs();
507
+ const endTime = Date.now();
508
+ (0, utils_1.logDebug)(_debug, 'Standardized br elements:', {
509
+ removed: processedCount,
510
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
511
+ });
512
+ }
513
+ function moveWhitespaceOutside(node, doc, direction) {
514
+ const child = direction === 'leading' ? node.firstChild : node.lastChild;
515
+ if (!child || !(0, utils_1.isTextNode)(child))
516
+ return 0;
517
+ const text = child.textContent || '';
518
+ const trimmed = direction === 'leading' ? text.replace(/^\s+/, '') : text.replace(/\s+$/, '');
519
+ if (trimmed === text || !node.parentNode)
520
+ return 0;
521
+ child.textContent = trimmed;
522
+ // Ensure a space exists on the outside
523
+ const neighbor = direction === 'leading' ? node.previousSibling : node.nextSibling;
524
+ const neighborHasSpace = neighbor && (0, utils_1.isTextNode)(neighbor) && (direction === 'leading'
525
+ ? (neighbor.textContent || '').endsWith(' ')
526
+ : (neighbor.textContent || '').startsWith(' '));
527
+ if (!neighborHasSpace) {
528
+ const insertBefore = direction === 'leading' ? node : node.nextSibling;
529
+ node.parentNode.insertBefore(doc.createTextNode(' '), insertBefore);
530
+ }
531
+ return 1;
532
+ }
533
+ function removeEmptyLines(element, doc) {
534
+ let removedCount = 0;
535
+ const startTime = Date.now();
536
+ // First pass: remove empty text nodes
537
+ const removeEmptyTextNodes = (node) => {
538
+ // Skip if inside pre or code
539
+ if ((0, utils_1.isElement)(node)) {
540
+ const tag = node.tagName.toLowerCase();
541
+ if (tag === 'pre' || tag === 'code') {
542
+ return;
543
+ }
544
+ }
545
+ // Process children first (depth-first)
546
+ const children = Array.from(node.childNodes);
547
+ children.forEach(removeEmptyTextNodes);
548
+ // Then handle this node
549
+ if ((0, utils_1.isTextNode)(node)) {
550
+ const text = node.textContent || '';
551
+ // If it's completely empty or just zero-width/invisible characters, remove it
552
+ // Preserve nodes with regular spaces or &nbsp; as they may separate words
553
+ if (!text || /^[\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/.test(text)) {
554
+ node.parentNode?.removeChild(node);
555
+ removedCount++;
556
+ }
557
+ else {
558
+ // Clean up the text content while preserving important spaces
559
+ // Collapse newlines to spaces (CSS white-space: normal behavior)
560
+ const newText = text
561
+ .replace(/[\n\r]+/g, ' ') // Newlines -> spaces
562
+ .replace(/\t+/g, ' ') // Tabs -> spaces
563
+ .replace(/ {2,}/g, ' ') // 2+ spaces -> 1 space
564
+ .replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
565
+ .replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
566
+ // Clean up zero-width characters (except ZWNJ \u200C used in Farsi) and multiple non-breaking spaces
567
+ .replace(/[\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
568
+ .replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple &nbsp; -> single &nbsp;
569
+ if (newText !== text) {
570
+ node.textContent = newText;
571
+ removedCount += text.length - newText.length;
572
+ }
573
+ }
574
+ }
575
+ };
576
+ // Second pass: clean up empty elements and normalize spacing
577
+ const cleanupEmptyElements = (node) => {
578
+ if (!(0, utils_1.isElement)(node))
579
+ return;
580
+ // Skip pre and code elements
581
+ const tag = node.tagName.toLowerCase();
582
+ if (tag === 'pre' || tag === 'code') {
583
+ return;
584
+ }
585
+ // Process children first (depth-first)
586
+ Array.from(node.childNodes)
587
+ .filter(utils_1.isElement)
588
+ .forEach(cleanupEmptyElements);
589
+ // Then normalize this element's whitespace
590
+ node.normalize(); // Combine adjacent text nodes
591
+ // Special handling for block elements
592
+ const isBlockElement = (0, utils_1.getComputedStyle)(node)?.display === 'block';
593
+ // Remove whitespace-only text nodes at start/end
594
+ const whitespacePattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
595
+ while (node.firstChild &&
596
+ (0, utils_1.isTextNode)(node.firstChild) &&
597
+ (node.firstChild.textContent || '').match(whitespacePattern)) {
598
+ node.removeChild(node.firstChild);
599
+ removedCount++;
600
+ }
601
+ while (node.lastChild &&
602
+ (0, utils_1.isTextNode)(node.lastChild) &&
603
+ (node.lastChild.textContent || '').match(whitespacePattern)) {
604
+ node.removeChild(node.lastChild);
605
+ removedCount++;
606
+ }
607
+ // For inline elements, move leading/trailing spaces outside the element
608
+ if (!isBlockElement && constants_1.INLINE_ELEMENTS.has(tag) && node.parentNode) {
609
+ removedCount += moveWhitespaceOutside(node, doc, 'leading');
610
+ removedCount += moveWhitespaceOutside(node, doc, 'trailing');
611
+ }
612
+ // Ensure there's a space between inline elements if needed
613
+ if (!isBlockElement) {
614
+ const children = Array.from(node.childNodes);
615
+ for (let i = 0; i < children.length - 1; i++) {
616
+ const current = children[i];
617
+ const next = children[i + 1];
618
+ // Only add space between elements or between element and text
619
+ if ((0, utils_1.isElement)(current) || (0, utils_1.isElement)(next)) {
620
+ // Get the text content
621
+ const nextContent = next.textContent || '';
622
+ const currentContent = current.textContent || '';
623
+ // Don't add space if:
624
+ // 1. Next content starts with punctuation or closing parenthesis
625
+ // 2. Current content ends with punctuation or opening parenthesis
626
+ // 3. There's already a space
627
+ const nextStartsWithPunctuation = nextContent.match(/^[,.!?:;)\]]/);
628
+ const currentEndsWithPunctuation = currentContent.match(/[,.!?:;(\[]\s*$/);
629
+ const hasSpace = ((0, utils_1.isTextNode)(current) &&
630
+ (current.textContent || '').endsWith(' ')) ||
631
+ ((0, utils_1.isTextNode)(next) &&
632
+ (next.textContent || '').startsWith(' '));
633
+ // Only add space if none of the above conditions are true
634
+ if (!nextStartsWithPunctuation &&
635
+ !currentEndsWithPunctuation &&
636
+ !hasSpace) {
637
+ const space = doc.createTextNode(' ');
638
+ node.insertBefore(space, next);
639
+ }
640
+ }
641
+ }
642
+ }
643
+ };
644
+ // Run both passes
645
+ removeEmptyTextNodes(element);
646
+ cleanupEmptyElements(element);
647
+ const endTime = Date.now();
648
+ (0, utils_1.logDebug)(_debug, 'Removed empty lines:', {
649
+ charactersRemoved: removedCount,
650
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
651
+ });
652
+ }
653
+ function standardizeElements(element, doc) {
654
+ let processedCount = 0;
655
+ // Convert elements based on standardization rules
656
+ ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
657
+ let elements;
658
+ try {
659
+ elements = element.querySelectorAll(rule.selector);
660
+ }
661
+ catch (e) {
662
+ // Some selectors use :has() which isn't supported by jsdom/nwsapi.
663
+ // Skip the rule gracefully in those environments.
664
+ return;
665
+ }
666
+ elements.forEach(el => {
667
+ if (rule.transform) {
668
+ // If there's a transform function, use it to create the new element
669
+ const transformed = rule.transform(el, doc);
670
+ el.replaceWith(transformed);
671
+ processedCount++;
672
+ }
673
+ });
674
+ });
675
+ // arXiv LaTeXML: Convert equation tables to <math> elements before attribute stripping
676
+ const equationTables = Array.from(element.querySelectorAll('table.ltx_equation, table.ltx_eqn_table, table.ltx_equationgroup'));
677
+ equationTables.forEach(table => {
678
+ const mathElements = table.querySelectorAll('math');
679
+ if (mathElements.length === 0)
680
+ return;
681
+ const fragment = doc.createDocumentFragment();
682
+ mathElements.forEach(mathEl => {
683
+ // Extract LaTeX from alttext or annotation
684
+ const alttext = mathEl.getAttribute('alttext');
685
+ const annotation = mathEl.querySelector('annotation[encoding="application/x-tex"]');
686
+ const latex = alttext || annotation?.textContent?.trim() || '';
687
+ if (!latex)
688
+ return;
689
+ const isBlock = mathEl.getAttribute('display') === 'block' ||
690
+ table.classList.contains('ltx_equation') ||
691
+ table.classList.contains('ltx_equationgroup');
692
+ const cleanMath = doc.createElement('math');
693
+ cleanMath.setAttribute('xmlns', 'http://www.w3.org/1998/Math/MathML');
694
+ cleanMath.setAttribute('display', isBlock ? 'block' : 'inline');
695
+ cleanMath.setAttribute('data-latex', latex);
696
+ cleanMath.textContent = latex;
697
+ fragment.appendChild(cleanMath);
698
+ });
699
+ if (fragment.childNodes.length > 0) {
700
+ table.replaceWith(fragment);
701
+ processedCount++;
702
+ }
703
+ });
704
+ // arXiv LaTeXML: Remove hidden ltx_note_outer spans (CSS display:none on arxiv.org)
705
+ // These contain duplicated footnote marks and "footnotemark:" text
706
+ const noteOuters = Array.from(element.querySelectorAll('span.ltx_note_outer'));
707
+ noteOuters.forEach(outer => {
708
+ outer.remove();
709
+ processedCount++;
710
+ });
711
+ // arXiv LaTeXML: Unwrap ltx_ref_tag spans so cross-reference numbers are preserved
712
+ // These spans (e.g. <span class="ltx_text ltx_ref_tag">1</span>) get stripped to bare
713
+ // spans during attribute stripping, then unwrapped — but their parent <a> links get
714
+ // removed by the exact selector `a[href^="#"][class*="ref" i]`. Fix by unwrapping the
715
+ // link and keeping the text inline.
716
+ const refLinks = Array.from(element.querySelectorAll('a.ltx_ref'));
717
+ refLinks.forEach(link => {
718
+ const refTag = link.querySelector('span.ltx_ref_tag, span.ltx_text.ltx_ref_tag');
719
+ if (refTag) {
720
+ // Replace the link with just the text content
721
+ const text = doc.createTextNode(link.textContent || '');
722
+ link.replaceWith(text);
723
+ processedCount++;
724
+ }
725
+ });
726
+ // Unwrap single-column layout tables (used for styling/positioning, not data)
727
+ const tables = Array.from(element.querySelectorAll('table'));
728
+ tables.forEach(table => {
729
+ if (!table.parentNode)
730
+ return;
731
+ const directCells = Array.from(table.querySelectorAll('td, th'))
732
+ .filter(cell => (0, dom_1.isDirectTableChild)(cell, table));
733
+ // Skip data tables that have direct header cells
734
+ if (directCells.some(cell => cell.tagName === 'TH'))
735
+ return;
736
+ const directRows = Array.from(table.querySelectorAll('tr'))
737
+ .filter(row => (0, dom_1.isDirectTableChild)(row, table));
738
+ if (directRows.length === 0)
739
+ return;
740
+ // Check that every row has at most one direct cell
741
+ const isSingleColumn = directRows.every(tr => directCells.filter(cell => cell.parentNode === tr).length <= 1);
742
+ if (!isSingleColumn)
743
+ return;
744
+ const fragment = doc.createDocumentFragment();
745
+ directCells.forEach(cell => {
746
+ while (cell.firstChild) {
747
+ fragment.appendChild(cell.firstChild);
748
+ }
749
+ });
750
+ table.replaceWith(fragment);
751
+ processedCount++;
752
+ });
753
+ // Convert lite-youtube elements
754
+ const liteYoutubeElements = element.querySelectorAll('lite-youtube');
755
+ liteYoutubeElements.forEach(el => {
756
+ const videoId = el.getAttribute('videoid');
757
+ if (!videoId)
758
+ return;
759
+ const iframe = doc.createElement('iframe');
760
+ iframe.width = '560';
761
+ iframe.height = '315';
762
+ iframe.src = `https://www.youtube.com/embed/${videoId}`;
763
+ iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
764
+ iframe.frameBorder = '0';
765
+ iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
766
+ iframe.setAttribute('allowfullscreen', '');
767
+ el.replaceWith(iframe);
768
+ processedCount++;
769
+ });
770
+ (0, utils_1.logDebug)(_debug, 'Converted embedded elements:', processedCount);
771
+ // Verso (Lean docs) emits many adjacent command/output blocks.
772
+ // Merge contiguous transformed blocks back into one readable block.
773
+ mergeAdjacentVersoCodeBlocks(element);
774
+ }
775
+ function mergeAdjacentVersoCodeBlocks(root) {
776
+ const getCodeNode = (pre) => {
777
+ let code = null;
778
+ for (const child of pre.children) {
779
+ if (child.tagName.toLowerCase() !== 'code')
780
+ return null;
781
+ if (code)
782
+ return null;
783
+ code = child;
784
+ }
785
+ return code;
786
+ };
787
+ const getLanguage = (code) => {
788
+ const dataLang = (code.getAttribute('data-lang') || '').toLowerCase();
789
+ if (dataLang)
790
+ return dataLang;
791
+ const className = code.getAttribute('class') || '';
792
+ const match = className.match(/(?:^|\s)language-([a-z0-9_+-]+)(?:\s|$)/i);
793
+ return match?.[1]?.toLowerCase() || '';
794
+ };
795
+ // Only visit parents of verso code blocks, not every element in the tree
796
+ const candidates = root.querySelectorAll('pre[data-verso-code="true"]');
797
+ const parents = new Set();
798
+ for (const candidate of candidates) {
799
+ const parent = candidate.parentElement;
800
+ if (parent)
801
+ parents.add(parent);
802
+ }
803
+ for (const container of parents) {
804
+ const children = Array.from(container.childNodes);
805
+ for (let i = 0; i < children.length; i++) {
806
+ const startNode = children[i];
807
+ if (!(0, utils_1.isElement)(startNode) || startNode.tagName.toLowerCase() !== 'pre')
808
+ continue;
809
+ if (startNode.getAttribute('data-verso-code') !== 'true')
810
+ continue;
811
+ const startCode = getCodeNode(startNode);
812
+ if (!startCode)
813
+ continue;
814
+ const language = getLanguage(startCode);
815
+ if (language !== 'lean' && language !== 'lean4')
816
+ continue;
817
+ const run = [{ pre: startNode, code: startCode }];
818
+ const betweenWhitespace = [];
819
+ let j = i + 1;
820
+ while (j < children.length) {
821
+ const node = children[j];
822
+ if ((0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
823
+ betweenWhitespace.push(node);
824
+ j++;
825
+ continue;
826
+ }
827
+ if (!(0, utils_1.isElement)(node) || node.tagName.toLowerCase() !== 'pre')
828
+ break;
829
+ const pre = node;
830
+ if (pre.getAttribute('data-verso-code') !== 'true')
831
+ break;
832
+ const code = getCodeNode(pre);
833
+ if (!code || getLanguage(code) !== language)
834
+ break;
835
+ run.push({ pre, code });
836
+ j++;
837
+ }
838
+ if (run.length <= 1)
839
+ continue;
840
+ const merged = run
841
+ .map(({ code }) => (code.textContent || '').replace(/\r?\n$/, ''))
842
+ .join('\n')
843
+ .replace(/\n{3,}/g, '\n\n')
844
+ .replace(/^\n+|\n+$/g, '');
845
+ startCode.textContent = merged;
846
+ for (let k = 1; k < run.length; k++) {
847
+ run[k].pre.remove();
848
+ }
849
+ for (const node of betweenWhitespace) {
850
+ node.parentNode?.removeChild(node);
851
+ }
852
+ // Continue scanning after the merged run.
853
+ i = j - 1;
854
+ }
855
+ }
856
+ }
857
+ function flattenWrapperElements(element, doc) {
858
+ let processedCount = 0;
859
+ const startTime = Date.now();
860
+ // Process in batches to maintain performance
861
+ let keepProcessing = true;
862
+ // Helper function to check if an element directly contains inline content
863
+ // This helps prevent unwrapping divs that visually act as paragraphs.
864
+ function hasDirectInlineContent(el) {
865
+ for (const child of el.childNodes) {
866
+ // Check for non-empty text nodes
867
+ if ((0, utils_1.isTextNode)(child) && child.textContent?.trim()) {
868
+ return true;
869
+ }
870
+ // Check for element nodes that are considered inline
871
+ if ((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
872
+ return true;
873
+ }
874
+ }
875
+ return false;
876
+ }
877
+ const shouldPreserveElement = (el) => {
878
+ const tagName = el.tagName.toLowerCase();
879
+ // Check if element should be preserved
880
+ if (constants_1.PRESERVE_ELEMENTS.has(tagName))
881
+ return true;
882
+ // Preserve callout structure (div.callout[data-callout] and children)
883
+ if (el.getAttribute('data-callout') || el.closest?.('[data-callout]'))
884
+ return true;
885
+ // Check for semantic roles
886
+ const role = el.getAttribute('role');
887
+ if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
888
+ return true;
889
+ }
890
+ // Check for semantic classes
891
+ const className = (0, dom_1.getClassName)(el);
892
+ if (className && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
893
+ return true;
894
+ }
895
+ // Check if element contains mixed content types that should be preserved
896
+ const children = Array.from(el.children);
897
+ const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
898
+ child.getAttribute('role') === 'article' ||
899
+ !!(0, dom_1.getClassName)(child) && (0, dom_1.getClassName)(child).toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/));
900
+ if (hasPreservedElements)
901
+ return true;
902
+ return false;
903
+ };
904
+ const isWrapperElement = (el) => {
905
+ // If it directly contains inline content, it's NOT a wrapper
906
+ if (hasDirectInlineContent(el)) {
907
+ return false;
908
+ }
909
+ // Check if it's just empty space
910
+ if (!el.textContent?.trim())
911
+ return true;
912
+ // Check if it only contains other block elements
913
+ const children = Array.from(el.children);
914
+ if (children.length === 0)
915
+ return true;
916
+ // Check if all children are block elements
917
+ const allBlockElements = children.every(child => {
918
+ return constants_1.BLOCK_LEVEL_ELEMENTS.has(child.tagName.toLowerCase());
919
+ });
920
+ if (allBlockElements)
921
+ return true;
922
+ // Check for common wrapper patterns
923
+ const className = (0, dom_1.getClassName)(el).toLowerCase();
924
+ const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
925
+ if (isWrapper)
926
+ return true;
927
+ // Check if it has excessive whitespace or empty text nodes
928
+ const textNodes = Array.from(el.childNodes).filter(node => (0, utils_1.isTextNode)(node) && node.textContent?.trim());
929
+ if (textNodes.length === 0)
930
+ return true;
931
+ // Check if it only contains block elements
932
+ const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
933
+ const tag = child.tagName.toLowerCase();
934
+ return constants_1.INLINE_ELEMENTS.has(tag);
935
+ });
936
+ if (hasOnlyBlockElements)
937
+ return true;
938
+ return false;
939
+ };
940
+ // Function to process a single element
941
+ const processElement = (el) => {
942
+ // Skip processing if element has been removed or should be preserved
943
+ if (!el.parentNode || shouldPreserveElement(el))
944
+ return false;
945
+ const tagName = el.tagName.toLowerCase();
946
+ // Case 1: Element is truly empty (no text content, no child elements) and not self-closing
947
+ if (!constants_1.ALLOWED_EMPTY_ELEMENTS.has(tagName) && !el.children.length && !el.textContent?.trim()) {
948
+ el.remove();
949
+ processedCount++;
950
+ return true;
951
+ }
952
+ // Case 2: Top-level element - be more aggressive
953
+ if (el.parentElement === element) {
954
+ const children = Array.from(el.children);
955
+ const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
956
+ const tag = child.tagName.toLowerCase();
957
+ return constants_1.INLINE_ELEMENTS.has(tag);
958
+ });
959
+ if (hasOnlyBlockElements) {
960
+ const fragment = doc.createDocumentFragment();
961
+ while (el.firstChild) {
962
+ fragment.appendChild(el.firstChild);
963
+ }
964
+ el.replaceWith(fragment);
965
+ processedCount++;
966
+ return true;
967
+ }
968
+ }
969
+ // Case 3: Wrapper element - merge up aggressively
970
+ if (isWrapperElement(el)) {
971
+ const fragment = doc.createDocumentFragment();
972
+ while (el.firstChild) {
973
+ fragment.appendChild(el.firstChild);
974
+ }
975
+ el.replaceWith(fragment);
976
+ processedCount++;
977
+ return true;
978
+ }
979
+ // Case 4: Element only contains text and/or inline elements - convert to paragraph
980
+ const childNodes = Array.from(el.childNodes);
981
+ const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => ((0, utils_1.isTextNode)(child)) ||
982
+ ((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
983
+ if (hasOnlyInlineOrText && el.textContent?.trim()) { // Ensure there's actual content
984
+ const p = doc.createElement('p');
985
+ // Move all children (including inline tags like <font>) to the new <p>
986
+ while (el.firstChild) {
987
+ p.appendChild(el.firstChild);
988
+ }
989
+ el.replaceWith(p);
990
+ processedCount++;
991
+ return true;
992
+ }
993
+ // Case 5: Element has single child - unwrap only if child is block-level
994
+ if (el.children.length === 1) {
995
+ const child = el.firstElementChild;
996
+ const childTag = child.tagName.toLowerCase();
997
+ // Only unwrap if the single child is a block element and not preserved
998
+ if (constants_1.BLOCK_ELEMENTS_SET.has(childTag) && !shouldPreserveElement(child)) {
999
+ el.replaceWith(child);
1000
+ processedCount++;
1001
+ return true;
1002
+ }
1003
+ }
1004
+ // Case 6: Deeply nested element - merge up
1005
+ let nestingDepth = 0;
1006
+ let parent = el.parentElement;
1007
+ while (parent) {
1008
+ const parentTag = parent.tagName.toLowerCase();
1009
+ if (constants_1.BLOCK_ELEMENTS_SET.has(parentTag)) {
1010
+ nestingDepth++;
1011
+ }
1012
+ parent = parent.parentElement;
1013
+ }
1014
+ // Only unwrap if nested AND does not contain direct inline content
1015
+ if (nestingDepth > 0 && !hasDirectInlineContent(el)) {
1016
+ const fragment = doc.createDocumentFragment();
1017
+ while (el.firstChild) {
1018
+ fragment.appendChild(el.firstChild);
1019
+ }
1020
+ el.replaceWith(fragment);
1021
+ processedCount++;
1022
+ return true;
1023
+ }
1024
+ return false;
1025
+ };
1026
+ // First pass: Process top-level wrapper elements
1027
+ const processTopLevelElements = () => {
1028
+ const topElements = Array.from(element.children).filter(el => constants_1.BLOCK_ELEMENTS_SET.has(el.tagName.toLowerCase()));
1029
+ let modified = false;
1030
+ topElements.forEach(el => {
1031
+ if (processElement(el)) {
1032
+ modified = true;
1033
+ }
1034
+ });
1035
+ return modified;
1036
+ };
1037
+ // Second pass: Process remaining wrapper elements from deepest to shallowest
1038
+ const processRemainingElements = () => {
1039
+ // Get all wrapper elements
1040
+ const allElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR))
1041
+ .sort((a, b) => {
1042
+ // Count nesting depth
1043
+ const getDepth = (el) => {
1044
+ let depth = 0;
1045
+ let parent = el.parentElement;
1046
+ while (parent) {
1047
+ const parentTag = parent.tagName.toLowerCase();
1048
+ if (constants_1.BLOCK_ELEMENTS_SET.has(parentTag))
1049
+ depth++;
1050
+ parent = parent.parentElement;
1051
+ }
1052
+ return depth;
1053
+ };
1054
+ return getDepth(b) - getDepth(a); // Process deepest first
1055
+ });
1056
+ let modified = false;
1057
+ allElements.forEach(el => {
1058
+ if (processElement(el)) {
1059
+ modified = true;
1060
+ }
1061
+ });
1062
+ return modified;
1063
+ };
1064
+ // Final cleanup pass - aggressively flatten remaining wrapper elements
1065
+ const finalCleanup = () => {
1066
+ const remainingElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR));
1067
+ let modified = false;
1068
+ remainingElements.forEach(el => {
1069
+ // Check if element only contains paragraphs
1070
+ const children = Array.from(el.children);
1071
+ const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
1072
+ // Unwrap if it only contains paragraphs OR is a non-preserved wrapper element
1073
+ if (onlyParagraphs || (!shouldPreserveElement(el) && isWrapperElement(el))) {
1074
+ const fragment = doc.createDocumentFragment();
1075
+ while (el.firstChild) {
1076
+ fragment.appendChild(el.firstChild);
1077
+ }
1078
+ el.replaceWith(fragment);
1079
+ processedCount++;
1080
+ modified = true;
1081
+ }
1082
+ });
1083
+ return modified;
1084
+ };
1085
+ // Execute all passes until no more changes
1086
+ do {
1087
+ keepProcessing = false;
1088
+ if (processTopLevelElements())
1089
+ keepProcessing = true;
1090
+ if (processRemainingElements())
1091
+ keepProcessing = true;
1092
+ if (finalCleanup())
1093
+ keepProcessing = true;
1094
+ } while (keepProcessing);
1095
+ const endTime = Date.now();
1096
+ (0, utils_1.logDebug)(_debug, 'Flattened wrapper elements:', {
1097
+ count: processedCount,
1098
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
1099
+ });
1100
+ }
1101
+ //# sourceMappingURL=standardize.js.map