defuddle 0.5.3 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,830 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.standardizeContent = standardizeContent;
4
+ const constants_1 = require("./constants");
5
+ const math_full_1 = require("./elements/math.full");
6
+ const code_1 = require("./elements/code");
7
+ const footnotes_1 = require("./elements/footnotes");
8
+ const headings_1 = require("./elements/headings");
9
+ const images_1 = require("./elements/images");
10
+ const utils_1 = require("./utils");
11
+ const ELEMENT_STANDARDIZATION_RULES = [
12
+ ...math_full_1.mathRules,
13
+ ...code_1.codeBlockRules,
14
+ ...headings_1.headingRules,
15
+ ...images_1.imageRules,
16
+ // Convert divs with paragraph role to actual paragraphs
17
+ {
18
+ selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
19
+ element: 'p',
20
+ transform: (el, doc) => {
21
+ const p = doc.createElement('p');
22
+ // Copy innerHTML
23
+ p.innerHTML = el.innerHTML;
24
+ // Copy allowed attributes
25
+ Array.from(el.attributes).forEach(attr => {
26
+ if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
27
+ p.setAttribute(attr.name, attr.value);
28
+ }
29
+ });
30
+ return p;
31
+ }
32
+ },
33
+ // Convert divs with list roles to actual lists
34
+ {
35
+ selector: 'div[role="list"]',
36
+ element: 'ul',
37
+ // Custom handler for list type detection and transformation
38
+ transform: (el, doc) => {
39
+ // First determine if this is an ordered list
40
+ const firstItem = el.querySelector('div[role="listitem"] .label');
41
+ const label = firstItem?.textContent?.trim() || '';
42
+ const isOrdered = label.match(/^\d+\)/);
43
+ // Create the appropriate list type
44
+ const list = doc.createElement(isOrdered ? 'ol' : 'ul');
45
+ // Process each list item
46
+ const items = el.querySelectorAll('div[role="listitem"]');
47
+ items.forEach(item => {
48
+ const li = doc.createElement('li');
49
+ const content = item.querySelector('.content');
50
+ if (content) {
51
+ // Convert any paragraph divs inside content
52
+ const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
53
+ paragraphDivs.forEach(div => {
54
+ const p = doc.createElement('p');
55
+ p.innerHTML = div.innerHTML;
56
+ div.replaceWith(p);
57
+ });
58
+ // Convert any nested lists recursively
59
+ const nestedLists = content.querySelectorAll('div[role="list"]');
60
+ nestedLists.forEach(nestedList => {
61
+ const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
62
+ const nestedLabel = firstNestedItem?.textContent?.trim() || '';
63
+ const isNestedOrdered = nestedLabel.match(/^\d+\)/);
64
+ const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
65
+ // Process nested items
66
+ const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
67
+ nestedItems.forEach(nestedItem => {
68
+ const nestedLi = doc.createElement('li');
69
+ const nestedContent = nestedItem.querySelector('.content');
70
+ if (nestedContent) {
71
+ // Convert paragraph divs in nested items
72
+ const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
73
+ nestedParagraphs.forEach(div => {
74
+ const p = doc.createElement('p');
75
+ p.innerHTML = div.innerHTML;
76
+ div.replaceWith(p);
77
+ });
78
+ nestedLi.innerHTML = nestedContent.innerHTML;
79
+ }
80
+ newNestedList.appendChild(nestedLi);
81
+ });
82
+ nestedList.replaceWith(newNestedList);
83
+ });
84
+ li.innerHTML = content.innerHTML;
85
+ }
86
+ list.appendChild(li);
87
+ });
88
+ return list;
89
+ }
90
+ },
91
+ {
92
+ selector: 'div[role="listitem"]',
93
+ element: 'li',
94
+ // Custom handler for list item content
95
+ transform: (el, doc) => {
96
+ const content = el.querySelector('.content');
97
+ if (!content)
98
+ return el;
99
+ // Convert any paragraph divs inside content
100
+ const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
101
+ paragraphDivs.forEach(div => {
102
+ const p = doc.createElement('p');
103
+ p.innerHTML = div.innerHTML;
104
+ div.replaceWith(p);
105
+ });
106
+ return content;
107
+ }
108
+ }
109
+ ];
110
+ function standardizeContent(element, metadata, doc, debug = false) {
111
+ standardizeSpaces(element);
112
+ // Remove HTML comments
113
+ removeHtmlComments(element);
114
+ // Handle H1 elements - remove first one and convert others to H2
115
+ standardizeHeadings(element, metadata.title, doc);
116
+ // Standardize footnotes and citations
117
+ (0, footnotes_1.standardizeFootnotes)(element);
118
+ // Convert embedded content to standard formats
119
+ standardizeElements(element, doc);
120
+ // If not debug mode, do the full cleanup
121
+ if (!debug) {
122
+ // First pass of div flattening
123
+ flattenWrapperElements(element, doc);
124
+ // Strip unwanted attributes
125
+ stripUnwantedAttributes(element, debug);
126
+ // Remove empty elements
127
+ removeEmptyElements(element);
128
+ // Remove trailing headings
129
+ removeTrailingHeadings(element);
130
+ // Final pass of div flattening after cleanup operations
131
+ flattenWrapperElements(element, doc);
132
+ // Standardize consecutive br elements
133
+ stripExtraBrElements(element);
134
+ // Clean up empty lines
135
+ removeEmptyLines(element, doc);
136
+ }
137
+ else {
138
+ // In debug mode, still do basic cleanup but preserve structure
139
+ stripUnwantedAttributes(element, debug);
140
+ removeTrailingHeadings(element);
141
+ stripExtraBrElements(element);
142
+ (0, utils_1.logDebug)('Debug mode: Skipping div flattening to preserve structure');
143
+ }
144
+ }
145
+ function standardizeSpaces(element) {
146
+ const processNode = (node) => {
147
+ // Skip pre and code elements
148
+ if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
149
+ const tag = node.tagName.toLowerCase();
150
+ if (tag === 'pre' || tag === 'code') {
151
+ return;
152
+ }
153
+ }
154
+ // Process text nodes
155
+ if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
156
+ const text = node.textContent || '';
157
+ // Replace   with regular spaces, except when it's a single   between words
158
+ const newText = text.replace(/\xA0+/g, (match) => {
159
+ // If it's a single   between word characters, preserve it
160
+ if (match.length === 1) {
161
+ const prev = node.previousSibling?.textContent?.slice(-1);
162
+ const next = node.nextSibling?.textContent?.charAt(0);
163
+ if (prev?.match(/\w/) && next?.match(/\w/)) {
164
+ return '\xA0';
165
+ }
166
+ }
167
+ return ' '.repeat(match.length);
168
+ });
169
+ if (newText !== text) {
170
+ node.textContent = newText;
171
+ }
172
+ }
173
+ // Process children recursively
174
+ if (node.hasChildNodes()) {
175
+ Array.from(node.childNodes).forEach(processNode);
176
+ }
177
+ };
178
+ processNode(element);
179
+ }
180
+ function removeTrailingHeadings(element) {
181
+ let removedCount = 0;
182
+ const hasContentAfter = (el) => {
183
+ // Check if there's any meaningful content after this element
184
+ let nextContent = '';
185
+ let sibling = el.nextSibling;
186
+ // First check direct siblings
187
+ while (sibling) {
188
+ if (sibling.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
189
+ nextContent += sibling.textContent || '';
190
+ }
191
+ else if (sibling.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) { // ELEMENT_NODE
192
+ // If we find an element sibling, check its content
193
+ nextContent += sibling.textContent || '';
194
+ }
195
+ sibling = sibling.nextSibling;
196
+ }
197
+ // If we found meaningful content at this level, return true
198
+ if (nextContent.trim()) {
199
+ return true;
200
+ }
201
+ // If no content found at this level and we have a parent,
202
+ // check for content after the parent
203
+ const parent = el.parentElement;
204
+ if (parent && parent !== element) {
205
+ return hasContentAfter(parent);
206
+ }
207
+ return false;
208
+ };
209
+ // Process all headings from bottom to top
210
+ const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
211
+ .reverse();
212
+ headings.forEach(heading => {
213
+ if (!hasContentAfter(heading)) {
214
+ heading.remove();
215
+ removedCount++;
216
+ }
217
+ else {
218
+ // Stop processing once we find a heading with content after it
219
+ return;
220
+ }
221
+ });
222
+ if (removedCount > 0) {
223
+ (0, utils_1.logDebug)('Removed trailing headings:', removedCount);
224
+ }
225
+ }
226
+ function standardizeHeadings(element, title, doc) {
227
+ const normalizeText = (text) => {
228
+ return text
229
+ .replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
230
+ .replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
231
+ .trim()
232
+ .toLowerCase();
233
+ };
234
+ const h1s = element.getElementsByTagName('h1');
235
+ Array.from(h1s).forEach(h1 => {
236
+ const h2 = doc.createElement('h2');
237
+ h2.innerHTML = h1.innerHTML;
238
+ // Copy allowed attributes
239
+ Array.from(h1.attributes).forEach(attr => {
240
+ if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
241
+ h2.setAttribute(attr.name, attr.value);
242
+ }
243
+ });
244
+ h1.parentNode?.replaceChild(h2, h1);
245
+ });
246
+ // Remove first H2 if it matches title
247
+ const h2s = element.getElementsByTagName('h2');
248
+ if (h2s.length > 0) {
249
+ const firstH2 = h2s[0];
250
+ const firstH2Text = normalizeText(firstH2.textContent || '');
251
+ const normalizedTitle = normalizeText(title);
252
+ if (normalizedTitle && normalizedTitle === firstH2Text) {
253
+ firstH2.remove();
254
+ }
255
+ }
256
+ }
257
+ function removeHtmlComments(element) {
258
+ let removedCount = 0;
259
+ // Get all elements and check their child nodes
260
+ const allElements = Array.from(element.getElementsByTagName('*'));
261
+ // Process each element's child nodes
262
+ allElements.forEach(el => {
263
+ const childNodes = Array.from(el.childNodes);
264
+ childNodes.forEach(node => {
265
+ if (node.nodeType === 8) { // 8 is the node type for comments
266
+ node.remove();
267
+ removedCount++;
268
+ }
269
+ });
270
+ });
271
+ (0, utils_1.logDebug)('Removed HTML comments:', removedCount);
272
+ }
273
+ function stripUnwantedAttributes(element, debug) {
274
+ let attributeCount = 0;
275
+ const processElement = (el) => {
276
+ // Skip SVG elements - preserve all their attributes
277
+ if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
278
+ return;
279
+ }
280
+ const attributes = Array.from(el.attributes);
281
+ const tag = el.tagName.toLowerCase();
282
+ attributes.forEach(attr => {
283
+ const attrName = attr.name.toLowerCase();
284
+ const attrValue = attr.value;
285
+ // Special cases for preserving specific attributes
286
+ if (
287
+ // Preserve footnote IDs
288
+ (attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
289
+ attrValue.startsWith('fn:') || // Footnote content
290
+ attrValue === 'footnotes' // Footnotes container
291
+ )) ||
292
+ // Preserve code block language classes and footnote backref class
293
+ (attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
294
+ attrValue === 'footnote-backref'))) {
295
+ return;
296
+ }
297
+ // In debug mode, allow debug attributes and data- attributes
298
+ if (debug) {
299
+ if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
300
+ !constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
301
+ !attrName.startsWith('data-')) {
302
+ el.removeAttribute(attr.name);
303
+ attributeCount++;
304
+ }
305
+ }
306
+ else {
307
+ // In normal mode, only allow standard attributes
308
+ if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
309
+ el.removeAttribute(attr.name);
310
+ attributeCount++;
311
+ }
312
+ }
313
+ });
314
+ };
315
+ processElement(element);
316
+ element.querySelectorAll('*').forEach(processElement);
317
+ (0, utils_1.logDebug)('Stripped attributes:', attributeCount);
318
+ }
319
+ function removeEmptyElements(element) {
320
+ let removedCount = 0;
321
+ let iterations = 0;
322
+ let keepRemoving = true;
323
+ while (keepRemoving) {
324
+ iterations++;
325
+ keepRemoving = false;
326
+ // Get all elements without children, working from deepest first
327
+ const emptyElements = Array.from(element.getElementsByTagName('*')).filter(el => {
328
+ if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase())) {
329
+ return false;
330
+ }
331
+ // Check if element has only whitespace or  
332
+ const textContent = el.textContent || '';
333
+ const hasOnlyWhitespace = textContent.trim().length === 0;
334
+ const hasNbsp = textContent.includes('\u00A0'); // Unicode non-breaking space
335
+ // Check if element has no meaningful children
336
+ const hasNoChildren = !el.hasChildNodes() ||
337
+ (Array.from(el.childNodes).every(node => {
338
+ if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
339
+ const nodeText = node.textContent || '';
340
+ return nodeText.trim().length === 0 && !nodeText.includes('\u00A0');
341
+ }
342
+ return false;
343
+ }));
344
+ // Special case: Check for divs that only contain spans with commas
345
+ if (el.tagName.toLowerCase() === 'div') {
346
+ const children = Array.from(el.children);
347
+ const hasOnlyCommaSpans = children.length > 0 && children.every(child => {
348
+ if (child.tagName.toLowerCase() !== 'span')
349
+ return false;
350
+ const content = child.textContent?.trim() || '';
351
+ return content === ',' || content === '' || content === ' ';
352
+ });
353
+ if (hasOnlyCommaSpans)
354
+ return true;
355
+ }
356
+ return hasOnlyWhitespace && !hasNbsp && hasNoChildren;
357
+ });
358
+ if (emptyElements.length > 0) {
359
+ emptyElements.forEach(el => {
360
+ el.remove();
361
+ removedCount++;
362
+ });
363
+ keepRemoving = true;
364
+ }
365
+ }
366
+ (0, utils_1.logDebug)('Removed empty elements:', removedCount, 'iterations:', iterations);
367
+ }
368
+ function stripExtraBrElements(element) {
369
+ let processedCount = 0;
370
+ const startTime = Date.now();
371
+ // Get all br elements directly
372
+ const brElements = Array.from(element.getElementsByTagName('br'));
373
+ // Keep track of consecutive br elements
374
+ let consecutiveBrs = [];
375
+ // Helper to process collected br elements
376
+ const processBrs = () => {
377
+ if (consecutiveBrs.length > 2) {
378
+ // Keep only two br elements
379
+ for (let i = 2; i < consecutiveBrs.length; i++) {
380
+ consecutiveBrs[i].remove();
381
+ processedCount++;
382
+ }
383
+ }
384
+ consecutiveBrs = [];
385
+ };
386
+ // Process all br elements
387
+ brElements.forEach(currentNode => {
388
+ // Check if this br is consecutive with previous ones
389
+ let isConsecutive = false;
390
+ if (consecutiveBrs.length > 0) {
391
+ const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
392
+ let node = currentNode.previousSibling;
393
+ // Skip whitespace text nodes
394
+ while (node && node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && !node.textContent?.trim()) {
395
+ node = node.previousSibling;
396
+ }
397
+ if (node === lastBr) {
398
+ isConsecutive = true;
399
+ }
400
+ }
401
+ if (isConsecutive) {
402
+ consecutiveBrs.push(currentNode);
403
+ }
404
+ else {
405
+ // Process any previously collected brs before starting new group
406
+ processBrs();
407
+ consecutiveBrs = [currentNode];
408
+ }
409
+ });
410
+ // Process any remaining br elements
411
+ processBrs();
412
+ const endTime = Date.now();
413
+ (0, utils_1.logDebug)('Standardized br elements:', {
414
+ removed: processedCount,
415
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
416
+ });
417
+ }
418
+ function removeEmptyLines(element, doc) {
419
+ let removedCount = 0;
420
+ const startTime = Date.now();
421
+ // First pass: remove empty text nodes
422
+ const removeEmptyTextNodes = (node) => {
423
+ // Skip if inside pre or code
424
+ if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
425
+ const tag = node.tagName.toLowerCase();
426
+ if (tag === 'pre' || tag === 'code') {
427
+ return;
428
+ }
429
+ }
430
+ // Process children first (depth-first)
431
+ const children = Array.from(node.childNodes);
432
+ children.forEach(removeEmptyTextNodes);
433
+ // Then handle this node
434
+ if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
435
+ const text = node.textContent || '';
436
+ // If it's completely empty or just special characters/whitespace, remove it
437
+ if (!text || text.match(/^[\u200C\u200B\u200D\u200E\u200F\uFEFF\xA0\s]*$/)) {
438
+ node.parentNode?.removeChild(node);
439
+ removedCount++;
440
+ }
441
+ else {
442
+ // Clean up the text content while preserving important spaces
443
+ const newText = text
444
+ .replace(/\n{3,}/g, '\n\n') // More than 2 newlines -> 2 newlines
445
+ .replace(/^[\n\r\t]+/, '') // Remove leading newlines/tabs (preserve spaces)
446
+ .replace(/[\n\r\t]+$/, '') // Remove trailing newlines/tabs (preserve spaces)
447
+ .replace(/[ \t]*\n[ \t]*/g, '\n') // Remove spaces around newlines
448
+ .replace(/[ \t]{3,}/g, ' ') // 3+ spaces -> 1 space
449
+ .replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
450
+ .replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
451
+ // Clean up zero-width characters and multiple non-breaking spaces
452
+ .replace(/[\u200C\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
453
+ .replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple &nbsp; -> single &nbsp;
454
+ if (newText !== text) {
455
+ node.textContent = newText;
456
+ removedCount += text.length - newText.length;
457
+ }
458
+ }
459
+ }
460
+ };
461
+ // Second pass: clean up empty elements and normalize spacing
462
+ const cleanupEmptyElements = (node) => {
463
+ if (!(0, utils_1.isElement)(node))
464
+ return;
465
+ // Skip pre and code elements
466
+ const tag = node.tagName.toLowerCase();
467
+ if (tag === 'pre' || tag === 'code') {
468
+ return;
469
+ }
470
+ // Process children first (depth-first)
471
+ Array.from(node.childNodes)
472
+ .filter(utils_1.isElement)
473
+ .forEach(cleanupEmptyElements);
474
+ // Then normalize this element's whitespace
475
+ node.normalize(); // Combine adjacent text nodes
476
+ // Special handling for block elements
477
+ const isBlockElement = (0, utils_1.getComputedStyle)(node)?.display === 'block';
478
+ // Only remove empty text nodes at the start and end if they contain just newlines/tabs
479
+ // For block elements, also remove spaces
480
+ const startPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
481
+ const endPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
482
+ while (node.firstChild &&
483
+ node.firstChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
484
+ (node.firstChild.textContent || '').match(startPattern)) {
485
+ node.removeChild(node.firstChild);
486
+ removedCount++;
487
+ }
488
+ while (node.lastChild &&
489
+ node.lastChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
490
+ (node.lastChild.textContent || '').match(endPattern)) {
491
+ node.removeChild(node.lastChild);
492
+ removedCount++;
493
+ }
494
+ // Ensure there's a space between inline elements if needed
495
+ if (!isBlockElement) {
496
+ const children = Array.from(node.childNodes);
497
+ for (let i = 0; i < children.length - 1; i++) {
498
+ const current = children[i];
499
+ const next = children[i + 1];
500
+ // Only add space between elements or between element and text
501
+ if ((0, utils_1.isElement)(current) || (0, utils_1.isElement)(next)) {
502
+ // Get the text content
503
+ const nextContent = next.textContent || '';
504
+ const currentContent = current.textContent || '';
505
+ // Don't add space if:
506
+ // 1. Next content starts with punctuation or closing parenthesis
507
+ // 2. Current content ends with punctuation or opening parenthesis
508
+ // 3. There's already a space
509
+ const nextStartsWithPunctuation = nextContent.match(/^[,.!?:;)\]]/);
510
+ const currentEndsWithPunctuation = currentContent.match(/[,.!?:;(\[]\s*$/);
511
+ const hasSpace = (current.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
512
+ (current.textContent || '').endsWith(' ')) ||
513
+ (next.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
514
+ (next.textContent || '').startsWith(' '));
515
+ // Only add space if none of the above conditions are true
516
+ if (!nextStartsWithPunctuation &&
517
+ !currentEndsWithPunctuation &&
518
+ !hasSpace) {
519
+ const space = doc.createTextNode(' ');
520
+ node.insertBefore(space, next);
521
+ }
522
+ }
523
+ }
524
+ }
525
+ };
526
+ // Run both passes
527
+ removeEmptyTextNodes(element);
528
+ cleanupEmptyElements(element);
529
+ const endTime = Date.now();
530
+ (0, utils_1.logDebug)('Removed empty lines:', {
531
+ charactersRemoved: removedCount,
532
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
533
+ });
534
+ }
535
+ function standardizeElements(element, doc) {
536
+ let processedCount = 0;
537
+ // Convert elements based on standardization rules
538
+ ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
539
+ const elements = element.querySelectorAll(rule.selector);
540
+ elements.forEach(el => {
541
+ if (rule.transform) {
542
+ // If there's a transform function, use it to create the new element
543
+ const transformed = rule.transform(el, doc);
544
+ el.replaceWith(transformed);
545
+ processedCount++;
546
+ }
547
+ });
548
+ });
549
+ // Convert lite-youtube elements
550
+ const liteYoutubeElements = element.querySelectorAll('lite-youtube');
551
+ liteYoutubeElements.forEach(el => {
552
+ const videoId = el.getAttribute('videoid');
553
+ if (!videoId)
554
+ return;
555
+ const iframe = doc.createElement('iframe');
556
+ iframe.width = '560';
557
+ iframe.height = '315';
558
+ iframe.src = `https://www.youtube.com/embed/${videoId}`;
559
+ iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
560
+ iframe.frameBorder = '0';
561
+ iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
562
+ iframe.setAttribute('allowfullscreen', '');
563
+ el.replaceWith(iframe);
564
+ processedCount++;
565
+ });
566
+ (0, utils_1.logDebug)('Converted embedded elements:', processedCount);
567
+ }
568
+ function flattenWrapperElements(element, doc) {
569
+ let processedCount = 0;
570
+ const startTime = Date.now();
571
+ // Process in batches to maintain performance
572
+ let keepProcessing = true;
573
+ // Helper function to check if an element directly contains inline content
574
+ // This helps prevent unwrapping divs that visually act as paragraphs.
575
+ function hasDirectInlineContent(el) {
576
+ for (const child of el.childNodes) {
577
+ // Check for non-empty text nodes
578
+ if (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE && child.textContent?.trim()) {
579
+ return true;
580
+ }
581
+ // Check for element nodes that are considered inline
582
+ if (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
583
+ return true;
584
+ }
585
+ }
586
+ return false;
587
+ }
588
+ const shouldPreserveElement = (el) => {
589
+ const tagName = el.tagName.toLowerCase();
590
+ // Check if element should be preserved
591
+ if (constants_1.PRESERVE_ELEMENTS.has(tagName))
592
+ return true;
593
+ // Check for semantic roles
594
+ const role = el.getAttribute('role');
595
+ if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
596
+ return true;
597
+ }
598
+ // Check for semantic classes
599
+ const className = el.className;
600
+ if (typeof className === 'string' && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
601
+ return true;
602
+ }
603
+ // Check if element contains mixed content types that should be preserved
604
+ const children = Array.from(el.children);
605
+ const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
606
+ child.getAttribute('role') === 'article' ||
607
+ (child.className && typeof child.className === 'string' &&
608
+ child.className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)));
609
+ if (hasPreservedElements)
610
+ return true;
611
+ return false;
612
+ };
613
+ const isWrapperElement = (el) => {
614
+ // If it directly contains inline content, it's NOT a wrapper
615
+ if (hasDirectInlineContent(el)) {
616
+ return false;
617
+ }
618
+ // Check if it's just empty space
619
+ if (!el.textContent?.trim())
620
+ return true;
621
+ // Check if it only contains other block elements
622
+ const children = Array.from(el.children);
623
+ if (children.length === 0)
624
+ return true;
625
+ // Check if all children are block elements
626
+ const allBlockElements = children.every(child => {
627
+ const tag = child.tagName.toLowerCase();
628
+ return constants_1.BLOCK_ELEMENTS.includes(tag) ||
629
+ tag === 'p' || tag === 'h1' || tag === 'h2' ||
630
+ tag === 'h3' || tag === 'h4' || tag === 'h5' || tag === 'h6' ||
631
+ tag === 'ul' || tag === 'ol' || tag === 'pre' || tag === 'blockquote' ||
632
+ tag === 'figure';
633
+ });
634
+ if (allBlockElements)
635
+ return true;
636
+ // Check for common wrapper patterns
637
+ const className = el.className.toLowerCase();
638
+ const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
639
+ if (isWrapper)
640
+ return true;
641
+ // Check if it has excessive whitespace or empty text nodes
642
+ const textNodes = Array.from(el.childNodes).filter(node => node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && node.textContent?.trim());
643
+ if (textNodes.length === 0)
644
+ return true;
645
+ // Check if it only contains block elements
646
+ const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
647
+ const tag = child.tagName.toLowerCase();
648
+ return constants_1.INLINE_ELEMENTS.has(tag);
649
+ });
650
+ if (hasOnlyBlockElements)
651
+ return true;
652
+ return false;
653
+ };
654
+ // Function to process a single element
655
+ const processElement = (el) => {
656
+ // Skip processing if element has been removed or should be preserved
657
+ if (!el.isConnected || shouldPreserveElement(el))
658
+ return false;
659
+ // Case 1: Empty element or element with only whitespace
660
+ if (!el.hasChildNodes() || !el.textContent?.trim()) {
661
+ el.remove();
662
+ processedCount++;
663
+ return true;
664
+ }
665
+ // Case 2: Top-level element - be more aggressive
666
+ if (el.parentElement === element) {
667
+ const children = Array.from(el.children);
668
+ const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
669
+ const tag = child.tagName.toLowerCase();
670
+ return constants_1.INLINE_ELEMENTS.has(tag);
671
+ });
672
+ if (hasOnlyBlockElements) {
673
+ const fragment = doc.createDocumentFragment();
674
+ while (el.firstChild) {
675
+ fragment.appendChild(el.firstChild);
676
+ }
677
+ el.replaceWith(fragment);
678
+ processedCount++;
679
+ return true;
680
+ }
681
+ }
682
+ // Case 3: Wrapper element - merge up aggressively
683
+ if (isWrapperElement(el)) {
684
+ // Special case: if element only contains block elements, merge them up
685
+ const children = Array.from(el.children);
686
+ const onlyBlockElements = !children.some(child => {
687
+ const tag = child.tagName.toLowerCase();
688
+ return constants_1.INLINE_ELEMENTS.has(tag);
689
+ });
690
+ if (onlyBlockElements) {
691
+ const fragment = doc.createDocumentFragment();
692
+ while (el.firstChild) {
693
+ fragment.appendChild(el.firstChild);
694
+ }
695
+ el.replaceWith(fragment);
696
+ processedCount++;
697
+ return true;
698
+ }
699
+ // Otherwise handle as normal wrapper
700
+ const fragment = doc.createDocumentFragment();
701
+ while (el.firstChild) {
702
+ fragment.appendChild(el.firstChild);
703
+ }
704
+ el.replaceWith(fragment);
705
+ processedCount++;
706
+ return true;
707
+ }
708
+ // Case 4: Element only contains text and/or inline elements - convert to paragraph
709
+ const childNodes = Array.from(el.childNodes);
710
+ const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE) ||
711
+ (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
712
+ if (hasOnlyInlineOrText && el.textContent?.trim()) { // Ensure there's actual content
713
+ const p = doc.createElement('p');
714
+ // Move all children (including inline tags like <font>) to the new <p>
715
+ while (el.firstChild) {
716
+ p.appendChild(el.firstChild);
717
+ }
718
+ el.replaceWith(p);
719
+ processedCount++;
720
+ return true;
721
+ }
722
+ // Case 5: Element has single child - unwrap only if child is block-level
723
+ if (el.children.length === 1) {
724
+ const child = el.firstElementChild;
725
+ const childTag = child.tagName.toLowerCase();
726
+ // Only unwrap if the single child is a block element and not preserved
727
+ if (constants_1.BLOCK_ELEMENTS.includes(childTag) && !shouldPreserveElement(child)) {
728
+ el.replaceWith(child);
729
+ processedCount++;
730
+ return true;
731
+ }
732
+ }
733
+ // Case 6: Deeply nested element - merge up
734
+ let nestingDepth = 0;
735
+ let parent = el.parentElement;
736
+ while (parent) {
737
+ const parentTag = parent.tagName.toLowerCase();
738
+ if (constants_1.BLOCK_ELEMENTS.includes(parentTag)) {
739
+ nestingDepth++;
740
+ }
741
+ parent = parent.parentElement;
742
+ }
743
+ // Only unwrap if nested AND does not contain direct inline content
744
+ if (nestingDepth > 0 && !hasDirectInlineContent(el)) {
745
+ const fragment = doc.createDocumentFragment();
746
+ while (el.firstChild) {
747
+ fragment.appendChild(el.firstChild);
748
+ }
749
+ el.replaceWith(fragment);
750
+ processedCount++;
751
+ return true;
752
+ }
753
+ return false;
754
+ };
755
+ // First pass: Process top-level wrapper elements
756
+ const processTopLevelElements = () => {
757
+ const topElements = Array.from(element.children).filter(el => constants_1.BLOCK_ELEMENTS.includes(el.tagName.toLowerCase()));
758
+ let modified = false;
759
+ topElements.forEach(el => {
760
+ if (processElement(el)) {
761
+ modified = true;
762
+ }
763
+ });
764
+ return modified;
765
+ };
766
+ // Second pass: Process remaining wrapper elements from deepest to shallowest
767
+ const processRemainingElements = () => {
768
+ // Get all wrapper elements
769
+ const allElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS.join(',')))
770
+ .sort((a, b) => {
771
+ // Count nesting depth
772
+ const getDepth = (el) => {
773
+ let depth = 0;
774
+ let parent = el.parentElement;
775
+ while (parent) {
776
+ const parentTag = parent.tagName.toLowerCase();
777
+ if (constants_1.BLOCK_ELEMENTS.includes(parentTag))
778
+ depth++;
779
+ parent = parent.parentElement;
780
+ }
781
+ return depth;
782
+ };
783
+ return getDepth(b) - getDepth(a); // Process deepest first
784
+ });
785
+ let modified = false;
786
+ allElements.forEach(el => {
787
+ if (processElement(el)) {
788
+ modified = true;
789
+ }
790
+ });
791
+ return modified;
792
+ };
793
+ // Final cleanup pass - aggressively flatten remaining wrapper elements
794
+ const finalCleanup = () => {
795
+ const remainingElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS.join(',')));
796
+ let modified = false;
797
+ remainingElements.forEach(el => {
798
+ // Check if element only contains paragraphs
799
+ const children = Array.from(el.children);
800
+ const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
801
+ // Unwrap if it only contains paragraphs OR is a non-preserved wrapper element
802
+ if (onlyParagraphs || (!shouldPreserveElement(el) && isWrapperElement(el))) {
803
+ const fragment = doc.createDocumentFragment();
804
+ while (el.firstChild) {
805
+ fragment.appendChild(el.firstChild);
806
+ }
807
+ el.replaceWith(fragment);
808
+ processedCount++;
809
+ modified = true;
810
+ }
811
+ });
812
+ return modified;
813
+ };
814
+ // Execute all passes until no more changes
815
+ do {
816
+ keepProcessing = false;
817
+ if (processTopLevelElements())
818
+ keepProcessing = true;
819
+ if (processRemainingElements())
820
+ keepProcessing = true;
821
+ if (finalCleanup())
822
+ keepProcessing = true;
823
+ } while (keepProcessing);
824
+ const endTime = Date.now();
825
+ (0, utils_1.logDebug)('Flattened wrapper elements:', {
826
+ count: processedCount,
827
+ processingTime: `${(endTime - startTime).toFixed(2)}ms`
828
+ });
829
+ }
830
+ //# sourceMappingURL=standardize.js.map