defuddle 0.5.4 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.d.ts +0 -1
- package/dist/constants.js +88 -28
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +1 -14
- package/dist/defuddle.js +23 -907
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +779 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.core.js +59 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.js +9 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/scoring.d.ts +17 -0
- package/dist/scoring.js +216 -0
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +830 -0
- package/dist/standardize.js.map +1 -0
- package/dist/utils.d.ts +4 -0
- package/dist/utils.js +38 -0
- package/dist/utils.js.map +1 -0
- package/package.json +1 -1
- package/dist/elements/math.full.js +0 -121
- package/dist/elements/math.full.js.map +0 -1
package/dist/defuddle.js
CHANGED
|
@@ -4,113 +4,9 @@ exports.Defuddle = void 0;
|
|
|
4
4
|
const metadata_1 = require("./metadata");
|
|
5
5
|
const extractor_registry_1 = require("./extractor-registry");
|
|
6
6
|
const constants_1 = require("./constants");
|
|
7
|
-
const
|
|
8
|
-
const code_1 = require("./elements/code");
|
|
9
|
-
const footnotes_1 = require("./elements/footnotes");
|
|
10
|
-
const headings_1 = require("./elements/headings");
|
|
7
|
+
const standardize_1 = require("./standardize");
|
|
11
8
|
const scoring_1 = require("./scoring");
|
|
12
|
-
const
|
|
13
|
-
...math_full_1.mathRules,
|
|
14
|
-
...code_1.codeBlockRules,
|
|
15
|
-
...headings_1.headingRules,
|
|
16
|
-
// Convert divs with paragraph role to actual paragraphs
|
|
17
|
-
{
|
|
18
|
-
selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
|
|
19
|
-
element: 'p',
|
|
20
|
-
transform: (el, doc) => {
|
|
21
|
-
const p = doc.createElement('p');
|
|
22
|
-
// Copy innerHTML
|
|
23
|
-
p.innerHTML = el.innerHTML;
|
|
24
|
-
// Copy allowed attributes
|
|
25
|
-
Array.from(el.attributes).forEach(attr => {
|
|
26
|
-
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
|
|
27
|
-
p.setAttribute(attr.name, attr.value);
|
|
28
|
-
}
|
|
29
|
-
});
|
|
30
|
-
return p;
|
|
31
|
-
}
|
|
32
|
-
},
|
|
33
|
-
// Convert divs with list roles to actual lists
|
|
34
|
-
{
|
|
35
|
-
selector: 'div[role="list"]',
|
|
36
|
-
element: 'ul',
|
|
37
|
-
// Custom handler for list type detection and transformation
|
|
38
|
-
transform: (el, doc) => {
|
|
39
|
-
// First determine if this is an ordered list
|
|
40
|
-
const firstItem = el.querySelector('div[role="listitem"] .label');
|
|
41
|
-
const label = firstItem?.textContent?.trim() || '';
|
|
42
|
-
const isOrdered = label.match(/^\d+\)/);
|
|
43
|
-
// Create the appropriate list type
|
|
44
|
-
const list = doc.createElement(isOrdered ? 'ol' : 'ul');
|
|
45
|
-
// Process each list item
|
|
46
|
-
const items = el.querySelectorAll('div[role="listitem"]');
|
|
47
|
-
items.forEach(item => {
|
|
48
|
-
const li = doc.createElement('li');
|
|
49
|
-
const content = item.querySelector('.content');
|
|
50
|
-
if (content) {
|
|
51
|
-
// Convert any paragraph divs inside content
|
|
52
|
-
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
|
|
53
|
-
paragraphDivs.forEach(div => {
|
|
54
|
-
const p = doc.createElement('p');
|
|
55
|
-
p.innerHTML = div.innerHTML;
|
|
56
|
-
div.replaceWith(p);
|
|
57
|
-
});
|
|
58
|
-
// Convert any nested lists recursively
|
|
59
|
-
const nestedLists = content.querySelectorAll('div[role="list"]');
|
|
60
|
-
nestedLists.forEach(nestedList => {
|
|
61
|
-
const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
|
|
62
|
-
const nestedLabel = firstNestedItem?.textContent?.trim() || '';
|
|
63
|
-
const isNestedOrdered = nestedLabel.match(/^\d+\)/);
|
|
64
|
-
const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
|
|
65
|
-
// Process nested items
|
|
66
|
-
const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
|
|
67
|
-
nestedItems.forEach(nestedItem => {
|
|
68
|
-
const nestedLi = doc.createElement('li');
|
|
69
|
-
const nestedContent = nestedItem.querySelector('.content');
|
|
70
|
-
if (nestedContent) {
|
|
71
|
-
// Convert paragraph divs in nested items
|
|
72
|
-
const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
|
|
73
|
-
nestedParagraphs.forEach(div => {
|
|
74
|
-
const p = doc.createElement('p');
|
|
75
|
-
p.innerHTML = div.innerHTML;
|
|
76
|
-
div.replaceWith(p);
|
|
77
|
-
});
|
|
78
|
-
nestedLi.innerHTML = nestedContent.innerHTML;
|
|
79
|
-
}
|
|
80
|
-
newNestedList.appendChild(nestedLi);
|
|
81
|
-
});
|
|
82
|
-
nestedList.replaceWith(newNestedList);
|
|
83
|
-
});
|
|
84
|
-
li.innerHTML = content.innerHTML;
|
|
85
|
-
}
|
|
86
|
-
list.appendChild(li);
|
|
87
|
-
});
|
|
88
|
-
return list;
|
|
89
|
-
}
|
|
90
|
-
},
|
|
91
|
-
{
|
|
92
|
-
selector: 'div[role="listitem"]',
|
|
93
|
-
element: 'li',
|
|
94
|
-
// Custom handler for list item content
|
|
95
|
-
transform: (el, doc) => {
|
|
96
|
-
const content = el.querySelector('.content');
|
|
97
|
-
if (!content)
|
|
98
|
-
return el;
|
|
99
|
-
// Convert any paragraph divs inside content
|
|
100
|
-
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
|
|
101
|
-
paragraphDivs.forEach(div => {
|
|
102
|
-
const p = doc.createElement('p');
|
|
103
|
-
p.innerHTML = div.innerHTML;
|
|
104
|
-
div.replaceWith(p);
|
|
105
|
-
});
|
|
106
|
-
return content;
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
];
|
|
110
|
-
// Type guard
|
|
111
|
-
function isElement(node) {
|
|
112
|
-
return node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE;
|
|
113
|
-
}
|
|
9
|
+
const utils_1 = require("./utils");
|
|
114
10
|
class Defuddle {
|
|
115
11
|
/**
|
|
116
12
|
* Create a new Defuddle instance
|
|
@@ -131,7 +27,7 @@ class Defuddle {
|
|
|
131
27
|
const schemaOrgData = metadata_1.MetadataExtractor.extractSchemaOrgData(this.doc);
|
|
132
28
|
const metadata = metadata_1.MetadataExtractor.extract(this.doc, schemaOrgData);
|
|
133
29
|
try {
|
|
134
|
-
//
|
|
30
|
+
// Use site-specific extractor first, if there is one
|
|
135
31
|
const url = this.options.url || this.doc.URL;
|
|
136
32
|
const extractor = extractor_registry_1.ExtractorRegistry.findExtractor(this.doc, url, schemaOrgData);
|
|
137
33
|
if (extractor && extractor.canExtract()) {
|
|
@@ -153,13 +49,14 @@ class Defuddle {
|
|
|
153
49
|
extractorType: extractor.constructor.name.replace('Extractor', '').toLowerCase()
|
|
154
50
|
};
|
|
155
51
|
}
|
|
156
|
-
//
|
|
52
|
+
// Continue if there is no extractor...
|
|
53
|
+
// Evaluate mobile styles and sizes on original document
|
|
157
54
|
const mobileStyles = this._evaluateMediaQueries(this.doc);
|
|
158
|
-
//
|
|
55
|
+
// Find small images in original document, excluding lazy-loaded ones
|
|
159
56
|
const smallImages = this.findSmallImages(this.doc);
|
|
160
57
|
// Clone document
|
|
161
58
|
const clone = this.doc.cloneNode(true);
|
|
162
|
-
// Apply mobile
|
|
59
|
+
// Apply mobile styles to clone
|
|
163
60
|
this.applyMobileStyles(clone, mobileStyles);
|
|
164
61
|
// Find main content
|
|
165
62
|
const mainContent = this.findMainContent(clone);
|
|
@@ -172,14 +69,18 @@ class Defuddle {
|
|
|
172
69
|
parseTime: Math.round(endTime - startTime)
|
|
173
70
|
};
|
|
174
71
|
}
|
|
175
|
-
// Remove small images
|
|
72
|
+
// Remove small images
|
|
176
73
|
this.removeSmallImages(clone, smallImages);
|
|
177
|
-
//
|
|
74
|
+
// Remove hidden elements using computed styles
|
|
178
75
|
this.removeHiddenElements(clone);
|
|
76
|
+
// Remove non-content blocks by scoring
|
|
77
|
+
// Tries to find lists, navigation based on text content and link density
|
|
78
|
+
scoring_1.ContentScorer.scoreAndRemove(clone, this.debug);
|
|
79
|
+
// Remove clutter using selectors
|
|
179
80
|
this.removeClutter(clone);
|
|
180
|
-
//
|
|
181
|
-
|
|
182
|
-
const content = mainContent
|
|
81
|
+
// Normalize the main content
|
|
82
|
+
(0, standardize_1.standardizeContent)(mainContent, metadata, this.doc, this.debug);
|
|
83
|
+
const content = mainContent.outerHTML;
|
|
183
84
|
const endTime = Date.now();
|
|
184
85
|
return {
|
|
185
86
|
content,
|
|
@@ -301,35 +202,10 @@ class Defuddle {
|
|
|
301
202
|
}
|
|
302
203
|
});
|
|
303
204
|
}
|
|
304
|
-
getWindow(doc) {
|
|
305
|
-
// First try defaultView
|
|
306
|
-
if (doc.defaultView) {
|
|
307
|
-
return doc.defaultView;
|
|
308
|
-
}
|
|
309
|
-
// Then try ownerWindow
|
|
310
|
-
if (doc.ownerWindow) {
|
|
311
|
-
return doc.ownerWindow;
|
|
312
|
-
}
|
|
313
|
-
// Finally try to get window from document
|
|
314
|
-
if (doc.window) {
|
|
315
|
-
return doc.window;
|
|
316
|
-
}
|
|
317
|
-
return null;
|
|
318
|
-
}
|
|
319
|
-
getComputedStyle(element) {
|
|
320
|
-
const win = this.getWindow(element.ownerDocument);
|
|
321
|
-
if (!win)
|
|
322
|
-
return null;
|
|
323
|
-
return win.getComputedStyle(element);
|
|
324
|
-
}
|
|
325
205
|
removeHiddenElements(doc) {
|
|
326
206
|
let count = 0;
|
|
327
207
|
const elementsToRemove = new Set();
|
|
328
|
-
//
|
|
329
|
-
const hiddenElements = doc.querySelectorAll(constants_1.HIDDEN_ELEMENT_SELECTORS);
|
|
330
|
-
hiddenElements.forEach(el => elementsToRemove.add(el));
|
|
331
|
-
count += hiddenElements.length;
|
|
332
|
-
// Second pass: Get all elements and check their styles
|
|
208
|
+
// Get all elements and check their styles
|
|
333
209
|
const allElements = Array.from(doc.getElementsByTagName('*'));
|
|
334
210
|
// Process styles in batches to minimize layout thrashing
|
|
335
211
|
const BATCH_SIZE = 100;
|
|
@@ -365,8 +241,7 @@ class Defuddle {
|
|
|
365
241
|
}
|
|
366
242
|
});
|
|
367
243
|
}
|
|
368
|
-
//
|
|
369
|
-
elementsToRemove.forEach(el => el.remove());
|
|
244
|
+
// Batch remove all hidden elements
|
|
370
245
|
this._log('Removed hidden elements:', count);
|
|
371
246
|
}
|
|
372
247
|
removeClutter(doc) {
|
|
@@ -425,754 +300,6 @@ class Defuddle {
|
|
|
425
300
|
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
426
301
|
});
|
|
427
302
|
}
|
|
428
|
-
flattenDivs(element) {
|
|
429
|
-
let processedCount = 0;
|
|
430
|
-
const startTime = Date.now();
|
|
431
|
-
// Process in batches to maintain performance
|
|
432
|
-
let keepProcessing = true;
|
|
433
|
-
// Helper function to check if an element directly contains inline content
|
|
434
|
-
// This helps prevent unwrapping divs that visually act as paragraphs.
|
|
435
|
-
function hasDirectInlineContent(el) {
|
|
436
|
-
for (const child of el.childNodes) {
|
|
437
|
-
// Check for non-empty text nodes
|
|
438
|
-
if (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE && child.textContent?.trim()) {
|
|
439
|
-
return true;
|
|
440
|
-
}
|
|
441
|
-
// Check for element nodes that are considered inline
|
|
442
|
-
if (child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
|
|
443
|
-
return true;
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
return false;
|
|
447
|
-
}
|
|
448
|
-
const shouldPreserveElement = (el) => {
|
|
449
|
-
const tagName = el.tagName.toLowerCase();
|
|
450
|
-
// Check if element should be preserved
|
|
451
|
-
if (constants_1.PRESERVE_ELEMENTS.has(tagName))
|
|
452
|
-
return true;
|
|
453
|
-
// Check for semantic roles
|
|
454
|
-
const role = el.getAttribute('role');
|
|
455
|
-
if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
|
|
456
|
-
return true;
|
|
457
|
-
}
|
|
458
|
-
// Check for semantic classes
|
|
459
|
-
const className = el.className;
|
|
460
|
-
if (typeof className === 'string' && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
|
|
461
|
-
return true;
|
|
462
|
-
}
|
|
463
|
-
// Check if div contains mixed content types that should be preserved
|
|
464
|
-
if (tagName === 'div') {
|
|
465
|
-
const children = Array.from(el.children);
|
|
466
|
-
const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
|
|
467
|
-
child.getAttribute('role') === 'article' ||
|
|
468
|
-
(child.className && typeof child.className === 'string' &&
|
|
469
|
-
child.className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)));
|
|
470
|
-
if (hasPreservedElements)
|
|
471
|
-
return true;
|
|
472
|
-
}
|
|
473
|
-
return false;
|
|
474
|
-
};
|
|
475
|
-
const isWrapperDiv = (div) => {
|
|
476
|
-
// If it directly contains inline content, it's NOT a wrapper
|
|
477
|
-
if (hasDirectInlineContent(div)) {
|
|
478
|
-
return false;
|
|
479
|
-
}
|
|
480
|
-
// Check if it's just empty space
|
|
481
|
-
if (!div.textContent?.trim())
|
|
482
|
-
return true;
|
|
483
|
-
// Check if it only contains other divs or block elements
|
|
484
|
-
const children = Array.from(div.children);
|
|
485
|
-
if (children.length === 0)
|
|
486
|
-
return true;
|
|
487
|
-
// Check if all children are block elements
|
|
488
|
-
const allBlockElements = children.every(child => {
|
|
489
|
-
const tag = child.tagName.toLowerCase();
|
|
490
|
-
return tag === 'div' || tag === 'p' || tag === 'h1' || tag === 'h2' ||
|
|
491
|
-
tag === 'h3' || tag === 'h4' || tag === 'h5' || tag === 'h6' ||
|
|
492
|
-
tag === 'ul' || tag === 'ol' || tag === 'pre' || tag === 'blockquote' ||
|
|
493
|
-
tag === 'figure';
|
|
494
|
-
});
|
|
495
|
-
if (allBlockElements)
|
|
496
|
-
return true;
|
|
497
|
-
// Check for common wrapper patterns
|
|
498
|
-
const className = div.className.toLowerCase();
|
|
499
|
-
const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
|
|
500
|
-
if (isWrapper)
|
|
501
|
-
return true;
|
|
502
|
-
// Check if it has excessive whitespace or empty text nodes
|
|
503
|
-
const textNodes = Array.from(div.childNodes).filter(node => node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && node.textContent?.trim() // TEXT_NODE
|
|
504
|
-
);
|
|
505
|
-
if (textNodes.length === 0)
|
|
506
|
-
return true;
|
|
507
|
-
// Check if it's a div that only contains block elements
|
|
508
|
-
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
|
|
509
|
-
const tag = child.tagName.toLowerCase();
|
|
510
|
-
return constants_1.INLINE_ELEMENTS.has(tag);
|
|
511
|
-
});
|
|
512
|
-
if (hasOnlyBlockElements)
|
|
513
|
-
return true;
|
|
514
|
-
return false;
|
|
515
|
-
};
|
|
516
|
-
// Function to process a single div
|
|
517
|
-
const processDiv = (div) => {
|
|
518
|
-
// Skip processing if div has been removed or should be preserved
|
|
519
|
-
if (!div.isConnected || shouldPreserveElement(div))
|
|
520
|
-
return false;
|
|
521
|
-
// Case 1: Empty div or div with only whitespace
|
|
522
|
-
if (!div.hasChildNodes() || !div.textContent?.trim()) {
|
|
523
|
-
div.remove();
|
|
524
|
-
processedCount++;
|
|
525
|
-
return true;
|
|
526
|
-
}
|
|
527
|
-
// Case 2: Top-level div - be more aggressive
|
|
528
|
-
if (div.parentElement === element) {
|
|
529
|
-
const children = Array.from(div.children);
|
|
530
|
-
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
|
|
531
|
-
const tag = child.tagName.toLowerCase();
|
|
532
|
-
return constants_1.INLINE_ELEMENTS.has(tag);
|
|
533
|
-
});
|
|
534
|
-
if (hasOnlyBlockElements) {
|
|
535
|
-
const fragment = this.doc.createDocumentFragment();
|
|
536
|
-
while (div.firstChild) {
|
|
537
|
-
fragment.appendChild(div.firstChild);
|
|
538
|
-
}
|
|
539
|
-
div.replaceWith(fragment);
|
|
540
|
-
processedCount++;
|
|
541
|
-
return true;
|
|
542
|
-
}
|
|
543
|
-
}
|
|
544
|
-
// Case 3: Wrapper div - merge up aggressively
|
|
545
|
-
if (isWrapperDiv(div)) {
|
|
546
|
-
// Special case: if div only contains block elements, merge them up
|
|
547
|
-
const children = Array.from(div.children);
|
|
548
|
-
const onlyBlockElements = !children.some(child => {
|
|
549
|
-
const tag = child.tagName.toLowerCase();
|
|
550
|
-
return constants_1.INLINE_ELEMENTS.has(tag);
|
|
551
|
-
});
|
|
552
|
-
if (onlyBlockElements) {
|
|
553
|
-
const fragment = this.doc.createDocumentFragment();
|
|
554
|
-
while (div.firstChild) {
|
|
555
|
-
fragment.appendChild(div.firstChild);
|
|
556
|
-
}
|
|
557
|
-
div.replaceWith(fragment);
|
|
558
|
-
processedCount++;
|
|
559
|
-
return true;
|
|
560
|
-
}
|
|
561
|
-
// Otherwise handle as normal wrapper
|
|
562
|
-
const fragment = this.doc.createDocumentFragment();
|
|
563
|
-
while (div.firstChild) {
|
|
564
|
-
fragment.appendChild(div.firstChild);
|
|
565
|
-
}
|
|
566
|
-
div.replaceWith(fragment);
|
|
567
|
-
processedCount++;
|
|
568
|
-
return true;
|
|
569
|
-
}
|
|
570
|
-
// Case 4: Div only contains text and/or inline elements - convert to paragraph
|
|
571
|
-
const childNodes = Array.from(div.childNodes);
|
|
572
|
-
const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => (child.nodeType === constants_1.NODE_TYPE.TEXT_NODE) ||
|
|
573
|
-
(child.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
|
|
574
|
-
if (hasOnlyInlineOrText && div.textContent?.trim()) { // Ensure there's actual content
|
|
575
|
-
const p = this.doc.createElement('p');
|
|
576
|
-
// Move all children (including inline tags like <font>) to the new <p>
|
|
577
|
-
while (div.firstChild) {
|
|
578
|
-
p.appendChild(div.firstChild);
|
|
579
|
-
}
|
|
580
|
-
div.replaceWith(p);
|
|
581
|
-
processedCount++;
|
|
582
|
-
return true;
|
|
583
|
-
}
|
|
584
|
-
// Case 5: Div has single child - unwrap only if child is block-level
|
|
585
|
-
if (div.children.length === 1) {
|
|
586
|
-
const child = div.firstElementChild;
|
|
587
|
-
const childTag = child.tagName.toLowerCase();
|
|
588
|
-
// Only unwrap if the single child is a block element and not preserved
|
|
589
|
-
if (constants_1.BLOCK_ELEMENTS.includes(childTag) && !shouldPreserveElement(child)) {
|
|
590
|
-
div.replaceWith(child);
|
|
591
|
-
processedCount++;
|
|
592
|
-
return true;
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
// Case 6: Deeply nested div - merge up
|
|
596
|
-
let nestingDepth = 0;
|
|
597
|
-
let parent = div.parentElement;
|
|
598
|
-
while (parent) {
|
|
599
|
-
if (parent.tagName.toLowerCase() === 'div') {
|
|
600
|
-
nestingDepth++;
|
|
601
|
-
}
|
|
602
|
-
parent = parent.parentElement;
|
|
603
|
-
}
|
|
604
|
-
// Only unwrap if nested AND does not contain direct inline content
|
|
605
|
-
if (nestingDepth > 0 && !hasDirectInlineContent(div)) {
|
|
606
|
-
const fragment = this.doc.createDocumentFragment();
|
|
607
|
-
while (div.firstChild) {
|
|
608
|
-
fragment.appendChild(div.firstChild);
|
|
609
|
-
}
|
|
610
|
-
div.replaceWith(fragment);
|
|
611
|
-
processedCount++;
|
|
612
|
-
return true;
|
|
613
|
-
}
|
|
614
|
-
return false;
|
|
615
|
-
};
|
|
616
|
-
// First pass: Process top-level divs
|
|
617
|
-
const processTopLevelDivs = () => {
|
|
618
|
-
const topDivs = Array.from(element.children).filter(el => el.tagName.toLowerCase() === 'div');
|
|
619
|
-
let modified = false;
|
|
620
|
-
topDivs.forEach(div => {
|
|
621
|
-
if (processDiv(div)) {
|
|
622
|
-
modified = true;
|
|
623
|
-
}
|
|
624
|
-
});
|
|
625
|
-
return modified;
|
|
626
|
-
};
|
|
627
|
-
// Second pass: Process remaining divs from deepest to shallowest
|
|
628
|
-
const processRemainingDivs = () => {
|
|
629
|
-
const allDivs = Array.from(element.getElementsByTagName('div'))
|
|
630
|
-
.sort((a, b) => {
|
|
631
|
-
// Count nesting depth
|
|
632
|
-
const getDepth = (el) => {
|
|
633
|
-
let depth = 0;
|
|
634
|
-
let parent = el.parentElement;
|
|
635
|
-
while (parent) {
|
|
636
|
-
if (parent.tagName.toLowerCase() === 'div')
|
|
637
|
-
depth++;
|
|
638
|
-
parent = parent.parentElement;
|
|
639
|
-
}
|
|
640
|
-
return depth;
|
|
641
|
-
};
|
|
642
|
-
return getDepth(b) - getDepth(a); // Process deepest first
|
|
643
|
-
});
|
|
644
|
-
let modified = false;
|
|
645
|
-
allDivs.forEach(div => {
|
|
646
|
-
if (processDiv(div)) {
|
|
647
|
-
modified = true;
|
|
648
|
-
}
|
|
649
|
-
});
|
|
650
|
-
return modified;
|
|
651
|
-
};
|
|
652
|
-
// Final cleanup pass - aggressively flatten remaining divs
|
|
653
|
-
const finalCleanup = () => {
|
|
654
|
-
const remainingDivs = Array.from(element.getElementsByTagName('div'));
|
|
655
|
-
let modified = false;
|
|
656
|
-
remainingDivs.forEach(div => {
|
|
657
|
-
// Only perform final cleanup/unwrap if the div is still connected,
|
|
658
|
-
// not preserved, and does not contain direct inline content.
|
|
659
|
-
if (div.isConnected && !shouldPreserveElement(div) && !hasDirectInlineContent(div)) {
|
|
660
|
-
const children = Array.from(div.children);
|
|
661
|
-
const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
|
|
662
|
-
// Unwrap if it only contains paragraphs OR is identified as a wrapper
|
|
663
|
-
if (onlyParagraphs || isWrapperDiv(div)) {
|
|
664
|
-
const fragment = this.doc.createDocumentFragment();
|
|
665
|
-
while (div.firstChild) {
|
|
666
|
-
fragment.appendChild(div.firstChild);
|
|
667
|
-
}
|
|
668
|
-
div.replaceWith(fragment);
|
|
669
|
-
processedCount++;
|
|
670
|
-
modified = true;
|
|
671
|
-
}
|
|
672
|
-
}
|
|
673
|
-
});
|
|
674
|
-
return modified;
|
|
675
|
-
};
|
|
676
|
-
// Execute all passes until no more changes
|
|
677
|
-
do {
|
|
678
|
-
keepProcessing = false;
|
|
679
|
-
if (processTopLevelDivs())
|
|
680
|
-
keepProcessing = true;
|
|
681
|
-
if (processRemainingDivs())
|
|
682
|
-
keepProcessing = true;
|
|
683
|
-
if (finalCleanup())
|
|
684
|
-
keepProcessing = true;
|
|
685
|
-
} while (keepProcessing);
|
|
686
|
-
const endTime = Date.now();
|
|
687
|
-
this._log('Flattened divs:', {
|
|
688
|
-
count: processedCount,
|
|
689
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
690
|
-
});
|
|
691
|
-
}
|
|
692
|
-
cleanContent(element, metadata) {
|
|
693
|
-
this.standardizeSpaces(element);
|
|
694
|
-
// Remove HTML comments
|
|
695
|
-
this.removeHtmlComments(element);
|
|
696
|
-
// Handle H1 elements - remove first one and convert others to H2
|
|
697
|
-
this.standardizeHeadings(element, metadata.title);
|
|
698
|
-
// Standardize footnotes and citations
|
|
699
|
-
(0, footnotes_1.standardizeFootnotes)(element);
|
|
700
|
-
// Handle lazy-loaded images
|
|
701
|
-
this.handleLazyImages(element);
|
|
702
|
-
// Convert embedded content to standard formats
|
|
703
|
-
this.standardizeElements(element);
|
|
704
|
-
// If not debug mode, do the full cleanup
|
|
705
|
-
if (!this.debug) {
|
|
706
|
-
// First pass of div flattening
|
|
707
|
-
this.flattenDivs(element);
|
|
708
|
-
// Strip unwanted attributes
|
|
709
|
-
this.stripUnwantedAttributes(element);
|
|
710
|
-
// Remove empty elements
|
|
711
|
-
this.removeEmptyElements(element);
|
|
712
|
-
// Remove trailing headings
|
|
713
|
-
this.removeTrailingHeadings(element);
|
|
714
|
-
// Final pass of div flattening after cleanup operations
|
|
715
|
-
this.flattenDivs(element);
|
|
716
|
-
// Standardize consecutive br elements
|
|
717
|
-
this.stripExtraBrElements(element);
|
|
718
|
-
// Clean up empty lines
|
|
719
|
-
this.removeEmptyLines(element);
|
|
720
|
-
}
|
|
721
|
-
else {
|
|
722
|
-
// In debug mode, still do basic cleanup but preserve structure
|
|
723
|
-
this.stripUnwantedAttributes(element);
|
|
724
|
-
this.removeEmptyElements(element);
|
|
725
|
-
this.removeTrailingHeadings(element);
|
|
726
|
-
this.stripExtraBrElements(element);
|
|
727
|
-
this._log('Debug mode: Skipping div flattening to preserve structure');
|
|
728
|
-
}
|
|
729
|
-
}
|
|
730
|
-
standardizeSpaces(element) {
|
|
731
|
-
const processNode = (node) => {
|
|
732
|
-
// Skip pre and code elements
|
|
733
|
-
if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
|
|
734
|
-
const tag = node.tagName.toLowerCase();
|
|
735
|
-
if (tag === 'pre' || tag === 'code') {
|
|
736
|
-
return;
|
|
737
|
-
}
|
|
738
|
-
}
|
|
739
|
-
// Process text nodes
|
|
740
|
-
if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
|
|
741
|
-
const text = node.textContent || '';
|
|
742
|
-
// Replace with regular spaces, except when it's a single between words
|
|
743
|
-
const newText = text.replace(/\xA0+/g, (match) => {
|
|
744
|
-
// If it's a single between word characters, preserve it
|
|
745
|
-
if (match.length === 1) {
|
|
746
|
-
const prev = node.previousSibling?.textContent?.slice(-1);
|
|
747
|
-
const next = node.nextSibling?.textContent?.charAt(0);
|
|
748
|
-
if (prev?.match(/\w/) && next?.match(/\w/)) {
|
|
749
|
-
return '\xA0';
|
|
750
|
-
}
|
|
751
|
-
}
|
|
752
|
-
return ' '.repeat(match.length);
|
|
753
|
-
});
|
|
754
|
-
if (newText !== text) {
|
|
755
|
-
node.textContent = newText;
|
|
756
|
-
}
|
|
757
|
-
}
|
|
758
|
-
// Process children recursively
|
|
759
|
-
if (node.hasChildNodes()) {
|
|
760
|
-
Array.from(node.childNodes).forEach(processNode);
|
|
761
|
-
}
|
|
762
|
-
};
|
|
763
|
-
processNode(element);
|
|
764
|
-
}
|
|
765
|
-
removeTrailingHeadings(element) {
|
|
766
|
-
let removedCount = 0;
|
|
767
|
-
const hasContentAfter = (el) => {
|
|
768
|
-
// Check if there's any meaningful content after this element
|
|
769
|
-
let nextContent = '';
|
|
770
|
-
let sibling = el.nextSibling;
|
|
771
|
-
// First check direct siblings
|
|
772
|
-
while (sibling) {
|
|
773
|
-
if (sibling.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
|
|
774
|
-
nextContent += sibling.textContent || '';
|
|
775
|
-
}
|
|
776
|
-
else if (sibling.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) { // ELEMENT_NODE
|
|
777
|
-
// If we find an element sibling, check its content
|
|
778
|
-
nextContent += sibling.textContent || '';
|
|
779
|
-
}
|
|
780
|
-
sibling = sibling.nextSibling;
|
|
781
|
-
}
|
|
782
|
-
// If we found meaningful content at this level, return true
|
|
783
|
-
if (nextContent.trim()) {
|
|
784
|
-
return true;
|
|
785
|
-
}
|
|
786
|
-
// If no content found at this level and we have a parent,
|
|
787
|
-
// check for content after the parent
|
|
788
|
-
const parent = el.parentElement;
|
|
789
|
-
if (parent && parent !== element) {
|
|
790
|
-
return hasContentAfter(parent);
|
|
791
|
-
}
|
|
792
|
-
return false;
|
|
793
|
-
};
|
|
794
|
-
// Process all headings from bottom to top
|
|
795
|
-
const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
|
|
796
|
-
.reverse();
|
|
797
|
-
headings.forEach(heading => {
|
|
798
|
-
if (!hasContentAfter(heading)) {
|
|
799
|
-
heading.remove();
|
|
800
|
-
removedCount++;
|
|
801
|
-
}
|
|
802
|
-
else {
|
|
803
|
-
// Stop processing once we find a heading with content after it
|
|
804
|
-
return;
|
|
805
|
-
}
|
|
806
|
-
});
|
|
807
|
-
if (removedCount > 0) {
|
|
808
|
-
this._log('Removed trailing headings:', removedCount);
|
|
809
|
-
}
|
|
810
|
-
}
|
|
811
|
-
standardizeHeadings(element, title) {
|
|
812
|
-
const normalizeText = (text) => {
|
|
813
|
-
return text
|
|
814
|
-
.replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
|
|
815
|
-
.replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
|
|
816
|
-
.trim()
|
|
817
|
-
.toLowerCase();
|
|
818
|
-
};
|
|
819
|
-
const h1s = element.getElementsByTagName('h1');
|
|
820
|
-
Array.from(h1s).forEach(h1 => {
|
|
821
|
-
const h2 = this.doc.createElement('h2');
|
|
822
|
-
h2.innerHTML = h1.innerHTML;
|
|
823
|
-
// Copy allowed attributes
|
|
824
|
-
Array.from(h1.attributes).forEach(attr => {
|
|
825
|
-
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
|
|
826
|
-
h2.setAttribute(attr.name, attr.value);
|
|
827
|
-
}
|
|
828
|
-
});
|
|
829
|
-
h1.parentNode?.replaceChild(h2, h1);
|
|
830
|
-
});
|
|
831
|
-
// Remove first H2 if it matches title
|
|
832
|
-
const h2s = element.getElementsByTagName('h2');
|
|
833
|
-
if (h2s.length > 0) {
|
|
834
|
-
const firstH2 = h2s[0];
|
|
835
|
-
const firstH2Text = normalizeText(firstH2.textContent || '');
|
|
836
|
-
const normalizedTitle = normalizeText(title);
|
|
837
|
-
if (normalizedTitle && normalizedTitle === firstH2Text) {
|
|
838
|
-
firstH2.remove();
|
|
839
|
-
}
|
|
840
|
-
}
|
|
841
|
-
}
|
|
842
|
-
removeHtmlComments(element) {
|
|
843
|
-
let removedCount = 0;
|
|
844
|
-
// Get all elements and check their child nodes
|
|
845
|
-
const allElements = Array.from(element.getElementsByTagName('*'));
|
|
846
|
-
// Process each element's child nodes
|
|
847
|
-
allElements.forEach(el => {
|
|
848
|
-
const childNodes = Array.from(el.childNodes);
|
|
849
|
-
childNodes.forEach(node => {
|
|
850
|
-
if (node.nodeType === 8) { // 8 is the node type for comments
|
|
851
|
-
node.remove();
|
|
852
|
-
removedCount++;
|
|
853
|
-
}
|
|
854
|
-
});
|
|
855
|
-
});
|
|
856
|
-
this._log('Removed HTML comments:', removedCount);
|
|
857
|
-
}
|
|
858
|
-
stripUnwantedAttributes(element) {
|
|
859
|
-
let attributeCount = 0;
|
|
860
|
-
const processElement = (el) => {
|
|
861
|
-
// Skip SVG elements - preserve all their attributes
|
|
862
|
-
if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
|
|
863
|
-
return;
|
|
864
|
-
}
|
|
865
|
-
const attributes = Array.from(el.attributes);
|
|
866
|
-
const tag = el.tagName.toLowerCase();
|
|
867
|
-
attributes.forEach(attr => {
|
|
868
|
-
const attrName = attr.name.toLowerCase();
|
|
869
|
-
const attrValue = attr.value;
|
|
870
|
-
// Special cases for preserving specific attributes
|
|
871
|
-
if (
|
|
872
|
-
// Preserve footnote IDs
|
|
873
|
-
(attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
|
|
874
|
-
attrValue.startsWith('fn:') || // Footnote content
|
|
875
|
-
attrValue === 'footnotes' // Footnotes container
|
|
876
|
-
)) ||
|
|
877
|
-
// Preserve code block language classes and footnote backref class
|
|
878
|
-
(attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
|
|
879
|
-
attrValue === 'footnote-backref'))) {
|
|
880
|
-
return;
|
|
881
|
-
}
|
|
882
|
-
// In debug mode, allow debug attributes and data- attributes
|
|
883
|
-
if (this.debug) {
|
|
884
|
-
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
|
|
885
|
-
!constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
|
|
886
|
-
!attrName.startsWith('data-')) {
|
|
887
|
-
el.removeAttribute(attr.name);
|
|
888
|
-
attributeCount++;
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
else {
|
|
892
|
-
// In normal mode, only allow standard attributes
|
|
893
|
-
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
|
|
894
|
-
el.removeAttribute(attr.name);
|
|
895
|
-
attributeCount++;
|
|
896
|
-
}
|
|
897
|
-
}
|
|
898
|
-
});
|
|
899
|
-
};
|
|
900
|
-
processElement(element);
|
|
901
|
-
element.querySelectorAll('*').forEach(processElement);
|
|
902
|
-
this._log('Stripped attributes:', attributeCount);
|
|
903
|
-
}
|
|
904
|
-
removeEmptyElements(element) {
|
|
905
|
-
let removedCount = 0;
|
|
906
|
-
let iterations = 0;
|
|
907
|
-
let keepRemoving = true;
|
|
908
|
-
while (keepRemoving) {
|
|
909
|
-
iterations++;
|
|
910
|
-
keepRemoving = false;
|
|
911
|
-
// Get all elements without children, working from deepest first
|
|
912
|
-
const emptyElements = Array.from(element.getElementsByTagName('*')).filter(el => {
|
|
913
|
-
if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase())) {
|
|
914
|
-
return false;
|
|
915
|
-
}
|
|
916
|
-
// Check if element has only whitespace or
|
|
917
|
-
const textContent = el.textContent || '';
|
|
918
|
-
const hasOnlyWhitespace = textContent.trim().length === 0;
|
|
919
|
-
const hasNbsp = textContent.includes('\u00A0'); // Unicode non-breaking space
|
|
920
|
-
// Check if element has no meaningful children
|
|
921
|
-
const hasNoChildren = !el.hasChildNodes() ||
|
|
922
|
-
(Array.from(el.childNodes).every(node => {
|
|
923
|
-
if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) { // TEXT_NODE
|
|
924
|
-
const nodeText = node.textContent || '';
|
|
925
|
-
return nodeText.trim().length === 0 && !nodeText.includes('\u00A0');
|
|
926
|
-
}
|
|
927
|
-
return false;
|
|
928
|
-
}));
|
|
929
|
-
// Special case: Check for divs that only contain spans with commas
|
|
930
|
-
if (el.tagName.toLowerCase() === 'div') {
|
|
931
|
-
const children = Array.from(el.children);
|
|
932
|
-
const hasOnlyCommaSpans = children.length > 0 && children.every(child => {
|
|
933
|
-
if (child.tagName.toLowerCase() !== 'span')
|
|
934
|
-
return false;
|
|
935
|
-
const content = child.textContent?.trim() || '';
|
|
936
|
-
return content === ',' || content === '' || content === ' ';
|
|
937
|
-
});
|
|
938
|
-
if (hasOnlyCommaSpans)
|
|
939
|
-
return true;
|
|
940
|
-
}
|
|
941
|
-
return hasOnlyWhitespace && !hasNbsp && hasNoChildren;
|
|
942
|
-
});
|
|
943
|
-
if (emptyElements.length > 0) {
|
|
944
|
-
emptyElements.forEach(el => {
|
|
945
|
-
el.remove();
|
|
946
|
-
removedCount++;
|
|
947
|
-
});
|
|
948
|
-
keepRemoving = true;
|
|
949
|
-
}
|
|
950
|
-
}
|
|
951
|
-
this._log('Removed empty elements:', removedCount, 'iterations:', iterations);
|
|
952
|
-
}
|
|
953
|
-
stripExtraBrElements(element) {
|
|
954
|
-
let processedCount = 0;
|
|
955
|
-
const startTime = Date.now();
|
|
956
|
-
// Get all br elements directly
|
|
957
|
-
const brElements = Array.from(element.getElementsByTagName('br'));
|
|
958
|
-
// Keep track of consecutive br elements
|
|
959
|
-
let consecutiveBrs = [];
|
|
960
|
-
// Helper to process collected br elements
|
|
961
|
-
const processBrs = () => {
|
|
962
|
-
if (consecutiveBrs.length > 2) {
|
|
963
|
-
// Keep only two br elements
|
|
964
|
-
for (let i = 2; i < consecutiveBrs.length; i++) {
|
|
965
|
-
consecutiveBrs[i].remove();
|
|
966
|
-
processedCount++;
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
consecutiveBrs = [];
|
|
970
|
-
};
|
|
971
|
-
// Process all br elements
|
|
972
|
-
brElements.forEach(currentNode => {
|
|
973
|
-
// Check if this br is consecutive with previous ones
|
|
974
|
-
let isConsecutive = false;
|
|
975
|
-
if (consecutiveBrs.length > 0) {
|
|
976
|
-
const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
|
|
977
|
-
let node = currentNode.previousSibling;
|
|
978
|
-
// Skip whitespace text nodes
|
|
979
|
-
while (node && node.nodeType === constants_1.NODE_TYPE.TEXT_NODE && !node.textContent?.trim()) {
|
|
980
|
-
node = node.previousSibling;
|
|
981
|
-
}
|
|
982
|
-
if (node === lastBr) {
|
|
983
|
-
isConsecutive = true;
|
|
984
|
-
}
|
|
985
|
-
}
|
|
986
|
-
if (isConsecutive) {
|
|
987
|
-
consecutiveBrs.push(currentNode);
|
|
988
|
-
}
|
|
989
|
-
else {
|
|
990
|
-
// Process any previously collected brs before starting new group
|
|
991
|
-
processBrs();
|
|
992
|
-
consecutiveBrs = [currentNode];
|
|
993
|
-
}
|
|
994
|
-
});
|
|
995
|
-
// Process any remaining br elements
|
|
996
|
-
processBrs();
|
|
997
|
-
const endTime = Date.now();
|
|
998
|
-
this._log('Standardized br elements:', {
|
|
999
|
-
removed: processedCount,
|
|
1000
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
1001
|
-
});
|
|
1002
|
-
}
|
|
1003
|
-
removeEmptyLines(element) {
|
|
1004
|
-
let removedCount = 0;
|
|
1005
|
-
const startTime = Date.now();
|
|
1006
|
-
// First pass: remove empty text nodes
|
|
1007
|
-
const removeEmptyTextNodes = (node) => {
|
|
1008
|
-
// Skip if inside pre or code
|
|
1009
|
-
if (node.nodeType === constants_1.NODE_TYPE.ELEMENT_NODE) {
|
|
1010
|
-
const tag = node.tagName.toLowerCase();
|
|
1011
|
-
if (tag === 'pre' || tag === 'code') {
|
|
1012
|
-
return;
|
|
1013
|
-
}
|
|
1014
|
-
}
|
|
1015
|
-
// Process children first (depth-first)
|
|
1016
|
-
const children = Array.from(node.childNodes);
|
|
1017
|
-
children.forEach(removeEmptyTextNodes);
|
|
1018
|
-
// Then handle this node
|
|
1019
|
-
if (node.nodeType === constants_1.NODE_TYPE.TEXT_NODE) {
|
|
1020
|
-
const text = node.textContent || '';
|
|
1021
|
-
// If it's completely empty or just special characters/whitespace, remove it
|
|
1022
|
-
if (!text || text.match(/^[\u200C\u200B\u200D\u200E\u200F\uFEFF\xA0\s]*$/)) {
|
|
1023
|
-
node.parentNode?.removeChild(node);
|
|
1024
|
-
removedCount++;
|
|
1025
|
-
}
|
|
1026
|
-
else {
|
|
1027
|
-
// Clean up the text content while preserving important spaces
|
|
1028
|
-
const newText = text
|
|
1029
|
-
.replace(/\n{3,}/g, '\n\n') // More than 2 newlines -> 2 newlines
|
|
1030
|
-
.replace(/^[\n\r\t]+/, '') // Remove leading newlines/tabs (preserve spaces)
|
|
1031
|
-
.replace(/[\n\r\t]+$/, '') // Remove trailing newlines/tabs (preserve spaces)
|
|
1032
|
-
.replace(/[ \t]*\n[ \t]*/g, '\n') // Remove spaces around newlines
|
|
1033
|
-
.replace(/[ \t]{3,}/g, ' ') // 3+ spaces -> 1 space
|
|
1034
|
-
.replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
|
|
1035
|
-
.replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
|
|
1036
|
-
// Clean up zero-width characters and multiple non-breaking spaces
|
|
1037
|
-
.replace(/[\u200C\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
|
|
1038
|
-
.replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple -> single
|
|
1039
|
-
if (newText !== text) {
|
|
1040
|
-
node.textContent = newText;
|
|
1041
|
-
removedCount += text.length - newText.length;
|
|
1042
|
-
}
|
|
1043
|
-
}
|
|
1044
|
-
}
|
|
1045
|
-
};
|
|
1046
|
-
// Second pass: clean up empty elements and normalize spacing
|
|
1047
|
-
const cleanupEmptyElements = (node) => {
|
|
1048
|
-
if (!isElement(node))
|
|
1049
|
-
return;
|
|
1050
|
-
// Skip pre and code elements
|
|
1051
|
-
const tag = node.tagName.toLowerCase();
|
|
1052
|
-
if (tag === 'pre' || tag === 'code') {
|
|
1053
|
-
return;
|
|
1054
|
-
}
|
|
1055
|
-
// Process children first (depth-first)
|
|
1056
|
-
Array.from(node.childNodes)
|
|
1057
|
-
.filter(isElement)
|
|
1058
|
-
.forEach(cleanupEmptyElements);
|
|
1059
|
-
// Then normalize this element's whitespace
|
|
1060
|
-
node.normalize(); // Combine adjacent text nodes
|
|
1061
|
-
// Special handling for block elements
|
|
1062
|
-
const isBlockElement = this.getComputedStyle(node)?.display === 'block';
|
|
1063
|
-
// Only remove empty text nodes at the start and end if they contain just newlines/tabs
|
|
1064
|
-
// For block elements, also remove spaces
|
|
1065
|
-
const startPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
|
|
1066
|
-
const endPattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
|
|
1067
|
-
while (node.firstChild &&
|
|
1068
|
-
node.firstChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
|
|
1069
|
-
(node.firstChild.textContent || '').match(startPattern)) {
|
|
1070
|
-
node.removeChild(node.firstChild);
|
|
1071
|
-
removedCount++;
|
|
1072
|
-
}
|
|
1073
|
-
while (node.lastChild &&
|
|
1074
|
-
node.lastChild.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
|
|
1075
|
-
(node.lastChild.textContent || '').match(endPattern)) {
|
|
1076
|
-
node.removeChild(node.lastChild);
|
|
1077
|
-
removedCount++;
|
|
1078
|
-
}
|
|
1079
|
-
// Ensure there's a space between inline elements if needed
|
|
1080
|
-
if (!isBlockElement) {
|
|
1081
|
-
const children = Array.from(node.childNodes);
|
|
1082
|
-
for (let i = 0; i < children.length - 1; i++) {
|
|
1083
|
-
const current = children[i];
|
|
1084
|
-
const next = children[i + 1];
|
|
1085
|
-
// Only add space between elements or between element and text
|
|
1086
|
-
if (isElement(current) || isElement(next)) {
|
|
1087
|
-
// Don't add space if next content starts with punctuation
|
|
1088
|
-
const nextContent = next.textContent || '';
|
|
1089
|
-
const currentContent = current.textContent || '';
|
|
1090
|
-
if (!nextContent.match(/^[,.!?:;]/) &&
|
|
1091
|
-
!currentContent.match(/[,.!?:;]$/)) {
|
|
1092
|
-
// Check if there's already a space
|
|
1093
|
-
const hasSpace = (current.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
|
|
1094
|
-
(current.textContent || '').endsWith(' ')) ||
|
|
1095
|
-
(next.nodeType === constants_1.NODE_TYPE.TEXT_NODE &&
|
|
1096
|
-
(next.textContent || '').startsWith(' '));
|
|
1097
|
-
if (!hasSpace) {
|
|
1098
|
-
const space = this.doc.createTextNode(' ');
|
|
1099
|
-
node.insertBefore(space, next);
|
|
1100
|
-
}
|
|
1101
|
-
}
|
|
1102
|
-
}
|
|
1103
|
-
}
|
|
1104
|
-
}
|
|
1105
|
-
};
|
|
1106
|
-
// Run both passes
|
|
1107
|
-
removeEmptyTextNodes(element);
|
|
1108
|
-
cleanupEmptyElements(element);
|
|
1109
|
-
const endTime = Date.now();
|
|
1110
|
-
this._log('Removed empty lines:', {
|
|
1111
|
-
charactersRemoved: removedCount,
|
|
1112
|
-
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
1113
|
-
});
|
|
1114
|
-
}
|
|
1115
|
-
handleLazyImages(element) {
|
|
1116
|
-
let processedCount = 0;
|
|
1117
|
-
const lazyImages = element.querySelectorAll('img[data-src], img[data-srcset]');
|
|
1118
|
-
lazyImages.forEach(img => {
|
|
1119
|
-
// Check if element is an image by checking tag name and required properties
|
|
1120
|
-
if (img.tagName.toLowerCase() !== 'img' || !('src' in img) || !('srcset' in img)) {
|
|
1121
|
-
return;
|
|
1122
|
-
}
|
|
1123
|
-
// Handle data-src
|
|
1124
|
-
const dataSrc = img.getAttribute('data-src');
|
|
1125
|
-
if (dataSrc && !img.getAttribute('src')) {
|
|
1126
|
-
img.setAttribute('src', dataSrc);
|
|
1127
|
-
processedCount++;
|
|
1128
|
-
}
|
|
1129
|
-
// Handle data-srcset
|
|
1130
|
-
const dataSrcset = img.getAttribute('data-srcset');
|
|
1131
|
-
if (dataSrcset && !img.getAttribute('srcset')) {
|
|
1132
|
-
img.setAttribute('srcset', dataSrcset);
|
|
1133
|
-
processedCount++;
|
|
1134
|
-
}
|
|
1135
|
-
// Remove lazy loading related classes and attributes
|
|
1136
|
-
img.classList.remove('lazy', 'lazyload');
|
|
1137
|
-
img.removeAttribute('data-ll-status');
|
|
1138
|
-
img.removeAttribute('data-src');
|
|
1139
|
-
img.removeAttribute('data-srcset');
|
|
1140
|
-
});
|
|
1141
|
-
this._log('Processed lazy images:', processedCount);
|
|
1142
|
-
}
|
|
1143
|
-
standardizeElements(element) {
|
|
1144
|
-
let processedCount = 0;
|
|
1145
|
-
// Convert elements based on standardization rules
|
|
1146
|
-
ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
|
|
1147
|
-
const elements = element.querySelectorAll(rule.selector);
|
|
1148
|
-
elements.forEach(el => {
|
|
1149
|
-
if (rule.transform) {
|
|
1150
|
-
// If there's a transform function, use it to create the new element
|
|
1151
|
-
const transformed = rule.transform(el, this.doc);
|
|
1152
|
-
el.replaceWith(transformed);
|
|
1153
|
-
processedCount++;
|
|
1154
|
-
}
|
|
1155
|
-
});
|
|
1156
|
-
});
|
|
1157
|
-
// Convert lite-youtube elements
|
|
1158
|
-
const liteYoutubeElements = element.querySelectorAll('lite-youtube');
|
|
1159
|
-
liteYoutubeElements.forEach(el => {
|
|
1160
|
-
const videoId = el.getAttribute('videoid');
|
|
1161
|
-
if (!videoId)
|
|
1162
|
-
return;
|
|
1163
|
-
const iframe = this.doc.createElement('iframe');
|
|
1164
|
-
iframe.width = '560';
|
|
1165
|
-
iframe.height = '315';
|
|
1166
|
-
iframe.src = `https://www.youtube.com/embed/${videoId}`;
|
|
1167
|
-
iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
|
|
1168
|
-
iframe.frameBorder = '0';
|
|
1169
|
-
iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
|
|
1170
|
-
iframe.setAttribute('allowfullscreen', '');
|
|
1171
|
-
el.replaceWith(iframe);
|
|
1172
|
-
processedCount++;
|
|
1173
|
-
});
|
|
1174
|
-
this._log('Converted embedded elements:', processedCount);
|
|
1175
|
-
}
|
|
1176
303
|
// Find small IMG and SVG elements
|
|
1177
304
|
findSmallImages(doc) {
|
|
1178
305
|
const MIN_DIMENSION = 33;
|
|
@@ -1184,21 +311,7 @@ class Defuddle {
|
|
|
1184
311
|
const elements = [
|
|
1185
312
|
...Array.from(doc.getElementsByTagName('img')),
|
|
1186
313
|
...Array.from(doc.getElementsByTagName('svg'))
|
|
1187
|
-
]
|
|
1188
|
-
// Skip lazy-loaded images that haven't been processed yet
|
|
1189
|
-
// and math images which may be small
|
|
1190
|
-
if (element.tagName.toLowerCase() === 'img') {
|
|
1191
|
-
const ignoredImage = element.classList.contains('lazy') ||
|
|
1192
|
-
element.classList.contains('lazyload') ||
|
|
1193
|
-
element.classList.contains('latex') ||
|
|
1194
|
-
element.hasAttribute('decoding') ||
|
|
1195
|
-
element.hasAttribute('data-src') ||
|
|
1196
|
-
element.hasAttribute('data-srcset') ||
|
|
1197
|
-
element.hasAttribute('loading');
|
|
1198
|
-
return !ignoredImage;
|
|
1199
|
-
}
|
|
1200
|
-
return true;
|
|
1201
|
-
});
|
|
314
|
+
];
|
|
1202
315
|
if (elements.length === 0) {
|
|
1203
316
|
return smallImages;
|
|
1204
317
|
}
|
|
@@ -1342,7 +455,7 @@ class Defuddle {
|
|
|
1342
455
|
const elements = doc.querySelectorAll(selector);
|
|
1343
456
|
elements.forEach(element => {
|
|
1344
457
|
// Base score from selector priority (earlier = higher)
|
|
1345
|
-
let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) *
|
|
458
|
+
let score = (constants_1.ENTRY_POINT_ELEMENTS.length - index) * 40;
|
|
1346
459
|
// Add score based on content analysis
|
|
1347
460
|
score += scoring_1.ContentScorer.scoreElement(element);
|
|
1348
461
|
candidates.push({ element, score });
|
|
@@ -1416,6 +529,9 @@ class Defuddle {
|
|
|
1416
529
|
}
|
|
1417
530
|
return parts.join(' > ');
|
|
1418
531
|
}
|
|
532
|
+
getComputedStyle(element) {
|
|
533
|
+
return (0, utils_1.getComputedStyle)(element);
|
|
534
|
+
}
|
|
1419
535
|
}
|
|
1420
536
|
exports.Defuddle = Defuddle;
|
|
1421
537
|
//# sourceMappingURL=defuddle.js.map
|