@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,1101 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.standardizeContent = standardizeContent;
|
|
4
|
+
const constants_1 = require("./constants");
|
|
5
|
+
const math_1 = require("./elements/math");
|
|
6
|
+
const code_1 = require("./elements/code");
|
|
7
|
+
const headings_1 = require("./elements/headings");
|
|
8
|
+
const images_1 = require("./elements/images");
|
|
9
|
+
const utils_1 = require("./utils");
|
|
10
|
+
const dom_1 = require("./utils/dom");
|
|
11
|
+
// Module-level debug flag, set by standardizeContent for child functions
|
|
12
|
+
let _debug = false;
|
|
13
|
+
const ELEMENT_STANDARDIZATION_RULES = [
|
|
14
|
+
...math_1.mathRules,
|
|
15
|
+
...code_1.codeBlockRules,
|
|
16
|
+
...headings_1.headingRules,
|
|
17
|
+
...images_1.imageRules,
|
|
18
|
+
// Convert divs with paragraph role to actual paragraphs
|
|
19
|
+
{
|
|
20
|
+
selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
|
|
21
|
+
element: 'p',
|
|
22
|
+
transform: (el, doc) => {
|
|
23
|
+
const p = doc.createElement('p');
|
|
24
|
+
(0, dom_1.transferContent)(el, p);
|
|
25
|
+
// Copy allowed attributes
|
|
26
|
+
Array.from(el.attributes).forEach(attr => {
|
|
27
|
+
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
|
|
28
|
+
p.setAttribute(attr.name, attr.value);
|
|
29
|
+
}
|
|
30
|
+
});
|
|
31
|
+
return p;
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
// Convert divs with list roles to actual lists
|
|
35
|
+
{
|
|
36
|
+
selector: 'div[role="list"]',
|
|
37
|
+
element: 'ul',
|
|
38
|
+
// Custom handler for list type detection and transformation
|
|
39
|
+
transform: (el, doc) => {
|
|
40
|
+
// First determine if this is an ordered list
|
|
41
|
+
const firstItem = el.querySelector('div[role="listitem"] .label');
|
|
42
|
+
const label = firstItem?.textContent?.trim() || '';
|
|
43
|
+
const isOrdered = label.match(/^\d+\)/);
|
|
44
|
+
// Create the appropriate list type
|
|
45
|
+
const list = doc.createElement(isOrdered ? 'ol' : 'ul');
|
|
46
|
+
// Process each list item
|
|
47
|
+
const items = el.querySelectorAll('div[role="listitem"]');
|
|
48
|
+
items.forEach(item => {
|
|
49
|
+
const li = doc.createElement('li');
|
|
50
|
+
const content = item.querySelector('.content');
|
|
51
|
+
if (content) {
|
|
52
|
+
// Convert any paragraph divs inside content
|
|
53
|
+
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
|
|
54
|
+
paragraphDivs.forEach(div => {
|
|
55
|
+
const p = doc.createElement('p');
|
|
56
|
+
(0, dom_1.transferContent)(div, p);
|
|
57
|
+
div.replaceWith(p);
|
|
58
|
+
});
|
|
59
|
+
// Convert any nested lists recursively
|
|
60
|
+
const nestedLists = content.querySelectorAll('div[role="list"]');
|
|
61
|
+
nestedLists.forEach(nestedList => {
|
|
62
|
+
const firstNestedItem = nestedList.querySelector('div[role="listitem"] .label');
|
|
63
|
+
const nestedLabel = firstNestedItem?.textContent?.trim() || '';
|
|
64
|
+
const isNestedOrdered = nestedLabel.match(/^\d+\)/);
|
|
65
|
+
const newNestedList = doc.createElement(isNestedOrdered ? 'ol' : 'ul');
|
|
66
|
+
// Process nested items
|
|
67
|
+
const nestedItems = nestedList.querySelectorAll('div[role="listitem"]');
|
|
68
|
+
nestedItems.forEach(nestedItem => {
|
|
69
|
+
const nestedLi = doc.createElement('li');
|
|
70
|
+
const nestedContent = nestedItem.querySelector('.content');
|
|
71
|
+
if (nestedContent) {
|
|
72
|
+
// Convert paragraph divs in nested items
|
|
73
|
+
const nestedParagraphs = nestedContent.querySelectorAll('div[role="paragraph"]');
|
|
74
|
+
nestedParagraphs.forEach(div => {
|
|
75
|
+
const p = doc.createElement('p');
|
|
76
|
+
(0, dom_1.transferContent)(div, p);
|
|
77
|
+
div.replaceWith(p);
|
|
78
|
+
});
|
|
79
|
+
(0, dom_1.transferContent)(nestedContent, nestedLi);
|
|
80
|
+
}
|
|
81
|
+
newNestedList.appendChild(nestedLi);
|
|
82
|
+
});
|
|
83
|
+
nestedList.replaceWith(newNestedList);
|
|
84
|
+
});
|
|
85
|
+
(0, dom_1.transferContent)(content, li);
|
|
86
|
+
}
|
|
87
|
+
list.appendChild(li);
|
|
88
|
+
});
|
|
89
|
+
return list;
|
|
90
|
+
}
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
selector: 'div[role="listitem"]',
|
|
94
|
+
element: 'li',
|
|
95
|
+
// Custom handler for list item content
|
|
96
|
+
transform: (el, doc) => {
|
|
97
|
+
const content = el.querySelector('.content');
|
|
98
|
+
if (!content)
|
|
99
|
+
return el;
|
|
100
|
+
// Convert any paragraph divs inside content
|
|
101
|
+
const paragraphDivs = content.querySelectorAll('div[role="paragraph"]');
|
|
102
|
+
paragraphDivs.forEach(div => {
|
|
103
|
+
const p = doc.createElement('p');
|
|
104
|
+
(0, dom_1.transferContent)(div, p);
|
|
105
|
+
div.replaceWith(p);
|
|
106
|
+
});
|
|
107
|
+
return content;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
];
|
|
111
|
+
function standardizeContent(element, metadata, doc, debug = false) {
|
|
112
|
+
_debug = debug;
|
|
113
|
+
standardizeSpaces(element);
|
|
114
|
+
// Remove HTML comments
|
|
115
|
+
removeHtmlComments(element);
|
|
116
|
+
// Handle H1 elements - remove first one and convert others to H2
|
|
117
|
+
standardizeHeadings(element, metadata.title, doc);
|
|
118
|
+
// Wrap code elements with white-space: pre in <pre> before attribute stripping
|
|
119
|
+
wrapPreformattedCode(element, doc);
|
|
120
|
+
// Convert embedded content to standard formats
|
|
121
|
+
standardizeElements(element, doc);
|
|
122
|
+
// If not debug mode, do the full cleanup
|
|
123
|
+
if (!debug) {
|
|
124
|
+
// First pass of div flattening
|
|
125
|
+
flattenWrapperElements(element, doc);
|
|
126
|
+
// Strip unwanted attributes
|
|
127
|
+
stripUnwantedAttributes(element, debug);
|
|
128
|
+
// Unwrap bare spans (no attributes remaining after stripping)
|
|
129
|
+
unwrapBareSpans(element);
|
|
130
|
+
// Unwrap javascript: links — keep text, remove the link
|
|
131
|
+
// Unwrap links inside inline code — markdown can't render links in backtick code
|
|
132
|
+
Array.from(element.querySelectorAll('code a')).forEach(unwrapElement);
|
|
133
|
+
// Unwrap javascript: links — keep text, remove the link
|
|
134
|
+
Array.from(element.querySelectorAll('a[href^="javascript:"]')).forEach(unwrapElement);
|
|
135
|
+
// Unwrap anchor links that wrap headings (e.g. clickable section headers)
|
|
136
|
+
Array.from(element.querySelectorAll('a[href^="#"]')).forEach(link => {
|
|
137
|
+
if (link.querySelector('h1, h2, h3, h4, h5, h6')) {
|
|
138
|
+
unwrapElement(link);
|
|
139
|
+
}
|
|
140
|
+
});
|
|
141
|
+
// Remove heading anchor links (e.g. <h2>Title<a href="#title">#</a></h2>)
|
|
142
|
+
(0, headings_1.removeHeadingAnchors)(element);
|
|
143
|
+
// Remove obsolete plugin elements
|
|
144
|
+
element.querySelectorAll('object, embed, applet').forEach(el => el.remove());
|
|
145
|
+
// Remove empty elements
|
|
146
|
+
removeEmptyElements(element);
|
|
147
|
+
// Remove trailing headings
|
|
148
|
+
removeTrailingHeadings(element);
|
|
149
|
+
// Remove orphaned leading/trailing <hr> elements
|
|
150
|
+
removeOrphanedDividers(element);
|
|
151
|
+
// Final pass of div flattening after cleanup operations
|
|
152
|
+
flattenWrapperElements(element, doc);
|
|
153
|
+
// Standardize consecutive br elements
|
|
154
|
+
stripExtraBrElements(element);
|
|
155
|
+
// Clean up empty lines
|
|
156
|
+
removeEmptyLines(element, doc);
|
|
157
|
+
}
|
|
158
|
+
else {
|
|
159
|
+
// In debug mode, still do basic cleanup but preserve structure
|
|
160
|
+
stripUnwantedAttributes(element, debug);
|
|
161
|
+
removeTrailingHeadings(element);
|
|
162
|
+
stripExtraBrElements(element);
|
|
163
|
+
(0, utils_1.logDebug)(_debug, 'Debug mode: Skipping div flattening to preserve structure');
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Wrap <code> elements that have white-space: pre (via inline style)
|
|
168
|
+
* in a <pre> element, so they get treated as code blocks.
|
|
169
|
+
*/
|
|
170
|
+
function wrapPreformattedCode(element, doc) {
|
|
171
|
+
const codeElements = Array.from(element.querySelectorAll('code'));
|
|
172
|
+
for (const code of codeElements) {
|
|
173
|
+
// Skip if already inside a <pre>
|
|
174
|
+
if (code.closest('pre'))
|
|
175
|
+
continue;
|
|
176
|
+
// Check inline style for white-space: pre
|
|
177
|
+
const style = code.getAttribute('style') || '';
|
|
178
|
+
if (!/white-space\s*:\s*pre/.test(style))
|
|
179
|
+
continue;
|
|
180
|
+
// Wrap in <pre>
|
|
181
|
+
const pre = doc.createElement('pre');
|
|
182
|
+
code.parentNode?.insertBefore(pre, code);
|
|
183
|
+
pre.appendChild(code);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
function standardizeSpaces(element) {
|
|
187
|
+
const processNode = (node) => {
|
|
188
|
+
// Skip pre and code elements
|
|
189
|
+
if ((0, utils_1.isElement)(node)) {
|
|
190
|
+
const tag = node.tagName.toLowerCase();
|
|
191
|
+
if (tag === 'pre' || tag === 'code') {
|
|
192
|
+
return;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// Process text nodes
|
|
196
|
+
if ((0, utils_1.isTextNode)(node)) {
|
|
197
|
+
const text = node.textContent || '';
|
|
198
|
+
// Replace with regular spaces, preserving them between words
|
|
199
|
+
const newText = text.replace(/\xA0/g, ' ');
|
|
200
|
+
if (newText !== text) {
|
|
201
|
+
node.textContent = newText;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
// Process children recursively
|
|
205
|
+
if (node.hasChildNodes()) {
|
|
206
|
+
Array.from(node.childNodes).forEach(processNode);
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
processNode(element);
|
|
210
|
+
}
|
|
211
|
+
function removeTrailingHeadings(element) {
|
|
212
|
+
let removedCount = 0;
|
|
213
|
+
const hasContentAfter = (el) => {
|
|
214
|
+
// Check if there's any meaningful content after this element
|
|
215
|
+
let nextContent = '';
|
|
216
|
+
let sibling = el.nextSibling;
|
|
217
|
+
// First check direct siblings
|
|
218
|
+
while (sibling) {
|
|
219
|
+
if ((0, utils_1.isTextNode)(sibling)) { // TEXT_NODE
|
|
220
|
+
nextContent += sibling.textContent || '';
|
|
221
|
+
}
|
|
222
|
+
else if ((0, utils_1.isElement)(sibling)) { // ELEMENT_NODE
|
|
223
|
+
// If we find an element sibling, check its content
|
|
224
|
+
nextContent += sibling.textContent || '';
|
|
225
|
+
}
|
|
226
|
+
sibling = sibling.nextSibling;
|
|
227
|
+
}
|
|
228
|
+
// If we found meaningful content at this level, return true
|
|
229
|
+
if (nextContent.trim()) {
|
|
230
|
+
return true;
|
|
231
|
+
}
|
|
232
|
+
// If no content found at this level and we have a parent,
|
|
233
|
+
// check for content after the parent
|
|
234
|
+
const parent = el.parentElement;
|
|
235
|
+
if (parent && parent !== element) {
|
|
236
|
+
return hasContentAfter(parent);
|
|
237
|
+
}
|
|
238
|
+
return false;
|
|
239
|
+
};
|
|
240
|
+
// Process all headings from bottom to top
|
|
241
|
+
const headings = Array.from(element.querySelectorAll('h1, h2, h3, h4, h5, h6'))
|
|
242
|
+
.reverse();
|
|
243
|
+
for (const heading of headings) {
|
|
244
|
+
if (!hasContentAfter(heading)) {
|
|
245
|
+
heading.remove();
|
|
246
|
+
removedCount++;
|
|
247
|
+
}
|
|
248
|
+
else {
|
|
249
|
+
// Stop processing once we find a heading with content after it
|
|
250
|
+
break;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
if (removedCount > 0) {
|
|
254
|
+
(0, utils_1.logDebug)(_debug, 'Removed trailing headings:', removedCount);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
function removeOrphanedDividers(element) {
|
|
258
|
+
// Remove leading <hr> elements (skipping whitespace text nodes)
|
|
259
|
+
while (true) {
|
|
260
|
+
let node = element.firstChild;
|
|
261
|
+
while (node && (0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
|
|
262
|
+
node = node.nextSibling;
|
|
263
|
+
}
|
|
264
|
+
if (node && (0, utils_1.isElement)(node) && node.tagName.toLowerCase() === 'hr') {
|
|
265
|
+
node.remove();
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
// Remove trailing <hr> elements (skipping whitespace text nodes)
|
|
272
|
+
while (true) {
|
|
273
|
+
let node = element.lastChild;
|
|
274
|
+
while (node && (0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
|
|
275
|
+
node = node.previousSibling;
|
|
276
|
+
}
|
|
277
|
+
if (node && (0, utils_1.isElement)(node) && node.tagName.toLowerCase() === 'hr') {
|
|
278
|
+
node.remove();
|
|
279
|
+
}
|
|
280
|
+
else {
|
|
281
|
+
break;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
function standardizeHeadings(element, title, doc) {
|
|
286
|
+
const normalizeText = (text) => {
|
|
287
|
+
return text
|
|
288
|
+
.replace(/\u00A0/g, ' ') // Convert non-breaking spaces to regular spaces
|
|
289
|
+
.replace(/\s+/g, ' ') // Normalize all whitespace to single spaces
|
|
290
|
+
.trim()
|
|
291
|
+
.toLowerCase();
|
|
292
|
+
};
|
|
293
|
+
const h1s = element.getElementsByTagName('h1');
|
|
294
|
+
Array.from(h1s).forEach(h1 => {
|
|
295
|
+
const h2 = doc.createElement('h2');
|
|
296
|
+
(0, dom_1.transferContent)(h1, h2);
|
|
297
|
+
// Copy allowed attributes
|
|
298
|
+
Array.from(h1.attributes).forEach(attr => {
|
|
299
|
+
if (constants_1.ALLOWED_ATTRIBUTES.has(attr.name)) {
|
|
300
|
+
h2.setAttribute(attr.name, attr.value);
|
|
301
|
+
}
|
|
302
|
+
});
|
|
303
|
+
h1.parentNode?.replaceChild(h2, h1);
|
|
304
|
+
});
|
|
305
|
+
// Remove first H2 if it matches title
|
|
306
|
+
const h2s = element.getElementsByTagName('h2');
|
|
307
|
+
if (h2s.length > 0) {
|
|
308
|
+
const firstH2 = h2s[0];
|
|
309
|
+
const firstH2Text = normalizeText(firstH2.textContent || '');
|
|
310
|
+
const normalizedTitle = normalizeText(title);
|
|
311
|
+
if (normalizedTitle && normalizedTitle === firstH2Text) {
|
|
312
|
+
firstH2.remove();
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
function removeHtmlComments(element) {
|
|
317
|
+
let removedCount = 0;
|
|
318
|
+
const doc = element.ownerDocument;
|
|
319
|
+
// Use TreeWalker to find comment nodes directly (O(n) instead of O(n*m))
|
|
320
|
+
const walker = doc.createTreeWalker(element, 128 /* NodeFilter.SHOW_COMMENT */);
|
|
321
|
+
const comments = [];
|
|
322
|
+
while (walker.nextNode()) {
|
|
323
|
+
comments.push(walker.currentNode);
|
|
324
|
+
}
|
|
325
|
+
for (const node of comments) {
|
|
326
|
+
node.parentNode?.removeChild(node);
|
|
327
|
+
removedCount++;
|
|
328
|
+
}
|
|
329
|
+
(0, utils_1.logDebug)(_debug, 'Removed HTML comments:', removedCount);
|
|
330
|
+
}
|
|
331
|
+
function stripUnwantedAttributes(element, debug) {
|
|
332
|
+
let attributeCount = 0;
|
|
333
|
+
const processElement = (el) => {
|
|
334
|
+
// Skip SVG elements - preserve all their attributes
|
|
335
|
+
if (el.tagName.toLowerCase() === 'svg' || el.namespaceURI === 'http://www.w3.org/2000/svg') {
|
|
336
|
+
return;
|
|
337
|
+
}
|
|
338
|
+
const attributes = Array.from(el.attributes);
|
|
339
|
+
const tag = el.tagName.toLowerCase();
|
|
340
|
+
attributes.forEach(attr => {
|
|
341
|
+
const attrName = attr.name.toLowerCase();
|
|
342
|
+
const attrValue = attr.value;
|
|
343
|
+
// Special cases for preserving specific attributes
|
|
344
|
+
if (
|
|
345
|
+
// Preserve footnote IDs
|
|
346
|
+
(attrName === 'id' && (attrValue.startsWith('fnref:') || // Footnote reference
|
|
347
|
+
attrValue.startsWith('fn:') || // Footnote content
|
|
348
|
+
attrValue === 'footnotes' // Footnotes container
|
|
349
|
+
)) ||
|
|
350
|
+
// Preserve code block language classes and footnote backref class
|
|
351
|
+
(attrName === 'class' && ((tag === 'code' && attrValue.startsWith('language-')) ||
|
|
352
|
+
attrValue === 'footnote-backref' ||
|
|
353
|
+
/^callout(?:-|$)/.test(attrValue)))) {
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
// In debug mode, allow debug attributes and data- attributes
|
|
357
|
+
if (debug) {
|
|
358
|
+
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName) &&
|
|
359
|
+
!constants_1.ALLOWED_ATTRIBUTES_DEBUG.has(attrName) &&
|
|
360
|
+
!attrName.startsWith('data-')) {
|
|
361
|
+
el.removeAttribute(attr.name);
|
|
362
|
+
attributeCount++;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
else {
|
|
366
|
+
// In normal mode, only allow standard attributes
|
|
367
|
+
if (!constants_1.ALLOWED_ATTRIBUTES.has(attrName)) {
|
|
368
|
+
el.removeAttribute(attr.name);
|
|
369
|
+
attributeCount++;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
});
|
|
373
|
+
};
|
|
374
|
+
processElement(element);
|
|
375
|
+
element.querySelectorAll('*').forEach(processElement);
|
|
376
|
+
(0, utils_1.logDebug)(_debug, 'Stripped attributes:', attributeCount);
|
|
377
|
+
}
|
|
378
|
+
function unwrapElement(el) {
|
|
379
|
+
while (el.firstChild) {
|
|
380
|
+
el.parentNode?.insertBefore(el.firstChild, el);
|
|
381
|
+
}
|
|
382
|
+
el.remove();
|
|
383
|
+
}
|
|
384
|
+
function unwrapBareSpans(element) {
|
|
385
|
+
// Process deepest spans first so nested bare spans collapse in one pass
|
|
386
|
+
const spans = Array.from(element.querySelectorAll('span')).reverse();
|
|
387
|
+
let unwrappedCount = 0;
|
|
388
|
+
for (const span of spans) {
|
|
389
|
+
if (!span.parentNode)
|
|
390
|
+
continue;
|
|
391
|
+
if (span.attributes.length > 0)
|
|
392
|
+
continue;
|
|
393
|
+
const parent = span.parentNode;
|
|
394
|
+
if (!parent)
|
|
395
|
+
continue;
|
|
396
|
+
// Replace span with its children
|
|
397
|
+
while (span.firstChild) {
|
|
398
|
+
parent.insertBefore(span.firstChild, span);
|
|
399
|
+
}
|
|
400
|
+
span.remove();
|
|
401
|
+
unwrappedCount++;
|
|
402
|
+
}
|
|
403
|
+
// Merge adjacent text nodes left behind in one pass
|
|
404
|
+
if (unwrappedCount > 0) {
|
|
405
|
+
element.normalize();
|
|
406
|
+
}
|
|
407
|
+
(0, utils_1.logDebug)(_debug, 'Unwrapped bare spans:', unwrappedCount);
|
|
408
|
+
}
|
|
409
|
+
function removeEmptyElements(element) {
|
|
410
|
+
let removedCount = 0;
|
|
411
|
+
const isEmptyElement = (el) => {
|
|
412
|
+
if (constants_1.ALLOWED_EMPTY_ELEMENTS.has(el.tagName.toLowerCase()))
|
|
413
|
+
return false;
|
|
414
|
+
// Special case: divs that only contain spans with commas
|
|
415
|
+
if (el.tagName === 'DIV') {
|
|
416
|
+
const children = el.children;
|
|
417
|
+
if (children.length > 0) {
|
|
418
|
+
let allCommaSpans = true;
|
|
419
|
+
for (let i = 0; i < children.length; i++) {
|
|
420
|
+
const child = children[i];
|
|
421
|
+
if (child.tagName !== 'SPAN') {
|
|
422
|
+
allCommaSpans = false;
|
|
423
|
+
break;
|
|
424
|
+
}
|
|
425
|
+
const content = child.textContent?.trim() || '';
|
|
426
|
+
if (content !== ',' && content !== '' && content !== ' ') {
|
|
427
|
+
allCommaSpans = false;
|
|
428
|
+
break;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (allCommaSpans)
|
|
432
|
+
return true;
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
const textContent = el.textContent || '';
|
|
436
|
+
if (textContent.trim().length > 0 || textContent.includes('\u00A0'))
|
|
437
|
+
return false;
|
|
438
|
+
// Check if element has no meaningful children (no element children, only whitespace text)
|
|
439
|
+
if (!el.hasChildNodes())
|
|
440
|
+
return true;
|
|
441
|
+
const childNodes = el.childNodes;
|
|
442
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
443
|
+
const node = childNodes[i];
|
|
444
|
+
if (!(0, utils_1.isTextNode)(node))
|
|
445
|
+
return false;
|
|
446
|
+
const nodeText = node.textContent || '';
|
|
447
|
+
if (nodeText.trim().length > 0 || nodeText.includes('\u00A0'))
|
|
448
|
+
return false;
|
|
449
|
+
}
|
|
450
|
+
return true;
|
|
451
|
+
};
|
|
452
|
+
// Process deepest-first in a single pass by reversing the element list
|
|
453
|
+
// (querySelectorAll returns document order, reverse gives deepest last → first)
|
|
454
|
+
const allElements = Array.from(element.querySelectorAll('*')).reverse();
|
|
455
|
+
for (const el of allElements) {
|
|
456
|
+
if (el.parentNode && isEmptyElement(el)) {
|
|
457
|
+
el.remove();
|
|
458
|
+
removedCount++;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
(0, utils_1.logDebug)(_debug, 'Removed empty elements:', removedCount);
|
|
462
|
+
}
|
|
463
|
+
function stripExtraBrElements(element) {
|
|
464
|
+
let processedCount = 0;
|
|
465
|
+
const startTime = Date.now();
|
|
466
|
+
// Get all br elements directly
|
|
467
|
+
const brElements = Array.from(element.getElementsByTagName('br'));
|
|
468
|
+
// Keep track of consecutive br elements
|
|
469
|
+
let consecutiveBrs = [];
|
|
470
|
+
// Helper to process collected br elements
|
|
471
|
+
const processBrs = () => {
|
|
472
|
+
if (consecutiveBrs.length > 2) {
|
|
473
|
+
// Keep only two br elements
|
|
474
|
+
for (let i = 2; i < consecutiveBrs.length; i++) {
|
|
475
|
+
consecutiveBrs[i].remove();
|
|
476
|
+
processedCount++;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
consecutiveBrs = [];
|
|
480
|
+
};
|
|
481
|
+
// Process all br elements
|
|
482
|
+
brElements.forEach(currentNode => {
|
|
483
|
+
// Check if this br is consecutive with previous ones
|
|
484
|
+
let isConsecutive = false;
|
|
485
|
+
if (consecutiveBrs.length > 0) {
|
|
486
|
+
const lastBr = consecutiveBrs[consecutiveBrs.length - 1];
|
|
487
|
+
let node = currentNode.previousSibling;
|
|
488
|
+
// Skip whitespace text nodes
|
|
489
|
+
while (node && (0, utils_1.isTextNode)(node) && !node.textContent?.trim()) {
|
|
490
|
+
node = node.previousSibling;
|
|
491
|
+
}
|
|
492
|
+
if (node === lastBr) {
|
|
493
|
+
isConsecutive = true;
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (isConsecutive) {
|
|
497
|
+
consecutiveBrs.push(currentNode);
|
|
498
|
+
}
|
|
499
|
+
else {
|
|
500
|
+
// Process any previously collected brs before starting new group
|
|
501
|
+
processBrs();
|
|
502
|
+
consecutiveBrs = [currentNode];
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
// Process any remaining br elements
|
|
506
|
+
processBrs();
|
|
507
|
+
const endTime = Date.now();
|
|
508
|
+
(0, utils_1.logDebug)(_debug, 'Standardized br elements:', {
|
|
509
|
+
removed: processedCount,
|
|
510
|
+
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
511
|
+
});
|
|
512
|
+
}
|
|
513
|
+
function moveWhitespaceOutside(node, doc, direction) {
|
|
514
|
+
const child = direction === 'leading' ? node.firstChild : node.lastChild;
|
|
515
|
+
if (!child || !(0, utils_1.isTextNode)(child))
|
|
516
|
+
return 0;
|
|
517
|
+
const text = child.textContent || '';
|
|
518
|
+
const trimmed = direction === 'leading' ? text.replace(/^\s+/, '') : text.replace(/\s+$/, '');
|
|
519
|
+
if (trimmed === text || !node.parentNode)
|
|
520
|
+
return 0;
|
|
521
|
+
child.textContent = trimmed;
|
|
522
|
+
// Ensure a space exists on the outside
|
|
523
|
+
const neighbor = direction === 'leading' ? node.previousSibling : node.nextSibling;
|
|
524
|
+
const neighborHasSpace = neighbor && (0, utils_1.isTextNode)(neighbor) && (direction === 'leading'
|
|
525
|
+
? (neighbor.textContent || '').endsWith(' ')
|
|
526
|
+
: (neighbor.textContent || '').startsWith(' '));
|
|
527
|
+
if (!neighborHasSpace) {
|
|
528
|
+
const insertBefore = direction === 'leading' ? node : node.nextSibling;
|
|
529
|
+
node.parentNode.insertBefore(doc.createTextNode(' '), insertBefore);
|
|
530
|
+
}
|
|
531
|
+
return 1;
|
|
532
|
+
}
|
|
533
|
+
function removeEmptyLines(element, doc) {
|
|
534
|
+
let removedCount = 0;
|
|
535
|
+
const startTime = Date.now();
|
|
536
|
+
// First pass: remove empty text nodes
|
|
537
|
+
const removeEmptyTextNodes = (node) => {
|
|
538
|
+
// Skip if inside pre or code
|
|
539
|
+
if ((0, utils_1.isElement)(node)) {
|
|
540
|
+
const tag = node.tagName.toLowerCase();
|
|
541
|
+
if (tag === 'pre' || tag === 'code') {
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
// Process children first (depth-first)
|
|
546
|
+
const children = Array.from(node.childNodes);
|
|
547
|
+
children.forEach(removeEmptyTextNodes);
|
|
548
|
+
// Then handle this node
|
|
549
|
+
if ((0, utils_1.isTextNode)(node)) {
|
|
550
|
+
const text = node.textContent || '';
|
|
551
|
+
// If it's completely empty or just zero-width/invisible characters, remove it
|
|
552
|
+
// Preserve nodes with regular spaces or as they may separate words
|
|
553
|
+
if (!text || /^[\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/.test(text)) {
|
|
554
|
+
node.parentNode?.removeChild(node);
|
|
555
|
+
removedCount++;
|
|
556
|
+
}
|
|
557
|
+
else {
|
|
558
|
+
// Clean up the text content while preserving important spaces
|
|
559
|
+
// Collapse newlines to spaces (CSS white-space: normal behavior)
|
|
560
|
+
const newText = text
|
|
561
|
+
.replace(/[\n\r]+/g, ' ') // Newlines -> spaces
|
|
562
|
+
.replace(/\t+/g, ' ') // Tabs -> spaces
|
|
563
|
+
.replace(/ {2,}/g, ' ') // 2+ spaces -> 1 space
|
|
564
|
+
.replace(/^[ ]+$/, ' ') // Multiple spaces between elements -> single space
|
|
565
|
+
.replace(/\s+([,.!?:;])/g, '$1') // Remove spaces before punctuation
|
|
566
|
+
// Clean up zero-width characters (except ZWNJ \u200C used in Farsi) and multiple non-breaking spaces
|
|
567
|
+
.replace(/[\u200B\u200D\u200E\u200F\uFEFF]+/g, '')
|
|
568
|
+
.replace(/(?:\xA0){2,}/g, '\xA0'); // Multiple -> single
|
|
569
|
+
if (newText !== text) {
|
|
570
|
+
node.textContent = newText;
|
|
571
|
+
removedCount += text.length - newText.length;
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
};
|
|
576
|
+
// Second pass: clean up empty elements and normalize spacing
|
|
577
|
+
const cleanupEmptyElements = (node) => {
|
|
578
|
+
if (!(0, utils_1.isElement)(node))
|
|
579
|
+
return;
|
|
580
|
+
// Skip pre and code elements
|
|
581
|
+
const tag = node.tagName.toLowerCase();
|
|
582
|
+
if (tag === 'pre' || tag === 'code') {
|
|
583
|
+
return;
|
|
584
|
+
}
|
|
585
|
+
// Process children first (depth-first)
|
|
586
|
+
Array.from(node.childNodes)
|
|
587
|
+
.filter(utils_1.isElement)
|
|
588
|
+
.forEach(cleanupEmptyElements);
|
|
589
|
+
// Then normalize this element's whitespace
|
|
590
|
+
node.normalize(); // Combine adjacent text nodes
|
|
591
|
+
// Special handling for block elements
|
|
592
|
+
const isBlockElement = (0, utils_1.getComputedStyle)(node)?.display === 'block';
|
|
593
|
+
// Remove whitespace-only text nodes at start/end
|
|
594
|
+
const whitespacePattern = isBlockElement ? /^[\n\r\t \u200C\u200B\u200D\u200E\u200F\uFEFF\xA0]*$/ : /^[\n\r\t\u200C\u200B\u200D\u200E\u200F\uFEFF]*$/;
|
|
595
|
+
while (node.firstChild &&
|
|
596
|
+
(0, utils_1.isTextNode)(node.firstChild) &&
|
|
597
|
+
(node.firstChild.textContent || '').match(whitespacePattern)) {
|
|
598
|
+
node.removeChild(node.firstChild);
|
|
599
|
+
removedCount++;
|
|
600
|
+
}
|
|
601
|
+
while (node.lastChild &&
|
|
602
|
+
(0, utils_1.isTextNode)(node.lastChild) &&
|
|
603
|
+
(node.lastChild.textContent || '').match(whitespacePattern)) {
|
|
604
|
+
node.removeChild(node.lastChild);
|
|
605
|
+
removedCount++;
|
|
606
|
+
}
|
|
607
|
+
// For inline elements, move leading/trailing spaces outside the element
|
|
608
|
+
if (!isBlockElement && constants_1.INLINE_ELEMENTS.has(tag) && node.parentNode) {
|
|
609
|
+
removedCount += moveWhitespaceOutside(node, doc, 'leading');
|
|
610
|
+
removedCount += moveWhitespaceOutside(node, doc, 'trailing');
|
|
611
|
+
}
|
|
612
|
+
// Ensure there's a space between inline elements if needed
|
|
613
|
+
if (!isBlockElement) {
|
|
614
|
+
const children = Array.from(node.childNodes);
|
|
615
|
+
for (let i = 0; i < children.length - 1; i++) {
|
|
616
|
+
const current = children[i];
|
|
617
|
+
const next = children[i + 1];
|
|
618
|
+
// Only add space between elements or between element and text
|
|
619
|
+
if ((0, utils_1.isElement)(current) || (0, utils_1.isElement)(next)) {
|
|
620
|
+
// Get the text content
|
|
621
|
+
const nextContent = next.textContent || '';
|
|
622
|
+
const currentContent = current.textContent || '';
|
|
623
|
+
// Don't add space if:
|
|
624
|
+
// 1. Next content starts with punctuation or closing parenthesis
|
|
625
|
+
// 2. Current content ends with punctuation or opening parenthesis
|
|
626
|
+
// 3. There's already a space
|
|
627
|
+
const nextStartsWithPunctuation = nextContent.match(/^[,.!?:;)\]]/);
|
|
628
|
+
const currentEndsWithPunctuation = currentContent.match(/[,.!?:;(\[]\s*$/);
|
|
629
|
+
const hasSpace = ((0, utils_1.isTextNode)(current) &&
|
|
630
|
+
(current.textContent || '').endsWith(' ')) ||
|
|
631
|
+
((0, utils_1.isTextNode)(next) &&
|
|
632
|
+
(next.textContent || '').startsWith(' '));
|
|
633
|
+
// Only add space if none of the above conditions are true
|
|
634
|
+
if (!nextStartsWithPunctuation &&
|
|
635
|
+
!currentEndsWithPunctuation &&
|
|
636
|
+
!hasSpace) {
|
|
637
|
+
const space = doc.createTextNode(' ');
|
|
638
|
+
node.insertBefore(space, next);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
};
|
|
644
|
+
// Run both passes
|
|
645
|
+
removeEmptyTextNodes(element);
|
|
646
|
+
cleanupEmptyElements(element);
|
|
647
|
+
const endTime = Date.now();
|
|
648
|
+
(0, utils_1.logDebug)(_debug, 'Removed empty lines:', {
|
|
649
|
+
charactersRemoved: removedCount,
|
|
650
|
+
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
651
|
+
});
|
|
652
|
+
}
|
|
653
|
+
function standardizeElements(element, doc) {
|
|
654
|
+
let processedCount = 0;
|
|
655
|
+
// Convert elements based on standardization rules
|
|
656
|
+
ELEMENT_STANDARDIZATION_RULES.forEach(rule => {
|
|
657
|
+
let elements;
|
|
658
|
+
try {
|
|
659
|
+
elements = element.querySelectorAll(rule.selector);
|
|
660
|
+
}
|
|
661
|
+
catch (e) {
|
|
662
|
+
// Some selectors use :has() which isn't supported by jsdom/nwsapi.
|
|
663
|
+
// Skip the rule gracefully in those environments.
|
|
664
|
+
return;
|
|
665
|
+
}
|
|
666
|
+
elements.forEach(el => {
|
|
667
|
+
if (rule.transform) {
|
|
668
|
+
// If there's a transform function, use it to create the new element
|
|
669
|
+
const transformed = rule.transform(el, doc);
|
|
670
|
+
el.replaceWith(transformed);
|
|
671
|
+
processedCount++;
|
|
672
|
+
}
|
|
673
|
+
});
|
|
674
|
+
});
|
|
675
|
+
// arXiv LaTeXML: Convert equation tables to <math> elements before attribute stripping
|
|
676
|
+
const equationTables = Array.from(element.querySelectorAll('table.ltx_equation, table.ltx_eqn_table, table.ltx_equationgroup'));
|
|
677
|
+
equationTables.forEach(table => {
|
|
678
|
+
const mathElements = table.querySelectorAll('math');
|
|
679
|
+
if (mathElements.length === 0)
|
|
680
|
+
return;
|
|
681
|
+
const fragment = doc.createDocumentFragment();
|
|
682
|
+
mathElements.forEach(mathEl => {
|
|
683
|
+
// Extract LaTeX from alttext or annotation
|
|
684
|
+
const alttext = mathEl.getAttribute('alttext');
|
|
685
|
+
const annotation = mathEl.querySelector('annotation[encoding="application/x-tex"]');
|
|
686
|
+
const latex = alttext || annotation?.textContent?.trim() || '';
|
|
687
|
+
if (!latex)
|
|
688
|
+
return;
|
|
689
|
+
const isBlock = mathEl.getAttribute('display') === 'block' ||
|
|
690
|
+
table.classList.contains('ltx_equation') ||
|
|
691
|
+
table.classList.contains('ltx_equationgroup');
|
|
692
|
+
const cleanMath = doc.createElement('math');
|
|
693
|
+
cleanMath.setAttribute('xmlns', 'http://www.w3.org/1998/Math/MathML');
|
|
694
|
+
cleanMath.setAttribute('display', isBlock ? 'block' : 'inline');
|
|
695
|
+
cleanMath.setAttribute('data-latex', latex);
|
|
696
|
+
cleanMath.textContent = latex;
|
|
697
|
+
fragment.appendChild(cleanMath);
|
|
698
|
+
});
|
|
699
|
+
if (fragment.childNodes.length > 0) {
|
|
700
|
+
table.replaceWith(fragment);
|
|
701
|
+
processedCount++;
|
|
702
|
+
}
|
|
703
|
+
});
|
|
704
|
+
// arXiv LaTeXML: Remove hidden ltx_note_outer spans (CSS display:none on arxiv.org)
|
|
705
|
+
// These contain duplicated footnote marks and "footnotemark:" text
|
|
706
|
+
const noteOuters = Array.from(element.querySelectorAll('span.ltx_note_outer'));
|
|
707
|
+
noteOuters.forEach(outer => {
|
|
708
|
+
outer.remove();
|
|
709
|
+
processedCount++;
|
|
710
|
+
});
|
|
711
|
+
// arXiv LaTeXML: Unwrap ltx_ref_tag spans so cross-reference numbers are preserved
|
|
712
|
+
// These spans (e.g. <span class="ltx_text ltx_ref_tag">1</span>) get stripped to bare
|
|
713
|
+
// spans during attribute stripping, then unwrapped — but their parent <a> links get
|
|
714
|
+
// removed by the exact selector `a[href^="#"][class*="ref" i]`. Fix by unwrapping the
|
|
715
|
+
// link and keeping the text inline.
|
|
716
|
+
const refLinks = Array.from(element.querySelectorAll('a.ltx_ref'));
|
|
717
|
+
refLinks.forEach(link => {
|
|
718
|
+
const refTag = link.querySelector('span.ltx_ref_tag, span.ltx_text.ltx_ref_tag');
|
|
719
|
+
if (refTag) {
|
|
720
|
+
// Replace the link with just the text content
|
|
721
|
+
const text = doc.createTextNode(link.textContent || '');
|
|
722
|
+
link.replaceWith(text);
|
|
723
|
+
processedCount++;
|
|
724
|
+
}
|
|
725
|
+
});
|
|
726
|
+
// Unwrap single-column layout tables (used for styling/positioning, not data)
|
|
727
|
+
const tables = Array.from(element.querySelectorAll('table'));
|
|
728
|
+
tables.forEach(table => {
|
|
729
|
+
if (!table.parentNode)
|
|
730
|
+
return;
|
|
731
|
+
const directCells = Array.from(table.querySelectorAll('td, th'))
|
|
732
|
+
.filter(cell => (0, dom_1.isDirectTableChild)(cell, table));
|
|
733
|
+
// Skip data tables that have direct header cells
|
|
734
|
+
if (directCells.some(cell => cell.tagName === 'TH'))
|
|
735
|
+
return;
|
|
736
|
+
const directRows = Array.from(table.querySelectorAll('tr'))
|
|
737
|
+
.filter(row => (0, dom_1.isDirectTableChild)(row, table));
|
|
738
|
+
if (directRows.length === 0)
|
|
739
|
+
return;
|
|
740
|
+
// Check that every row has at most one direct cell
|
|
741
|
+
const isSingleColumn = directRows.every(tr => directCells.filter(cell => cell.parentNode === tr).length <= 1);
|
|
742
|
+
if (!isSingleColumn)
|
|
743
|
+
return;
|
|
744
|
+
const fragment = doc.createDocumentFragment();
|
|
745
|
+
directCells.forEach(cell => {
|
|
746
|
+
while (cell.firstChild) {
|
|
747
|
+
fragment.appendChild(cell.firstChild);
|
|
748
|
+
}
|
|
749
|
+
});
|
|
750
|
+
table.replaceWith(fragment);
|
|
751
|
+
processedCount++;
|
|
752
|
+
});
|
|
753
|
+
// Convert lite-youtube elements
|
|
754
|
+
const liteYoutubeElements = element.querySelectorAll('lite-youtube');
|
|
755
|
+
liteYoutubeElements.forEach(el => {
|
|
756
|
+
const videoId = el.getAttribute('videoid');
|
|
757
|
+
if (!videoId)
|
|
758
|
+
return;
|
|
759
|
+
const iframe = doc.createElement('iframe');
|
|
760
|
+
iframe.width = '560';
|
|
761
|
+
iframe.height = '315';
|
|
762
|
+
iframe.src = `https://www.youtube.com/embed/${videoId}`;
|
|
763
|
+
iframe.title = el.getAttribute('videotitle') || 'YouTube video player';
|
|
764
|
+
iframe.frameBorder = '0';
|
|
765
|
+
iframe.allow = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share';
|
|
766
|
+
iframe.setAttribute('allowfullscreen', '');
|
|
767
|
+
el.replaceWith(iframe);
|
|
768
|
+
processedCount++;
|
|
769
|
+
});
|
|
770
|
+
(0, utils_1.logDebug)(_debug, 'Converted embedded elements:', processedCount);
|
|
771
|
+
// Verso (Lean docs) emits many adjacent command/output blocks.
|
|
772
|
+
// Merge contiguous transformed blocks back into one readable block.
|
|
773
|
+
mergeAdjacentVersoCodeBlocks(element);
|
|
774
|
+
}
|
|
775
|
+
function mergeAdjacentVersoCodeBlocks(root) {
|
|
776
|
+
const getCodeNode = (pre) => {
|
|
777
|
+
let code = null;
|
|
778
|
+
for (const child of pre.children) {
|
|
779
|
+
if (child.tagName.toLowerCase() !== 'code')
|
|
780
|
+
return null;
|
|
781
|
+
if (code)
|
|
782
|
+
return null;
|
|
783
|
+
code = child;
|
|
784
|
+
}
|
|
785
|
+
return code;
|
|
786
|
+
};
|
|
787
|
+
const getLanguage = (code) => {
|
|
788
|
+
const dataLang = (code.getAttribute('data-lang') || '').toLowerCase();
|
|
789
|
+
if (dataLang)
|
|
790
|
+
return dataLang;
|
|
791
|
+
const className = code.getAttribute('class') || '';
|
|
792
|
+
const match = className.match(/(?:^|\s)language-([a-z0-9_+-]+)(?:\s|$)/i);
|
|
793
|
+
return match?.[1]?.toLowerCase() || '';
|
|
794
|
+
};
|
|
795
|
+
// Only visit parents of verso code blocks, not every element in the tree
|
|
796
|
+
const candidates = root.querySelectorAll('pre[data-verso-code="true"]');
|
|
797
|
+
const parents = new Set();
|
|
798
|
+
for (const candidate of candidates) {
|
|
799
|
+
const parent = candidate.parentElement;
|
|
800
|
+
if (parent)
|
|
801
|
+
parents.add(parent);
|
|
802
|
+
}
|
|
803
|
+
for (const container of parents) {
|
|
804
|
+
const children = Array.from(container.childNodes);
|
|
805
|
+
for (let i = 0; i < children.length; i++) {
|
|
806
|
+
const startNode = children[i];
|
|
807
|
+
if (!(0, utils_1.isElement)(startNode) || startNode.tagName.toLowerCase() !== 'pre')
|
|
808
|
+
continue;
|
|
809
|
+
if (startNode.getAttribute('data-verso-code') !== 'true')
|
|
810
|
+
continue;
|
|
811
|
+
const startCode = getCodeNode(startNode);
|
|
812
|
+
if (!startCode)
|
|
813
|
+
continue;
|
|
814
|
+
const language = getLanguage(startCode);
|
|
815
|
+
if (language !== 'lean' && language !== 'lean4')
|
|
816
|
+
continue;
|
|
817
|
+
const run = [{ pre: startNode, code: startCode }];
|
|
818
|
+
const betweenWhitespace = [];
|
|
819
|
+
let j = i + 1;
|
|
820
|
+
while (j < children.length) {
|
|
821
|
+
const node = children[j];
|
|
822
|
+
if ((0, utils_1.isTextNode)(node) && !(node.textContent || '').trim()) {
|
|
823
|
+
betweenWhitespace.push(node);
|
|
824
|
+
j++;
|
|
825
|
+
continue;
|
|
826
|
+
}
|
|
827
|
+
if (!(0, utils_1.isElement)(node) || node.tagName.toLowerCase() !== 'pre')
|
|
828
|
+
break;
|
|
829
|
+
const pre = node;
|
|
830
|
+
if (pre.getAttribute('data-verso-code') !== 'true')
|
|
831
|
+
break;
|
|
832
|
+
const code = getCodeNode(pre);
|
|
833
|
+
if (!code || getLanguage(code) !== language)
|
|
834
|
+
break;
|
|
835
|
+
run.push({ pre, code });
|
|
836
|
+
j++;
|
|
837
|
+
}
|
|
838
|
+
if (run.length <= 1)
|
|
839
|
+
continue;
|
|
840
|
+
const merged = run
|
|
841
|
+
.map(({ code }) => (code.textContent || '').replace(/\r?\n$/, ''))
|
|
842
|
+
.join('\n')
|
|
843
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
844
|
+
.replace(/^\n+|\n+$/g, '');
|
|
845
|
+
startCode.textContent = merged;
|
|
846
|
+
for (let k = 1; k < run.length; k++) {
|
|
847
|
+
run[k].pre.remove();
|
|
848
|
+
}
|
|
849
|
+
for (const node of betweenWhitespace) {
|
|
850
|
+
node.parentNode?.removeChild(node);
|
|
851
|
+
}
|
|
852
|
+
// Continue scanning after the merged run.
|
|
853
|
+
i = j - 1;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
function flattenWrapperElements(element, doc) {
|
|
858
|
+
let processedCount = 0;
|
|
859
|
+
const startTime = Date.now();
|
|
860
|
+
// Process in batches to maintain performance
|
|
861
|
+
let keepProcessing = true;
|
|
862
|
+
// Helper function to check if an element directly contains inline content
|
|
863
|
+
// This helps prevent unwrapping divs that visually act as paragraphs.
|
|
864
|
+
function hasDirectInlineContent(el) {
|
|
865
|
+
for (const child of el.childNodes) {
|
|
866
|
+
// Check for non-empty text nodes
|
|
867
|
+
if ((0, utils_1.isTextNode)(child) && child.textContent?.trim()) {
|
|
868
|
+
return true;
|
|
869
|
+
}
|
|
870
|
+
// Check for element nodes that are considered inline
|
|
871
|
+
if ((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())) {
|
|
872
|
+
return true;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
return false;
|
|
876
|
+
}
|
|
877
|
+
const shouldPreserveElement = (el) => {
|
|
878
|
+
const tagName = el.tagName.toLowerCase();
|
|
879
|
+
// Check if element should be preserved
|
|
880
|
+
if (constants_1.PRESERVE_ELEMENTS.has(tagName))
|
|
881
|
+
return true;
|
|
882
|
+
// Preserve callout structure (div.callout[data-callout] and children)
|
|
883
|
+
if (el.getAttribute('data-callout') || el.closest?.('[data-callout]'))
|
|
884
|
+
return true;
|
|
885
|
+
// Check for semantic roles
|
|
886
|
+
const role = el.getAttribute('role');
|
|
887
|
+
if (role && ['article', 'main', 'navigation', 'banner', 'contentinfo'].includes(role)) {
|
|
888
|
+
return true;
|
|
889
|
+
}
|
|
890
|
+
// Check for semantic classes
|
|
891
|
+
const className = (0, dom_1.getClassName)(el);
|
|
892
|
+
if (className && className.toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/)) {
|
|
893
|
+
return true;
|
|
894
|
+
}
|
|
895
|
+
// Check if element contains mixed content types that should be preserved
|
|
896
|
+
const children = Array.from(el.children);
|
|
897
|
+
const hasPreservedElements = children.some(child => constants_1.PRESERVE_ELEMENTS.has(child.tagName.toLowerCase()) ||
|
|
898
|
+
child.getAttribute('role') === 'article' ||
|
|
899
|
+
!!(0, dom_1.getClassName)(child) && (0, dom_1.getClassName)(child).toLowerCase().match(/(?:article|main|content|footnote|reference|bibliography)/));
|
|
900
|
+
if (hasPreservedElements)
|
|
901
|
+
return true;
|
|
902
|
+
return false;
|
|
903
|
+
};
|
|
904
|
+
const isWrapperElement = (el) => {
|
|
905
|
+
// If it directly contains inline content, it's NOT a wrapper
|
|
906
|
+
if (hasDirectInlineContent(el)) {
|
|
907
|
+
return false;
|
|
908
|
+
}
|
|
909
|
+
// Check if it's just empty space
|
|
910
|
+
if (!el.textContent?.trim())
|
|
911
|
+
return true;
|
|
912
|
+
// Check if it only contains other block elements
|
|
913
|
+
const children = Array.from(el.children);
|
|
914
|
+
if (children.length === 0)
|
|
915
|
+
return true;
|
|
916
|
+
// Check if all children are block elements
|
|
917
|
+
const allBlockElements = children.every(child => {
|
|
918
|
+
return constants_1.BLOCK_LEVEL_ELEMENTS.has(child.tagName.toLowerCase());
|
|
919
|
+
});
|
|
920
|
+
if (allBlockElements)
|
|
921
|
+
return true;
|
|
922
|
+
// Check for common wrapper patterns
|
|
923
|
+
const className = (0, dom_1.getClassName)(el).toLowerCase();
|
|
924
|
+
const isWrapper = /(?:wrapper|container|layout|row|col|grid|flex|outer|inner|content-area)/i.test(className);
|
|
925
|
+
if (isWrapper)
|
|
926
|
+
return true;
|
|
927
|
+
// Check if it has excessive whitespace or empty text nodes
|
|
928
|
+
const textNodes = Array.from(el.childNodes).filter(node => (0, utils_1.isTextNode)(node) && node.textContent?.trim());
|
|
929
|
+
if (textNodes.length === 0)
|
|
930
|
+
return true;
|
|
931
|
+
// Check if it only contains block elements
|
|
932
|
+
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
|
|
933
|
+
const tag = child.tagName.toLowerCase();
|
|
934
|
+
return constants_1.INLINE_ELEMENTS.has(tag);
|
|
935
|
+
});
|
|
936
|
+
if (hasOnlyBlockElements)
|
|
937
|
+
return true;
|
|
938
|
+
return false;
|
|
939
|
+
};
|
|
940
|
+
// Function to process a single element
|
|
941
|
+
const processElement = (el) => {
|
|
942
|
+
// Skip processing if element has been removed or should be preserved
|
|
943
|
+
if (!el.parentNode || shouldPreserveElement(el))
|
|
944
|
+
return false;
|
|
945
|
+
const tagName = el.tagName.toLowerCase();
|
|
946
|
+
// Case 1: Element is truly empty (no text content, no child elements) and not self-closing
|
|
947
|
+
if (!constants_1.ALLOWED_EMPTY_ELEMENTS.has(tagName) && !el.children.length && !el.textContent?.trim()) {
|
|
948
|
+
el.remove();
|
|
949
|
+
processedCount++;
|
|
950
|
+
return true;
|
|
951
|
+
}
|
|
952
|
+
// Case 2: Top-level element - be more aggressive
|
|
953
|
+
if (el.parentElement === element) {
|
|
954
|
+
const children = Array.from(el.children);
|
|
955
|
+
const hasOnlyBlockElements = children.length > 0 && !children.some(child => {
|
|
956
|
+
const tag = child.tagName.toLowerCase();
|
|
957
|
+
return constants_1.INLINE_ELEMENTS.has(tag);
|
|
958
|
+
});
|
|
959
|
+
if (hasOnlyBlockElements) {
|
|
960
|
+
const fragment = doc.createDocumentFragment();
|
|
961
|
+
while (el.firstChild) {
|
|
962
|
+
fragment.appendChild(el.firstChild);
|
|
963
|
+
}
|
|
964
|
+
el.replaceWith(fragment);
|
|
965
|
+
processedCount++;
|
|
966
|
+
return true;
|
|
967
|
+
}
|
|
968
|
+
}
|
|
969
|
+
// Case 3: Wrapper element - merge up aggressively
|
|
970
|
+
if (isWrapperElement(el)) {
|
|
971
|
+
const fragment = doc.createDocumentFragment();
|
|
972
|
+
while (el.firstChild) {
|
|
973
|
+
fragment.appendChild(el.firstChild);
|
|
974
|
+
}
|
|
975
|
+
el.replaceWith(fragment);
|
|
976
|
+
processedCount++;
|
|
977
|
+
return true;
|
|
978
|
+
}
|
|
979
|
+
// Case 4: Element only contains text and/or inline elements - convert to paragraph
|
|
980
|
+
const childNodes = Array.from(el.childNodes);
|
|
981
|
+
const hasOnlyInlineOrText = childNodes.length > 0 && childNodes.every(child => ((0, utils_1.isTextNode)(child)) ||
|
|
982
|
+
((0, utils_1.isElement)(child) && constants_1.INLINE_ELEMENTS.has(child.nodeName.toLowerCase())));
|
|
983
|
+
if (hasOnlyInlineOrText && el.textContent?.trim()) { // Ensure there's actual content
|
|
984
|
+
const p = doc.createElement('p');
|
|
985
|
+
// Move all children (including inline tags like <font>) to the new <p>
|
|
986
|
+
while (el.firstChild) {
|
|
987
|
+
p.appendChild(el.firstChild);
|
|
988
|
+
}
|
|
989
|
+
el.replaceWith(p);
|
|
990
|
+
processedCount++;
|
|
991
|
+
return true;
|
|
992
|
+
}
|
|
993
|
+
// Case 5: Element has single child - unwrap only if child is block-level
|
|
994
|
+
if (el.children.length === 1) {
|
|
995
|
+
const child = el.firstElementChild;
|
|
996
|
+
const childTag = child.tagName.toLowerCase();
|
|
997
|
+
// Only unwrap if the single child is a block element and not preserved
|
|
998
|
+
if (constants_1.BLOCK_ELEMENTS_SET.has(childTag) && !shouldPreserveElement(child)) {
|
|
999
|
+
el.replaceWith(child);
|
|
1000
|
+
processedCount++;
|
|
1001
|
+
return true;
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
// Case 6: Deeply nested element - merge up
|
|
1005
|
+
let nestingDepth = 0;
|
|
1006
|
+
let parent = el.parentElement;
|
|
1007
|
+
while (parent) {
|
|
1008
|
+
const parentTag = parent.tagName.toLowerCase();
|
|
1009
|
+
if (constants_1.BLOCK_ELEMENTS_SET.has(parentTag)) {
|
|
1010
|
+
nestingDepth++;
|
|
1011
|
+
}
|
|
1012
|
+
parent = parent.parentElement;
|
|
1013
|
+
}
|
|
1014
|
+
// Only unwrap if nested AND does not contain direct inline content
|
|
1015
|
+
if (nestingDepth > 0 && !hasDirectInlineContent(el)) {
|
|
1016
|
+
const fragment = doc.createDocumentFragment();
|
|
1017
|
+
while (el.firstChild) {
|
|
1018
|
+
fragment.appendChild(el.firstChild);
|
|
1019
|
+
}
|
|
1020
|
+
el.replaceWith(fragment);
|
|
1021
|
+
processedCount++;
|
|
1022
|
+
return true;
|
|
1023
|
+
}
|
|
1024
|
+
return false;
|
|
1025
|
+
};
|
|
1026
|
+
// First pass: Process top-level wrapper elements
|
|
1027
|
+
const processTopLevelElements = () => {
|
|
1028
|
+
const topElements = Array.from(element.children).filter(el => constants_1.BLOCK_ELEMENTS_SET.has(el.tagName.toLowerCase()));
|
|
1029
|
+
let modified = false;
|
|
1030
|
+
topElements.forEach(el => {
|
|
1031
|
+
if (processElement(el)) {
|
|
1032
|
+
modified = true;
|
|
1033
|
+
}
|
|
1034
|
+
});
|
|
1035
|
+
return modified;
|
|
1036
|
+
};
|
|
1037
|
+
// Second pass: Process remaining wrapper elements from deepest to shallowest
|
|
1038
|
+
const processRemainingElements = () => {
|
|
1039
|
+
// Get all wrapper elements
|
|
1040
|
+
const allElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR))
|
|
1041
|
+
.sort((a, b) => {
|
|
1042
|
+
// Count nesting depth
|
|
1043
|
+
const getDepth = (el) => {
|
|
1044
|
+
let depth = 0;
|
|
1045
|
+
let parent = el.parentElement;
|
|
1046
|
+
while (parent) {
|
|
1047
|
+
const parentTag = parent.tagName.toLowerCase();
|
|
1048
|
+
if (constants_1.BLOCK_ELEMENTS_SET.has(parentTag))
|
|
1049
|
+
depth++;
|
|
1050
|
+
parent = parent.parentElement;
|
|
1051
|
+
}
|
|
1052
|
+
return depth;
|
|
1053
|
+
};
|
|
1054
|
+
return getDepth(b) - getDepth(a); // Process deepest first
|
|
1055
|
+
});
|
|
1056
|
+
let modified = false;
|
|
1057
|
+
allElements.forEach(el => {
|
|
1058
|
+
if (processElement(el)) {
|
|
1059
|
+
modified = true;
|
|
1060
|
+
}
|
|
1061
|
+
});
|
|
1062
|
+
return modified;
|
|
1063
|
+
};
|
|
1064
|
+
// Final cleanup pass - aggressively flatten remaining wrapper elements
|
|
1065
|
+
const finalCleanup = () => {
|
|
1066
|
+
const remainingElements = Array.from(element.querySelectorAll(constants_1.BLOCK_ELEMENTS_SELECTOR));
|
|
1067
|
+
let modified = false;
|
|
1068
|
+
remainingElements.forEach(el => {
|
|
1069
|
+
// Check if element only contains paragraphs
|
|
1070
|
+
const children = Array.from(el.children);
|
|
1071
|
+
const onlyParagraphs = children.length > 0 && children.every(child => child.tagName.toLowerCase() === 'p');
|
|
1072
|
+
// Unwrap if it only contains paragraphs OR is a non-preserved wrapper element
|
|
1073
|
+
if (onlyParagraphs || (!shouldPreserveElement(el) && isWrapperElement(el))) {
|
|
1074
|
+
const fragment = doc.createDocumentFragment();
|
|
1075
|
+
while (el.firstChild) {
|
|
1076
|
+
fragment.appendChild(el.firstChild);
|
|
1077
|
+
}
|
|
1078
|
+
el.replaceWith(fragment);
|
|
1079
|
+
processedCount++;
|
|
1080
|
+
modified = true;
|
|
1081
|
+
}
|
|
1082
|
+
});
|
|
1083
|
+
return modified;
|
|
1084
|
+
};
|
|
1085
|
+
// Execute all passes until no more changes
|
|
1086
|
+
do {
|
|
1087
|
+
keepProcessing = false;
|
|
1088
|
+
if (processTopLevelElements())
|
|
1089
|
+
keepProcessing = true;
|
|
1090
|
+
if (processRemainingElements())
|
|
1091
|
+
keepProcessing = true;
|
|
1092
|
+
if (finalCleanup())
|
|
1093
|
+
keepProcessing = true;
|
|
1094
|
+
} while (keepProcessing);
|
|
1095
|
+
const endTime = Date.now();
|
|
1096
|
+
(0, utils_1.logDebug)(_debug, 'Flattened wrapper elements:', {
|
|
1097
|
+
count: processedCount,
|
|
1098
|
+
processingTime: `${(endTime - startTime).toFixed(2)}ms`
|
|
1099
|
+
});
|
|
1100
|
+
}
|
|
1101
|
+
//# sourceMappingURL=standardize.js.map
|