@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,661 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.isGenericElement = isGenericElement;
7
+ exports.asGenericElement = asGenericElement;
8
+ exports.createMarkdownContent = createMarkdownContent;
9
+ exports.toMarkdown = toMarkdown;
10
+ const turndown_1 = __importDefault(require("turndown"));
11
+ const utils_1 = require("./utils");
12
+ const dom_1 = require("./utils/dom");
13
+ function isGenericElement(node) {
14
+ return node !== null && typeof node === 'object' && 'getAttribute' in node;
15
+ }
16
+ function asGenericElement(node) {
17
+ return node;
18
+ }
19
+ const WIDTH_DESCRIPTOR_RE = /^(\d+)w,?$/;
20
+ const DENSITY_DESCRIPTOR_RE = /^\d+(?:\.\d+)?x,?$/;
21
+ function getBestImageSrc(node) {
22
+ const srcset = node.getAttribute('srcset');
23
+ if (srcset) {
24
+ let bestUrl = '';
25
+ let bestWidth = 0;
26
+ // Tokenize by whitespace instead of splitting on commas, because CDN
27
+ // image URLs (e.g. Substack) can contain commas in the URL path
28
+ // (e.g. `w_424,c_limit,f_webp`). We scan tokens and treat any token
29
+ // matching `Nw` as a width descriptor; the preceding tokens form the URL.
30
+ const tokens = srcset.trim().split(/\s+/);
31
+ let urlParts = [];
32
+ for (const token of tokens) {
33
+ const widthMatch = token.match(WIDTH_DESCRIPTOR_RE);
34
+ if (widthMatch) {
35
+ const width = parseInt(widthMatch[1], 10);
36
+ if (urlParts.length > 0 && width > bestWidth) {
37
+ const url = urlParts.join(' ').replace(/^,\s*/, '');
38
+ if (url) {
39
+ bestWidth = width;
40
+ bestUrl = url;
41
+ }
42
+ }
43
+ urlParts = [];
44
+ }
45
+ else if (DENSITY_DESCRIPTOR_RE.test(token)) {
46
+ // Density descriptor (e.g. 2x) — skip, not used for selection
47
+ urlParts = [];
48
+ }
49
+ else {
50
+ urlParts.push(token);
51
+ }
52
+ }
53
+ if (bestUrl)
54
+ return bestUrl;
55
+ }
56
+ return node.getAttribute('src') || '';
57
+ }
58
+ function createMarkdownContent(content, url) {
59
+ const footnotes = {};
60
+ const turndownService = new turndown_1.default({
61
+ headingStyle: 'atx',
62
+ hr: '---',
63
+ bulletListMarker: '-',
64
+ codeBlockStyle: 'fenced',
65
+ emDelimiter: '*',
66
+ preformattedCode: true,
67
+ });
68
+ turndownService.addRule('table', {
69
+ filter: 'table',
70
+ replacement: function (content, node) {
71
+ if (!isGenericElement(node))
72
+ return content;
73
+ // Check if it's an ArXiv equation table
74
+ if (node.classList?.contains('ltx_equation') || node.classList?.contains('ltx_eqn_table')) {
75
+ return handleNestedEquations(node);
76
+ }
77
+ // Detect layout tables (used for styling/positioning, not data)
78
+ const hasNestedTables = node.querySelector('table') !== null;
79
+ const directCells = Array.from(node.querySelectorAll('td, th')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
80
+ if (hasNestedTables || directCells.length <= 1) {
81
+ const directRows = Array.from(node.querySelectorAll('tr')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
82
+ const cellCounts = directRows.map((tr) => directCells.filter((cell) => cell.parentNode === tr).length);
83
+ const isSingleColumn = directRows.length > 0
84
+ && new Set(cellCounts).size === 1
85
+ && cellCounts[0] <= 1;
86
+ if (isSingleColumn) {
87
+ // Layout table — extract content, don't convert to markdown table
88
+ return '\n\n' + turndownService.turndown(directCells.map((cell) => (0, dom_1.serializeHTML)(cell)).join('')) + '\n\n';
89
+ }
90
+ }
91
+ // Check if the table has colspan or rowspan
92
+ const cells = Array.from(node.querySelectorAll('td, th'));
93
+ const hasComplexStructure = cells.some(cell => isGenericElement(asGenericElement(cell)) && (cell.hasAttribute('colspan') || cell.hasAttribute('rowspan')));
94
+ if (hasComplexStructure) {
95
+ // Clean up the table HTML
96
+ const cleanedTable = cleanupTableHTML(node);
97
+ return '\n\n' + cleanedTable + '\n\n';
98
+ }
99
+ // Process simple tables as before
100
+ // Use node.rows/row.cells when available (browser/JSDOM), fall back to
101
+ // querySelectorAll for environments like linkedom that lack these properties
102
+ const tableEl = node;
103
+ const rowElements = tableEl.rows && tableEl.rows.length > 0
104
+ ? Array.from(tableEl.rows)
105
+ : Array.from(node.querySelectorAll('tr')).filter((tr) => (0, dom_1.isDirectTableChild)(tr, node));
106
+ const rows = rowElements.map((row) => {
107
+ const cellElements = row.cells && row.cells.length > 0
108
+ ? Array.from(row.cells)
109
+ : Array.from(row.querySelectorAll('td, th')).filter((cell) => cell.parentNode === row);
110
+ const cellContents = cellElements.map((cell) => {
111
+ // Remove newlines and trim the content
112
+ let cellContent = turndownService.turndown((0, dom_1.serializeHTML)(cell))
113
+ .replace(/\n/g, ' ')
114
+ .trim();
115
+ // Escape pipe characters
116
+ cellContent = cellContent.replace(/\|/g, '\\|');
117
+ return cellContent;
118
+ });
119
+ return `| ${cellContents.join(' | ')} |`;
120
+ });
121
+ if (!rows.length)
122
+ return content;
123
+ // Create the separator row
124
+ const separatorRow = `| ${Array(rows[0].split('|').length - 2).fill('---').join(' | ')} |`;
125
+ // Combine all rows
126
+ const tableContent = [rows[0], separatorRow, ...rows.slice(1)].join('\n');
127
+ return `\n\n${tableContent}\n\n`;
128
+ }
129
+ });
130
+ turndownService.remove(['style', 'script']);
131
+ // Keep iframes, video, audio, sup, and sub elements
132
+ // @ts-ignore
133
+ turndownService.keep(['iframe', 'video', 'audio', 'sup', 'sub', 'svg', 'math']);
134
+ turndownService.remove(['button']);
135
+ turndownService.addRule('list', {
136
+ filter: ['ul', 'ol'],
137
+ replacement: function (content, node) {
138
+ // Remove trailing newlines/spaces from content
139
+ content = content.trim();
140
+ // Add a newline before the list if it's a top-level list
141
+ const element = node;
142
+ const isTopLevel = !(element.parentNode && (element.parentNode.nodeName === 'UL' || element.parentNode.nodeName === 'OL'));
143
+ return (isTopLevel ? '\n' : '') + content + '\n';
144
+ }
145
+ });
146
+ // Lists with tab indentation
147
+ turndownService.addRule('listItem', {
148
+ filter: 'li',
149
+ replacement: function (content, node, options) {
150
+ if (!isGenericElement(node))
151
+ return content;
152
+ // Handle task list items
153
+ const isTaskListItem = node.classList?.contains('task-list-item');
154
+ const checkbox = node.querySelector('input[type="checkbox"]');
155
+ let taskListMarker = '';
156
+ if (isTaskListItem && checkbox && isGenericElement(checkbox)) {
157
+ // Remove the checkbox from content since we'll add markdown checkbox
158
+ content = content.replace(/<input[^>]*>/, '');
159
+ taskListMarker = checkbox.getAttribute('checked') ? '[x] ' : '[ ] ';
160
+ }
161
+ content = content
162
+ // Remove trailing newlines
163
+ .replace(/\n+$/, '')
164
+ // Split into lines
165
+ .split('\n')
166
+ // Remove empty lines
167
+ .filter(line => line.length > 0)
168
+ // Add indentation to continued lines
169
+ .join('\n\t');
170
+ let prefix = options.bulletListMarker + ' ';
171
+ let parent = node.parentNode;
172
+ // Calculate the nesting level
173
+ let level = 0;
174
+ let currentParent = node.parentNode;
175
+ while (currentParent && isGenericElement(currentParent)) {
176
+ if (currentParent.nodeName === 'UL' || currentParent.nodeName === 'OL') {
177
+ level++;
178
+ }
179
+ else if (currentParent.nodeName !== 'LI') {
180
+ break;
181
+ }
182
+ currentParent = currentParent.parentNode;
183
+ }
184
+ // Add tab indentation based on nesting level, ensuring it's never negative
185
+ const indentLevel = Math.max(0, level - 1);
186
+ prefix = '\t'.repeat(indentLevel) + prefix;
187
+ if (parent && isGenericElement(parent) && parent.nodeName === 'OL') {
188
+ let start = parent.getAttribute('start');
189
+ let index = 1;
190
+ const children = Array.from(parent.children || []);
191
+ for (let i = 0; i < children.length; i++) {
192
+ if (children[i] === node) {
193
+ index = i + 1;
194
+ break;
195
+ }
196
+ }
197
+ prefix = '\t'.repeat(level - 1) + (start ? Number(start) + index - 1 : index) + '. ';
198
+ }
199
+ return prefix + taskListMarker + content.trim() + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
200
+ }
201
+ });
202
+ turndownService.addRule('figure', {
203
+ filter: 'figure',
204
+ replacement: function (content, node) {
205
+ if (!isGenericElement(node))
206
+ return content;
207
+ const img = node.querySelector('img');
208
+ const figcaption = node.querySelector('figcaption');
209
+ if (!img || !isGenericElement(img))
210
+ return content;
211
+ const alt = img.getAttribute('alt') || '';
212
+ const src = getBestImageSrc(img);
213
+ let caption = '';
214
+ if (figcaption && isGenericElement(figcaption)) {
215
+ const tagSpan = figcaption.querySelector('.ltx_tag_figure');
216
+ const tagText = tagSpan && isGenericElement(tagSpan) ? tagSpan.textContent?.trim() : '';
217
+ // Process the caption content, including math elements
218
+ let captionContent = (0, dom_1.serializeHTML)(figcaption);
219
+ const ownerDoc = node.ownerDocument;
220
+ captionContent = captionContent.replace(/<math.*?>(.*?)<\/math>/g, (match, mathContent, offset, string) => {
221
+ let latex = '';
222
+ if (ownerDoc) {
223
+ const fragment = (0, dom_1.parseHTML)(ownerDoc, match);
224
+ const mathElement = fragment.querySelector('math');
225
+ latex = mathElement && isGenericElement(mathElement) ? extractLatex(mathElement) : '';
226
+ }
227
+ const prevChar = string[offset - 1] || '';
228
+ const nextChar = string[offset + match.length] || '';
229
+ const isStartOfLine = offset === 0 || /\s/.test(prevChar);
230
+ const isEndOfLine = offset + match.length === string.length || /\s/.test(nextChar);
231
+ const leftSpace = (!isStartOfLine && !/[\s$]/.test(prevChar)) ? ' ' : '';
232
+ const rightSpace = (!isEndOfLine && !/[\s$]/.test(nextChar)) ? ' ' : '';
233
+ return `${leftSpace}$${latex}$${rightSpace}`;
234
+ });
235
+ // Convert the processed caption content to markdown
236
+ const captionMarkdown = turndownService.turndown(captionContent);
237
+ // Combine tag and processed caption
238
+ caption = `${tagText} ${captionMarkdown}`.trim();
239
+ }
240
+ // Handle references in the caption
241
+ caption = caption.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, href) => {
242
+ return `[${text}](${href})`;
243
+ });
244
+ return `![${alt}](${src})\n\n${caption}\n\n`;
245
+ }
246
+ });
247
+ // Prefer the highest-resolution image from srcset over the small fallback in src
248
+ turndownService.addRule('image', {
249
+ filter: 'img',
250
+ replacement: function (content, node) {
251
+ if (!isGenericElement(node))
252
+ return content;
253
+ const alt = node.getAttribute('alt') || '';
254
+ const src = getBestImageSrc(node);
255
+ const title = node.getAttribute('title') || '';
256
+ const titlePart = title ? ` "${title}"` : '';
257
+ return src ? `![${alt}](${src}${titlePart})` : '';
258
+ }
259
+ });
260
+ // Use Obsidian format for YouTube embeds and tweets
261
+ turndownService.addRule('embedToMarkdown', {
262
+ filter: function (node) {
263
+ if (!isGenericElement(node))
264
+ return false;
265
+ const src = node.getAttribute('src');
266
+ return !!src && (!!src.match(/(?:youtube\.com|youtube-nocookie\.com|youtu\.be)/) ||
267
+ !!src.match(/(?:twitter\.com|x\.com)/));
268
+ },
269
+ replacement: function (content, node) {
270
+ if (!isGenericElement(node))
271
+ return content;
272
+ const src = node.getAttribute('src');
273
+ if (src) {
274
+ const youtubeMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtube-nocookie\.com|youtu\.be)\/(?:embed\/|watch\?v=)?([a-zA-Z0-9_-]+)/);
275
+ if (youtubeMatch && youtubeMatch[1]) {
276
+ return `\n![](https://www.youtube.com/watch?v=${youtubeMatch[1]})\n`;
277
+ }
278
+ // Direct URL: /user/status/id
279
+ const tweetDirectMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:twitter\.com|x\.com)\/([^/]+)\/status\/([0-9]+)/);
280
+ if (tweetDirectMatch) {
281
+ return `\n![](https://x.com/${tweetDirectMatch[1]}/status/${tweetDirectMatch[2]})\n`;
282
+ }
283
+ // Platform embed: ?id=
284
+ const tweetEmbedMatch = src.match(/(?:https?:\/\/)?(?:platform\.)?twitter\.com\/embed\/Tweet\.html\?.*?id=([0-9]+)/);
285
+ if (tweetEmbedMatch) {
286
+ return `\n![](https://x.com/i/status/${tweetEmbedMatch[1]})\n`;
287
+ }
288
+ }
289
+ return content;
290
+ }
291
+ });
292
+ turndownService.addRule('highlight', {
293
+ filter: 'mark',
294
+ replacement: function (content) {
295
+ return '==' + content + '==';
296
+ }
297
+ });
298
+ turndownService.addRule('strikethrough', {
299
+ filter: (node) => node.nodeName === 'DEL' ||
300
+ node.nodeName === 'S' ||
301
+ node.nodeName === 'STRIKE',
302
+ replacement: function (content) {
303
+ return '~~' + content + '~~';
304
+ }
305
+ });
306
+ // Add a new custom rule for complex link structures
307
+ turndownService.addRule('complexLinkStructure', {
308
+ filter: function (node, options) {
309
+ return (node.nodeName === 'A' &&
310
+ node.childNodes.length > 1 &&
311
+ Array.from(node.childNodes).some(child => ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(child.nodeName)));
312
+ },
313
+ replacement: function (content, node, options) {
314
+ if (!isGenericElement(node))
315
+ return content;
316
+ const href = node.getAttribute('href');
317
+ const title = node.getAttribute('title');
318
+ // Extract the heading — use outerHTML to preserve the heading tag
319
+ const headingNode = node.querySelector('h1, h2, h3, h4, h5, h6');
320
+ const headingContent = headingNode ? turndownService.turndown(headingNode.outerHTML) : '';
321
+ // Remove the heading from the content
322
+ if (headingNode) {
323
+ headingNode.remove();
324
+ }
325
+ // Convert the remaining content
326
+ const remainingContent = turndownService.turndown((0, dom_1.serializeHTML)(node));
327
+ // Construct the new markdown
328
+ let markdown = `${headingContent}\n\n${remainingContent}\n\n`;
329
+ if (href) {
330
+ markdown += `[View original](${href})`;
331
+ if (title) {
332
+ markdown += ` "${title}"`;
333
+ }
334
+ }
335
+ return markdown;
336
+ }
337
+ });
338
+ turndownService.addRule('arXivEnumerate', {
339
+ filter: (node) => {
340
+ return node.nodeName === 'OL' && isGenericElement(node) && (node.classList?.contains('ltx_enumerate') ?? false);
341
+ },
342
+ replacement: function (content, node) {
343
+ if (!isGenericElement(node))
344
+ return content;
345
+ const items = Array.from(node.children || []).map((item, index) => {
346
+ if (isGenericElement(item)) {
347
+ const itemContent = ((0, dom_1.serializeHTML)(item) || '').replace(/^<span class="ltx_tag ltx_tag_item">\d+\.<\/span>\s*/, '');
348
+ return `${index + 1}. ${turndownService.turndown(itemContent)}`;
349
+ }
350
+ return '';
351
+ });
352
+ return '\n\n' + items.join('\n\n') + '\n\n';
353
+ }
354
+ });
355
+ turndownService.addRule('citations', {
356
+ filter: (node) => {
357
+ if (isGenericElement(node)) {
358
+ const id = node.getAttribute('id');
359
+ return node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:');
360
+ }
361
+ return false;
362
+ },
363
+ replacement: (content, node) => {
364
+ if (isGenericElement(node)) {
365
+ const id = node.getAttribute('id');
366
+ if (node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:')) {
367
+ const primaryNumber = id.replace('fnref:', '').split('-')[0];
368
+ return `[^${primaryNumber}]`;
369
+ }
370
+ }
371
+ return content;
372
+ }
373
+ });
374
+ // Footnotes list
375
+ turndownService.addRule('footnotesList', {
376
+ filter: (node) => {
377
+ if (isGenericElement(node)) {
378
+ const parentNode = node.parentNode;
379
+ return (node.nodeName === 'OL' &&
380
+ parentNode !== null &&
381
+ isGenericElement(parentNode) &&
382
+ parentNode.getAttribute('id') === 'footnotes');
383
+ }
384
+ return false;
385
+ },
386
+ replacement: (content, node) => {
387
+ if (!isGenericElement(node))
388
+ return content;
389
+ const references = Array.from(node.children || []).map(li => {
390
+ let id;
391
+ if (isGenericElement(li)) {
392
+ const liId = li.getAttribute('id');
393
+ if (liId !== null) {
394
+ if (liId.startsWith('fn:')) {
395
+ id = liId.replace('fn:', '');
396
+ }
397
+ else {
398
+ const match = liId.split('/').pop()?.match(/cite_note-(.+)/);
399
+ id = match ? match[1] : liId;
400
+ }
401
+ }
402
+ // Remove the leading sup element if its content matches the footnote id
403
+ const supElement = li.querySelector('sup');
404
+ if (supElement && isGenericElement(supElement) && supElement.textContent?.trim() === id) {
405
+ supElement.remove();
406
+ }
407
+ const referenceContent = turndownService.turndown((0, dom_1.serializeHTML)(li));
408
+ // Remove the backlink from the footnote content
409
+ const cleanedContent = referenceContent.replace(/\s*↩︎$/, '').trim();
410
+ return `[^${id?.toLowerCase()}]: ${cleanedContent}`;
411
+ }
412
+ return '';
413
+ });
414
+ return '\n\n' + references.join('\n\n') + '\n\n';
415
+ }
416
+ });
417
+ // General removal rules for varous website elements
418
+ turndownService.addRule('removals', {
419
+ filter: function (node) {
420
+ if (!isGenericElement(node))
421
+ return false;
422
+ // Remove the Defuddle backlink from the footnote content
423
+ if (node.getAttribute('href')?.includes('#fnref'))
424
+ return true;
425
+ if (node.classList?.contains('footnote-backref'))
426
+ return true;
427
+ return false;
428
+ },
429
+ replacement: function (content, node) {
430
+ return '';
431
+ }
432
+ });
433
+ turndownService.addRule('handleTextNodesInTables', {
434
+ filter: function (node) {
435
+ return (0, utils_1.isTextNode)(node) &&
436
+ node.parentNode !== null &&
437
+ node.parentNode.nodeName === 'TD';
438
+ },
439
+ replacement: function (content) {
440
+ return content;
441
+ }
442
+ });
443
+ turndownService.addRule('preformattedCode', {
444
+ filter: (node) => {
445
+ return node.nodeName === 'PRE';
446
+ },
447
+ replacement: (content, node) => {
448
+ if (!isGenericElement(node))
449
+ return content;
450
+ const codeElement = node.querySelector('code');
451
+ if (!codeElement || !isGenericElement(codeElement))
452
+ return content;
453
+ const language = codeElement.getAttribute('data-lang')
454
+ || codeElement.getAttribute('data-language')
455
+ || codeElement.getAttribute('class')?.match(/language-(\w+)/)?.[1]
456
+ || node.getAttribute('data-language')
457
+ || '';
458
+ const code = codeElement.textContent || '';
459
+ // Clean up the content and escape backticks
460
+ const cleanCode = code
461
+ .trim()
462
+ .replace(/`/g, '\\`');
463
+ return `\n\`\`\`${language}\n${cleanCode}\n\`\`\`\n`;
464
+ }
465
+ });
466
+ turndownService.addRule('math', {
467
+ filter: (node) => {
468
+ return node.nodeName.toLowerCase() === 'math' ||
469
+ (isGenericElement(node) &&
470
+ (node.classList?.contains('mwe-math-element') ||
471
+ node.classList?.contains('mwe-math-fallback-image-inline') ||
472
+ node.classList?.contains('mwe-math-fallback-image-display')));
473
+ },
474
+ replacement: (content, node) => {
475
+ if (!isGenericElement(node))
476
+ return content;
477
+ let latex = extractLatex(node);
478
+ // Remove leading and trailing whitespace
479
+ latex = latex.trim();
480
+ // Check if the math element is within a table
481
+ const isInTable = typeof node.closest === 'function' ? node.closest('table') !== null : false;
482
+ // Check if it's an inline or block math element
483
+ if (!isInTable && (node.getAttribute('display') === 'block' ||
484
+ node.classList?.contains('mwe-math-fallback-image-display') ||
485
+ (node.parentNode && isGenericElement(node.parentNode) &&
486
+ node.parentNode.classList?.contains('mwe-math-element') &&
487
+ node.parentNode.previousSibling && isGenericElement(node.parentNode.previousSibling) &&
488
+ node.parentNode.previousSibling.nodeName.toLowerCase() === 'p'))) {
489
+ return `\n$$\n${latex}\n$$\n`;
490
+ }
491
+ else {
492
+ // For inline math, ensure there's a space before and after only if needed
493
+ const prevNode = node.previousSibling;
494
+ const nextNode = node.nextSibling;
495
+ const prevChar = prevNode && isGenericElement(prevNode) ? prevNode.textContent?.slice(-1) || '' : '';
496
+ const nextChar = nextNode && isGenericElement(nextNode) ? nextNode.textContent?.[0] || '' : '';
497
+ const isStartOfLine = !prevNode || ((0, utils_1.isTextNode)(prevNode) && prevNode.textContent?.trim() === '');
498
+ const isEndOfLine = !nextNode || ((0, utils_1.isTextNode)(nextNode) && nextNode.textContent?.trim() === '');
499
+ const leftSpace = (!isStartOfLine && prevChar && !/[\s$]/.test(prevChar)) ? ' ' : '';
500
+ const rightSpace = (!isEndOfLine && nextChar && !/[\s$]/.test(nextChar)) ? ' ' : '';
501
+ return `${leftSpace}$${latex}$${rightSpace}`;
502
+ }
503
+ }
504
+ });
505
+ turndownService.addRule('katex', {
506
+ filter: (node) => {
507
+ return isGenericElement(node) &&
508
+ (node.classList?.contains('math') || node.classList?.contains('katex'));
509
+ },
510
+ replacement: (content, node) => {
511
+ if (!isGenericElement(node))
512
+ return content;
513
+ // Try to find the original LaTeX content
514
+ // 1. Check data-latex attribute
515
+ let latex = node.getAttribute('data-latex');
516
+ // 2. If no data-latex, try to get from .katex-mathml
517
+ if (!latex) {
518
+ const mathml = node.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
519
+ latex = mathml && isGenericElement(mathml) ? mathml.textContent || '' : '';
520
+ }
521
+ // 3. If still no content, use text content as fallback
522
+ if (!latex) {
523
+ latex = node.textContent?.trim() || '';
524
+ }
525
+ // Determine if it's an inline formula
526
+ const mathElement = node.querySelector('.katex-mathml math');
527
+ const isInline = node.classList?.contains('math-inline') ||
528
+ (mathElement && isGenericElement(mathElement) && mathElement.getAttribute('display') !== 'block');
529
+ if (isInline) {
530
+ return `$${latex}$`;
531
+ }
532
+ else {
533
+ return `\n$$\n${latex}\n$$\n`;
534
+ }
535
+ }
536
+ });
537
+ // All callout types (GitHub alerts, Bootstrap alerts, callout asides) are
538
+ // standardized to div.callout[data-callout] in callouts.ts
539
+ turndownService.addRule('callout', {
540
+ filter: (node) => {
541
+ return (isGenericElement(node) &&
542
+ !!node.getAttribute('data-callout') &&
543
+ node.classList?.contains('callout'));
544
+ },
545
+ replacement: (content, node) => {
546
+ if (!isGenericElement(node))
547
+ return content;
548
+ const type = node.getAttribute('data-callout') || 'note';
549
+ // Extract title from .callout-title-inner
550
+ const titleInner = node.querySelector('.callout-title-inner');
551
+ const title = titleInner?.textContent?.trim() || type.charAt(0).toUpperCase() + type.slice(1);
552
+ // Remove the title from the DOM so it doesn't appear in content
553
+ const titleDiv = node.querySelector('.callout-title');
554
+ if (titleDiv) {
555
+ titleDiv.remove();
556
+ }
557
+ // Re-convert without the title element
558
+ const contentEl = node.querySelector('.callout-content');
559
+ const calloutContent = contentEl
560
+ ? turndownService.turndown(contentEl.innerHTML)
561
+ : turndownService.turndown(node.innerHTML);
562
+ const lines = calloutContent.trim().split('\n');
563
+ const quotedContent = lines.map(line => `> ${line}`).join('\n');
564
+ return `\n\n> [!${type}] ${title}\n${quotedContent}\n\n`;
565
+ }
566
+ });
567
+ function handleNestedEquations(element) {
568
+ const mathElements = element.querySelectorAll('math[alttext]');
569
+ if (mathElements.length === 0)
570
+ return '';
571
+ return Array.from(mathElements).map(mathElement => {
572
+ const alttext = mathElement.getAttribute('alttext');
573
+ if (alttext) {
574
+ // Check if it's an inline or block equation
575
+ const isInline = mathElement.closest('.ltx_eqn_inline') !== null;
576
+ return isInline ? `$${alttext.trim()}$` : `\n$$\n${alttext.trim()}\n$$`;
577
+ }
578
+ return '';
579
+ }).join('\n\n');
580
+ }
581
+ function cleanupTableHTML(element) {
582
+ const allowedAttributes = ['src', 'href', 'style', 'align', 'width', 'height', 'rowspan', 'colspan', 'bgcolor', 'scope', 'valign', 'headers'];
583
+ const cleanElement = (element) => {
584
+ Array.from(element.attributes).forEach(attr => {
585
+ if (!allowedAttributes.includes(attr.name)) {
586
+ element.removeAttribute(attr.name);
587
+ }
588
+ });
589
+ element.childNodes.forEach(child => {
590
+ if ((0, utils_1.isElement)(child)) {
591
+ cleanElement(child);
592
+ }
593
+ });
594
+ };
595
+ // Create a clone of the table to avoid modifying the original DOM
596
+ const tableClone = element.cloneNode(true);
597
+ cleanElement(tableClone);
598
+ // outerHTML encodes & as &amp;, which breaks LaTeX alignment
599
+ // characters inside math delimiters. Decode common entities since
600
+ // the output goes into markdown, not back through an HTML parser.
601
+ return tableClone.outerHTML
602
+ .replace(/&amp;/g, '&')
603
+ .replace(/&lt;/g, '<')
604
+ .replace(/&gt;/g, '>');
605
+ }
606
+ function extractLatex(element) {
607
+ // Check if the element is a <math> element and has an alttext attribute
608
+ let latex = element.getAttribute('data-latex');
609
+ let alttext = element.getAttribute('alttext');
610
+ if (latex) {
611
+ return latex.trim();
612
+ }
613
+ else if (alttext) {
614
+ return alttext.trim();
615
+ }
616
+ return '';
617
+ }
618
+ try {
619
+ // Strip <wbr> tags — word break opportunity hints that are invisible in
620
+ // browsers but would insert unwanted spaces during Turndown conversion.
621
+ content = content.replace(/<wbr\s*\/?>/gi, '');
622
+ let markdown = turndownService.turndown(content);
623
+ // Remove the title from the beginning of the content if it exists
624
+ const titleMatch = markdown.match(/^# .+\n+/);
625
+ if (titleMatch) {
626
+ markdown = markdown.slice(titleMatch[0].length);
627
+ }
628
+ // Remove any empty links e.g. [](example.com) that remain, along with surrounding newlines
629
+ // But don't affect image links like ![](image.jpg)
630
+ markdown = markdown.replace(/\n*(?<!!)\[]\([^)]+\)\n*/g, '');
631
+ // Add a space between exclamation marks and image syntax ![
632
+ // e.g. "Yey!![IMG](url)" becomes "Yey! ![IMG](url)" to prevent
633
+ // the parser from misinterpreting the ! as part of the image markup.
634
+ // Also handles linked images: "Yey![![IMG](src)](href)"
635
+ markdown = markdown.replace(/!(?=!\[|\[!\[)/g, '! ');
636
+ // Remove any consecutive newlines more than two
637
+ markdown = markdown.replace(/\n{3,}/g, '\n\n');
638
+ // Append footnotes at the end of the document
639
+ if (Object.keys(footnotes).length > 0) {
640
+ markdown += '\n\n---\n\n';
641
+ for (const [id, content] of Object.entries(footnotes)) {
642
+ markdown += `[^${id}]: ${content}\n\n`;
643
+ }
644
+ }
645
+ return markdown.trim();
646
+ }
647
+ catch (error) {
648
+ console.error('Error converting HTML to Markdown:', error);
649
+ console.log('Problematic content:', content.substring(0, 1000) + '...');
650
+ return `Partial conversion completed with errors. Original HTML:\n\n${content}`;
651
+ }
652
+ }
653
+ function toMarkdown(result, options, url) {
654
+ if (options.markdown) {
655
+ result.content = createMarkdownContent(result.content, url);
656
+ }
657
+ else if (options.separateMarkdown) {
658
+ result.contentMarkdown = createMarkdownContent(result.content, url);
659
+ }
660
+ }
661
+ //# sourceMappingURL=markdown.js.map