defuddle-cli 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/dist/dom/document.d.ts +3 -0
  2. package/dist/dom/document.js +49 -0
  3. package/dist/dom/elements.d.ts +2 -0
  4. package/dist/dom/elements.js +478 -0
  5. package/dist/dom/interfaces/elements/base.d.ts +14 -0
  6. package/dist/dom/interfaces/elements/base.js +46 -0
  7. package/dist/dom/interfaces/elements/form.d.ts +2 -0
  8. package/dist/dom/interfaces/elements/form.js +123 -0
  9. package/dist/dom/interfaces/elements/index.d.ts +2 -0
  10. package/dist/dom/interfaces/elements/index.js +22 -0
  11. package/dist/dom/interfaces/elements/interactive.d.ts +2 -0
  12. package/dist/dom/interfaces/elements/interactive.js +83 -0
  13. package/dist/dom/interfaces/elements/media.d.ts +2 -0
  14. package/dist/dom/interfaces/elements/media.js +43 -0
  15. package/dist/dom/interfaces/elements/table.d.ts +2 -0
  16. package/dist/dom/interfaces/elements/table.js +155 -0
  17. package/dist/dom/interfaces/elements/text.d.ts +2 -0
  18. package/dist/dom/interfaces/elements/text.js +57 -0
  19. package/dist/dom/interfaces/elements.d.ts +2 -0
  20. package/dist/dom/interfaces/elements.js +478 -0
  21. package/dist/dom/interfaces/range.d.ts +1 -1
  22. package/dist/dom/range.d.ts +2 -0
  23. package/dist/dom/range.js +87 -0
  24. package/dist/dom/setup/document.d.ts +3 -0
  25. package/dist/dom/setup/document.js +49 -0
  26. package/dist/dom/setup.d.ts +12 -9
  27. package/dist/dom/setup.js +148 -533
  28. package/dist/dom/types/setup.d.ts +10 -0
  29. package/dist/dom/types/setup.js +1 -0
  30. package/dist/index.js +9 -684
  31. package/package.json +3 -5
  32. package/src/index.ts +9 -772
  33. package/src/dom/interfaces/document.ts +0 -53
  34. package/src/dom/interfaces/range.ts +0 -120
  35. package/src/dom/interfaces/setup.ts +0 -196
  36. package/src/markdown.ts +0 -592
package/src/markdown.ts DELETED
@@ -1,592 +0,0 @@
1
- import TurndownService from 'turndown';
2
-
3
- const footnotes: { [key: string]: string } = {};
4
-
5
- export function createMarkdownContent(content: string, url: string) {
6
- const turndownService = new TurndownService({
7
- headingStyle: 'atx',
8
- hr: '---',
9
- bulletListMarker: '-',
10
- codeBlockStyle: 'fenced',
11
- emDelimiter: '*',
12
- preformattedCode: true,
13
- });
14
-
15
- turndownService.addRule('table', {
16
- filter: 'table',
17
- replacement: function(content, node) {
18
- if (!(node instanceof HTMLTableElement)) return content;
19
-
20
- // Check if it's an ArXiv equation table
21
- if (node.classList.contains('ltx_equation') || node.classList.contains('ltx_eqn_table')) {
22
- return handleNestedEquations(node);
23
- }
24
-
25
- // Check if the table has colspan or rowspan
26
- const hasComplexStructure = Array.from(node.querySelectorAll('td, th')).some(cell =>
27
- cell.hasAttribute('colspan') || cell.hasAttribute('rowspan')
28
- );
29
-
30
- if (hasComplexStructure) {
31
- // Clean up the table HTML
32
- const cleanedTable = cleanupTableHTML(node);
33
- return '\n\n' + cleanedTable + '\n\n';
34
- }
35
-
36
- // Process simple tables as before
37
- const rows = Array.from(node.rows).map(row => {
38
- const cells = Array.from(row.cells).map(cell => {
39
- // Remove newlines and trim the content
40
- let cellContent = turndownService.turndown(cell.innerHTML)
41
- .replace(/\n/g, ' ')
42
- .trim();
43
- // Escape pipe characters
44
- cellContent = cellContent.replace(/\|/g, '\\|');
45
- return cellContent;
46
- });
47
- return `| ${cells.join(' | ')} |`;
48
- });
49
-
50
- // Create the separator row
51
- const separatorRow = `| ${Array(rows[0].split('|').length - 2).fill('---').join(' | ')} |`;
52
-
53
- // Combine all rows
54
- const tableContent = [rows[0], separatorRow, ...rows.slice(1)].join('\n');
55
-
56
- return `\n\n${tableContent}\n\n`;
57
- }
58
- });
59
-
60
- turndownService.remove(['style', 'script']);
61
-
62
- // Keep iframes, video, audio, sup, and sub elements
63
- // @ts-ignore
64
- turndownService.keep(['iframe', 'video', 'audio', 'sup', 'sub', 'svg', 'math']);
65
- turndownService.remove(['button']);
66
-
67
- turndownService.addRule('list', {
68
- filter: ['ul', 'ol'],
69
- replacement: function (content: string, node: Node) {
70
- // Remove trailing newlines/spaces from content
71
- content = content.trim();
72
-
73
- // Add a newline before the list if it's a top-level list
74
- const isTopLevel = !(node.parentNode && (node.parentNode.nodeName === 'UL' || node.parentNode.nodeName === 'OL'));
75
- return (isTopLevel ? '\n' : '') + content + '\n';
76
- }
77
- });
78
-
79
- // Lists with tab indentation
80
- turndownService.addRule('listItem', {
81
- filter: 'li',
82
- replacement: function (content: string, node: Node, options: TurndownService.Options) {
83
- if (!(node instanceof HTMLElement)) return content;
84
-
85
- // Handle task list items
86
- const isTaskListItem = node.classList.contains('task-list-item');
87
- const checkbox = node.querySelector('input[type="checkbox"]') as HTMLInputElement | null;
88
- let taskListMarker = '';
89
-
90
- if (isTaskListItem && checkbox) {
91
- // Remove the checkbox from content since we'll add markdown checkbox
92
- content = content.replace(/<input[^>]*>/, '');
93
- taskListMarker = checkbox.checked ? '[x] ' : '[ ] ';
94
- }
95
-
96
- content = content
97
- // Remove trailing newlines
98
- .replace(/\n+$/, '')
99
- // Split into lines
100
- .split('\n')
101
- // Remove empty lines
102
- .filter(line => line.length > 0)
103
- // Add indentation to continued lines
104
- .join('\n\t');
105
-
106
- let prefix = options.bulletListMarker + ' ';
107
- let parent = node.parentNode;
108
-
109
- // Calculate the nesting level
110
- let level = 0;
111
- let currentParent = node.parentNode;
112
- while (currentParent && (currentParent.nodeName === 'UL' || currentParent.nodeName === 'OL')) {
113
- level++;
114
- currentParent = currentParent.parentNode;
115
- }
116
-
117
- // Add tab indentation based on nesting level, ensuring it's never negative
118
- const indentLevel = Math.max(0, level - 1);
119
- prefix = '\t'.repeat(indentLevel) + prefix;
120
-
121
- if (parent instanceof HTMLOListElement) {
122
- let start = parent.getAttribute('start');
123
- let index = Array.from(parent.children).indexOf(node as HTMLElement) + 1;
124
- prefix = '\t'.repeat(level - 1) + (start ? Number(start) + index - 1 : index) + '. ';
125
- }
126
-
127
- return prefix + taskListMarker + content.trim() + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
128
- }
129
- });
130
-
131
- turndownService.addRule('figure', {
132
- filter: 'figure',
133
- replacement: function(content, node) {
134
- const figure = node as HTMLElement;
135
- const img = figure.querySelector('img');
136
- const figcaption = figure.querySelector('figcaption');
137
-
138
- if (!img) return content;
139
-
140
- const alt = img.getAttribute('alt') || '';
141
- const src = img.getAttribute('src') || '';
142
- let caption = '';
143
-
144
- if (figcaption) {
145
- const tagSpan = figcaption.querySelector('.ltx_tag_figure');
146
- const tagText = tagSpan ? tagSpan.textContent?.trim() : '';
147
-
148
- // Process the caption content, including math elements
149
- let captionContent = figcaption.innerHTML;
150
- captionContent = captionContent.replace(/<math.*?>(.*?)<\/math>/g, (match, mathContent, offset, string) => {
151
- const mathElement = new DOMParser().parseFromString(match, 'text/html').body.firstChild as Element;
152
- const latex = extractLatex(mathElement);
153
- const prevChar = string[offset - 1] || '';
154
- const nextChar = string[offset + match.length] || '';
155
-
156
- const isStartOfLine = offset === 0 || /\s/.test(prevChar);
157
- const isEndOfLine = offset + match.length === string.length || /\s/.test(nextChar);
158
-
159
- const leftSpace = (!isStartOfLine && !/[\s$]/.test(prevChar)) ? ' ' : '';
160
- const rightSpace = (!isEndOfLine && !/[\s$]/.test(nextChar)) ? ' ' : '';
161
-
162
- return `${leftSpace}$${latex}$${rightSpace}`;
163
- });
164
-
165
- // Convert the processed caption content to markdown
166
- const captionMarkdown = turndownService.turndown(captionContent);
167
-
168
- // Combine tag and processed caption
169
- caption = `${tagText} ${captionMarkdown}`.trim();
170
- }
171
-
172
- // Handle references in the caption
173
- caption = caption.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, href) => {
174
- return `[${text}](${href})`;
175
- });
176
-
177
- return `![${alt}](${src})\n\n${caption}\n\n`;
178
- }
179
- });
180
-
181
- // Use Obsidian format for YouTube embeds and tweets
182
- turndownService.addRule('embedToMarkdown', {
183
- filter: function (node: Node): boolean {
184
- if (node instanceof HTMLIFrameElement) {
185
- const src = node.getAttribute('src');
186
- return !!src && (
187
- !!src.match(/(?:youtube\.com|youtu\.be)/) ||
188
- !!src.match(/(?:twitter\.com|x\.com)/)
189
- );
190
- }
191
- return false;
192
- },
193
- replacement: function (content: string, node: Node): string {
194
- if (node instanceof HTMLIFrameElement) {
195
- const src = node.getAttribute('src');
196
- if (src) {
197
- const youtubeMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:embed\/|watch\?v=)?([a-zA-Z0-9_-]+)/);
198
- if (youtubeMatch && youtubeMatch[1]) {
199
- return `![](https://www.youtube.com/watch?v=${youtubeMatch[1]})`;
200
- }
201
- const tweetMatch = src.match(/(?:twitter\.com|x\.com)\/.*?(?:status|statuses)\/(\d+)/);
202
- if (tweetMatch && tweetMatch[1]) {
203
- return `![](https://x.com/i/status/${tweetMatch[1]})`;
204
- }
205
- }
206
- }
207
- return content;
208
- }
209
- });
210
-
211
- turndownService.addRule('highlight', {
212
- filter: 'mark',
213
- replacement: function(content) {
214
- return '==' + content + '==';
215
- }
216
- });
217
-
218
- turndownService.addRule('strikethrough', {
219
- filter: (node: Node) =>
220
- node.nodeName === 'DEL' ||
221
- node.nodeName === 'S' ||
222
- node.nodeName === 'STRIKE',
223
- replacement: function(content) {
224
- return '~~' + content + '~~';
225
- }
226
- });
227
-
228
- // Add a new custom rule for complex link structures
229
- turndownService.addRule('complexLinkStructure', {
230
- filter: function (node, options) {
231
- return (
232
- node.nodeName === 'A' &&
233
- node.childNodes.length > 1 &&
234
- Array.from(node.childNodes).some(child => ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(child.nodeName))
235
- );
236
- },
237
- replacement: function (content, node, options) {
238
- if (!(node instanceof HTMLElement)) return content;
239
- const href = node.getAttribute('href');
240
- const title = node.getAttribute('title');
241
-
242
- // Extract the heading
243
- const headingNode = node.querySelector('h1, h2, h3, h4, h5, h6');
244
- const headingContent = headingNode ? turndownService.turndown(headingNode.innerHTML) : '';
245
-
246
- // Remove the heading from the content
247
- if (headingNode) {
248
- headingNode.remove();
249
- }
250
-
251
- // Convert the remaining content
252
- const remainingContent = turndownService.turndown(node.innerHTML);
253
-
254
- // Construct the new markdown
255
- let markdown = `${headingContent}\n\n${remainingContent}\n\n`;
256
- if (href) {
257
- markdown += `[View original](${href})`;
258
- if (title) {
259
- markdown += ` "${title}"`;
260
- }
261
- }
262
-
263
- return markdown;
264
- }
265
- });
266
-
267
- turndownService.addRule('arXivEnumerate', {
268
- filter: (node) => {
269
- return node.nodeName === 'OL' && node.classList.contains('ltx_enumerate');
270
- },
271
- replacement: function(content, node) {
272
- if (!(node instanceof HTMLElement)) return content;
273
-
274
- const items = Array.from(node.children).map((item, index) => {
275
- if (item instanceof HTMLElement) {
276
- const itemContent = item.innerHTML.replace(/^<span class="ltx_tag ltx_tag_item">\d+\.<\/span>\s*/, '');
277
- return `${index + 1}. ${turndownService.turndown(itemContent)}`;
278
- }
279
- return '';
280
- });
281
-
282
- return '\n\n' + items.join('\n\n') + '\n\n';
283
- }
284
- });
285
-
286
- turndownService.addRule('citations', {
287
- filter: (node: Node): boolean => {
288
- if (node instanceof Element) {
289
- return (
290
- (node.nodeName === 'SUP' && node.id.startsWith('fnref:'))
291
- );
292
- }
293
- return false;
294
- },
295
- replacement: (content, node) => {
296
- if (node instanceof HTMLElement) {
297
- if (node.nodeName === 'SUP' && node.id.startsWith('fnref:')) {
298
- const id = node.id.replace('fnref:', '');
299
- // Extract only the primary number before any hyphen
300
- const primaryNumber = id.split('-')[0];
301
- return `[^${primaryNumber}]`;
302
- }
303
- }
304
- return content;
305
- }
306
- });
307
-
308
- // Footnotes list
309
- turndownService.addRule('footnotesList', {
310
- filter: (node: Node): boolean => {
311
- if (node instanceof HTMLOListElement) {
312
- return (
313
- node.parentElement?.id === 'footnotes'
314
- );
315
- }
316
- return false;
317
- },
318
- replacement: (content, node) => {
319
- if (node instanceof HTMLElement) {
320
- const references = Array.from(node.children).map(li => {
321
- let id;
322
- if (li.id.startsWith('fn:')) {
323
- id = li.id.replace('fn:', '');
324
- } else {
325
- const match = li.id.split('/').pop()?.match(/cite_note-(.+)/);
326
- id = match ? match[1] : li.id;
327
- }
328
-
329
- // Remove the leading sup element if its content matches the footnote id
330
- const supElement = li.querySelector('sup');
331
- if (supElement && supElement.textContent?.trim() === id) {
332
- supElement.remove();
333
- }
334
-
335
- const referenceContent = turndownService.turndown(li.innerHTML);
336
- // Remove the backlink from the footnote content
337
- const cleanedContent = referenceContent.replace(/\s*↩︎$/, '').trim();
338
- return `[^${id.toLowerCase()}]: ${cleanedContent}`;
339
- });
340
- return '\n\n' + references.join('\n\n') + '\n\n';
341
- }
342
- return content;
343
- }
344
- });
345
-
346
- // General removal rules for varous website elements
347
- turndownService.addRule('removals', {
348
- filter: function (node) {
349
- if (!(node instanceof HTMLElement)) return false;
350
- // Remove the Defuddle backlink from the footnote content
351
- if (node.getAttribute('href')?.includes('#fnref')) return true;
352
- if (node.classList.contains('footnote-backref')) return true;
353
- return false;
354
- },
355
- replacement: function (content, node) {
356
- return '';
357
- }
358
- });
359
-
360
- turndownService.addRule('handleTextNodesInTables', {
361
- filter: function (node: Node): boolean {
362
- return node.nodeType === Node.TEXT_NODE &&
363
- node.parentNode !== null &&
364
- node.parentNode.nodeName === 'TD';
365
- },
366
- replacement: function (content: string): string {
367
- return content;
368
- }
369
- });
370
-
371
- turndownService.addRule('preformattedCode', {
372
- filter: (node) => {
373
- return node.nodeName === 'PRE';
374
- },
375
- replacement: (content, node) => {
376
- if (!(node instanceof HTMLElement)) return content;
377
-
378
- const codeElement = node.querySelector('code');
379
- if (!codeElement) return content;
380
-
381
- const language = codeElement.getAttribute('data-lang') || '';
382
- const code = codeElement.textContent || '';
383
-
384
- // Clean up the content and escape backticks
385
- const cleanCode = code
386
- .trim()
387
- .replace(/`/g, '\\`');
388
-
389
- return `\n\`\`\`${language}\n${cleanCode}\n\`\`\`\n`;
390
- }
391
- });
392
-
393
- turndownService.addRule('math', {
394
- filter: (node) => {
395
- return node.nodeName.toLowerCase() === 'math' ||
396
- (node instanceof Element && node.classList &&
397
- (node.classList.contains('mwe-math-element') ||
398
- node.classList.contains('mwe-math-fallback-image-inline') ||
399
- node.classList.contains('mwe-math-fallback-image-display')));
400
- },
401
- replacement: (content, node) => {
402
- if (!(node instanceof Element)) return content;
403
-
404
- let latex = extractLatex(node);
405
-
406
- // Remove leading and trailing whitespace
407
- latex = latex.trim();
408
-
409
- // Check if the math element is within a table
410
- const isInTable = node.closest('table') !== null;
411
-
412
- // Check if it's an inline or block math element
413
- if (!isInTable && (
414
- node.getAttribute('display') === 'block' ||
415
- node.classList.contains('mwe-math-fallback-image-display') ||
416
- (node.parentElement && node.parentElement.classList.contains('mwe-math-element') &&
417
- node.parentElement.previousElementSibling &&
418
- node.parentElement.previousElementSibling.nodeName.toLowerCase() === 'p')
419
- )) {
420
- return `\n$$\n${latex}\n$$\n`;
421
- } else {
422
- // For inline math, ensure there's a space before and after only if needed
423
- const prevNode = node.previousSibling;
424
- const nextNode = node.nextSibling;
425
- const prevChar = prevNode?.textContent?.slice(-1) || '';
426
- const nextChar = nextNode?.textContent?.[0] || '';
427
-
428
- const isStartOfLine = !prevNode || (prevNode.nodeType === Node.TEXT_NODE && prevNode.textContent?.trim() === '');
429
- const isEndOfLine = !nextNode || (nextNode.nodeType === Node.TEXT_NODE && nextNode.textContent?.trim() === '');
430
-
431
- const leftSpace = (!isStartOfLine && prevChar && !/[\s$]/.test(prevChar)) ? ' ' : '';
432
- const rightSpace = (!isEndOfLine && nextChar && !/[\s$]/.test(nextChar)) ? ' ' : '';
433
-
434
- return `${leftSpace}$${latex}$${rightSpace}`;
435
- }
436
- }
437
- });
438
-
439
- turndownService.addRule('katex', {
440
- filter: (node) => {
441
- return node instanceof HTMLElement &&
442
- (node.classList.contains('math') || node.classList.contains('katex'));
443
- },
444
- replacement: (content, node) => {
445
- if (!(node instanceof HTMLElement)) return content;
446
-
447
- // Try to find the original LaTeX content
448
- // 1. Check data-latex attribute
449
- let latex = node.getAttribute('data-latex');
450
-
451
- // 2. If no data-latex, try to get from .katex-mathml
452
- if (!latex) {
453
- const mathml = node.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
454
- latex = mathml?.textContent || '';
455
- }
456
-
457
- // 3. If still no content, use text content as fallback
458
- if (!latex) {
459
- latex = node.textContent?.trim() || '';
460
- }
461
-
462
- // Determine if it's an inline formula
463
- const mathElement = node.querySelector('.katex-mathml math');
464
- const isInline = node.classList.contains('math-inline') ||
465
- (mathElement && mathElement.getAttribute('display') !== 'block');
466
-
467
- if (isInline) {
468
- return `$${latex}$`;
469
- } else {
470
- return `\n$$\n${latex}\n$$\n`;
471
- }
472
- }
473
- });
474
-
475
- turndownService.addRule('callout', {
476
- filter: (node) => {
477
- return (
478
- node.nodeName.toLowerCase() === 'div' &&
479
- node.classList.contains('markdown-alert')
480
- );
481
- },
482
- replacement: (content, node) => {
483
- const element = node as HTMLElement;
484
-
485
- // Get alert type from the class (e.g., markdown-alert-note -> NOTE)
486
- const alertClasses = Array.from(element.classList);
487
- const typeClass = alertClasses.find(c => c.startsWith('markdown-alert-') && c !== 'markdown-alert');
488
- const type = typeClass ? typeClass.replace('markdown-alert-', '').toUpperCase() : 'NOTE';
489
-
490
- // Find the title element and content
491
- const titleElement = element.querySelector('.markdown-alert-title');
492
- const contentElement = element.querySelector('p:not(.markdown-alert-title)');
493
-
494
- // Extract content, removing the title from it if present
495
- let alertContent = content;
496
- if (titleElement && titleElement.textContent) {
497
- alertContent = contentElement?.textContent || content.replace(titleElement.textContent, '');
498
- }
499
-
500
- // Format as Obsidian callout
501
- return `\n> [!${type}]\n> ${alertContent.trim().replace(/\n/g, '\n> ')}\n`;
502
- }
503
- });
504
-
505
- function handleNestedEquations(table: Element): string {
506
- const mathElements = table.querySelectorAll('math[alttext]');
507
- if (mathElements.length === 0) return '';
508
-
509
- return Array.from(mathElements).map(mathElement => {
510
- const alttext = mathElement.getAttribute('alttext');
511
- if (alttext) {
512
- // Check if it's an inline or block equation
513
- const isInline = mathElement.closest('.ltx_eqn_inline') !== null;
514
- return isInline ? `$${alttext.trim()}$` : `\n$$\n${alttext.trim()}\n$$`;
515
- }
516
- return '';
517
- }).join('\n\n');
518
- }
519
-
520
- function cleanupTableHTML(table: HTMLTableElement): string {
521
- const allowedAttributes = ['src', 'href', 'style', 'align', 'width', 'height', 'rowspan', 'colspan', 'bgcolor', 'scope', 'valign', 'headers'];
522
-
523
- const cleanElement = (element: Element) => {
524
- Array.from(element.attributes).forEach(attr => {
525
- if (!allowedAttributes.includes(attr.name)) {
526
- element.removeAttribute(attr.name);
527
- }
528
- });
529
-
530
- element.childNodes.forEach(child => {
531
- if (child instanceof Element) {
532
- cleanElement(child);
533
- }
534
- });
535
- };
536
-
537
- // Create a clone of the table to avoid modifying the original DOM
538
- const tableClone = table.cloneNode(true) as HTMLTableElement;
539
- cleanElement(tableClone);
540
-
541
- return tableClone.outerHTML;
542
- }
543
-
544
- function extractLatex(element: Element): string {
545
- // Check if the element is a <math> element and has an alttext attribute
546
- if (element.nodeName.toLowerCase() === 'math') {
547
- let latex = element.getAttribute('data-latex');
548
- let alttext = element.getAttribute('alttext');
549
- if (latex) {
550
- return latex.trim();
551
- } else if (alttext) {
552
- return alttext.trim();
553
- }
554
- console.log('No latex or alttext found for math element:', element);
555
- }
556
- return ''; // Return empty string for non-math elements
557
- }
558
-
559
- try {
560
- let markdown = turndownService.turndown(content);
561
-
562
- // Remove the title from the beginning of the content if it exists
563
- const titleMatch = markdown.match(/^# .+\n+/);
564
- if (titleMatch) {
565
- markdown = markdown.slice(titleMatch[0].length);
566
- }
567
-
568
- // Remove any empty links e.g. [](example.com) that remain, along with surrounding newlines
569
- // But don't affect image links like ![](image.jpg)
570
- markdown = markdown.replace(/\n*(?<!!)\[]\([^)]+\)\n*/g, '');
571
-
572
- // Remove any consecutive newlines more than two
573
- markdown = markdown.replace(/\n{3,}/g, '\n\n');
574
-
575
- // Append footnotes at the end of the document
576
- if (Object.keys(footnotes).length > 0) {
577
- markdown += '\n\n---\n\n';
578
- for (const [id, content] of Object.entries(footnotes)) {
579
- markdown += `[^${id}]: ${content}\n\n`;
580
- }
581
- }
582
-
583
- // Clear the footnotes object for the next conversion
584
- Object.keys(footnotes).forEach(key => delete footnotes[key]);
585
-
586
- return markdown.trim();
587
- } catch (error) {
588
- console.error('Error converting HTML to Markdown:', error);
589
- console.log('Problematic content:', content.substring(0, 1000) + '...');
590
- return `Partial conversion completed with errors. Original HTML:\n\n${content}`;
591
- }
592
- }