defuddle-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ export declare function createMarkdownContent(content: string, url: string): string;
@@ -0,0 +1,511 @@
1
+ import TurndownService from 'turndown';
2
+ const footnotes = {};
3
+ export function createMarkdownContent(content, url) {
4
+ const turndownService = new TurndownService({
5
+ headingStyle: 'atx',
6
+ hr: '---',
7
+ bulletListMarker: '-',
8
+ codeBlockStyle: 'fenced',
9
+ emDelimiter: '*',
10
+ preformattedCode: true,
11
+ });
12
+ turndownService.addRule('table', {
13
+ filter: 'table',
14
+ replacement: function (content, node) {
15
+ if (!(node instanceof HTMLTableElement))
16
+ return content;
17
+ // Check if it's an ArXiv equation table
18
+ if (node.classList.contains('ltx_equation') || node.classList.contains('ltx_eqn_table')) {
19
+ return handleNestedEquations(node);
20
+ }
21
+ // Check if the table has colspan or rowspan
22
+ const hasComplexStructure = Array.from(node.querySelectorAll('td, th')).some(cell => cell.hasAttribute('colspan') || cell.hasAttribute('rowspan'));
23
+ if (hasComplexStructure) {
24
+ // Clean up the table HTML
25
+ const cleanedTable = cleanupTableHTML(node);
26
+ return '\n\n' + cleanedTable + '\n\n';
27
+ }
28
+ // Process simple tables as before
29
+ const rows = Array.from(node.rows).map(row => {
30
+ const cells = Array.from(row.cells).map(cell => {
31
+ // Remove newlines and trim the content
32
+ let cellContent = turndownService.turndown(cell.innerHTML)
33
+ .replace(/\n/g, ' ')
34
+ .trim();
35
+ // Escape pipe characters
36
+ cellContent = cellContent.replace(/\|/g, '\\|');
37
+ return cellContent;
38
+ });
39
+ return `| ${cells.join(' | ')} |`;
40
+ });
41
+ // Create the separator row
42
+ const separatorRow = `| ${Array(rows[0].split('|').length - 2).fill('---').join(' | ')} |`;
43
+ // Combine all rows
44
+ const tableContent = [rows[0], separatorRow, ...rows.slice(1)].join('\n');
45
+ return `\n\n${tableContent}\n\n`;
46
+ }
47
+ });
48
+ turndownService.remove(['style', 'script']);
49
+ // Keep iframes, video, audio, sup, and sub elements
50
+ // @ts-ignore
51
+ turndownService.keep(['iframe', 'video', 'audio', 'sup', 'sub', 'svg', 'math']);
52
+ turndownService.remove(['button']);
53
+ turndownService.addRule('list', {
54
+ filter: ['ul', 'ol'],
55
+ replacement: function (content, node) {
56
+ // Remove trailing newlines/spaces from content
57
+ content = content.trim();
58
+ // Add a newline before the list if it's a top-level list
59
+ const isTopLevel = !(node.parentNode && (node.parentNode.nodeName === 'UL' || node.parentNode.nodeName === 'OL'));
60
+ return (isTopLevel ? '\n' : '') + content + '\n';
61
+ }
62
+ });
63
+ // Lists with tab indentation
64
+ turndownService.addRule('listItem', {
65
+ filter: 'li',
66
+ replacement: function (content, node, options) {
67
+ if (!(node instanceof HTMLElement))
68
+ return content;
69
+ // Handle task list items
70
+ const isTaskListItem = node.classList.contains('task-list-item');
71
+ const checkbox = node.querySelector('input[type="checkbox"]');
72
+ let taskListMarker = '';
73
+ if (isTaskListItem && checkbox) {
74
+ // Remove the checkbox from content since we'll add markdown checkbox
75
+ content = content.replace(/<input[^>]*>/, '');
76
+ taskListMarker = checkbox.checked ? '[x] ' : '[ ] ';
77
+ }
78
+ content = content
79
+ // Remove trailing newlines
80
+ .replace(/\n+$/, '')
81
+ // Split into lines
82
+ .split('\n')
83
+ // Remove empty lines
84
+ .filter(line => line.length > 0)
85
+ // Add indentation to continued lines
86
+ .join('\n\t');
87
+ let prefix = options.bulletListMarker + ' ';
88
+ let parent = node.parentNode;
89
+ // Calculate the nesting level
90
+ let level = 0;
91
+ let currentParent = node.parentNode;
92
+ while (currentParent && (currentParent.nodeName === 'UL' || currentParent.nodeName === 'OL')) {
93
+ level++;
94
+ currentParent = currentParent.parentNode;
95
+ }
96
+ // Add tab indentation based on nesting level, ensuring it's never negative
97
+ const indentLevel = Math.max(0, level - 1);
98
+ prefix = '\t'.repeat(indentLevel) + prefix;
99
+ if (parent instanceof HTMLOListElement) {
100
+ let start = parent.getAttribute('start');
101
+ let index = Array.from(parent.children).indexOf(node) + 1;
102
+ prefix = '\t'.repeat(level - 1) + (start ? Number(start) + index - 1 : index) + '. ';
103
+ }
104
+ return prefix + taskListMarker + content.trim() + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
105
+ }
106
+ });
107
+ turndownService.addRule('figure', {
108
+ filter: 'figure',
109
+ replacement: function (content, node) {
110
+ const figure = node;
111
+ const img = figure.querySelector('img');
112
+ const figcaption = figure.querySelector('figcaption');
113
+ if (!img)
114
+ return content;
115
+ const alt = img.getAttribute('alt') || '';
116
+ const src = img.getAttribute('src') || '';
117
+ let caption = '';
118
+ if (figcaption) {
119
+ const tagSpan = figcaption.querySelector('.ltx_tag_figure');
120
+ const tagText = tagSpan ? tagSpan.textContent?.trim() : '';
121
+ // Process the caption content, including math elements
122
+ let captionContent = figcaption.innerHTML;
123
+ captionContent = captionContent.replace(/<math.*?>(.*?)<\/math>/g, (match, mathContent, offset, string) => {
124
+ const mathElement = new DOMParser().parseFromString(match, 'text/html').body.firstChild;
125
+ const latex = extractLatex(mathElement);
126
+ const prevChar = string[offset - 1] || '';
127
+ const nextChar = string[offset + match.length] || '';
128
+ const isStartOfLine = offset === 0 || /\s/.test(prevChar);
129
+ const isEndOfLine = offset + match.length === string.length || /\s/.test(nextChar);
130
+ const leftSpace = (!isStartOfLine && !/[\s$]/.test(prevChar)) ? ' ' : '';
131
+ const rightSpace = (!isEndOfLine && !/[\s$]/.test(nextChar)) ? ' ' : '';
132
+ return `${leftSpace}$${latex}$${rightSpace}`;
133
+ });
134
+ // Convert the processed caption content to markdown
135
+ const captionMarkdown = turndownService.turndown(captionContent);
136
+ // Combine tag and processed caption
137
+ caption = `${tagText} ${captionMarkdown}`.trim();
138
+ }
139
+ // Handle references in the caption
140
+ caption = caption.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, href) => {
141
+ return `[${text}](${href})`;
142
+ });
143
+ return `![${alt}](${src})\n\n${caption}\n\n`;
144
+ }
145
+ });
146
+ // Use Obsidian format for YouTube embeds and tweets
147
+ turndownService.addRule('embedToMarkdown', {
148
+ filter: function (node) {
149
+ if (node instanceof HTMLIFrameElement) {
150
+ const src = node.getAttribute('src');
151
+ return !!src && (!!src.match(/(?:youtube\.com|youtu\.be)/) ||
152
+ !!src.match(/(?:twitter\.com|x\.com)/));
153
+ }
154
+ return false;
155
+ },
156
+ replacement: function (content, node) {
157
+ if (node instanceof HTMLIFrameElement) {
158
+ const src = node.getAttribute('src');
159
+ if (src) {
160
+ const youtubeMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtu\.be)\/(?:embed\/|watch\?v=)?([a-zA-Z0-9_-]+)/);
161
+ if (youtubeMatch && youtubeMatch[1]) {
162
+ return `![](https://www.youtube.com/watch?v=${youtubeMatch[1]})`;
163
+ }
164
+ const tweetMatch = src.match(/(?:twitter\.com|x\.com)\/.*?(?:status|statuses)\/(\d+)/);
165
+ if (tweetMatch && tweetMatch[1]) {
166
+ return `![](https://x.com/i/status/${tweetMatch[1]})`;
167
+ }
168
+ }
169
+ }
170
+ return content;
171
+ }
172
+ });
173
+ turndownService.addRule('highlight', {
174
+ filter: 'mark',
175
+ replacement: function (content) {
176
+ return '==' + content + '==';
177
+ }
178
+ });
179
+ turndownService.addRule('strikethrough', {
180
+ filter: (node) => node.nodeName === 'DEL' ||
181
+ node.nodeName === 'S' ||
182
+ node.nodeName === 'STRIKE',
183
+ replacement: function (content) {
184
+ return '~~' + content + '~~';
185
+ }
186
+ });
187
+ // Add a new custom rule for complex link structures
188
+ turndownService.addRule('complexLinkStructure', {
189
+ filter: function (node, options) {
190
+ return (node.nodeName === 'A' &&
191
+ node.childNodes.length > 1 &&
192
+ Array.from(node.childNodes).some(child => ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(child.nodeName)));
193
+ },
194
+ replacement: function (content, node, options) {
195
+ if (!(node instanceof HTMLElement))
196
+ return content;
197
+ const href = node.getAttribute('href');
198
+ const title = node.getAttribute('title');
199
+ // Extract the heading
200
+ const headingNode = node.querySelector('h1, h2, h3, h4, h5, h6');
201
+ const headingContent = headingNode ? turndownService.turndown(headingNode.innerHTML) : '';
202
+ // Remove the heading from the content
203
+ if (headingNode) {
204
+ headingNode.remove();
205
+ }
206
+ // Convert the remaining content
207
+ const remainingContent = turndownService.turndown(node.innerHTML);
208
+ // Construct the new markdown
209
+ let markdown = `${headingContent}\n\n${remainingContent}\n\n`;
210
+ if (href) {
211
+ markdown += `[View original](${href})`;
212
+ if (title) {
213
+ markdown += ` "${title}"`;
214
+ }
215
+ }
216
+ return markdown;
217
+ }
218
+ });
219
+ turndownService.addRule('arXivEnumerate', {
220
+ filter: (node) => {
221
+ return node.nodeName === 'OL' && node.classList.contains('ltx_enumerate');
222
+ },
223
+ replacement: function (content, node) {
224
+ if (!(node instanceof HTMLElement))
225
+ return content;
226
+ const items = Array.from(node.children).map((item, index) => {
227
+ if (item instanceof HTMLElement) {
228
+ const itemContent = item.innerHTML.replace(/^<span class="ltx_tag ltx_tag_item">\d+\.<\/span>\s*/, '');
229
+ return `${index + 1}. ${turndownService.turndown(itemContent)}`;
230
+ }
231
+ return '';
232
+ });
233
+ return '\n\n' + items.join('\n\n') + '\n\n';
234
+ }
235
+ });
236
+ turndownService.addRule('removeHiddenElements', {
237
+ filter: function (node) {
238
+ return (node.style.display === 'none');
239
+ },
240
+ replacement: function () {
241
+ return '';
242
+ }
243
+ });
244
+ turndownService.addRule('citations', {
245
+ filter: (node) => {
246
+ if (node instanceof Element) {
247
+ return ((node.nodeName === 'SUP' && node.id.startsWith('fnref:')));
248
+ }
249
+ return false;
250
+ },
251
+ replacement: (content, node) => {
252
+ if (node instanceof HTMLElement) {
253
+ if (node.nodeName === 'SUP' && node.id.startsWith('fnref:')) {
254
+ const id = node.id.replace('fnref:', '');
255
+ // Extract only the primary number before any hyphen
256
+ const primaryNumber = id.split('-')[0];
257
+ return `[^${primaryNumber}]`;
258
+ }
259
+ }
260
+ return content;
261
+ }
262
+ });
263
+ // Footnotes list
264
+ turndownService.addRule('footnotesList', {
265
+ filter: (node) => {
266
+ if (node instanceof HTMLOListElement) {
267
+ return (node.parentElement?.id === 'footnotes');
268
+ }
269
+ return false;
270
+ },
271
+ replacement: (content, node) => {
272
+ if (node instanceof HTMLElement) {
273
+ const references = Array.from(node.children).map(li => {
274
+ let id;
275
+ if (li.id.startsWith('fn:')) {
276
+ id = li.id.replace('fn:', '');
277
+ }
278
+ else {
279
+ const match = li.id.split('/').pop()?.match(/cite_note-(.+)/);
280
+ id = match ? match[1] : li.id;
281
+ }
282
+ // Remove the leading sup element if its content matches the footnote id
283
+ const supElement = li.querySelector('sup');
284
+ if (supElement && supElement.textContent?.trim() === id) {
285
+ supElement.remove();
286
+ }
287
+ const referenceContent = turndownService.turndown(li.innerHTML);
288
+ // Remove the backlink from the footnote content
289
+ const cleanedContent = referenceContent.replace(/\s*↩︎$/, '').trim();
290
+ return `[^${id.toLowerCase()}]: ${cleanedContent}`;
291
+ });
292
+ return '\n\n' + references.join('\n\n') + '\n\n';
293
+ }
294
+ return content;
295
+ }
296
+ });
297
+ // General removal rules for varous website elements
298
+ turndownService.addRule('removals', {
299
+ filter: function (node) {
300
+ if (!(node instanceof HTMLElement))
301
+ return false;
302
+ // Remove the Defuddle backlink from the footnote content
303
+ if (node.getAttribute('href')?.includes('#fnref'))
304
+ return true;
305
+ if (node.classList.contains('footnote-backref'))
306
+ return true;
307
+ return false;
308
+ },
309
+ replacement: function (content, node) {
310
+ return '';
311
+ }
312
+ });
313
+ turndownService.addRule('handleTextNodesInTables', {
314
+ filter: function (node) {
315
+ return node.nodeType === Node.TEXT_NODE &&
316
+ node.parentNode !== null &&
317
+ node.parentNode.nodeName === 'TD';
318
+ },
319
+ replacement: function (content) {
320
+ return content;
321
+ }
322
+ });
323
+ turndownService.addRule('preformattedCode', {
324
+ filter: (node) => {
325
+ return node.nodeName === 'PRE';
326
+ },
327
+ replacement: (content, node) => {
328
+ if (!(node instanceof HTMLElement))
329
+ return content;
330
+ const codeElement = node.querySelector('code');
331
+ if (!codeElement)
332
+ return content;
333
+ const language = codeElement.getAttribute('data-lang') || '';
334
+ const code = codeElement.textContent || '';
335
+ // Clean up the content and escape backticks
336
+ const cleanCode = code
337
+ .trim()
338
+ .replace(/`/g, '\\`');
339
+ return `\n\`\`\`${language}\n${cleanCode}\n\`\`\`\n`;
340
+ }
341
+ });
342
+ turndownService.addRule('math', {
343
+ filter: (node) => {
344
+ return node.nodeName.toLowerCase() === 'math' ||
345
+ (node instanceof Element && node.classList &&
346
+ (node.classList.contains('mwe-math-element') ||
347
+ node.classList.contains('mwe-math-fallback-image-inline') ||
348
+ node.classList.contains('mwe-math-fallback-image-display')));
349
+ },
350
+ replacement: (content, node) => {
351
+ if (!(node instanceof Element))
352
+ return content;
353
+ let latex = extractLatex(node);
354
+ // Remove leading and trailing whitespace
355
+ latex = latex.trim();
356
+ // Check if the math element is within a table
357
+ const isInTable = node.closest('table') !== null;
358
+ // Check if it's an inline or block math element
359
+ if (!isInTable && (node.getAttribute('display') === 'block' ||
360
+ node.classList.contains('mwe-math-fallback-image-display') ||
361
+ (node.parentElement && node.parentElement.classList.contains('mwe-math-element') &&
362
+ node.parentElement.previousElementSibling &&
363
+ node.parentElement.previousElementSibling.nodeName.toLowerCase() === 'p'))) {
364
+ return `\n$$\n${latex}\n$$\n`;
365
+ }
366
+ else {
367
+ // For inline math, ensure there's a space before and after only if needed
368
+ const prevNode = node.previousSibling;
369
+ const nextNode = node.nextSibling;
370
+ const prevChar = prevNode?.textContent?.slice(-1) || '';
371
+ const nextChar = nextNode?.textContent?.[0] || '';
372
+ const isStartOfLine = !prevNode || (prevNode.nodeType === Node.TEXT_NODE && prevNode.textContent?.trim() === '');
373
+ const isEndOfLine = !nextNode || (nextNode.nodeType === Node.TEXT_NODE && nextNode.textContent?.trim() === '');
374
+ const leftSpace = (!isStartOfLine && prevChar && !/[\s$]/.test(prevChar)) ? ' ' : '';
375
+ const rightSpace = (!isEndOfLine && nextChar && !/[\s$]/.test(nextChar)) ? ' ' : '';
376
+ return `${leftSpace}$${latex}$${rightSpace}`;
377
+ }
378
+ }
379
+ });
380
+ turndownService.addRule('katex', {
381
+ filter: (node) => {
382
+ return node instanceof HTMLElement &&
383
+ (node.classList.contains('math') || node.classList.contains('katex'));
384
+ },
385
+ replacement: (content, node) => {
386
+ if (!(node instanceof HTMLElement))
387
+ return content;
388
+ // Try to find the original LaTeX content
389
+ // 1. Check data-latex attribute
390
+ let latex = node.getAttribute('data-latex');
391
+ // 2. If no data-latex, try to get from .katex-mathml
392
+ if (!latex) {
393
+ const mathml = node.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
394
+ latex = mathml?.textContent || '';
395
+ }
396
+ // 3. If still no content, use text content as fallback
397
+ if (!latex) {
398
+ latex = node.textContent?.trim() || '';
399
+ }
400
+ // Determine if it's an inline formula
401
+ const mathElement = node.querySelector('.katex-mathml math');
402
+ const isInline = node.classList.contains('math-inline') ||
403
+ (mathElement && mathElement.getAttribute('display') !== 'block');
404
+ if (isInline) {
405
+ return `$${latex}$`;
406
+ }
407
+ else {
408
+ return `\n$$\n${latex}\n$$\n`;
409
+ }
410
+ }
411
+ });
412
+ turndownService.addRule('callout', {
413
+ filter: (node) => {
414
+ return (node.nodeName.toLowerCase() === 'div' &&
415
+ node.classList.contains('markdown-alert'));
416
+ },
417
+ replacement: (content, node) => {
418
+ const element = node;
419
+ // Get alert type from the class (e.g., markdown-alert-note -> NOTE)
420
+ const alertClasses = Array.from(element.classList);
421
+ const typeClass = alertClasses.find(c => c.startsWith('markdown-alert-') && c !== 'markdown-alert');
422
+ const type = typeClass ? typeClass.replace('markdown-alert-', '').toUpperCase() : 'NOTE';
423
+ // Find the title element and content
424
+ const titleElement = element.querySelector('.markdown-alert-title');
425
+ const contentElement = element.querySelector('p:not(.markdown-alert-title)');
426
+ // Extract content, removing the title from it if present
427
+ let alertContent = content;
428
+ if (titleElement && titleElement.textContent) {
429
+ alertContent = contentElement?.textContent || content.replace(titleElement.textContent, '');
430
+ }
431
+ // Format as Obsidian callout
432
+ return `\n> [!${type}]\n> ${alertContent.trim().replace(/\n/g, '\n> ')}\n`;
433
+ }
434
+ });
435
+ function handleNestedEquations(table) {
436
+ const mathElements = table.querySelectorAll('math[alttext]');
437
+ if (mathElements.length === 0)
438
+ return '';
439
+ return Array.from(mathElements).map(mathElement => {
440
+ const alttext = mathElement.getAttribute('alttext');
441
+ if (alttext) {
442
+ // Check if it's an inline or block equation
443
+ const isInline = mathElement.closest('.ltx_eqn_inline') !== null;
444
+ return isInline ? `$${alttext.trim()}$` : `\n$$\n${alttext.trim()}\n$$`;
445
+ }
446
+ return '';
447
+ }).join('\n\n');
448
+ }
449
+ function cleanupTableHTML(table) {
450
+ const allowedAttributes = ['src', 'href', 'style', 'align', 'width', 'height', 'rowspan', 'colspan', 'bgcolor', 'scope', 'valign', 'headers'];
451
+ const cleanElement = (element) => {
452
+ Array.from(element.attributes).forEach(attr => {
453
+ if (!allowedAttributes.includes(attr.name)) {
454
+ element.removeAttribute(attr.name);
455
+ }
456
+ });
457
+ element.childNodes.forEach(child => {
458
+ if (child instanceof Element) {
459
+ cleanElement(child);
460
+ }
461
+ });
462
+ };
463
+ // Create a clone of the table to avoid modifying the original DOM
464
+ const tableClone = table.cloneNode(true);
465
+ cleanElement(tableClone);
466
+ return tableClone.outerHTML;
467
+ }
468
+ function extractLatex(element) {
469
+ // Check if the element is a <math> element and has an alttext attribute
470
+ if (element.nodeName.toLowerCase() === 'math') {
471
+ let latex = element.getAttribute('data-latex');
472
+ let alttext = element.getAttribute('alttext');
473
+ if (latex) {
474
+ return latex.trim();
475
+ }
476
+ else if (alttext) {
477
+ return alttext.trim();
478
+ }
479
+ console.log('No latex or alttext found for math element:', element);
480
+ }
481
+ return ''; // Return empty string for non-math elements
482
+ }
483
+ try {
484
+ let markdown = turndownService.turndown(content);
485
+ // Remove the title from the beginning of the content if it exists
486
+ const titleMatch = markdown.match(/^# .+\n+/);
487
+ if (titleMatch) {
488
+ markdown = markdown.slice(titleMatch[0].length);
489
+ }
490
+ // Remove any empty links e.g. [](example.com) that remain, along with surrounding newlines
491
+ // But don't affect image links like ![](image.jpg)
492
+ markdown = markdown.replace(/\n*(?<!!)\[]\([^)]+\)\n*/g, '');
493
+ // Remove any consecutive newlines more than two
494
+ markdown = markdown.replace(/\n{3,}/g, '\n\n');
495
+ // Append footnotes at the end of the document
496
+ if (Object.keys(footnotes).length > 0) {
497
+ markdown += '\n\n---\n\n';
498
+ for (const [id, content] of Object.entries(footnotes)) {
499
+ markdown += `[^${id}]: ${content}\n\n`;
500
+ }
501
+ }
502
+ // Clear the footnotes object for the next conversion
503
+ Object.keys(footnotes).forEach(key => delete footnotes[key]);
504
+ return markdown.trim();
505
+ }
506
+ catch (error) {
507
+ console.error('Error converting HTML to Markdown:', error);
508
+ console.log('Problematic content:', content.substring(0, 1000) + '...');
509
+ return `Partial conversion completed with errors. Original HTML:\n\n${content}`;
510
+ }
511
+ }
package/package.json ADDED
@@ -0,0 +1,34 @@
1
+ {
2
+ "name": "defuddle-cli",
3
+ "version": "0.1.0",
4
+ "description": "Command line interface for Defuddle - extract article content from web pages",
5
+ "main": "dist/index.js",
6
+ "bin": {
7
+ "defuddle": "dist/index.js"
8
+ },
9
+ "type": "module",
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "dev": "tsc --watch",
13
+ "start": "node dist/index.js"
14
+ },
15
+ "keywords": [
16
+ "defuddle",
17
+ "cli"
18
+ ],
19
+ "author": "kepano",
20
+ "license": "MIT",
21
+ "dependencies": {
22
+ "chalk": "^5.3.0",
23
+ "commander": "^12.0.0",
24
+ "defuddle": "^0.3.4",
25
+ "jsdom": "^24.0.0",
26
+ "turndown": "^7.2.0"
27
+ },
28
+ "devDependencies": {
29
+ "@types/jsdom": "^21.1.6",
30
+ "@types/node": "^20.0.0",
31
+ "@types/turndown": "^5.0.5",
32
+ "typescript": "^5.3.3"
33
+ }
34
+ }