@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
package/dist/markdown.js
ADDED
|
@@ -0,0 +1,661 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.isGenericElement = isGenericElement;
|
|
7
|
+
exports.asGenericElement = asGenericElement;
|
|
8
|
+
exports.createMarkdownContent = createMarkdownContent;
|
|
9
|
+
exports.toMarkdown = toMarkdown;
|
|
10
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
11
|
+
const utils_1 = require("./utils");
|
|
12
|
+
const dom_1 = require("./utils/dom");
|
|
13
|
+
function isGenericElement(node) {
|
|
14
|
+
return node !== null && typeof node === 'object' && 'getAttribute' in node;
|
|
15
|
+
}
|
|
16
|
+
function asGenericElement(node) {
|
|
17
|
+
return node;
|
|
18
|
+
}
|
|
19
|
+
const WIDTH_DESCRIPTOR_RE = /^(\d+)w,?$/;
|
|
20
|
+
const DENSITY_DESCRIPTOR_RE = /^\d+(?:\.\d+)?x,?$/;
|
|
21
|
+
function getBestImageSrc(node) {
|
|
22
|
+
const srcset = node.getAttribute('srcset');
|
|
23
|
+
if (srcset) {
|
|
24
|
+
let bestUrl = '';
|
|
25
|
+
let bestWidth = 0;
|
|
26
|
+
// Tokenize by whitespace instead of splitting on commas, because CDN
|
|
27
|
+
// image URLs (e.g. Substack) can contain commas in the URL path
|
|
28
|
+
// (e.g. `w_424,c_limit,f_webp`). We scan tokens and treat any token
|
|
29
|
+
// matching `Nw` as a width descriptor; the preceding tokens form the URL.
|
|
30
|
+
const tokens = srcset.trim().split(/\s+/);
|
|
31
|
+
let urlParts = [];
|
|
32
|
+
for (const token of tokens) {
|
|
33
|
+
const widthMatch = token.match(WIDTH_DESCRIPTOR_RE);
|
|
34
|
+
if (widthMatch) {
|
|
35
|
+
const width = parseInt(widthMatch[1], 10);
|
|
36
|
+
if (urlParts.length > 0 && width > bestWidth) {
|
|
37
|
+
const url = urlParts.join(' ').replace(/^,\s*/, '');
|
|
38
|
+
if (url) {
|
|
39
|
+
bestWidth = width;
|
|
40
|
+
bestUrl = url;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
urlParts = [];
|
|
44
|
+
}
|
|
45
|
+
else if (DENSITY_DESCRIPTOR_RE.test(token)) {
|
|
46
|
+
// Density descriptor (e.g. 2x) — skip, not used for selection
|
|
47
|
+
urlParts = [];
|
|
48
|
+
}
|
|
49
|
+
else {
|
|
50
|
+
urlParts.push(token);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
if (bestUrl)
|
|
54
|
+
return bestUrl;
|
|
55
|
+
}
|
|
56
|
+
return node.getAttribute('src') || '';
|
|
57
|
+
}
|
|
58
|
+
function createMarkdownContent(content, url) {
|
|
59
|
+
const footnotes = {};
|
|
60
|
+
const turndownService = new turndown_1.default({
|
|
61
|
+
headingStyle: 'atx',
|
|
62
|
+
hr: '---',
|
|
63
|
+
bulletListMarker: '-',
|
|
64
|
+
codeBlockStyle: 'fenced',
|
|
65
|
+
emDelimiter: '*',
|
|
66
|
+
preformattedCode: true,
|
|
67
|
+
});
|
|
68
|
+
turndownService.addRule('table', {
|
|
69
|
+
filter: 'table',
|
|
70
|
+
replacement: function (content, node) {
|
|
71
|
+
if (!isGenericElement(node))
|
|
72
|
+
return content;
|
|
73
|
+
// Check if it's an ArXiv equation table
|
|
74
|
+
if (node.classList?.contains('ltx_equation') || node.classList?.contains('ltx_eqn_table')) {
|
|
75
|
+
return handleNestedEquations(node);
|
|
76
|
+
}
|
|
77
|
+
// Detect layout tables (used for styling/positioning, not data)
|
|
78
|
+
const hasNestedTables = node.querySelector('table') !== null;
|
|
79
|
+
const directCells = Array.from(node.querySelectorAll('td, th')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
|
|
80
|
+
if (hasNestedTables || directCells.length <= 1) {
|
|
81
|
+
const directRows = Array.from(node.querySelectorAll('tr')).filter((el) => (0, dom_1.isDirectTableChild)(el, node));
|
|
82
|
+
const cellCounts = directRows.map((tr) => directCells.filter((cell) => cell.parentNode === tr).length);
|
|
83
|
+
const isSingleColumn = directRows.length > 0
|
|
84
|
+
&& new Set(cellCounts).size === 1
|
|
85
|
+
&& cellCounts[0] <= 1;
|
|
86
|
+
if (isSingleColumn) {
|
|
87
|
+
// Layout table — extract content, don't convert to markdown table
|
|
88
|
+
return '\n\n' + turndownService.turndown(directCells.map((cell) => (0, dom_1.serializeHTML)(cell)).join('')) + '\n\n';
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
// Check if the table has colspan or rowspan
|
|
92
|
+
const cells = Array.from(node.querySelectorAll('td, th'));
|
|
93
|
+
const hasComplexStructure = cells.some(cell => isGenericElement(asGenericElement(cell)) && (cell.hasAttribute('colspan') || cell.hasAttribute('rowspan')));
|
|
94
|
+
if (hasComplexStructure) {
|
|
95
|
+
// Clean up the table HTML
|
|
96
|
+
const cleanedTable = cleanupTableHTML(node);
|
|
97
|
+
return '\n\n' + cleanedTable + '\n\n';
|
|
98
|
+
}
|
|
99
|
+
// Process simple tables as before
|
|
100
|
+
// Use node.rows/row.cells when available (browser/JSDOM), fall back to
|
|
101
|
+
// querySelectorAll for environments like linkedom that lack these properties
|
|
102
|
+
const tableEl = node;
|
|
103
|
+
const rowElements = tableEl.rows && tableEl.rows.length > 0
|
|
104
|
+
? Array.from(tableEl.rows)
|
|
105
|
+
: Array.from(node.querySelectorAll('tr')).filter((tr) => (0, dom_1.isDirectTableChild)(tr, node));
|
|
106
|
+
const rows = rowElements.map((row) => {
|
|
107
|
+
const cellElements = row.cells && row.cells.length > 0
|
|
108
|
+
? Array.from(row.cells)
|
|
109
|
+
: Array.from(row.querySelectorAll('td, th')).filter((cell) => cell.parentNode === row);
|
|
110
|
+
const cellContents = cellElements.map((cell) => {
|
|
111
|
+
// Remove newlines and trim the content
|
|
112
|
+
let cellContent = turndownService.turndown((0, dom_1.serializeHTML)(cell))
|
|
113
|
+
.replace(/\n/g, ' ')
|
|
114
|
+
.trim();
|
|
115
|
+
// Escape pipe characters
|
|
116
|
+
cellContent = cellContent.replace(/\|/g, '\\|');
|
|
117
|
+
return cellContent;
|
|
118
|
+
});
|
|
119
|
+
return `| ${cellContents.join(' | ')} |`;
|
|
120
|
+
});
|
|
121
|
+
if (!rows.length)
|
|
122
|
+
return content;
|
|
123
|
+
// Create the separator row
|
|
124
|
+
const separatorRow = `| ${Array(rows[0].split('|').length - 2).fill('---').join(' | ')} |`;
|
|
125
|
+
// Combine all rows
|
|
126
|
+
const tableContent = [rows[0], separatorRow, ...rows.slice(1)].join('\n');
|
|
127
|
+
return `\n\n${tableContent}\n\n`;
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
turndownService.remove(['style', 'script']);
|
|
131
|
+
// Keep iframes, video, audio, sup, and sub elements
|
|
132
|
+
// @ts-ignore
|
|
133
|
+
turndownService.keep(['iframe', 'video', 'audio', 'sup', 'sub', 'svg', 'math']);
|
|
134
|
+
turndownService.remove(['button']);
|
|
135
|
+
turndownService.addRule('list', {
|
|
136
|
+
filter: ['ul', 'ol'],
|
|
137
|
+
replacement: function (content, node) {
|
|
138
|
+
// Remove trailing newlines/spaces from content
|
|
139
|
+
content = content.trim();
|
|
140
|
+
// Add a newline before the list if it's a top-level list
|
|
141
|
+
const element = node;
|
|
142
|
+
const isTopLevel = !(element.parentNode && (element.parentNode.nodeName === 'UL' || element.parentNode.nodeName === 'OL'));
|
|
143
|
+
return (isTopLevel ? '\n' : '') + content + '\n';
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
// Lists with tab indentation
|
|
147
|
+
turndownService.addRule('listItem', {
|
|
148
|
+
filter: 'li',
|
|
149
|
+
replacement: function (content, node, options) {
|
|
150
|
+
if (!isGenericElement(node))
|
|
151
|
+
return content;
|
|
152
|
+
// Handle task list items
|
|
153
|
+
const isTaskListItem = node.classList?.contains('task-list-item');
|
|
154
|
+
const checkbox = node.querySelector('input[type="checkbox"]');
|
|
155
|
+
let taskListMarker = '';
|
|
156
|
+
if (isTaskListItem && checkbox && isGenericElement(checkbox)) {
|
|
157
|
+
// Remove the checkbox from content since we'll add markdown checkbox
|
|
158
|
+
content = content.replace(/<input[^>]*>/, '');
|
|
159
|
+
taskListMarker = checkbox.getAttribute('checked') ? '[x] ' : '[ ] ';
|
|
160
|
+
}
|
|
161
|
+
content = content
|
|
162
|
+
// Remove trailing newlines
|
|
163
|
+
.replace(/\n+$/, '')
|
|
164
|
+
// Split into lines
|
|
165
|
+
.split('\n')
|
|
166
|
+
// Remove empty lines
|
|
167
|
+
.filter(line => line.length > 0)
|
|
168
|
+
// Add indentation to continued lines
|
|
169
|
+
.join('\n\t');
|
|
170
|
+
let prefix = options.bulletListMarker + ' ';
|
|
171
|
+
let parent = node.parentNode;
|
|
172
|
+
// Calculate the nesting level
|
|
173
|
+
let level = 0;
|
|
174
|
+
let currentParent = node.parentNode;
|
|
175
|
+
while (currentParent && isGenericElement(currentParent)) {
|
|
176
|
+
if (currentParent.nodeName === 'UL' || currentParent.nodeName === 'OL') {
|
|
177
|
+
level++;
|
|
178
|
+
}
|
|
179
|
+
else if (currentParent.nodeName !== 'LI') {
|
|
180
|
+
break;
|
|
181
|
+
}
|
|
182
|
+
currentParent = currentParent.parentNode;
|
|
183
|
+
}
|
|
184
|
+
// Add tab indentation based on nesting level, ensuring it's never negative
|
|
185
|
+
const indentLevel = Math.max(0, level - 1);
|
|
186
|
+
prefix = '\t'.repeat(indentLevel) + prefix;
|
|
187
|
+
if (parent && isGenericElement(parent) && parent.nodeName === 'OL') {
|
|
188
|
+
let start = parent.getAttribute('start');
|
|
189
|
+
let index = 1;
|
|
190
|
+
const children = Array.from(parent.children || []);
|
|
191
|
+
for (let i = 0; i < children.length; i++) {
|
|
192
|
+
if (children[i] === node) {
|
|
193
|
+
index = i + 1;
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
prefix = '\t'.repeat(level - 1) + (start ? Number(start) + index - 1 : index) + '. ';
|
|
198
|
+
}
|
|
199
|
+
return prefix + taskListMarker + content.trim() + (node.nextSibling && !/\n$/.test(content) ? '\n' : '');
|
|
200
|
+
}
|
|
201
|
+
});
|
|
202
|
+
turndownService.addRule('figure', {
|
|
203
|
+
filter: 'figure',
|
|
204
|
+
replacement: function (content, node) {
|
|
205
|
+
if (!isGenericElement(node))
|
|
206
|
+
return content;
|
|
207
|
+
const img = node.querySelector('img');
|
|
208
|
+
const figcaption = node.querySelector('figcaption');
|
|
209
|
+
if (!img || !isGenericElement(img))
|
|
210
|
+
return content;
|
|
211
|
+
const alt = img.getAttribute('alt') || '';
|
|
212
|
+
const src = getBestImageSrc(img);
|
|
213
|
+
let caption = '';
|
|
214
|
+
if (figcaption && isGenericElement(figcaption)) {
|
|
215
|
+
const tagSpan = figcaption.querySelector('.ltx_tag_figure');
|
|
216
|
+
const tagText = tagSpan && isGenericElement(tagSpan) ? tagSpan.textContent?.trim() : '';
|
|
217
|
+
// Process the caption content, including math elements
|
|
218
|
+
let captionContent = (0, dom_1.serializeHTML)(figcaption);
|
|
219
|
+
const ownerDoc = node.ownerDocument;
|
|
220
|
+
captionContent = captionContent.replace(/<math.*?>(.*?)<\/math>/g, (match, mathContent, offset, string) => {
|
|
221
|
+
let latex = '';
|
|
222
|
+
if (ownerDoc) {
|
|
223
|
+
const fragment = (0, dom_1.parseHTML)(ownerDoc, match);
|
|
224
|
+
const mathElement = fragment.querySelector('math');
|
|
225
|
+
latex = mathElement && isGenericElement(mathElement) ? extractLatex(mathElement) : '';
|
|
226
|
+
}
|
|
227
|
+
const prevChar = string[offset - 1] || '';
|
|
228
|
+
const nextChar = string[offset + match.length] || '';
|
|
229
|
+
const isStartOfLine = offset === 0 || /\s/.test(prevChar);
|
|
230
|
+
const isEndOfLine = offset + match.length === string.length || /\s/.test(nextChar);
|
|
231
|
+
const leftSpace = (!isStartOfLine && !/[\s$]/.test(prevChar)) ? ' ' : '';
|
|
232
|
+
const rightSpace = (!isEndOfLine && !/[\s$]/.test(nextChar)) ? ' ' : '';
|
|
233
|
+
return `${leftSpace}$${latex}$${rightSpace}`;
|
|
234
|
+
});
|
|
235
|
+
// Convert the processed caption content to markdown
|
|
236
|
+
const captionMarkdown = turndownService.turndown(captionContent);
|
|
237
|
+
// Combine tag and processed caption
|
|
238
|
+
caption = `${tagText} ${captionMarkdown}`.trim();
|
|
239
|
+
}
|
|
240
|
+
// Handle references in the caption
|
|
241
|
+
caption = caption.replace(/\[([^\]]+)\]\(([^)]+)\)/g, (match, text, href) => {
|
|
242
|
+
return `[${text}](${href})`;
|
|
243
|
+
});
|
|
244
|
+
return `\n\n${caption}\n\n`;
|
|
245
|
+
}
|
|
246
|
+
});
|
|
247
|
+
// Prefer the highest-resolution image from srcset over the small fallback in src
|
|
248
|
+
turndownService.addRule('image', {
|
|
249
|
+
filter: 'img',
|
|
250
|
+
replacement: function (content, node) {
|
|
251
|
+
if (!isGenericElement(node))
|
|
252
|
+
return content;
|
|
253
|
+
const alt = node.getAttribute('alt') || '';
|
|
254
|
+
const src = getBestImageSrc(node);
|
|
255
|
+
const title = node.getAttribute('title') || '';
|
|
256
|
+
const titlePart = title ? ` "${title}"` : '';
|
|
257
|
+
return src ? `` : '';
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
// Use Obsidian format for YouTube embeds and tweets
|
|
261
|
+
turndownService.addRule('embedToMarkdown', {
|
|
262
|
+
filter: function (node) {
|
|
263
|
+
if (!isGenericElement(node))
|
|
264
|
+
return false;
|
|
265
|
+
const src = node.getAttribute('src');
|
|
266
|
+
return !!src && (!!src.match(/(?:youtube\.com|youtube-nocookie\.com|youtu\.be)/) ||
|
|
267
|
+
!!src.match(/(?:twitter\.com|x\.com)/));
|
|
268
|
+
},
|
|
269
|
+
replacement: function (content, node) {
|
|
270
|
+
if (!isGenericElement(node))
|
|
271
|
+
return content;
|
|
272
|
+
const src = node.getAttribute('src');
|
|
273
|
+
if (src) {
|
|
274
|
+
const youtubeMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:youtube\.com|youtube-nocookie\.com|youtu\.be)\/(?:embed\/|watch\?v=)?([a-zA-Z0-9_-]+)/);
|
|
275
|
+
if (youtubeMatch && youtubeMatch[1]) {
|
|
276
|
+
return `\n\n`;
|
|
277
|
+
}
|
|
278
|
+
// Direct URL: /user/status/id
|
|
279
|
+
const tweetDirectMatch = src.match(/(?:https?:\/\/)?(?:www\.)?(?:twitter\.com|x\.com)\/([^/]+)\/status\/([0-9]+)/);
|
|
280
|
+
if (tweetDirectMatch) {
|
|
281
|
+
return `\n\n`;
|
|
282
|
+
}
|
|
283
|
+
// Platform embed: ?id=
|
|
284
|
+
const tweetEmbedMatch = src.match(/(?:https?:\/\/)?(?:platform\.)?twitter\.com\/embed\/Tweet\.html\?.*?id=([0-9]+)/);
|
|
285
|
+
if (tweetEmbedMatch) {
|
|
286
|
+
return `\n\n`;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
return content;
|
|
290
|
+
}
|
|
291
|
+
});
|
|
292
|
+
turndownService.addRule('highlight', {
|
|
293
|
+
filter: 'mark',
|
|
294
|
+
replacement: function (content) {
|
|
295
|
+
return '==' + content + '==';
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
turndownService.addRule('strikethrough', {
|
|
299
|
+
filter: (node) => node.nodeName === 'DEL' ||
|
|
300
|
+
node.nodeName === 'S' ||
|
|
301
|
+
node.nodeName === 'STRIKE',
|
|
302
|
+
replacement: function (content) {
|
|
303
|
+
return '~~' + content + '~~';
|
|
304
|
+
}
|
|
305
|
+
});
|
|
306
|
+
// Add a new custom rule for complex link structures
|
|
307
|
+
turndownService.addRule('complexLinkStructure', {
|
|
308
|
+
filter: function (node, options) {
|
|
309
|
+
return (node.nodeName === 'A' &&
|
|
310
|
+
node.childNodes.length > 1 &&
|
|
311
|
+
Array.from(node.childNodes).some(child => ['H1', 'H2', 'H3', 'H4', 'H5', 'H6'].includes(child.nodeName)));
|
|
312
|
+
},
|
|
313
|
+
replacement: function (content, node, options) {
|
|
314
|
+
if (!isGenericElement(node))
|
|
315
|
+
return content;
|
|
316
|
+
const href = node.getAttribute('href');
|
|
317
|
+
const title = node.getAttribute('title');
|
|
318
|
+
// Extract the heading — use outerHTML to preserve the heading tag
|
|
319
|
+
const headingNode = node.querySelector('h1, h2, h3, h4, h5, h6');
|
|
320
|
+
const headingContent = headingNode ? turndownService.turndown(headingNode.outerHTML) : '';
|
|
321
|
+
// Remove the heading from the content
|
|
322
|
+
if (headingNode) {
|
|
323
|
+
headingNode.remove();
|
|
324
|
+
}
|
|
325
|
+
// Convert the remaining content
|
|
326
|
+
const remainingContent = turndownService.turndown((0, dom_1.serializeHTML)(node));
|
|
327
|
+
// Construct the new markdown
|
|
328
|
+
let markdown = `${headingContent}\n\n${remainingContent}\n\n`;
|
|
329
|
+
if (href) {
|
|
330
|
+
markdown += `[View original](${href})`;
|
|
331
|
+
if (title) {
|
|
332
|
+
markdown += ` "${title}"`;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
return markdown;
|
|
336
|
+
}
|
|
337
|
+
});
|
|
338
|
+
turndownService.addRule('arXivEnumerate', {
|
|
339
|
+
filter: (node) => {
|
|
340
|
+
return node.nodeName === 'OL' && isGenericElement(node) && (node.classList?.contains('ltx_enumerate') ?? false);
|
|
341
|
+
},
|
|
342
|
+
replacement: function (content, node) {
|
|
343
|
+
if (!isGenericElement(node))
|
|
344
|
+
return content;
|
|
345
|
+
const items = Array.from(node.children || []).map((item, index) => {
|
|
346
|
+
if (isGenericElement(item)) {
|
|
347
|
+
const itemContent = ((0, dom_1.serializeHTML)(item) || '').replace(/^<span class="ltx_tag ltx_tag_item">\d+\.<\/span>\s*/, '');
|
|
348
|
+
return `${index + 1}. ${turndownService.turndown(itemContent)}`;
|
|
349
|
+
}
|
|
350
|
+
return '';
|
|
351
|
+
});
|
|
352
|
+
return '\n\n' + items.join('\n\n') + '\n\n';
|
|
353
|
+
}
|
|
354
|
+
});
|
|
355
|
+
turndownService.addRule('citations', {
|
|
356
|
+
filter: (node) => {
|
|
357
|
+
if (isGenericElement(node)) {
|
|
358
|
+
const id = node.getAttribute('id');
|
|
359
|
+
return node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:');
|
|
360
|
+
}
|
|
361
|
+
return false;
|
|
362
|
+
},
|
|
363
|
+
replacement: (content, node) => {
|
|
364
|
+
if (isGenericElement(node)) {
|
|
365
|
+
const id = node.getAttribute('id');
|
|
366
|
+
if (node.nodeName === 'SUP' && id !== null && id.startsWith('fnref:')) {
|
|
367
|
+
const primaryNumber = id.replace('fnref:', '').split('-')[0];
|
|
368
|
+
return `[^${primaryNumber}]`;
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
return content;
|
|
372
|
+
}
|
|
373
|
+
});
|
|
374
|
+
// Footnotes list
|
|
375
|
+
turndownService.addRule('footnotesList', {
|
|
376
|
+
filter: (node) => {
|
|
377
|
+
if (isGenericElement(node)) {
|
|
378
|
+
const parentNode = node.parentNode;
|
|
379
|
+
return (node.nodeName === 'OL' &&
|
|
380
|
+
parentNode !== null &&
|
|
381
|
+
isGenericElement(parentNode) &&
|
|
382
|
+
parentNode.getAttribute('id') === 'footnotes');
|
|
383
|
+
}
|
|
384
|
+
return false;
|
|
385
|
+
},
|
|
386
|
+
replacement: (content, node) => {
|
|
387
|
+
if (!isGenericElement(node))
|
|
388
|
+
return content;
|
|
389
|
+
const references = Array.from(node.children || []).map(li => {
|
|
390
|
+
let id;
|
|
391
|
+
if (isGenericElement(li)) {
|
|
392
|
+
const liId = li.getAttribute('id');
|
|
393
|
+
if (liId !== null) {
|
|
394
|
+
if (liId.startsWith('fn:')) {
|
|
395
|
+
id = liId.replace('fn:', '');
|
|
396
|
+
}
|
|
397
|
+
else {
|
|
398
|
+
const match = liId.split('/').pop()?.match(/cite_note-(.+)/);
|
|
399
|
+
id = match ? match[1] : liId;
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
// Remove the leading sup element if its content matches the footnote id
|
|
403
|
+
const supElement = li.querySelector('sup');
|
|
404
|
+
if (supElement && isGenericElement(supElement) && supElement.textContent?.trim() === id) {
|
|
405
|
+
supElement.remove();
|
|
406
|
+
}
|
|
407
|
+
const referenceContent = turndownService.turndown((0, dom_1.serializeHTML)(li));
|
|
408
|
+
// Remove the backlink from the footnote content
|
|
409
|
+
const cleanedContent = referenceContent.replace(/\s*↩︎$/, '').trim();
|
|
410
|
+
return `[^${id?.toLowerCase()}]: ${cleanedContent}`;
|
|
411
|
+
}
|
|
412
|
+
return '';
|
|
413
|
+
});
|
|
414
|
+
return '\n\n' + references.join('\n\n') + '\n\n';
|
|
415
|
+
}
|
|
416
|
+
});
|
|
417
|
+
// General removal rules for varous website elements
|
|
418
|
+
turndownService.addRule('removals', {
|
|
419
|
+
filter: function (node) {
|
|
420
|
+
if (!isGenericElement(node))
|
|
421
|
+
return false;
|
|
422
|
+
// Remove the Defuddle backlink from the footnote content
|
|
423
|
+
if (node.getAttribute('href')?.includes('#fnref'))
|
|
424
|
+
return true;
|
|
425
|
+
if (node.classList?.contains('footnote-backref'))
|
|
426
|
+
return true;
|
|
427
|
+
return false;
|
|
428
|
+
},
|
|
429
|
+
replacement: function (content, node) {
|
|
430
|
+
return '';
|
|
431
|
+
}
|
|
432
|
+
});
|
|
433
|
+
turndownService.addRule('handleTextNodesInTables', {
|
|
434
|
+
filter: function (node) {
|
|
435
|
+
return (0, utils_1.isTextNode)(node) &&
|
|
436
|
+
node.parentNode !== null &&
|
|
437
|
+
node.parentNode.nodeName === 'TD';
|
|
438
|
+
},
|
|
439
|
+
replacement: function (content) {
|
|
440
|
+
return content;
|
|
441
|
+
}
|
|
442
|
+
});
|
|
443
|
+
turndownService.addRule('preformattedCode', {
|
|
444
|
+
filter: (node) => {
|
|
445
|
+
return node.nodeName === 'PRE';
|
|
446
|
+
},
|
|
447
|
+
replacement: (content, node) => {
|
|
448
|
+
if (!isGenericElement(node))
|
|
449
|
+
return content;
|
|
450
|
+
const codeElement = node.querySelector('code');
|
|
451
|
+
if (!codeElement || !isGenericElement(codeElement))
|
|
452
|
+
return content;
|
|
453
|
+
const language = codeElement.getAttribute('data-lang')
|
|
454
|
+
|| codeElement.getAttribute('data-language')
|
|
455
|
+
|| codeElement.getAttribute('class')?.match(/language-(\w+)/)?.[1]
|
|
456
|
+
|| node.getAttribute('data-language')
|
|
457
|
+
|| '';
|
|
458
|
+
const code = codeElement.textContent || '';
|
|
459
|
+
// Clean up the content and escape backticks
|
|
460
|
+
const cleanCode = code
|
|
461
|
+
.trim()
|
|
462
|
+
.replace(/`/g, '\\`');
|
|
463
|
+
return `\n\`\`\`${language}\n${cleanCode}\n\`\`\`\n`;
|
|
464
|
+
}
|
|
465
|
+
});
|
|
466
|
+
turndownService.addRule('math', {
|
|
467
|
+
filter: (node) => {
|
|
468
|
+
return node.nodeName.toLowerCase() === 'math' ||
|
|
469
|
+
(isGenericElement(node) &&
|
|
470
|
+
(node.classList?.contains('mwe-math-element') ||
|
|
471
|
+
node.classList?.contains('mwe-math-fallback-image-inline') ||
|
|
472
|
+
node.classList?.contains('mwe-math-fallback-image-display')));
|
|
473
|
+
},
|
|
474
|
+
replacement: (content, node) => {
|
|
475
|
+
if (!isGenericElement(node))
|
|
476
|
+
return content;
|
|
477
|
+
let latex = extractLatex(node);
|
|
478
|
+
// Remove leading and trailing whitespace
|
|
479
|
+
latex = latex.trim();
|
|
480
|
+
// Check if the math element is within a table
|
|
481
|
+
const isInTable = typeof node.closest === 'function' ? node.closest('table') !== null : false;
|
|
482
|
+
// Check if it's an inline or block math element
|
|
483
|
+
if (!isInTable && (node.getAttribute('display') === 'block' ||
|
|
484
|
+
node.classList?.contains('mwe-math-fallback-image-display') ||
|
|
485
|
+
(node.parentNode && isGenericElement(node.parentNode) &&
|
|
486
|
+
node.parentNode.classList?.contains('mwe-math-element') &&
|
|
487
|
+
node.parentNode.previousSibling && isGenericElement(node.parentNode.previousSibling) &&
|
|
488
|
+
node.parentNode.previousSibling.nodeName.toLowerCase() === 'p'))) {
|
|
489
|
+
return `\n$$\n${latex}\n$$\n`;
|
|
490
|
+
}
|
|
491
|
+
else {
|
|
492
|
+
// For inline math, ensure there's a space before and after only if needed
|
|
493
|
+
const prevNode = node.previousSibling;
|
|
494
|
+
const nextNode = node.nextSibling;
|
|
495
|
+
const prevChar = prevNode && isGenericElement(prevNode) ? prevNode.textContent?.slice(-1) || '' : '';
|
|
496
|
+
const nextChar = nextNode && isGenericElement(nextNode) ? nextNode.textContent?.[0] || '' : '';
|
|
497
|
+
const isStartOfLine = !prevNode || ((0, utils_1.isTextNode)(prevNode) && prevNode.textContent?.trim() === '');
|
|
498
|
+
const isEndOfLine = !nextNode || ((0, utils_1.isTextNode)(nextNode) && nextNode.textContent?.trim() === '');
|
|
499
|
+
const leftSpace = (!isStartOfLine && prevChar && !/[\s$]/.test(prevChar)) ? ' ' : '';
|
|
500
|
+
const rightSpace = (!isEndOfLine && nextChar && !/[\s$]/.test(nextChar)) ? ' ' : '';
|
|
501
|
+
return `${leftSpace}$${latex}$${rightSpace}`;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
});
|
|
505
|
+
turndownService.addRule('katex', {
|
|
506
|
+
filter: (node) => {
|
|
507
|
+
return isGenericElement(node) &&
|
|
508
|
+
(node.classList?.contains('math') || node.classList?.contains('katex'));
|
|
509
|
+
},
|
|
510
|
+
replacement: (content, node) => {
|
|
511
|
+
if (!isGenericElement(node))
|
|
512
|
+
return content;
|
|
513
|
+
// Try to find the original LaTeX content
|
|
514
|
+
// 1. Check data-latex attribute
|
|
515
|
+
let latex = node.getAttribute('data-latex');
|
|
516
|
+
// 2. If no data-latex, try to get from .katex-mathml
|
|
517
|
+
if (!latex) {
|
|
518
|
+
const mathml = node.querySelector('.katex-mathml annotation[encoding="application/x-tex"]');
|
|
519
|
+
latex = mathml && isGenericElement(mathml) ? mathml.textContent || '' : '';
|
|
520
|
+
}
|
|
521
|
+
// 3. If still no content, use text content as fallback
|
|
522
|
+
if (!latex) {
|
|
523
|
+
latex = node.textContent?.trim() || '';
|
|
524
|
+
}
|
|
525
|
+
// Determine if it's an inline formula
|
|
526
|
+
const mathElement = node.querySelector('.katex-mathml math');
|
|
527
|
+
const isInline = node.classList?.contains('math-inline') ||
|
|
528
|
+
(mathElement && isGenericElement(mathElement) && mathElement.getAttribute('display') !== 'block');
|
|
529
|
+
if (isInline) {
|
|
530
|
+
return `$${latex}$`;
|
|
531
|
+
}
|
|
532
|
+
else {
|
|
533
|
+
return `\n$$\n${latex}\n$$\n`;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
});
|
|
537
|
+
// All callout types (GitHub alerts, Bootstrap alerts, callout asides) are
|
|
538
|
+
// standardized to div.callout[data-callout] in callouts.ts
|
|
539
|
+
turndownService.addRule('callout', {
|
|
540
|
+
filter: (node) => {
|
|
541
|
+
return (isGenericElement(node) &&
|
|
542
|
+
!!node.getAttribute('data-callout') &&
|
|
543
|
+
node.classList?.contains('callout'));
|
|
544
|
+
},
|
|
545
|
+
replacement: (content, node) => {
|
|
546
|
+
if (!isGenericElement(node))
|
|
547
|
+
return content;
|
|
548
|
+
const type = node.getAttribute('data-callout') || 'note';
|
|
549
|
+
// Extract title from .callout-title-inner
|
|
550
|
+
const titleInner = node.querySelector('.callout-title-inner');
|
|
551
|
+
const title = titleInner?.textContent?.trim() || type.charAt(0).toUpperCase() + type.slice(1);
|
|
552
|
+
// Remove the title from the DOM so it doesn't appear in content
|
|
553
|
+
const titleDiv = node.querySelector('.callout-title');
|
|
554
|
+
if (titleDiv) {
|
|
555
|
+
titleDiv.remove();
|
|
556
|
+
}
|
|
557
|
+
// Re-convert without the title element
|
|
558
|
+
const contentEl = node.querySelector('.callout-content');
|
|
559
|
+
const calloutContent = contentEl
|
|
560
|
+
? turndownService.turndown(contentEl.innerHTML)
|
|
561
|
+
: turndownService.turndown(node.innerHTML);
|
|
562
|
+
const lines = calloutContent.trim().split('\n');
|
|
563
|
+
const quotedContent = lines.map(line => `> ${line}`).join('\n');
|
|
564
|
+
return `\n\n> [!${type}] ${title}\n${quotedContent}\n\n`;
|
|
565
|
+
}
|
|
566
|
+
});
|
|
567
|
+
function handleNestedEquations(element) {
|
|
568
|
+
const mathElements = element.querySelectorAll('math[alttext]');
|
|
569
|
+
if (mathElements.length === 0)
|
|
570
|
+
return '';
|
|
571
|
+
return Array.from(mathElements).map(mathElement => {
|
|
572
|
+
const alttext = mathElement.getAttribute('alttext');
|
|
573
|
+
if (alttext) {
|
|
574
|
+
// Check if it's an inline or block equation
|
|
575
|
+
const isInline = mathElement.closest('.ltx_eqn_inline') !== null;
|
|
576
|
+
return isInline ? `$${alttext.trim()}$` : `\n$$\n${alttext.trim()}\n$$`;
|
|
577
|
+
}
|
|
578
|
+
return '';
|
|
579
|
+
}).join('\n\n');
|
|
580
|
+
}
|
|
581
|
+
function cleanupTableHTML(element) {
|
|
582
|
+
const allowedAttributes = ['src', 'href', 'style', 'align', 'width', 'height', 'rowspan', 'colspan', 'bgcolor', 'scope', 'valign', 'headers'];
|
|
583
|
+
const cleanElement = (element) => {
|
|
584
|
+
Array.from(element.attributes).forEach(attr => {
|
|
585
|
+
if (!allowedAttributes.includes(attr.name)) {
|
|
586
|
+
element.removeAttribute(attr.name);
|
|
587
|
+
}
|
|
588
|
+
});
|
|
589
|
+
element.childNodes.forEach(child => {
|
|
590
|
+
if ((0, utils_1.isElement)(child)) {
|
|
591
|
+
cleanElement(child);
|
|
592
|
+
}
|
|
593
|
+
});
|
|
594
|
+
};
|
|
595
|
+
// Create a clone of the table to avoid modifying the original DOM
|
|
596
|
+
const tableClone = element.cloneNode(true);
|
|
597
|
+
cleanElement(tableClone);
|
|
598
|
+
// outerHTML encodes & as &, which breaks LaTeX alignment
|
|
599
|
+
// characters inside math delimiters. Decode common entities since
|
|
600
|
+
// the output goes into markdown, not back through an HTML parser.
|
|
601
|
+
return tableClone.outerHTML
|
|
602
|
+
.replace(/&/g, '&')
|
|
603
|
+
.replace(/</g, '<')
|
|
604
|
+
.replace(/>/g, '>');
|
|
605
|
+
}
|
|
606
|
+
function extractLatex(element) {
|
|
607
|
+
// Check if the element is a <math> element and has an alttext attribute
|
|
608
|
+
let latex = element.getAttribute('data-latex');
|
|
609
|
+
let alttext = element.getAttribute('alttext');
|
|
610
|
+
if (latex) {
|
|
611
|
+
return latex.trim();
|
|
612
|
+
}
|
|
613
|
+
else if (alttext) {
|
|
614
|
+
return alttext.trim();
|
|
615
|
+
}
|
|
616
|
+
return '';
|
|
617
|
+
}
|
|
618
|
+
try {
|
|
619
|
+
// Strip <wbr> tags — word break opportunity hints that are invisible in
|
|
620
|
+
// browsers but would insert unwanted spaces during Turndown conversion.
|
|
621
|
+
content = content.replace(/<wbr\s*\/?>/gi, '');
|
|
622
|
+
let markdown = turndownService.turndown(content);
|
|
623
|
+
// Remove the title from the beginning of the content if it exists
|
|
624
|
+
const titleMatch = markdown.match(/^# .+\n+/);
|
|
625
|
+
if (titleMatch) {
|
|
626
|
+
markdown = markdown.slice(titleMatch[0].length);
|
|
627
|
+
}
|
|
628
|
+
// Remove any empty links e.g. [](example.com) that remain, along with surrounding newlines
|
|
629
|
+
// But don't affect image links like 
|
|
630
|
+
markdown = markdown.replace(/\n*(?<!!)\[]\([^)]+\)\n*/g, '');
|
|
631
|
+
// Add a space between exclamation marks and image syntax " becomes "Yey! " to prevent
|
|
633
|
+
// the parser from misinterpreting the ! as part of the image markup.
|
|
634
|
+
// Also handles linked images: "Yey](href)"
|
|
635
|
+
markdown = markdown.replace(/!(?=!\[|\[!\[)/g, '! ');
|
|
636
|
+
// Remove any consecutive newlines more than two
|
|
637
|
+
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
638
|
+
// Append footnotes at the end of the document
|
|
639
|
+
if (Object.keys(footnotes).length > 0) {
|
|
640
|
+
markdown += '\n\n---\n\n';
|
|
641
|
+
for (const [id, content] of Object.entries(footnotes)) {
|
|
642
|
+
markdown += `[^${id}]: ${content}\n\n`;
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
return markdown.trim();
|
|
646
|
+
}
|
|
647
|
+
catch (error) {
|
|
648
|
+
console.error('Error converting HTML to Markdown:', error);
|
|
649
|
+
console.log('Problematic content:', content.substring(0, 1000) + '...');
|
|
650
|
+
return `Partial conversion completed with errors. Original HTML:\n\n${content}`;
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
function toMarkdown(result, options, url) {
|
|
654
|
+
if (options.markdown) {
|
|
655
|
+
result.content = createMarkdownContent(result.content, url);
|
|
656
|
+
}
|
|
657
|
+
else if (options.separateMarkdown) {
|
|
658
|
+
result.contentMarkdown = createMarkdownContent(result.content, url);
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
//# sourceMappingURL=markdown.js.map
|