html-to-org 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-ASXHHGOB.cjs +294 -0
- package/dist/chunk-ZZ4VEN6R.js +294 -0
- package/dist/dom.cjs +6 -0
- package/dist/dom.d.cts +10 -0
- package/dist/dom.d.ts +10 -0
- package/dist/dom.js +6 -0
- package/dist/index.cjs +7 -343
- package/dist/index.d.cts +5 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +6 -317
- package/package.json +20 -8
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _optionalChain(ops) { let lastAccessLHS = undefined; let value = ops[0]; let i = 1; while (i < ops.length) { const op = ops[i]; const fn = ops[i + 1]; i += 2; if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { return undefined; } if (op === 'access' || op === 'optionalAccess') { lastAccessLHS = value; value = fn(value); } else if (op === 'call' || op === 'optionalCall') { value = fn((...args) => value.call(lastAccessLHS, ...args)); lastAccessLHS = undefined; } } return value; }// src/convert.ts
|
|
2
|
+
function domToOrg(node, baseUrl = "") {
|
|
3
|
+
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
4
|
+
const raw = convertNode(node, ctx);
|
|
5
|
+
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
6
|
+
}
|
|
7
|
+
function convertNode(node, ctx) {
|
|
8
|
+
if (node.nodeType === 3) {
|
|
9
|
+
return collapseWhitespace(node.textContent || "");
|
|
10
|
+
}
|
|
11
|
+
if (node.nodeType !== 1) return "";
|
|
12
|
+
const tag = (node.tagName || "").toLowerCase();
|
|
13
|
+
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
14
|
+
switch (tag) {
|
|
15
|
+
case "h1":
|
|
16
|
+
case "h2":
|
|
17
|
+
case "h3":
|
|
18
|
+
case "h4":
|
|
19
|
+
case "h5":
|
|
20
|
+
case "h6":
|
|
21
|
+
return convertHeading(node, tag, ctx);
|
|
22
|
+
case "p":
|
|
23
|
+
return convertParagraph(node, ctx);
|
|
24
|
+
case "strong":
|
|
25
|
+
case "b":
|
|
26
|
+
return wrapInline("*", node, ctx);
|
|
27
|
+
case "em":
|
|
28
|
+
case "i":
|
|
29
|
+
return wrapInline("/", node, ctx);
|
|
30
|
+
case "u":
|
|
31
|
+
case "ins":
|
|
32
|
+
return wrapInline("_", node, ctx);
|
|
33
|
+
case "s":
|
|
34
|
+
case "del":
|
|
35
|
+
case "strike":
|
|
36
|
+
return wrapInline("+", node, ctx);
|
|
37
|
+
case "code":
|
|
38
|
+
return wrapInline("=", node, ctx);
|
|
39
|
+
case "mark":
|
|
40
|
+
return wrapInline("=", node, ctx);
|
|
41
|
+
case "sup":
|
|
42
|
+
return `^{${convertChildren(node, ctx)}}`;
|
|
43
|
+
case "sub":
|
|
44
|
+
return `_{${convertChildren(node, ctx)}}`;
|
|
45
|
+
case "a":
|
|
46
|
+
return convertLink(node, ctx);
|
|
47
|
+
case "img":
|
|
48
|
+
return convertImage(node, ctx);
|
|
49
|
+
case "ul":
|
|
50
|
+
return convertList(node, false, ctx);
|
|
51
|
+
case "ol":
|
|
52
|
+
return convertList(node, true, ctx);
|
|
53
|
+
case "li":
|
|
54
|
+
return convertChildren(node, ctx);
|
|
55
|
+
case "pre":
|
|
56
|
+
return convertPre(node, ctx);
|
|
57
|
+
case "blockquote":
|
|
58
|
+
return convertBlockquote(node, ctx);
|
|
59
|
+
case "table":
|
|
60
|
+
return convertTable(node, ctx);
|
|
61
|
+
case "hr":
|
|
62
|
+
return "\n\n-----\n\n";
|
|
63
|
+
case "br":
|
|
64
|
+
return "\n";
|
|
65
|
+
default:
|
|
66
|
+
return convertChildren(node, ctx);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
function convertChildren(node, ctx) {
|
|
70
|
+
let out = "";
|
|
71
|
+
for (const child of node.childNodes) {
|
|
72
|
+
if (child.nodeType === 3) {
|
|
73
|
+
const text = child.textContent || "";
|
|
74
|
+
if (!text.trim()) {
|
|
75
|
+
const prev = child.previousSibling;
|
|
76
|
+
const next = child.nextSibling;
|
|
77
|
+
if (isBlockElement(prev) || isBlockElement(next)) continue;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
out += convertNode(child, ctx);
|
|
81
|
+
}
|
|
82
|
+
return out;
|
|
83
|
+
}
|
|
84
|
+
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
85
|
+
"p",
|
|
86
|
+
"div",
|
|
87
|
+
"h1",
|
|
88
|
+
"h2",
|
|
89
|
+
"h3",
|
|
90
|
+
"h4",
|
|
91
|
+
"h5",
|
|
92
|
+
"h6",
|
|
93
|
+
"ul",
|
|
94
|
+
"ol",
|
|
95
|
+
"li",
|
|
96
|
+
"pre",
|
|
97
|
+
"blockquote",
|
|
98
|
+
"table",
|
|
99
|
+
"hr",
|
|
100
|
+
"section",
|
|
101
|
+
"article",
|
|
102
|
+
"main",
|
|
103
|
+
"header",
|
|
104
|
+
"footer",
|
|
105
|
+
"nav",
|
|
106
|
+
"aside",
|
|
107
|
+
"figure",
|
|
108
|
+
"figcaption",
|
|
109
|
+
"details",
|
|
110
|
+
"summary"
|
|
111
|
+
]);
|
|
112
|
+
function isBlockElement(node) {
|
|
113
|
+
if (!node || node.nodeType !== 1) return false;
|
|
114
|
+
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
115
|
+
}
|
|
116
|
+
function convertHeading(node, tag, ctx) {
|
|
117
|
+
const level = parseInt(tag[1], 10);
|
|
118
|
+
const text = convertChildren(node, ctx).trim();
|
|
119
|
+
return `
|
|
120
|
+
|
|
121
|
+
${"*".repeat(level)} ${text}
|
|
122
|
+
|
|
123
|
+
`;
|
|
124
|
+
}
|
|
125
|
+
function convertParagraph(node, ctx) {
|
|
126
|
+
const text = convertChildren(node, ctx).trim();
|
|
127
|
+
if (!text) return "";
|
|
128
|
+
return `
|
|
129
|
+
|
|
130
|
+
${text}
|
|
131
|
+
|
|
132
|
+
`;
|
|
133
|
+
}
|
|
134
|
+
function convertBlockquote(node, ctx) {
|
|
135
|
+
const inner = convertChildren(node, ctx).trim();
|
|
136
|
+
return `
|
|
137
|
+
|
|
138
|
+
#+BEGIN_QUOTE
|
|
139
|
+
${inner}
|
|
140
|
+
#+END_QUOTE
|
|
141
|
+
|
|
142
|
+
`;
|
|
143
|
+
}
|
|
144
|
+
function convertPre(node, _ctx) {
|
|
145
|
+
const codeChild = _optionalChain([node, 'access', _ => _.querySelector, 'optionalCall', _2 => _2("code")]);
|
|
146
|
+
if (codeChild) {
|
|
147
|
+
const lang = detectLanguage(codeChild);
|
|
148
|
+
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
149
|
+
const langSuffix = lang ? ` ${lang}` : "";
|
|
150
|
+
return `
|
|
151
|
+
|
|
152
|
+
#+BEGIN_SRC${langSuffix}
|
|
153
|
+
${code}
|
|
154
|
+
#+END_SRC
|
|
155
|
+
|
|
156
|
+
`;
|
|
157
|
+
}
|
|
158
|
+
const text = (node.textContent || "").replace(/\n$/, "");
|
|
159
|
+
return `
|
|
160
|
+
|
|
161
|
+
#+BEGIN_EXAMPLE
|
|
162
|
+
${text}
|
|
163
|
+
#+END_EXAMPLE
|
|
164
|
+
|
|
165
|
+
`;
|
|
166
|
+
}
|
|
167
|
+
function detectLanguage(codeNode) {
|
|
168
|
+
const cls = _optionalChain([codeNode, 'access', _3 => _3.getAttribute, 'optionalCall', _4 => _4("class")]) || "";
|
|
169
|
+
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
170
|
+
return match ? match[1] : "";
|
|
171
|
+
}
|
|
172
|
+
function convertList(node, ordered, ctx) {
|
|
173
|
+
const items = [];
|
|
174
|
+
let counter = 1;
|
|
175
|
+
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
176
|
+
const prefixWidth = ordered ? 3 : 2;
|
|
177
|
+
for (const child of node.childNodes) {
|
|
178
|
+
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
179
|
+
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
180
|
+
let textParts = [];
|
|
181
|
+
let nestedLists = [];
|
|
182
|
+
for (const liChild of child.childNodes) {
|
|
183
|
+
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
184
|
+
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
185
|
+
const nestedCtx = {
|
|
186
|
+
...ctx,
|
|
187
|
+
listDepth: ctx.listDepth + 1,
|
|
188
|
+
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
189
|
+
};
|
|
190
|
+
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
191
|
+
} else {
|
|
192
|
+
textParts.push(convertNode(liChild, ctx));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const text = textParts.join("").trim();
|
|
196
|
+
let item = `${indent}${currentPrefix}${text}`;
|
|
197
|
+
if (nestedLists.length > 0) {
|
|
198
|
+
item += "\n" + nestedLists.join("\n");
|
|
199
|
+
}
|
|
200
|
+
items.push(item);
|
|
201
|
+
counter++;
|
|
202
|
+
}
|
|
203
|
+
const result = items.join("\n");
|
|
204
|
+
return ctx.listDepth === 0 ? `
|
|
205
|
+
|
|
206
|
+
${result}
|
|
207
|
+
|
|
208
|
+
` : result;
|
|
209
|
+
}
|
|
210
|
+
function wrapInline(marker, node, ctx) {
|
|
211
|
+
const inner = convertChildren(node, ctx);
|
|
212
|
+
if (!inner.trim()) return inner;
|
|
213
|
+
return `${marker}${inner}${marker}`;
|
|
214
|
+
}
|
|
215
|
+
function convertLink(node, ctx) {
|
|
216
|
+
const href = resolveUrl(_optionalChain([node, 'access', _5 => _5.getAttribute, 'optionalCall', _6 => _6("href")]) || "", ctx.baseUrl);
|
|
217
|
+
const text = convertChildren(node, ctx).trim();
|
|
218
|
+
if (!href) return text;
|
|
219
|
+
if (!text || text === href) return `[[${href}]]`;
|
|
220
|
+
return `[[${href}][${text}]]`;
|
|
221
|
+
}
|
|
222
|
+
function convertImage(node, ctx) {
|
|
223
|
+
const src = resolveUrl(_optionalChain([node, 'access', _7 => _7.getAttribute, 'optionalCall', _8 => _8("src")]) || "", ctx.baseUrl);
|
|
224
|
+
if (!src) return "";
|
|
225
|
+
return `[[${src}]]`;
|
|
226
|
+
}
|
|
227
|
+
function convertTable(node, ctx) {
|
|
228
|
+
const rows = [];
|
|
229
|
+
let headerRowCount = 0;
|
|
230
|
+
const thead = _optionalChain([node, 'access', _9 => _9.querySelector, 'optionalCall', _10 => _10("thead")]);
|
|
231
|
+
if (thead) {
|
|
232
|
+
for (const tr of _optionalChain([thead, 'access', _11 => _11.querySelectorAll, 'optionalCall', _12 => _12("tr")]) || []) {
|
|
233
|
+
rows.push(extractRowCells(tr, ctx));
|
|
234
|
+
headerRowCount++;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
const tbody = _optionalChain([node, 'access', _13 => _13.querySelector, 'optionalCall', _14 => _14("tbody")]);
|
|
238
|
+
const bodyContainer = tbody || node;
|
|
239
|
+
for (const tr of _optionalChain([bodyContainer, 'access', _15 => _15.querySelectorAll, 'optionalCall', _16 => _16("tr")]) || []) {
|
|
240
|
+
if (thead && tr.parentNode === thead) continue;
|
|
241
|
+
rows.push(extractRowCells(tr, ctx));
|
|
242
|
+
}
|
|
243
|
+
if (rows.length === 0) return "";
|
|
244
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
245
|
+
const colWidths = new Array(colCount).fill(0);
|
|
246
|
+
for (const row of rows) {
|
|
247
|
+
for (let i = 0; i < colCount; i++) {
|
|
248
|
+
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
const formatRow = (row) => {
|
|
252
|
+
const cells = [];
|
|
253
|
+
for (let i = 0; i < colCount; i++) {
|
|
254
|
+
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
255
|
+
}
|
|
256
|
+
return "| " + cells.join(" | ") + " |";
|
|
257
|
+
};
|
|
258
|
+
const separatorRow = "|" + colWidths.map((w) => `-${"-".repeat(w)}-`).join("+") + "|";
|
|
259
|
+
const lines = [];
|
|
260
|
+
for (let i = 0; i < rows.length; i++) {
|
|
261
|
+
lines.push(formatRow(rows[i]));
|
|
262
|
+
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
263
|
+
lines.push(separatorRow);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return "\n\n" + lines.join("\n") + "\n\n";
|
|
267
|
+
}
|
|
268
|
+
function extractRowCells(tr, ctx) {
|
|
269
|
+
const cells = [];
|
|
270
|
+
for (const cell of tr.childNodes) {
|
|
271
|
+
const cellTag = (cell.tagName || "").toLowerCase();
|
|
272
|
+
if (cellTag === "td" || cellTag === "th") {
|
|
273
|
+
cells.push(convertChildren(cell, ctx).trim());
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
return cells;
|
|
277
|
+
}
|
|
278
|
+
function collapseWhitespace(text) {
|
|
279
|
+
return text.replace(/\s+/g, " ");
|
|
280
|
+
}
|
|
281
|
+
function resolveUrl(url, baseUrl) {
|
|
282
|
+
if (!url) return "";
|
|
283
|
+
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
284
|
+
if (!baseUrl) return url;
|
|
285
|
+
try {
|
|
286
|
+
return new URL(url, baseUrl).href;
|
|
287
|
+
} catch (e) {
|
|
288
|
+
return url;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
exports.domToOrg = domToOrg;
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
// src/convert.ts
|
|
2
|
+
function domToOrg(node, baseUrl = "") {
|
|
3
|
+
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
4
|
+
const raw = convertNode(node, ctx);
|
|
5
|
+
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
6
|
+
}
|
|
7
|
+
function convertNode(node, ctx) {
|
|
8
|
+
if (node.nodeType === 3) {
|
|
9
|
+
return collapseWhitespace(node.textContent || "");
|
|
10
|
+
}
|
|
11
|
+
if (node.nodeType !== 1) return "";
|
|
12
|
+
const tag = (node.tagName || "").toLowerCase();
|
|
13
|
+
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
14
|
+
switch (tag) {
|
|
15
|
+
case "h1":
|
|
16
|
+
case "h2":
|
|
17
|
+
case "h3":
|
|
18
|
+
case "h4":
|
|
19
|
+
case "h5":
|
|
20
|
+
case "h6":
|
|
21
|
+
return convertHeading(node, tag, ctx);
|
|
22
|
+
case "p":
|
|
23
|
+
return convertParagraph(node, ctx);
|
|
24
|
+
case "strong":
|
|
25
|
+
case "b":
|
|
26
|
+
return wrapInline("*", node, ctx);
|
|
27
|
+
case "em":
|
|
28
|
+
case "i":
|
|
29
|
+
return wrapInline("/", node, ctx);
|
|
30
|
+
case "u":
|
|
31
|
+
case "ins":
|
|
32
|
+
return wrapInline("_", node, ctx);
|
|
33
|
+
case "s":
|
|
34
|
+
case "del":
|
|
35
|
+
case "strike":
|
|
36
|
+
return wrapInline("+", node, ctx);
|
|
37
|
+
case "code":
|
|
38
|
+
return wrapInline("=", node, ctx);
|
|
39
|
+
case "mark":
|
|
40
|
+
return wrapInline("=", node, ctx);
|
|
41
|
+
case "sup":
|
|
42
|
+
return `^{${convertChildren(node, ctx)}}`;
|
|
43
|
+
case "sub":
|
|
44
|
+
return `_{${convertChildren(node, ctx)}}`;
|
|
45
|
+
case "a":
|
|
46
|
+
return convertLink(node, ctx);
|
|
47
|
+
case "img":
|
|
48
|
+
return convertImage(node, ctx);
|
|
49
|
+
case "ul":
|
|
50
|
+
return convertList(node, false, ctx);
|
|
51
|
+
case "ol":
|
|
52
|
+
return convertList(node, true, ctx);
|
|
53
|
+
case "li":
|
|
54
|
+
return convertChildren(node, ctx);
|
|
55
|
+
case "pre":
|
|
56
|
+
return convertPre(node, ctx);
|
|
57
|
+
case "blockquote":
|
|
58
|
+
return convertBlockquote(node, ctx);
|
|
59
|
+
case "table":
|
|
60
|
+
return convertTable(node, ctx);
|
|
61
|
+
case "hr":
|
|
62
|
+
return "\n\n-----\n\n";
|
|
63
|
+
case "br":
|
|
64
|
+
return "\n";
|
|
65
|
+
default:
|
|
66
|
+
return convertChildren(node, ctx);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
function convertChildren(node, ctx) {
|
|
70
|
+
let out = "";
|
|
71
|
+
for (const child of node.childNodes) {
|
|
72
|
+
if (child.nodeType === 3) {
|
|
73
|
+
const text = child.textContent || "";
|
|
74
|
+
if (!text.trim()) {
|
|
75
|
+
const prev = child.previousSibling;
|
|
76
|
+
const next = child.nextSibling;
|
|
77
|
+
if (isBlockElement(prev) || isBlockElement(next)) continue;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
out += convertNode(child, ctx);
|
|
81
|
+
}
|
|
82
|
+
return out;
|
|
83
|
+
}
|
|
84
|
+
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
85
|
+
"p",
|
|
86
|
+
"div",
|
|
87
|
+
"h1",
|
|
88
|
+
"h2",
|
|
89
|
+
"h3",
|
|
90
|
+
"h4",
|
|
91
|
+
"h5",
|
|
92
|
+
"h6",
|
|
93
|
+
"ul",
|
|
94
|
+
"ol",
|
|
95
|
+
"li",
|
|
96
|
+
"pre",
|
|
97
|
+
"blockquote",
|
|
98
|
+
"table",
|
|
99
|
+
"hr",
|
|
100
|
+
"section",
|
|
101
|
+
"article",
|
|
102
|
+
"main",
|
|
103
|
+
"header",
|
|
104
|
+
"footer",
|
|
105
|
+
"nav",
|
|
106
|
+
"aside",
|
|
107
|
+
"figure",
|
|
108
|
+
"figcaption",
|
|
109
|
+
"details",
|
|
110
|
+
"summary"
|
|
111
|
+
]);
|
|
112
|
+
function isBlockElement(node) {
|
|
113
|
+
if (!node || node.nodeType !== 1) return false;
|
|
114
|
+
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
115
|
+
}
|
|
116
|
+
function convertHeading(node, tag, ctx) {
|
|
117
|
+
const level = parseInt(tag[1], 10);
|
|
118
|
+
const text = convertChildren(node, ctx).trim();
|
|
119
|
+
return `
|
|
120
|
+
|
|
121
|
+
${"*".repeat(level)} ${text}
|
|
122
|
+
|
|
123
|
+
`;
|
|
124
|
+
}
|
|
125
|
+
function convertParagraph(node, ctx) {
|
|
126
|
+
const text = convertChildren(node, ctx).trim();
|
|
127
|
+
if (!text) return "";
|
|
128
|
+
return `
|
|
129
|
+
|
|
130
|
+
${text}
|
|
131
|
+
|
|
132
|
+
`;
|
|
133
|
+
}
|
|
134
|
+
function convertBlockquote(node, ctx) {
|
|
135
|
+
const inner = convertChildren(node, ctx).trim();
|
|
136
|
+
return `
|
|
137
|
+
|
|
138
|
+
#+BEGIN_QUOTE
|
|
139
|
+
${inner}
|
|
140
|
+
#+END_QUOTE
|
|
141
|
+
|
|
142
|
+
`;
|
|
143
|
+
}
|
|
144
|
+
function convertPre(node, _ctx) {
|
|
145
|
+
const codeChild = node.querySelector?.("code");
|
|
146
|
+
if (codeChild) {
|
|
147
|
+
const lang = detectLanguage(codeChild);
|
|
148
|
+
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
149
|
+
const langSuffix = lang ? ` ${lang}` : "";
|
|
150
|
+
return `
|
|
151
|
+
|
|
152
|
+
#+BEGIN_SRC${langSuffix}
|
|
153
|
+
${code}
|
|
154
|
+
#+END_SRC
|
|
155
|
+
|
|
156
|
+
`;
|
|
157
|
+
}
|
|
158
|
+
const text = (node.textContent || "").replace(/\n$/, "");
|
|
159
|
+
return `
|
|
160
|
+
|
|
161
|
+
#+BEGIN_EXAMPLE
|
|
162
|
+
${text}
|
|
163
|
+
#+END_EXAMPLE
|
|
164
|
+
|
|
165
|
+
`;
|
|
166
|
+
}
|
|
167
|
+
function detectLanguage(codeNode) {
|
|
168
|
+
const cls = codeNode.getAttribute?.("class") || "";
|
|
169
|
+
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
170
|
+
return match ? match[1] : "";
|
|
171
|
+
}
|
|
172
|
+
function convertList(node, ordered, ctx) {
|
|
173
|
+
const items = [];
|
|
174
|
+
let counter = 1;
|
|
175
|
+
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
176
|
+
const prefixWidth = ordered ? 3 : 2;
|
|
177
|
+
for (const child of node.childNodes) {
|
|
178
|
+
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
179
|
+
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
180
|
+
let textParts = [];
|
|
181
|
+
let nestedLists = [];
|
|
182
|
+
for (const liChild of child.childNodes) {
|
|
183
|
+
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
184
|
+
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
185
|
+
const nestedCtx = {
|
|
186
|
+
...ctx,
|
|
187
|
+
listDepth: ctx.listDepth + 1,
|
|
188
|
+
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
189
|
+
};
|
|
190
|
+
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
191
|
+
} else {
|
|
192
|
+
textParts.push(convertNode(liChild, ctx));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
const text = textParts.join("").trim();
|
|
196
|
+
let item = `${indent}${currentPrefix}${text}`;
|
|
197
|
+
if (nestedLists.length > 0) {
|
|
198
|
+
item += "\n" + nestedLists.join("\n");
|
|
199
|
+
}
|
|
200
|
+
items.push(item);
|
|
201
|
+
counter++;
|
|
202
|
+
}
|
|
203
|
+
const result = items.join("\n");
|
|
204
|
+
return ctx.listDepth === 0 ? `
|
|
205
|
+
|
|
206
|
+
${result}
|
|
207
|
+
|
|
208
|
+
` : result;
|
|
209
|
+
}
|
|
210
|
+
function wrapInline(marker, node, ctx) {
|
|
211
|
+
const inner = convertChildren(node, ctx);
|
|
212
|
+
if (!inner.trim()) return inner;
|
|
213
|
+
return `${marker}${inner}${marker}`;
|
|
214
|
+
}
|
|
215
|
+
function convertLink(node, ctx) {
|
|
216
|
+
const href = resolveUrl(node.getAttribute?.("href") || "", ctx.baseUrl);
|
|
217
|
+
const text = convertChildren(node, ctx).trim();
|
|
218
|
+
if (!href) return text;
|
|
219
|
+
if (!text || text === href) return `[[${href}]]`;
|
|
220
|
+
return `[[${href}][${text}]]`;
|
|
221
|
+
}
|
|
222
|
+
function convertImage(node, ctx) {
|
|
223
|
+
const src = resolveUrl(node.getAttribute?.("src") || "", ctx.baseUrl);
|
|
224
|
+
if (!src) return "";
|
|
225
|
+
return `[[${src}]]`;
|
|
226
|
+
}
|
|
227
|
+
function convertTable(node, ctx) {
|
|
228
|
+
const rows = [];
|
|
229
|
+
let headerRowCount = 0;
|
|
230
|
+
const thead = node.querySelector?.("thead");
|
|
231
|
+
if (thead) {
|
|
232
|
+
for (const tr of thead.querySelectorAll?.("tr") || []) {
|
|
233
|
+
rows.push(extractRowCells(tr, ctx));
|
|
234
|
+
headerRowCount++;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
const tbody = node.querySelector?.("tbody");
|
|
238
|
+
const bodyContainer = tbody || node;
|
|
239
|
+
for (const tr of bodyContainer.querySelectorAll?.("tr") || []) {
|
|
240
|
+
if (thead && tr.parentNode === thead) continue;
|
|
241
|
+
rows.push(extractRowCells(tr, ctx));
|
|
242
|
+
}
|
|
243
|
+
if (rows.length === 0) return "";
|
|
244
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
245
|
+
const colWidths = new Array(colCount).fill(0);
|
|
246
|
+
for (const row of rows) {
|
|
247
|
+
for (let i = 0; i < colCount; i++) {
|
|
248
|
+
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
const formatRow = (row) => {
|
|
252
|
+
const cells = [];
|
|
253
|
+
for (let i = 0; i < colCount; i++) {
|
|
254
|
+
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
255
|
+
}
|
|
256
|
+
return "| " + cells.join(" | ") + " |";
|
|
257
|
+
};
|
|
258
|
+
const separatorRow = "|" + colWidths.map((w) => `-${"-".repeat(w)}-`).join("+") + "|";
|
|
259
|
+
const lines = [];
|
|
260
|
+
for (let i = 0; i < rows.length; i++) {
|
|
261
|
+
lines.push(formatRow(rows[i]));
|
|
262
|
+
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
263
|
+
lines.push(separatorRow);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
return "\n\n" + lines.join("\n") + "\n\n";
|
|
267
|
+
}
|
|
268
|
+
function extractRowCells(tr, ctx) {
|
|
269
|
+
const cells = [];
|
|
270
|
+
for (const cell of tr.childNodes) {
|
|
271
|
+
const cellTag = (cell.tagName || "").toLowerCase();
|
|
272
|
+
if (cellTag === "td" || cellTag === "th") {
|
|
273
|
+
cells.push(convertChildren(cell, ctx).trim());
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
return cells;
|
|
277
|
+
}
|
|
278
|
+
function collapseWhitespace(text) {
|
|
279
|
+
return text.replace(/\s+/g, " ");
|
|
280
|
+
}
|
|
281
|
+
function resolveUrl(url, baseUrl) {
|
|
282
|
+
if (!url) return "";
|
|
283
|
+
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
284
|
+
if (!baseUrl) return url;
|
|
285
|
+
try {
|
|
286
|
+
return new URL(url, baseUrl).href;
|
|
287
|
+
} catch {
|
|
288
|
+
return url;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
export {
|
|
293
|
+
domToOrg
|
|
294
|
+
};
|
package/dist/dom.cjs
ADDED
package/dist/dom.d.cts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert a DOM node to Org-mode format.
|
|
3
|
+
*
|
|
4
|
+
* @param node - A DOM node (Element, Document, or DocumentFragment)
|
|
5
|
+
* @param baseUrl - Base URL for resolving relative links
|
|
6
|
+
* @returns Org-mode formatted string
|
|
7
|
+
*/
|
|
8
|
+
declare function domToOrg(node: any, baseUrl?: string): string;
|
|
9
|
+
|
|
10
|
+
export { domToOrg };
|
package/dist/dom.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert a DOM node to Org-mode format.
|
|
3
|
+
*
|
|
4
|
+
* @param node - A DOM node (Element, Document, or DocumentFragment)
|
|
5
|
+
* @param baseUrl - Base URL for resolving relative links
|
|
6
|
+
* @returns Org-mode formatted string
|
|
7
|
+
*/
|
|
8
|
+
declare function domToOrg(node: any, baseUrl?: string): string;
|
|
9
|
+
|
|
10
|
+
export { domToOrg };
|
package/dist/dom.js
ADDED
package/dist/index.cjs
CHANGED
|
@@ -1,351 +1,15 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
|
|
3
|
-
var
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
-
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
-
};
|
|
10
|
-
var __copyProps = (to, from, except, desc) => {
|
|
11
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
-
for (let key of __getOwnPropNames(from))
|
|
13
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
-
}
|
|
16
|
-
return to;
|
|
17
|
-
};
|
|
18
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
1
|
+
"use strict";Object.defineProperty(exports, "__esModule", {value: true});
|
|
2
|
+
|
|
3
|
+
var _chunkASXHHGOBcjs = require('./chunk-ASXHHGOB.cjs');
|
|
19
4
|
|
|
20
5
|
// src/index.ts
|
|
21
|
-
var
|
|
22
|
-
__export(index_exports, {
|
|
23
|
-
htmlToOrg: () => htmlToOrg
|
|
24
|
-
});
|
|
25
|
-
module.exports = __toCommonJS(index_exports);
|
|
26
|
-
var import_linkedom = require("linkedom");
|
|
6
|
+
var _linkedom = require('linkedom');
|
|
27
7
|
function htmlToOrg(html, baseUrl = "") {
|
|
28
8
|
if (!html || !html.trim()) return "";
|
|
29
|
-
const { document } = (0,
|
|
30
|
-
|
|
31
|
-
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
32
|
-
const raw = convertNode(body, ctx);
|
|
33
|
-
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
34
|
-
}
|
|
35
|
-
function convertNode(node, ctx) {
|
|
36
|
-
if (node.nodeType === 3) {
|
|
37
|
-
return collapseWhitespace(node.textContent || "");
|
|
38
|
-
}
|
|
39
|
-
if (node.nodeType !== 1) return "";
|
|
40
|
-
const tag = (node.tagName || "").toLowerCase();
|
|
41
|
-
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
42
|
-
switch (tag) {
|
|
43
|
-
case "h1":
|
|
44
|
-
case "h2":
|
|
45
|
-
case "h3":
|
|
46
|
-
case "h4":
|
|
47
|
-
case "h5":
|
|
48
|
-
case "h6":
|
|
49
|
-
return convertHeading(node, tag, ctx);
|
|
50
|
-
case "p":
|
|
51
|
-
return convertParagraph(node, ctx);
|
|
52
|
-
case "strong":
|
|
53
|
-
case "b":
|
|
54
|
-
return wrapInline("*", node, ctx);
|
|
55
|
-
case "em":
|
|
56
|
-
case "i":
|
|
57
|
-
return wrapInline("/", node, ctx);
|
|
58
|
-
case "u":
|
|
59
|
-
case "ins":
|
|
60
|
-
return wrapInline("_", node, ctx);
|
|
61
|
-
case "s":
|
|
62
|
-
case "del":
|
|
63
|
-
case "strike":
|
|
64
|
-
return wrapInline("+", node, ctx);
|
|
65
|
-
case "code":
|
|
66
|
-
return wrapInline("=", node, ctx);
|
|
67
|
-
case "mark":
|
|
68
|
-
return wrapInline("=", node, ctx);
|
|
69
|
-
case "sup":
|
|
70
|
-
return `^{${convertChildren(node, ctx)}}`;
|
|
71
|
-
case "sub":
|
|
72
|
-
return `_{${convertChildren(node, ctx)}}`;
|
|
73
|
-
case "a":
|
|
74
|
-
return convertLink(node, ctx);
|
|
75
|
-
case "img":
|
|
76
|
-
return convertImage(node, ctx);
|
|
77
|
-
case "ul":
|
|
78
|
-
return convertList(node, false, ctx);
|
|
79
|
-
case "ol":
|
|
80
|
-
return convertList(node, true, ctx);
|
|
81
|
-
case "li":
|
|
82
|
-
return convertChildren(node, ctx);
|
|
83
|
-
case "pre":
|
|
84
|
-
return convertPre(node, ctx);
|
|
85
|
-
case "blockquote":
|
|
86
|
-
return convertBlockquote(node, ctx);
|
|
87
|
-
case "table":
|
|
88
|
-
return convertTable(node, ctx);
|
|
89
|
-
case "hr":
|
|
90
|
-
return "\n\n-----\n\n";
|
|
91
|
-
case "br":
|
|
92
|
-
return "\n";
|
|
93
|
-
// Transparent wrappers — just recurse
|
|
94
|
-
case "div":
|
|
95
|
-
case "section":
|
|
96
|
-
case "article":
|
|
97
|
-
case "main":
|
|
98
|
-
case "header":
|
|
99
|
-
case "footer":
|
|
100
|
-
case "nav":
|
|
101
|
-
case "aside":
|
|
102
|
-
case "figure":
|
|
103
|
-
case "figcaption":
|
|
104
|
-
case "details":
|
|
105
|
-
case "summary":
|
|
106
|
-
case "span":
|
|
107
|
-
case "small":
|
|
108
|
-
case "time":
|
|
109
|
-
case "abbr":
|
|
110
|
-
case "thead":
|
|
111
|
-
case "tbody":
|
|
112
|
-
case "tfoot":
|
|
113
|
-
case "html":
|
|
114
|
-
case "body":
|
|
115
|
-
return convertChildren(node, ctx);
|
|
116
|
-
default:
|
|
117
|
-
return convertChildren(node, ctx);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
function convertChildren(node, ctx) {
|
|
121
|
-
let out = "";
|
|
122
|
-
for (const child of node.childNodes) {
|
|
123
|
-
if (child.nodeType === 3) {
|
|
124
|
-
const text = child.textContent || "";
|
|
125
|
-
if (!text.trim()) {
|
|
126
|
-
const prev = child.previousSibling;
|
|
127
|
-
const next = child.nextSibling;
|
|
128
|
-
if (isBlockElement(prev) || isBlockElement(next)) {
|
|
129
|
-
continue;
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
out += convertNode(child, ctx);
|
|
134
|
-
}
|
|
135
|
-
return out;
|
|
136
|
-
}
|
|
137
|
-
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
138
|
-
"p",
|
|
139
|
-
"div",
|
|
140
|
-
"h1",
|
|
141
|
-
"h2",
|
|
142
|
-
"h3",
|
|
143
|
-
"h4",
|
|
144
|
-
"h5",
|
|
145
|
-
"h6",
|
|
146
|
-
"ul",
|
|
147
|
-
"ol",
|
|
148
|
-
"li",
|
|
149
|
-
"pre",
|
|
150
|
-
"blockquote",
|
|
151
|
-
"table",
|
|
152
|
-
"hr",
|
|
153
|
-
"section",
|
|
154
|
-
"article",
|
|
155
|
-
"main",
|
|
156
|
-
"header",
|
|
157
|
-
"footer",
|
|
158
|
-
"nav",
|
|
159
|
-
"aside",
|
|
160
|
-
"figure",
|
|
161
|
-
"figcaption",
|
|
162
|
-
"details",
|
|
163
|
-
"summary"
|
|
164
|
-
]);
|
|
165
|
-
function isBlockElement(node) {
|
|
166
|
-
if (!node || node.nodeType !== 1) return false;
|
|
167
|
-
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
168
|
-
}
|
|
169
|
-
function convertHeading(node, tag, ctx) {
|
|
170
|
-
const level = parseInt(tag[1], 10);
|
|
171
|
-
const stars = "*".repeat(level);
|
|
172
|
-
const text = convertChildren(node, ctx).trim();
|
|
173
|
-
return `
|
|
174
|
-
|
|
175
|
-
${stars} ${text}
|
|
176
|
-
|
|
177
|
-
`;
|
|
178
|
-
}
|
|
179
|
-
function convertParagraph(node, ctx) {
|
|
180
|
-
const text = convertChildren(node, ctx).trim();
|
|
181
|
-
if (!text) return "";
|
|
182
|
-
return `
|
|
183
|
-
|
|
184
|
-
${text}
|
|
185
|
-
|
|
186
|
-
`;
|
|
187
|
-
}
|
|
188
|
-
function convertBlockquote(node, ctx) {
|
|
189
|
-
const inner = convertChildren(node, ctx).trim();
|
|
190
|
-
return `
|
|
191
|
-
|
|
192
|
-
#+BEGIN_QUOTE
|
|
193
|
-
${inner}
|
|
194
|
-
#+END_QUOTE
|
|
195
|
-
|
|
196
|
-
`;
|
|
9
|
+
const { document } = _linkedom.parseHTML.call(void 0, `<!DOCTYPE html><html><body>${html}</body></html>`);
|
|
10
|
+
return _chunkASXHHGOBcjs.domToOrg.call(void 0, document.body, baseUrl);
|
|
197
11
|
}
|
|
198
|
-
function convertPre(node, _ctx) {
|
|
199
|
-
const codeChild = node.querySelector?.("code");
|
|
200
|
-
if (codeChild) {
|
|
201
|
-
const lang = detectLanguage(codeChild);
|
|
202
|
-
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
203
|
-
const langSuffix = lang ? ` ${lang}` : "";
|
|
204
|
-
return `
|
|
205
12
|
|
|
206
|
-
#+BEGIN_SRC${langSuffix}
|
|
207
|
-
${code}
|
|
208
|
-
#+END_SRC
|
|
209
13
|
|
|
210
|
-
`;
|
|
211
|
-
}
|
|
212
|
-
const text = (node.textContent || "").replace(/\n$/, "");
|
|
213
|
-
return `
|
|
214
14
|
|
|
215
|
-
|
|
216
|
-
${text}
|
|
217
|
-
#+END_EXAMPLE
|
|
218
|
-
|
|
219
|
-
`;
|
|
220
|
-
}
|
|
221
|
-
function detectLanguage(codeNode) {
|
|
222
|
-
const cls = codeNode.getAttribute?.("class") || "";
|
|
223
|
-
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
224
|
-
return match ? match[1] : "";
|
|
225
|
-
}
|
|
226
|
-
function convertList(node, ordered, ctx) {
|
|
227
|
-
const items = [];
|
|
228
|
-
let counter = 1;
|
|
229
|
-
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
230
|
-
const prefixWidth = ordered ? 3 : 2;
|
|
231
|
-
for (const child of node.childNodes) {
|
|
232
|
-
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
233
|
-
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
234
|
-
let textParts = [];
|
|
235
|
-
let nestedLists = [];
|
|
236
|
-
for (const liChild of child.childNodes) {
|
|
237
|
-
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
238
|
-
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
239
|
-
const nestedCtx = {
|
|
240
|
-
...ctx,
|
|
241
|
-
listDepth: ctx.listDepth + 1,
|
|
242
|
-
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
243
|
-
};
|
|
244
|
-
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
245
|
-
} else {
|
|
246
|
-
textParts.push(convertNode(liChild, ctx));
|
|
247
|
-
}
|
|
248
|
-
}
|
|
249
|
-
const text = textParts.join("").trim();
|
|
250
|
-
let item = `${indent}${currentPrefix}${text}`;
|
|
251
|
-
if (nestedLists.length > 0) {
|
|
252
|
-
item += "\n" + nestedLists.join("\n");
|
|
253
|
-
}
|
|
254
|
-
items.push(item);
|
|
255
|
-
counter++;
|
|
256
|
-
}
|
|
257
|
-
const result = items.join("\n");
|
|
258
|
-
return ctx.listDepth === 0 ? `
|
|
259
|
-
|
|
260
|
-
${result}
|
|
261
|
-
|
|
262
|
-
` : result;
|
|
263
|
-
}
|
|
264
|
-
function wrapInline(marker, node, ctx) {
|
|
265
|
-
const inner = convertChildren(node, ctx);
|
|
266
|
-
if (!inner.trim()) return inner;
|
|
267
|
-
return `${marker}${inner}${marker}`;
|
|
268
|
-
}
|
|
269
|
-
function convertLink(node, ctx) {
|
|
270
|
-
const href = resolveUrl(node.getAttribute?.("href") || "", ctx.baseUrl);
|
|
271
|
-
const text = convertChildren(node, ctx).trim();
|
|
272
|
-
if (!href) return text;
|
|
273
|
-
if (!text || text === href) return `[[${href}]]`;
|
|
274
|
-
return `[[${href}][${text}]]`;
|
|
275
|
-
}
|
|
276
|
-
function convertImage(node, ctx) {
|
|
277
|
-
const src = resolveUrl(node.getAttribute?.("src") || "", ctx.baseUrl);
|
|
278
|
-
if (!src) return "";
|
|
279
|
-
return `[[${src}]]`;
|
|
280
|
-
}
|
|
281
|
-
function convertTable(node, ctx) {
|
|
282
|
-
const rows = [];
|
|
283
|
-
let headerRowCount = 0;
|
|
284
|
-
const thead = node.querySelector?.("thead");
|
|
285
|
-
if (thead) {
|
|
286
|
-
for (const tr of thead.querySelectorAll?.("tr") || []) {
|
|
287
|
-
const cells = extractRowCells(tr, ctx);
|
|
288
|
-
rows.push(cells);
|
|
289
|
-
headerRowCount++;
|
|
290
|
-
}
|
|
291
|
-
}
|
|
292
|
-
const tbody = node.querySelector?.("tbody");
|
|
293
|
-
const bodyContainer = tbody || node;
|
|
294
|
-
for (const tr of bodyContainer.querySelectorAll?.("tr") || []) {
|
|
295
|
-
if (thead && tr.parentNode === thead) continue;
|
|
296
|
-
const cells = extractRowCells(tr, ctx);
|
|
297
|
-
rows.push(cells);
|
|
298
|
-
}
|
|
299
|
-
if (rows.length === 0) return "";
|
|
300
|
-
const colCount = Math.max(...rows.map((r) => r.length));
|
|
301
|
-
const colWidths = new Array(colCount).fill(0);
|
|
302
|
-
for (const row of rows) {
|
|
303
|
-
for (let i = 0; i < colCount; i++) {
|
|
304
|
-
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
const formatRow = (row) => {
|
|
308
|
-
const cells = [];
|
|
309
|
-
for (let i = 0; i < colCount; i++) {
|
|
310
|
-
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
311
|
-
}
|
|
312
|
-
return "| " + cells.join(" | ") + " |";
|
|
313
|
-
};
|
|
314
|
-
const sepParts = colWidths.map((w) => "-".repeat(w));
|
|
315
|
-
const separatorRow = "|" + sepParts.map((s) => `-${s}-`).join("+") + "|";
|
|
316
|
-
const lines = [];
|
|
317
|
-
for (let i = 0; i < rows.length; i++) {
|
|
318
|
-
lines.push(formatRow(rows[i]));
|
|
319
|
-
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
320
|
-
lines.push(separatorRow);
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
return "\n\n" + lines.join("\n") + "\n\n";
|
|
324
|
-
}
|
|
325
|
-
function extractRowCells(tr, ctx) {
|
|
326
|
-
const cells = [];
|
|
327
|
-
for (const cell of tr.childNodes) {
|
|
328
|
-
const cellTag = (cell.tagName || "").toLowerCase();
|
|
329
|
-
if (cellTag === "td" || cellTag === "th") {
|
|
330
|
-
cells.push(convertChildren(cell, ctx).trim());
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
return cells;
|
|
334
|
-
}
|
|
335
|
-
function collapseWhitespace(text) {
|
|
336
|
-
return text.replace(/\s+/g, " ");
|
|
337
|
-
}
|
|
338
|
-
function resolveUrl(url, baseUrl) {
|
|
339
|
-
if (!url) return "";
|
|
340
|
-
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
341
|
-
if (!baseUrl) return url;
|
|
342
|
-
try {
|
|
343
|
-
return new URL(url, baseUrl).href;
|
|
344
|
-
} catch {
|
|
345
|
-
return url;
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
// Annotate the CommonJS export names for ESM import in node:
|
|
349
|
-
0 && (module.exports = {
|
|
350
|
-
htmlToOrg
|
|
351
|
-
});
|
|
15
|
+
exports.domToOrg = _chunkASXHHGOBcjs.domToOrg; exports.htmlToOrg = htmlToOrg;
|
package/dist/index.d.cts
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
export { domToOrg } from './dom.cjs';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Convert HTML to Org-mode format.
|
|
3
5
|
*
|
|
6
|
+
* Uses linkedom for parsing. In browser environments, import `domToOrg`
|
|
7
|
+
* from `html-to-org/dom` to avoid bundling linkedom.
|
|
8
|
+
*
|
|
4
9
|
* @param html - HTML string to convert
|
|
5
10
|
* @param baseUrl - Base URL for resolving relative links
|
|
6
11
|
* @returns Org-mode formatted string
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
export { domToOrg } from './dom.js';
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Convert HTML to Org-mode format.
|
|
3
5
|
*
|
|
6
|
+
* Uses linkedom for parsing. In browser environments, import `domToOrg`
|
|
7
|
+
* from `html-to-org/dom` to avoid bundling linkedom.
|
|
8
|
+
*
|
|
4
9
|
* @param html - HTML string to convert
|
|
5
10
|
* @param baseUrl - Base URL for resolving relative links
|
|
6
11
|
* @returns Org-mode formatted string
|
package/dist/index.js
CHANGED
|
@@ -1,326 +1,15 @@
|
|
|
1
|
+
import {
|
|
2
|
+
domToOrg
|
|
3
|
+
} from "./chunk-ZZ4VEN6R.js";
|
|
4
|
+
|
|
1
5
|
// src/index.ts
|
|
2
6
|
import { parseHTML } from "linkedom";
|
|
3
7
|
function htmlToOrg(html, baseUrl = "") {
|
|
4
8
|
if (!html || !html.trim()) return "";
|
|
5
9
|
const { document } = parseHTML(`<!DOCTYPE html><html><body>${html}</body></html>`);
|
|
6
|
-
|
|
7
|
-
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
8
|
-
const raw = convertNode(body, ctx);
|
|
9
|
-
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
10
|
-
}
|
|
11
|
-
function convertNode(node, ctx) {
|
|
12
|
-
if (node.nodeType === 3) {
|
|
13
|
-
return collapseWhitespace(node.textContent || "");
|
|
14
|
-
}
|
|
15
|
-
if (node.nodeType !== 1) return "";
|
|
16
|
-
const tag = (node.tagName || "").toLowerCase();
|
|
17
|
-
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
18
|
-
switch (tag) {
|
|
19
|
-
case "h1":
|
|
20
|
-
case "h2":
|
|
21
|
-
case "h3":
|
|
22
|
-
case "h4":
|
|
23
|
-
case "h5":
|
|
24
|
-
case "h6":
|
|
25
|
-
return convertHeading(node, tag, ctx);
|
|
26
|
-
case "p":
|
|
27
|
-
return convertParagraph(node, ctx);
|
|
28
|
-
case "strong":
|
|
29
|
-
case "b":
|
|
30
|
-
return wrapInline("*", node, ctx);
|
|
31
|
-
case "em":
|
|
32
|
-
case "i":
|
|
33
|
-
return wrapInline("/", node, ctx);
|
|
34
|
-
case "u":
|
|
35
|
-
case "ins":
|
|
36
|
-
return wrapInline("_", node, ctx);
|
|
37
|
-
case "s":
|
|
38
|
-
case "del":
|
|
39
|
-
case "strike":
|
|
40
|
-
return wrapInline("+", node, ctx);
|
|
41
|
-
case "code":
|
|
42
|
-
return wrapInline("=", node, ctx);
|
|
43
|
-
case "mark":
|
|
44
|
-
return wrapInline("=", node, ctx);
|
|
45
|
-
case "sup":
|
|
46
|
-
return `^{${convertChildren(node, ctx)}}`;
|
|
47
|
-
case "sub":
|
|
48
|
-
return `_{${convertChildren(node, ctx)}}`;
|
|
49
|
-
case "a":
|
|
50
|
-
return convertLink(node, ctx);
|
|
51
|
-
case "img":
|
|
52
|
-
return convertImage(node, ctx);
|
|
53
|
-
case "ul":
|
|
54
|
-
return convertList(node, false, ctx);
|
|
55
|
-
case "ol":
|
|
56
|
-
return convertList(node, true, ctx);
|
|
57
|
-
case "li":
|
|
58
|
-
return convertChildren(node, ctx);
|
|
59
|
-
case "pre":
|
|
60
|
-
return convertPre(node, ctx);
|
|
61
|
-
case "blockquote":
|
|
62
|
-
return convertBlockquote(node, ctx);
|
|
63
|
-
case "table":
|
|
64
|
-
return convertTable(node, ctx);
|
|
65
|
-
case "hr":
|
|
66
|
-
return "\n\n-----\n\n";
|
|
67
|
-
case "br":
|
|
68
|
-
return "\n";
|
|
69
|
-
// Transparent wrappers — just recurse
|
|
70
|
-
case "div":
|
|
71
|
-
case "section":
|
|
72
|
-
case "article":
|
|
73
|
-
case "main":
|
|
74
|
-
case "header":
|
|
75
|
-
case "footer":
|
|
76
|
-
case "nav":
|
|
77
|
-
case "aside":
|
|
78
|
-
case "figure":
|
|
79
|
-
case "figcaption":
|
|
80
|
-
case "details":
|
|
81
|
-
case "summary":
|
|
82
|
-
case "span":
|
|
83
|
-
case "small":
|
|
84
|
-
case "time":
|
|
85
|
-
case "abbr":
|
|
86
|
-
case "thead":
|
|
87
|
-
case "tbody":
|
|
88
|
-
case "tfoot":
|
|
89
|
-
case "html":
|
|
90
|
-
case "body":
|
|
91
|
-
return convertChildren(node, ctx);
|
|
92
|
-
default:
|
|
93
|
-
return convertChildren(node, ctx);
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
function convertChildren(node, ctx) {
|
|
97
|
-
let out = "";
|
|
98
|
-
for (const child of node.childNodes) {
|
|
99
|
-
if (child.nodeType === 3) {
|
|
100
|
-
const text = child.textContent || "";
|
|
101
|
-
if (!text.trim()) {
|
|
102
|
-
const prev = child.previousSibling;
|
|
103
|
-
const next = child.nextSibling;
|
|
104
|
-
if (isBlockElement(prev) || isBlockElement(next)) {
|
|
105
|
-
continue;
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
out += convertNode(child, ctx);
|
|
110
|
-
}
|
|
111
|
-
return out;
|
|
112
|
-
}
|
|
113
|
-
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
114
|
-
"p",
|
|
115
|
-
"div",
|
|
116
|
-
"h1",
|
|
117
|
-
"h2",
|
|
118
|
-
"h3",
|
|
119
|
-
"h4",
|
|
120
|
-
"h5",
|
|
121
|
-
"h6",
|
|
122
|
-
"ul",
|
|
123
|
-
"ol",
|
|
124
|
-
"li",
|
|
125
|
-
"pre",
|
|
126
|
-
"blockquote",
|
|
127
|
-
"table",
|
|
128
|
-
"hr",
|
|
129
|
-
"section",
|
|
130
|
-
"article",
|
|
131
|
-
"main",
|
|
132
|
-
"header",
|
|
133
|
-
"footer",
|
|
134
|
-
"nav",
|
|
135
|
-
"aside",
|
|
136
|
-
"figure",
|
|
137
|
-
"figcaption",
|
|
138
|
-
"details",
|
|
139
|
-
"summary"
|
|
140
|
-
]);
|
|
141
|
-
function isBlockElement(node) {
|
|
142
|
-
if (!node || node.nodeType !== 1) return false;
|
|
143
|
-
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
144
|
-
}
|
|
145
|
-
function convertHeading(node, tag, ctx) {
|
|
146
|
-
const level = parseInt(tag[1], 10);
|
|
147
|
-
const stars = "*".repeat(level);
|
|
148
|
-
const text = convertChildren(node, ctx).trim();
|
|
149
|
-
return `
|
|
150
|
-
|
|
151
|
-
${stars} ${text}
|
|
152
|
-
|
|
153
|
-
`;
|
|
154
|
-
}
|
|
155
|
-
function convertParagraph(node, ctx) {
|
|
156
|
-
const text = convertChildren(node, ctx).trim();
|
|
157
|
-
if (!text) return "";
|
|
158
|
-
return `
|
|
159
|
-
|
|
160
|
-
${text}
|
|
161
|
-
|
|
162
|
-
`;
|
|
163
|
-
}
|
|
164
|
-
function convertBlockquote(node, ctx) {
|
|
165
|
-
const inner = convertChildren(node, ctx).trim();
|
|
166
|
-
return `
|
|
167
|
-
|
|
168
|
-
#+BEGIN_QUOTE
|
|
169
|
-
${inner}
|
|
170
|
-
#+END_QUOTE
|
|
171
|
-
|
|
172
|
-
`;
|
|
173
|
-
}
|
|
174
|
-
function convertPre(node, _ctx) {
|
|
175
|
-
const codeChild = node.querySelector?.("code");
|
|
176
|
-
if (codeChild) {
|
|
177
|
-
const lang = detectLanguage(codeChild);
|
|
178
|
-
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
179
|
-
const langSuffix = lang ? ` ${lang}` : "";
|
|
180
|
-
return `
|
|
181
|
-
|
|
182
|
-
#+BEGIN_SRC${langSuffix}
|
|
183
|
-
${code}
|
|
184
|
-
#+END_SRC
|
|
185
|
-
|
|
186
|
-
`;
|
|
187
|
-
}
|
|
188
|
-
const text = (node.textContent || "").replace(/\n$/, "");
|
|
189
|
-
return `
|
|
190
|
-
|
|
191
|
-
#+BEGIN_EXAMPLE
|
|
192
|
-
${text}
|
|
193
|
-
#+END_EXAMPLE
|
|
194
|
-
|
|
195
|
-
`;
|
|
196
|
-
}
|
|
197
|
-
function detectLanguage(codeNode) {
|
|
198
|
-
const cls = codeNode.getAttribute?.("class") || "";
|
|
199
|
-
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
200
|
-
return match ? match[1] : "";
|
|
201
|
-
}
|
|
202
|
-
function convertList(node, ordered, ctx) {
|
|
203
|
-
const items = [];
|
|
204
|
-
let counter = 1;
|
|
205
|
-
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
206
|
-
const prefixWidth = ordered ? 3 : 2;
|
|
207
|
-
for (const child of node.childNodes) {
|
|
208
|
-
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
209
|
-
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
210
|
-
let textParts = [];
|
|
211
|
-
let nestedLists = [];
|
|
212
|
-
for (const liChild of child.childNodes) {
|
|
213
|
-
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
214
|
-
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
215
|
-
const nestedCtx = {
|
|
216
|
-
...ctx,
|
|
217
|
-
listDepth: ctx.listDepth + 1,
|
|
218
|
-
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
219
|
-
};
|
|
220
|
-
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
221
|
-
} else {
|
|
222
|
-
textParts.push(convertNode(liChild, ctx));
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
const text = textParts.join("").trim();
|
|
226
|
-
let item = `${indent}${currentPrefix}${text}`;
|
|
227
|
-
if (nestedLists.length > 0) {
|
|
228
|
-
item += "\n" + nestedLists.join("\n");
|
|
229
|
-
}
|
|
230
|
-
items.push(item);
|
|
231
|
-
counter++;
|
|
232
|
-
}
|
|
233
|
-
const result = items.join("\n");
|
|
234
|
-
return ctx.listDepth === 0 ? `
|
|
235
|
-
|
|
236
|
-
${result}
|
|
237
|
-
|
|
238
|
-
` : result;
|
|
239
|
-
}
|
|
240
|
-
function wrapInline(marker, node, ctx) {
|
|
241
|
-
const inner = convertChildren(node, ctx);
|
|
242
|
-
if (!inner.trim()) return inner;
|
|
243
|
-
return `${marker}${inner}${marker}`;
|
|
244
|
-
}
|
|
245
|
-
function convertLink(node, ctx) {
|
|
246
|
-
const href = resolveUrl(node.getAttribute?.("href") || "", ctx.baseUrl);
|
|
247
|
-
const text = convertChildren(node, ctx).trim();
|
|
248
|
-
if (!href) return text;
|
|
249
|
-
if (!text || text === href) return `[[${href}]]`;
|
|
250
|
-
return `[[${href}][${text}]]`;
|
|
251
|
-
}
|
|
252
|
-
function convertImage(node, ctx) {
|
|
253
|
-
const src = resolveUrl(node.getAttribute?.("src") || "", ctx.baseUrl);
|
|
254
|
-
if (!src) return "";
|
|
255
|
-
return `[[${src}]]`;
|
|
256
|
-
}
|
|
257
|
-
function convertTable(node, ctx) {
|
|
258
|
-
const rows = [];
|
|
259
|
-
let headerRowCount = 0;
|
|
260
|
-
const thead = node.querySelector?.("thead");
|
|
261
|
-
if (thead) {
|
|
262
|
-
for (const tr of thead.querySelectorAll?.("tr") || []) {
|
|
263
|
-
const cells = extractRowCells(tr, ctx);
|
|
264
|
-
rows.push(cells);
|
|
265
|
-
headerRowCount++;
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
const tbody = node.querySelector?.("tbody");
|
|
269
|
-
const bodyContainer = tbody || node;
|
|
270
|
-
for (const tr of bodyContainer.querySelectorAll?.("tr") || []) {
|
|
271
|
-
if (thead && tr.parentNode === thead) continue;
|
|
272
|
-
const cells = extractRowCells(tr, ctx);
|
|
273
|
-
rows.push(cells);
|
|
274
|
-
}
|
|
275
|
-
if (rows.length === 0) return "";
|
|
276
|
-
const colCount = Math.max(...rows.map((r) => r.length));
|
|
277
|
-
const colWidths = new Array(colCount).fill(0);
|
|
278
|
-
for (const row of rows) {
|
|
279
|
-
for (let i = 0; i < colCount; i++) {
|
|
280
|
-
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
const formatRow = (row) => {
|
|
284
|
-
const cells = [];
|
|
285
|
-
for (let i = 0; i < colCount; i++) {
|
|
286
|
-
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
287
|
-
}
|
|
288
|
-
return "| " + cells.join(" | ") + " |";
|
|
289
|
-
};
|
|
290
|
-
const sepParts = colWidths.map((w) => "-".repeat(w));
|
|
291
|
-
const separatorRow = "|" + sepParts.map((s) => `-${s}-`).join("+") + "|";
|
|
292
|
-
const lines = [];
|
|
293
|
-
for (let i = 0; i < rows.length; i++) {
|
|
294
|
-
lines.push(formatRow(rows[i]));
|
|
295
|
-
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
296
|
-
lines.push(separatorRow);
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
return "\n\n" + lines.join("\n") + "\n\n";
|
|
300
|
-
}
|
|
301
|
-
function extractRowCells(tr, ctx) {
|
|
302
|
-
const cells = [];
|
|
303
|
-
for (const cell of tr.childNodes) {
|
|
304
|
-
const cellTag = (cell.tagName || "").toLowerCase();
|
|
305
|
-
if (cellTag === "td" || cellTag === "th") {
|
|
306
|
-
cells.push(convertChildren(cell, ctx).trim());
|
|
307
|
-
}
|
|
308
|
-
}
|
|
309
|
-
return cells;
|
|
310
|
-
}
|
|
311
|
-
function collapseWhitespace(text) {
|
|
312
|
-
return text.replace(/\s+/g, " ");
|
|
313
|
-
}
|
|
314
|
-
function resolveUrl(url, baseUrl) {
|
|
315
|
-
if (!url) return "";
|
|
316
|
-
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
317
|
-
if (!baseUrl) return url;
|
|
318
|
-
try {
|
|
319
|
-
return new URL(url, baseUrl).href;
|
|
320
|
-
} catch {
|
|
321
|
-
return url;
|
|
322
|
-
}
|
|
10
|
+
return domToOrg(document.body, baseUrl);
|
|
323
11
|
}
|
|
324
12
|
export {
|
|
13
|
+
domToOrg,
|
|
325
14
|
htmlToOrg
|
|
326
15
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "html-to-org",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Convert HTML to Org-mode format",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -16,11 +16,28 @@
|
|
|
16
16
|
"types": "./dist/index.d.cts",
|
|
17
17
|
"default": "./dist/index.cjs"
|
|
18
18
|
}
|
|
19
|
+
},
|
|
20
|
+
"./dom": {
|
|
21
|
+
"import": {
|
|
22
|
+
"types": "./dist/dom.d.ts",
|
|
23
|
+
"default": "./dist/dom.js"
|
|
24
|
+
},
|
|
25
|
+
"require": {
|
|
26
|
+
"types": "./dist/dom.d.cts",
|
|
27
|
+
"default": "./dist/dom.cjs"
|
|
28
|
+
}
|
|
19
29
|
}
|
|
20
30
|
},
|
|
21
31
|
"files": [
|
|
22
32
|
"dist"
|
|
23
33
|
],
|
|
34
|
+
"scripts": {
|
|
35
|
+
"build": "tsup",
|
|
36
|
+
"test": "vitest run",
|
|
37
|
+
"test:watch": "vitest",
|
|
38
|
+
"test:coverage": "vitest run --coverage",
|
|
39
|
+
"prepublishOnly": "pnpm build"
|
|
40
|
+
},
|
|
24
41
|
"keywords": [
|
|
25
42
|
"html",
|
|
26
43
|
"org-mode",
|
|
@@ -36,6 +53,7 @@
|
|
|
36
53
|
"url": "https://github.com/zzzhizhia/html-to-org.git"
|
|
37
54
|
},
|
|
38
55
|
"homepage": "https://github.com/zzzhizhia/html-to-org",
|
|
56
|
+
"packageManager": "pnpm@10.33.0",
|
|
39
57
|
"engines": {
|
|
40
58
|
"node": ">=20"
|
|
41
59
|
},
|
|
@@ -46,11 +64,5 @@
|
|
|
46
64
|
},
|
|
47
65
|
"dependencies": {
|
|
48
66
|
"linkedom": "^0.18.12"
|
|
49
|
-
},
|
|
50
|
-
"scripts": {
|
|
51
|
-
"build": "tsup",
|
|
52
|
-
"test": "vitest run",
|
|
53
|
-
"test:watch": "vitest",
|
|
54
|
-
"test:coverage": "vitest run --coverage"
|
|
55
67
|
}
|
|
56
|
-
}
|
|
68
|
+
}
|