html-to-org 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -0
- package/dist/index.cjs +351 -0
- package/dist/index.d.cts +10 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +326 -0
- package/package.json +56 -0
package/README.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# html-to-org
|
|
2
|
+
|
|
3
|
+
Convert HTML to [Org-mode](https://orgmode.org/) format.
|
|
4
|
+
|
|
5
|
+
Plenty of HTML-to-Markdown converters exist. Zero HTML-to-Org converters. This fills that gap.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install html-to-org
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```typescript
|
|
16
|
+
import { htmlToOrg } from 'html-to-org';
|
|
17
|
+
|
|
18
|
+
const org = htmlToOrg('<h1>Hello</h1><p>This is <strong>bold</strong> text.</p>');
|
|
19
|
+
// * Hello
|
|
20
|
+
//
|
|
21
|
+
// This is *bold* text.
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Pass a base URL to resolve relative links:
|
|
25
|
+
|
|
26
|
+
```typescript
|
|
27
|
+
htmlToOrg('<a href="/about">About</a>', 'https://example.com');
|
|
28
|
+
// [[https://example.com/about][About]]
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Conversion Table
|
|
32
|
+
|
|
33
|
+
| HTML | Org-mode |
|
|
34
|
+
|------|----------|
|
|
35
|
+
| `<h1>` – `<h6>` | `*` – `******` headings |
|
|
36
|
+
| `<strong>`, `<b>` | `*bold*` |
|
|
37
|
+
| `<em>`, `<i>` | `/italic/` |
|
|
38
|
+
| `<u>` | `_underline_` |
|
|
39
|
+
| `<s>`, `<del>` | `+strikethrough+` |
|
|
40
|
+
| `<code>` | `=verbatim=` |
|
|
41
|
+
| `<sup>` | `^{superscript}` |
|
|
42
|
+
| `<sub>` | `_{subscript}` |
|
|
43
|
+
| `<a href="url">text</a>` | `[[url][text]]` |
|
|
44
|
+
| `<img src="url">` | `[[url]]` |
|
|
45
|
+
| `<ul>`, `<ol>` | `- item` / `1. item` (nested) |
|
|
46
|
+
| `<pre><code class="language-js">` | `#+BEGIN_SRC js` … `#+END_SRC` |
|
|
47
|
+
| `<pre>` | `#+BEGIN_EXAMPLE` … `#+END_EXAMPLE` |
|
|
48
|
+
| `<blockquote>` | `#+BEGIN_QUOTE` … `#+END_QUOTE` |
|
|
49
|
+
| `<table>` | `\| col1 \| col2 \|` with header separator |
|
|
50
|
+
| `<hr>` | `-----` |
|
|
51
|
+
| `<br>` | newline |
|
|
52
|
+
|
|
53
|
+
## Example
|
|
54
|
+
|
|
55
|
+
```html
|
|
56
|
+
<h1>My Article</h1>
|
|
57
|
+
<p>This is the <strong>introduction</strong> with a <a href="https://example.com">link</a>.</p>
|
|
58
|
+
<h2>Section One</h2>
|
|
59
|
+
<pre><code class="language-js">console.log("hello");</code></pre>
|
|
60
|
+
<ul>
|
|
61
|
+
<li>Item A</li>
|
|
62
|
+
<li>Item B</li>
|
|
63
|
+
</ul>
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
```org
|
|
67
|
+
* My Article
|
|
68
|
+
|
|
69
|
+
This is the *introduction* with a [[https://example.com][link]].
|
|
70
|
+
|
|
71
|
+
** Section One
|
|
72
|
+
|
|
73
|
+
#+BEGIN_SRC js
|
|
74
|
+
console.log("hello");
|
|
75
|
+
#+END_SRC
|
|
76
|
+
|
|
77
|
+
- Item A
|
|
78
|
+
- Item B
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## API
|
|
82
|
+
|
|
83
|
+
### `htmlToOrg(html: string, baseUrl?: string): string`
|
|
84
|
+
|
|
85
|
+
| Parameter | Type | Default | Description |
|
|
86
|
+
|-----------|------|---------|-------------|
|
|
87
|
+
| `html` | `string` | (required) | HTML string to convert |
|
|
88
|
+
| `baseUrl` | `string` | `''` | Base URL for resolving relative links and images |
|
|
89
|
+
|
|
90
|
+
Returns an Org-mode formatted string. Returns `''` for empty input.
|
|
91
|
+
|
|
92
|
+
## How It Works
|
|
93
|
+
|
|
94
|
+
Parses HTML into a DOM tree via [linkedom](https://github.com/WebReflection/linkedom) (no browser required), walks it recursively converting each node to Org syntax, then normalizes whitespace. No intermediate Markdown step.
|
|
95
|
+
|
|
96
|
+
Handles: HTML entities, `<script>`/`<style>` stripping, nested lists with proper indentation, code language detection from `class="language-xxx"`, table column alignment, relative URL resolution, CJK text and emoji.
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
MIT
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
htmlToOrg: () => htmlToOrg
|
|
24
|
+
});
|
|
25
|
+
module.exports = __toCommonJS(index_exports);
|
|
26
|
+
var import_linkedom = require("linkedom");
|
|
27
|
+
function htmlToOrg(html, baseUrl = "") {
|
|
28
|
+
if (!html || !html.trim()) return "";
|
|
29
|
+
const { document } = (0, import_linkedom.parseHTML)(`<!DOCTYPE html><html><body>${html}</body></html>`);
|
|
30
|
+
const body = document.body;
|
|
31
|
+
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
32
|
+
const raw = convertNode(body, ctx);
|
|
33
|
+
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
34
|
+
}
|
|
35
|
+
function convertNode(node, ctx) {
|
|
36
|
+
if (node.nodeType === 3) {
|
|
37
|
+
return collapseWhitespace(node.textContent || "");
|
|
38
|
+
}
|
|
39
|
+
if (node.nodeType !== 1) return "";
|
|
40
|
+
const tag = (node.tagName || "").toLowerCase();
|
|
41
|
+
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
42
|
+
switch (tag) {
|
|
43
|
+
case "h1":
|
|
44
|
+
case "h2":
|
|
45
|
+
case "h3":
|
|
46
|
+
case "h4":
|
|
47
|
+
case "h5":
|
|
48
|
+
case "h6":
|
|
49
|
+
return convertHeading(node, tag, ctx);
|
|
50
|
+
case "p":
|
|
51
|
+
return convertParagraph(node, ctx);
|
|
52
|
+
case "strong":
|
|
53
|
+
case "b":
|
|
54
|
+
return wrapInline("*", node, ctx);
|
|
55
|
+
case "em":
|
|
56
|
+
case "i":
|
|
57
|
+
return wrapInline("/", node, ctx);
|
|
58
|
+
case "u":
|
|
59
|
+
case "ins":
|
|
60
|
+
return wrapInline("_", node, ctx);
|
|
61
|
+
case "s":
|
|
62
|
+
case "del":
|
|
63
|
+
case "strike":
|
|
64
|
+
return wrapInline("+", node, ctx);
|
|
65
|
+
case "code":
|
|
66
|
+
return wrapInline("=", node, ctx);
|
|
67
|
+
case "mark":
|
|
68
|
+
return wrapInline("=", node, ctx);
|
|
69
|
+
case "sup":
|
|
70
|
+
return `^{${convertChildren(node, ctx)}}`;
|
|
71
|
+
case "sub":
|
|
72
|
+
return `_{${convertChildren(node, ctx)}}`;
|
|
73
|
+
case "a":
|
|
74
|
+
return convertLink(node, ctx);
|
|
75
|
+
case "img":
|
|
76
|
+
return convertImage(node, ctx);
|
|
77
|
+
case "ul":
|
|
78
|
+
return convertList(node, false, ctx);
|
|
79
|
+
case "ol":
|
|
80
|
+
return convertList(node, true, ctx);
|
|
81
|
+
case "li":
|
|
82
|
+
return convertChildren(node, ctx);
|
|
83
|
+
case "pre":
|
|
84
|
+
return convertPre(node, ctx);
|
|
85
|
+
case "blockquote":
|
|
86
|
+
return convertBlockquote(node, ctx);
|
|
87
|
+
case "table":
|
|
88
|
+
return convertTable(node, ctx);
|
|
89
|
+
case "hr":
|
|
90
|
+
return "\n\n-----\n\n";
|
|
91
|
+
case "br":
|
|
92
|
+
return "\n";
|
|
93
|
+
// Transparent wrappers — just recurse
|
|
94
|
+
case "div":
|
|
95
|
+
case "section":
|
|
96
|
+
case "article":
|
|
97
|
+
case "main":
|
|
98
|
+
case "header":
|
|
99
|
+
case "footer":
|
|
100
|
+
case "nav":
|
|
101
|
+
case "aside":
|
|
102
|
+
case "figure":
|
|
103
|
+
case "figcaption":
|
|
104
|
+
case "details":
|
|
105
|
+
case "summary":
|
|
106
|
+
case "span":
|
|
107
|
+
case "small":
|
|
108
|
+
case "time":
|
|
109
|
+
case "abbr":
|
|
110
|
+
case "thead":
|
|
111
|
+
case "tbody":
|
|
112
|
+
case "tfoot":
|
|
113
|
+
case "html":
|
|
114
|
+
case "body":
|
|
115
|
+
return convertChildren(node, ctx);
|
|
116
|
+
default:
|
|
117
|
+
return convertChildren(node, ctx);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
function convertChildren(node, ctx) {
|
|
121
|
+
let out = "";
|
|
122
|
+
for (const child of node.childNodes) {
|
|
123
|
+
if (child.nodeType === 3) {
|
|
124
|
+
const text = child.textContent || "";
|
|
125
|
+
if (!text.trim()) {
|
|
126
|
+
const prev = child.previousSibling;
|
|
127
|
+
const next = child.nextSibling;
|
|
128
|
+
if (isBlockElement(prev) || isBlockElement(next)) {
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
out += convertNode(child, ctx);
|
|
134
|
+
}
|
|
135
|
+
return out;
|
|
136
|
+
}
|
|
137
|
+
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
138
|
+
"p",
|
|
139
|
+
"div",
|
|
140
|
+
"h1",
|
|
141
|
+
"h2",
|
|
142
|
+
"h3",
|
|
143
|
+
"h4",
|
|
144
|
+
"h5",
|
|
145
|
+
"h6",
|
|
146
|
+
"ul",
|
|
147
|
+
"ol",
|
|
148
|
+
"li",
|
|
149
|
+
"pre",
|
|
150
|
+
"blockquote",
|
|
151
|
+
"table",
|
|
152
|
+
"hr",
|
|
153
|
+
"section",
|
|
154
|
+
"article",
|
|
155
|
+
"main",
|
|
156
|
+
"header",
|
|
157
|
+
"footer",
|
|
158
|
+
"nav",
|
|
159
|
+
"aside",
|
|
160
|
+
"figure",
|
|
161
|
+
"figcaption",
|
|
162
|
+
"details",
|
|
163
|
+
"summary"
|
|
164
|
+
]);
|
|
165
|
+
function isBlockElement(node) {
|
|
166
|
+
if (!node || node.nodeType !== 1) return false;
|
|
167
|
+
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
168
|
+
}
|
|
169
|
+
function convertHeading(node, tag, ctx) {
|
|
170
|
+
const level = parseInt(tag[1], 10);
|
|
171
|
+
const stars = "*".repeat(level);
|
|
172
|
+
const text = convertChildren(node, ctx).trim();
|
|
173
|
+
return `
|
|
174
|
+
|
|
175
|
+
${stars} ${text}
|
|
176
|
+
|
|
177
|
+
`;
|
|
178
|
+
}
|
|
179
|
+
function convertParagraph(node, ctx) {
|
|
180
|
+
const text = convertChildren(node, ctx).trim();
|
|
181
|
+
if (!text) return "";
|
|
182
|
+
return `
|
|
183
|
+
|
|
184
|
+
${text}
|
|
185
|
+
|
|
186
|
+
`;
|
|
187
|
+
}
|
|
188
|
+
function convertBlockquote(node, ctx) {
|
|
189
|
+
const inner = convertChildren(node, ctx).trim();
|
|
190
|
+
return `
|
|
191
|
+
|
|
192
|
+
#+BEGIN_QUOTE
|
|
193
|
+
${inner}
|
|
194
|
+
#+END_QUOTE
|
|
195
|
+
|
|
196
|
+
`;
|
|
197
|
+
}
|
|
198
|
+
function convertPre(node, _ctx) {
|
|
199
|
+
const codeChild = node.querySelector?.("code");
|
|
200
|
+
if (codeChild) {
|
|
201
|
+
const lang = detectLanguage(codeChild);
|
|
202
|
+
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
203
|
+
const langSuffix = lang ? ` ${lang}` : "";
|
|
204
|
+
return `
|
|
205
|
+
|
|
206
|
+
#+BEGIN_SRC${langSuffix}
|
|
207
|
+
${code}
|
|
208
|
+
#+END_SRC
|
|
209
|
+
|
|
210
|
+
`;
|
|
211
|
+
}
|
|
212
|
+
const text = (node.textContent || "").replace(/\n$/, "");
|
|
213
|
+
return `
|
|
214
|
+
|
|
215
|
+
#+BEGIN_EXAMPLE
|
|
216
|
+
${text}
|
|
217
|
+
#+END_EXAMPLE
|
|
218
|
+
|
|
219
|
+
`;
|
|
220
|
+
}
|
|
221
|
+
function detectLanguage(codeNode) {
|
|
222
|
+
const cls = codeNode.getAttribute?.("class") || "";
|
|
223
|
+
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
224
|
+
return match ? match[1] : "";
|
|
225
|
+
}
|
|
226
|
+
function convertList(node, ordered, ctx) {
|
|
227
|
+
const items = [];
|
|
228
|
+
let counter = 1;
|
|
229
|
+
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
230
|
+
const prefixWidth = ordered ? 3 : 2;
|
|
231
|
+
for (const child of node.childNodes) {
|
|
232
|
+
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
233
|
+
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
234
|
+
let textParts = [];
|
|
235
|
+
let nestedLists = [];
|
|
236
|
+
for (const liChild of child.childNodes) {
|
|
237
|
+
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
238
|
+
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
239
|
+
const nestedCtx = {
|
|
240
|
+
...ctx,
|
|
241
|
+
listDepth: ctx.listDepth + 1,
|
|
242
|
+
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
243
|
+
};
|
|
244
|
+
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
245
|
+
} else {
|
|
246
|
+
textParts.push(convertNode(liChild, ctx));
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
const text = textParts.join("").trim();
|
|
250
|
+
let item = `${indent}${currentPrefix}${text}`;
|
|
251
|
+
if (nestedLists.length > 0) {
|
|
252
|
+
item += "\n" + nestedLists.join("\n");
|
|
253
|
+
}
|
|
254
|
+
items.push(item);
|
|
255
|
+
counter++;
|
|
256
|
+
}
|
|
257
|
+
const result = items.join("\n");
|
|
258
|
+
return ctx.listDepth === 0 ? `
|
|
259
|
+
|
|
260
|
+
${result}
|
|
261
|
+
|
|
262
|
+
` : result;
|
|
263
|
+
}
|
|
264
|
+
function wrapInline(marker, node, ctx) {
|
|
265
|
+
const inner = convertChildren(node, ctx);
|
|
266
|
+
if (!inner.trim()) return inner;
|
|
267
|
+
return `${marker}${inner}${marker}`;
|
|
268
|
+
}
|
|
269
|
+
function convertLink(node, ctx) {
|
|
270
|
+
const href = resolveUrl(node.getAttribute?.("href") || "", ctx.baseUrl);
|
|
271
|
+
const text = convertChildren(node, ctx).trim();
|
|
272
|
+
if (!href) return text;
|
|
273
|
+
if (!text || text === href) return `[[${href}]]`;
|
|
274
|
+
return `[[${href}][${text}]]`;
|
|
275
|
+
}
|
|
276
|
+
function convertImage(node, ctx) {
|
|
277
|
+
const src = resolveUrl(node.getAttribute?.("src") || "", ctx.baseUrl);
|
|
278
|
+
if (!src) return "";
|
|
279
|
+
return `[[${src}]]`;
|
|
280
|
+
}
|
|
281
|
+
function convertTable(node, ctx) {
|
|
282
|
+
const rows = [];
|
|
283
|
+
let headerRowCount = 0;
|
|
284
|
+
const thead = node.querySelector?.("thead");
|
|
285
|
+
if (thead) {
|
|
286
|
+
for (const tr of thead.querySelectorAll?.("tr") || []) {
|
|
287
|
+
const cells = extractRowCells(tr, ctx);
|
|
288
|
+
rows.push(cells);
|
|
289
|
+
headerRowCount++;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
const tbody = node.querySelector?.("tbody");
|
|
293
|
+
const bodyContainer = tbody || node;
|
|
294
|
+
for (const tr of bodyContainer.querySelectorAll?.("tr") || []) {
|
|
295
|
+
if (thead && tr.parentNode === thead) continue;
|
|
296
|
+
const cells = extractRowCells(tr, ctx);
|
|
297
|
+
rows.push(cells);
|
|
298
|
+
}
|
|
299
|
+
if (rows.length === 0) return "";
|
|
300
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
301
|
+
const colWidths = new Array(colCount).fill(0);
|
|
302
|
+
for (const row of rows) {
|
|
303
|
+
for (let i = 0; i < colCount; i++) {
|
|
304
|
+
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
const formatRow = (row) => {
|
|
308
|
+
const cells = [];
|
|
309
|
+
for (let i = 0; i < colCount; i++) {
|
|
310
|
+
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
311
|
+
}
|
|
312
|
+
return "| " + cells.join(" | ") + " |";
|
|
313
|
+
};
|
|
314
|
+
const sepParts = colWidths.map((w) => "-".repeat(w));
|
|
315
|
+
const separatorRow = "|" + sepParts.map((s) => `-${s}-`).join("+") + "|";
|
|
316
|
+
const lines = [];
|
|
317
|
+
for (let i = 0; i < rows.length; i++) {
|
|
318
|
+
lines.push(formatRow(rows[i]));
|
|
319
|
+
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
320
|
+
lines.push(separatorRow);
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return "\n\n" + lines.join("\n") + "\n\n";
|
|
324
|
+
}
|
|
325
|
+
function extractRowCells(tr, ctx) {
|
|
326
|
+
const cells = [];
|
|
327
|
+
for (const cell of tr.childNodes) {
|
|
328
|
+
const cellTag = (cell.tagName || "").toLowerCase();
|
|
329
|
+
if (cellTag === "td" || cellTag === "th") {
|
|
330
|
+
cells.push(convertChildren(cell, ctx).trim());
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
return cells;
|
|
334
|
+
}
|
|
335
|
+
function collapseWhitespace(text) {
|
|
336
|
+
return text.replace(/\s+/g, " ");
|
|
337
|
+
}
|
|
338
|
+
function resolveUrl(url, baseUrl) {
|
|
339
|
+
if (!url) return "";
|
|
340
|
+
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
341
|
+
if (!baseUrl) return url;
|
|
342
|
+
try {
|
|
343
|
+
return new URL(url, baseUrl).href;
|
|
344
|
+
} catch {
|
|
345
|
+
return url;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
349
|
+
0 && (module.exports = {
|
|
350
|
+
htmlToOrg
|
|
351
|
+
});
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert HTML to Org-mode format.
|
|
3
|
+
*
|
|
4
|
+
* @param html - HTML string to convert
|
|
5
|
+
* @param baseUrl - Base URL for resolving relative links
|
|
6
|
+
* @returns Org-mode formatted string
|
|
7
|
+
*/
|
|
8
|
+
declare function htmlToOrg(html: string, baseUrl?: string): string;
|
|
9
|
+
|
|
10
|
+
export { htmlToOrg };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Convert HTML to Org-mode format.
|
|
3
|
+
*
|
|
4
|
+
* @param html - HTML string to convert
|
|
5
|
+
* @param baseUrl - Base URL for resolving relative links
|
|
6
|
+
* @returns Org-mode formatted string
|
|
7
|
+
*/
|
|
8
|
+
declare function htmlToOrg(html: string, baseUrl?: string): string;
|
|
9
|
+
|
|
10
|
+
export { htmlToOrg };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
// src/index.ts
|
|
2
|
+
import { parseHTML } from "linkedom";
|
|
3
|
+
function htmlToOrg(html, baseUrl = "") {
|
|
4
|
+
if (!html || !html.trim()) return "";
|
|
5
|
+
const { document } = parseHTML(`<!DOCTYPE html><html><body>${html}</body></html>`);
|
|
6
|
+
const body = document.body;
|
|
7
|
+
const ctx = { baseUrl, listDepth: 0, orderedIndex: [], indentWidth: 0 };
|
|
8
|
+
const raw = convertNode(body, ctx);
|
|
9
|
+
return raw.replace(/\n{3,}/g, "\n\n").trim();
|
|
10
|
+
}
|
|
11
|
+
function convertNode(node, ctx) {
|
|
12
|
+
if (node.nodeType === 3) {
|
|
13
|
+
return collapseWhitespace(node.textContent || "");
|
|
14
|
+
}
|
|
15
|
+
if (node.nodeType !== 1) return "";
|
|
16
|
+
const tag = (node.tagName || "").toLowerCase();
|
|
17
|
+
if (tag === "script" || tag === "style" || tag === "noscript") return "";
|
|
18
|
+
switch (tag) {
|
|
19
|
+
case "h1":
|
|
20
|
+
case "h2":
|
|
21
|
+
case "h3":
|
|
22
|
+
case "h4":
|
|
23
|
+
case "h5":
|
|
24
|
+
case "h6":
|
|
25
|
+
return convertHeading(node, tag, ctx);
|
|
26
|
+
case "p":
|
|
27
|
+
return convertParagraph(node, ctx);
|
|
28
|
+
case "strong":
|
|
29
|
+
case "b":
|
|
30
|
+
return wrapInline("*", node, ctx);
|
|
31
|
+
case "em":
|
|
32
|
+
case "i":
|
|
33
|
+
return wrapInline("/", node, ctx);
|
|
34
|
+
case "u":
|
|
35
|
+
case "ins":
|
|
36
|
+
return wrapInline("_", node, ctx);
|
|
37
|
+
case "s":
|
|
38
|
+
case "del":
|
|
39
|
+
case "strike":
|
|
40
|
+
return wrapInline("+", node, ctx);
|
|
41
|
+
case "code":
|
|
42
|
+
return wrapInline("=", node, ctx);
|
|
43
|
+
case "mark":
|
|
44
|
+
return wrapInline("=", node, ctx);
|
|
45
|
+
case "sup":
|
|
46
|
+
return `^{${convertChildren(node, ctx)}}`;
|
|
47
|
+
case "sub":
|
|
48
|
+
return `_{${convertChildren(node, ctx)}}`;
|
|
49
|
+
case "a":
|
|
50
|
+
return convertLink(node, ctx);
|
|
51
|
+
case "img":
|
|
52
|
+
return convertImage(node, ctx);
|
|
53
|
+
case "ul":
|
|
54
|
+
return convertList(node, false, ctx);
|
|
55
|
+
case "ol":
|
|
56
|
+
return convertList(node, true, ctx);
|
|
57
|
+
case "li":
|
|
58
|
+
return convertChildren(node, ctx);
|
|
59
|
+
case "pre":
|
|
60
|
+
return convertPre(node, ctx);
|
|
61
|
+
case "blockquote":
|
|
62
|
+
return convertBlockquote(node, ctx);
|
|
63
|
+
case "table":
|
|
64
|
+
return convertTable(node, ctx);
|
|
65
|
+
case "hr":
|
|
66
|
+
return "\n\n-----\n\n";
|
|
67
|
+
case "br":
|
|
68
|
+
return "\n";
|
|
69
|
+
// Transparent wrappers — just recurse
|
|
70
|
+
case "div":
|
|
71
|
+
case "section":
|
|
72
|
+
case "article":
|
|
73
|
+
case "main":
|
|
74
|
+
case "header":
|
|
75
|
+
case "footer":
|
|
76
|
+
case "nav":
|
|
77
|
+
case "aside":
|
|
78
|
+
case "figure":
|
|
79
|
+
case "figcaption":
|
|
80
|
+
case "details":
|
|
81
|
+
case "summary":
|
|
82
|
+
case "span":
|
|
83
|
+
case "small":
|
|
84
|
+
case "time":
|
|
85
|
+
case "abbr":
|
|
86
|
+
case "thead":
|
|
87
|
+
case "tbody":
|
|
88
|
+
case "tfoot":
|
|
89
|
+
case "html":
|
|
90
|
+
case "body":
|
|
91
|
+
return convertChildren(node, ctx);
|
|
92
|
+
default:
|
|
93
|
+
return convertChildren(node, ctx);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
function convertChildren(node, ctx) {
|
|
97
|
+
let out = "";
|
|
98
|
+
for (const child of node.childNodes) {
|
|
99
|
+
if (child.nodeType === 3) {
|
|
100
|
+
const text = child.textContent || "";
|
|
101
|
+
if (!text.trim()) {
|
|
102
|
+
const prev = child.previousSibling;
|
|
103
|
+
const next = child.nextSibling;
|
|
104
|
+
if (isBlockElement(prev) || isBlockElement(next)) {
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
out += convertNode(child, ctx);
|
|
110
|
+
}
|
|
111
|
+
return out;
|
|
112
|
+
}
|
|
113
|
+
var BLOCK_TAGS = /* @__PURE__ */ new Set([
|
|
114
|
+
"p",
|
|
115
|
+
"div",
|
|
116
|
+
"h1",
|
|
117
|
+
"h2",
|
|
118
|
+
"h3",
|
|
119
|
+
"h4",
|
|
120
|
+
"h5",
|
|
121
|
+
"h6",
|
|
122
|
+
"ul",
|
|
123
|
+
"ol",
|
|
124
|
+
"li",
|
|
125
|
+
"pre",
|
|
126
|
+
"blockquote",
|
|
127
|
+
"table",
|
|
128
|
+
"hr",
|
|
129
|
+
"section",
|
|
130
|
+
"article",
|
|
131
|
+
"main",
|
|
132
|
+
"header",
|
|
133
|
+
"footer",
|
|
134
|
+
"nav",
|
|
135
|
+
"aside",
|
|
136
|
+
"figure",
|
|
137
|
+
"figcaption",
|
|
138
|
+
"details",
|
|
139
|
+
"summary"
|
|
140
|
+
]);
|
|
141
|
+
function isBlockElement(node) {
|
|
142
|
+
if (!node || node.nodeType !== 1) return false;
|
|
143
|
+
return BLOCK_TAGS.has((node.tagName || "").toLowerCase());
|
|
144
|
+
}
|
|
145
|
+
function convertHeading(node, tag, ctx) {
|
|
146
|
+
const level = parseInt(tag[1], 10);
|
|
147
|
+
const stars = "*".repeat(level);
|
|
148
|
+
const text = convertChildren(node, ctx).trim();
|
|
149
|
+
return `
|
|
150
|
+
|
|
151
|
+
${stars} ${text}
|
|
152
|
+
|
|
153
|
+
`;
|
|
154
|
+
}
|
|
155
|
+
function convertParagraph(node, ctx) {
|
|
156
|
+
const text = convertChildren(node, ctx).trim();
|
|
157
|
+
if (!text) return "";
|
|
158
|
+
return `
|
|
159
|
+
|
|
160
|
+
${text}
|
|
161
|
+
|
|
162
|
+
`;
|
|
163
|
+
}
|
|
164
|
+
function convertBlockquote(node, ctx) {
|
|
165
|
+
const inner = convertChildren(node, ctx).trim();
|
|
166
|
+
return `
|
|
167
|
+
|
|
168
|
+
#+BEGIN_QUOTE
|
|
169
|
+
${inner}
|
|
170
|
+
#+END_QUOTE
|
|
171
|
+
|
|
172
|
+
`;
|
|
173
|
+
}
|
|
174
|
+
function convertPre(node, _ctx) {
|
|
175
|
+
const codeChild = node.querySelector?.("code");
|
|
176
|
+
if (codeChild) {
|
|
177
|
+
const lang = detectLanguage(codeChild);
|
|
178
|
+
const code = (codeChild.textContent || "").replace(/\n$/, "");
|
|
179
|
+
const langSuffix = lang ? ` ${lang}` : "";
|
|
180
|
+
return `
|
|
181
|
+
|
|
182
|
+
#+BEGIN_SRC${langSuffix}
|
|
183
|
+
${code}
|
|
184
|
+
#+END_SRC
|
|
185
|
+
|
|
186
|
+
`;
|
|
187
|
+
}
|
|
188
|
+
const text = (node.textContent || "").replace(/\n$/, "");
|
|
189
|
+
return `
|
|
190
|
+
|
|
191
|
+
#+BEGIN_EXAMPLE
|
|
192
|
+
${text}
|
|
193
|
+
#+END_EXAMPLE
|
|
194
|
+
|
|
195
|
+
`;
|
|
196
|
+
}
|
|
197
|
+
function detectLanguage(codeNode) {
|
|
198
|
+
const cls = codeNode.getAttribute?.("class") || "";
|
|
199
|
+
const match = cls.match(/(?:^|\s)language-(\S+)/);
|
|
200
|
+
return match ? match[1] : "";
|
|
201
|
+
}
|
|
202
|
+
function convertList(node, ordered, ctx) {
|
|
203
|
+
const items = [];
|
|
204
|
+
let counter = 1;
|
|
205
|
+
const indent = ctx.listDepth > 0 ? " ".repeat(ctx.indentWidth || 2) : "";
|
|
206
|
+
const prefixWidth = ordered ? 3 : 2;
|
|
207
|
+
for (const child of node.childNodes) {
|
|
208
|
+
if (child.nodeType !== 1 || (child.tagName || "").toLowerCase() !== "li") continue;
|
|
209
|
+
const currentPrefix = ordered ? `${counter}. ` : "- ";
|
|
210
|
+
let textParts = [];
|
|
211
|
+
let nestedLists = [];
|
|
212
|
+
for (const liChild of child.childNodes) {
|
|
213
|
+
const liChildTag = (liChild.tagName || "").toLowerCase();
|
|
214
|
+
if (liChildTag === "ul" || liChildTag === "ol") {
|
|
215
|
+
const nestedCtx = {
|
|
216
|
+
...ctx,
|
|
217
|
+
listDepth: ctx.listDepth + 1,
|
|
218
|
+
indentWidth: (ctx.indentWidth || 0) + prefixWidth
|
|
219
|
+
};
|
|
220
|
+
nestedLists.push(convertList(liChild, liChildTag === "ol", nestedCtx));
|
|
221
|
+
} else {
|
|
222
|
+
textParts.push(convertNode(liChild, ctx));
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
const text = textParts.join("").trim();
|
|
226
|
+
let item = `${indent}${currentPrefix}${text}`;
|
|
227
|
+
if (nestedLists.length > 0) {
|
|
228
|
+
item += "\n" + nestedLists.join("\n");
|
|
229
|
+
}
|
|
230
|
+
items.push(item);
|
|
231
|
+
counter++;
|
|
232
|
+
}
|
|
233
|
+
const result = items.join("\n");
|
|
234
|
+
return ctx.listDepth === 0 ? `
|
|
235
|
+
|
|
236
|
+
${result}
|
|
237
|
+
|
|
238
|
+
` : result;
|
|
239
|
+
}
|
|
240
|
+
function wrapInline(marker, node, ctx) {
|
|
241
|
+
const inner = convertChildren(node, ctx);
|
|
242
|
+
if (!inner.trim()) return inner;
|
|
243
|
+
return `${marker}${inner}${marker}`;
|
|
244
|
+
}
|
|
245
|
+
function convertLink(node, ctx) {
|
|
246
|
+
const href = resolveUrl(node.getAttribute?.("href") || "", ctx.baseUrl);
|
|
247
|
+
const text = convertChildren(node, ctx).trim();
|
|
248
|
+
if (!href) return text;
|
|
249
|
+
if (!text || text === href) return `[[${href}]]`;
|
|
250
|
+
return `[[${href}][${text}]]`;
|
|
251
|
+
}
|
|
252
|
+
function convertImage(node, ctx) {
|
|
253
|
+
const src = resolveUrl(node.getAttribute?.("src") || "", ctx.baseUrl);
|
|
254
|
+
if (!src) return "";
|
|
255
|
+
return `[[${src}]]`;
|
|
256
|
+
}
|
|
257
|
+
function convertTable(node, ctx) {
|
|
258
|
+
const rows = [];
|
|
259
|
+
let headerRowCount = 0;
|
|
260
|
+
const thead = node.querySelector?.("thead");
|
|
261
|
+
if (thead) {
|
|
262
|
+
for (const tr of thead.querySelectorAll?.("tr") || []) {
|
|
263
|
+
const cells = extractRowCells(tr, ctx);
|
|
264
|
+
rows.push(cells);
|
|
265
|
+
headerRowCount++;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
const tbody = node.querySelector?.("tbody");
|
|
269
|
+
const bodyContainer = tbody || node;
|
|
270
|
+
for (const tr of bodyContainer.querySelectorAll?.("tr") || []) {
|
|
271
|
+
if (thead && tr.parentNode === thead) continue;
|
|
272
|
+
const cells = extractRowCells(tr, ctx);
|
|
273
|
+
rows.push(cells);
|
|
274
|
+
}
|
|
275
|
+
if (rows.length === 0) return "";
|
|
276
|
+
const colCount = Math.max(...rows.map((r) => r.length));
|
|
277
|
+
const colWidths = new Array(colCount).fill(0);
|
|
278
|
+
for (const row of rows) {
|
|
279
|
+
for (let i = 0; i < colCount; i++) {
|
|
280
|
+
colWidths[i] = Math.max(colWidths[i], (row[i] || "").length);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
const formatRow = (row) => {
|
|
284
|
+
const cells = [];
|
|
285
|
+
for (let i = 0; i < colCount; i++) {
|
|
286
|
+
cells.push((row[i] || "").padEnd(colWidths[i]));
|
|
287
|
+
}
|
|
288
|
+
return "| " + cells.join(" | ") + " |";
|
|
289
|
+
};
|
|
290
|
+
const sepParts = colWidths.map((w) => "-".repeat(w));
|
|
291
|
+
const separatorRow = "|" + sepParts.map((s) => `-${s}-`).join("+") + "|";
|
|
292
|
+
const lines = [];
|
|
293
|
+
for (let i = 0; i < rows.length; i++) {
|
|
294
|
+
lines.push(formatRow(rows[i]));
|
|
295
|
+
if (i === headerRowCount - 1 && headerRowCount > 0) {
|
|
296
|
+
lines.push(separatorRow);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return "\n\n" + lines.join("\n") + "\n\n";
|
|
300
|
+
}
|
|
301
|
+
function extractRowCells(tr, ctx) {
|
|
302
|
+
const cells = [];
|
|
303
|
+
for (const cell of tr.childNodes) {
|
|
304
|
+
const cellTag = (cell.tagName || "").toLowerCase();
|
|
305
|
+
if (cellTag === "td" || cellTag === "th") {
|
|
306
|
+
cells.push(convertChildren(cell, ctx).trim());
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
return cells;
|
|
310
|
+
}
|
|
311
|
+
function collapseWhitespace(text) {
|
|
312
|
+
return text.replace(/\s+/g, " ");
|
|
313
|
+
}
|
|
314
|
+
function resolveUrl(url, baseUrl) {
|
|
315
|
+
if (!url) return "";
|
|
316
|
+
if (/^https?:\/\//.test(url) || url.startsWith("mailto:")) return url;
|
|
317
|
+
if (!baseUrl) return url;
|
|
318
|
+
try {
|
|
319
|
+
return new URL(url, baseUrl).href;
|
|
320
|
+
} catch {
|
|
321
|
+
return url;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
export {
|
|
325
|
+
htmlToOrg
|
|
326
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "html-to-org",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Convert HTML to Org-mode format",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.cjs",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.cts",
|
|
17
|
+
"default": "./dist/index.cjs"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"files": [
|
|
22
|
+
"dist"
|
|
23
|
+
],
|
|
24
|
+
"keywords": [
|
|
25
|
+
"html",
|
|
26
|
+
"org-mode",
|
|
27
|
+
"org",
|
|
28
|
+
"converter",
|
|
29
|
+
"emacs",
|
|
30
|
+
"orgmode"
|
|
31
|
+
],
|
|
32
|
+
"author": "zzzhizhia",
|
|
33
|
+
"license": "MIT",
|
|
34
|
+
"repository": {
|
|
35
|
+
"type": "git",
|
|
36
|
+
"url": "https://github.com/zzzhizhia/html-to-org.git"
|
|
37
|
+
},
|
|
38
|
+
"homepage": "https://github.com/zzzhizhia/html-to-org",
|
|
39
|
+
"engines": {
|
|
40
|
+
"node": ">=20"
|
|
41
|
+
},
|
|
42
|
+
"devDependencies": {
|
|
43
|
+
"tsup": "^8.0.0",
|
|
44
|
+
"typescript": "^5.5.4",
|
|
45
|
+
"vitest": "^4.0.18"
|
|
46
|
+
},
|
|
47
|
+
"dependencies": {
|
|
48
|
+
"linkedom": "^0.18.12"
|
|
49
|
+
},
|
|
50
|
+
"scripts": {
|
|
51
|
+
"build": "tsup",
|
|
52
|
+
"test": "vitest run",
|
|
53
|
+
"test:watch": "vitest",
|
|
54
|
+
"test:coverage": "vitest run --coverage"
|
|
55
|
+
}
|
|
56
|
+
}
|