html-to-markdown-wasm 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +331 -0
- package/dist/README.md +159 -0
- package/package.json +46 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
# html-to-markdown-wasm
|
|
2
|
+
|
|
3
|
+
Universal HTML to Markdown converter using WebAssembly.
|
|
4
|
+
|
|
5
|
+
Runs anywhere: Node.js, Deno, Bun, browsers, and edge runtimes.
|
|
6
|
+
|
|
7
|
+
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
8
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
9
|
+
|
|
10
|
+
## Performance
|
|
11
|
+
|
|
12
|
+
Universal WebAssembly bindings with **excellent performance** across all JavaScript runtimes.
|
|
13
|
+
|
|
14
|
+
### Benchmark Results (Apple M4)
|
|
15
|
+
|
|
16
|
+
| Document Type | ops/sec | Notes |
|
|
17
|
+
| -------------------------- | ---------- | ------------------ |
|
|
18
|
+
| **Small (5 paragraphs)** | **70,300** | Simple documents |
|
|
19
|
+
| **Medium (25 paragraphs)** | **15,282** | Nested formatting |
|
|
20
|
+
| **Large (100 paragraphs)** | **3,836** | Complex structures |
|
|
21
|
+
| **Tables (20 tables)** | **3,748** | Table processing |
|
|
22
|
+
| **Lists (500 items)** | **1,391** | Nested lists |
|
|
23
|
+
| **Wikipedia (129KB)** | **1,022** | Real-world content |
|
|
24
|
+
| **Wikipedia (653KB)** | **147** | Large documents |
|
|
25
|
+
|
|
26
|
+
**Average: ~15,536 ops/sec** across varied workloads.
|
|
27
|
+
|
|
28
|
+
### Comparison
|
|
29
|
+
|
|
30
|
+
- **vs Native NAPI**: ~1.17× slower (WASM has minimal overhead)
|
|
31
|
+
- **vs Python**: ~6.3× faster (no FFI overhead)
|
|
32
|
+
- **Best for**: Universal deployment (browsers, Deno, edge runtimes, cross-platform apps)
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
### npm / Yarn / pnpm
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npm install html-to-markdown-wasm
|
|
40
|
+
# or
|
|
41
|
+
yarn add html-to-markdown-wasm
|
|
42
|
+
# or
|
|
43
|
+
pnpm add html-to-markdown-wasm
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Deno
|
|
47
|
+
|
|
48
|
+
```typescript
|
|
49
|
+
// Via npm specifier
|
|
50
|
+
import { convert } from "npm:html-to-markdown-wasm";
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Node.js
|
|
56
|
+
|
|
57
|
+
```javascript
|
|
58
|
+
// CommonJS
|
|
59
|
+
const { convert } = require('html-to-markdown-wasm/dist-node');
|
|
60
|
+
|
|
61
|
+
const markdown = convert('<h1>Hello World</h1>');
|
|
62
|
+
console.log(markdown);
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```javascript
|
|
66
|
+
// ESM
|
|
67
|
+
import { convert } from 'html-to-markdown-wasm/dist-node';
|
|
68
|
+
|
|
69
|
+
const html = '<h1>Hello</h1><p>World</p>';
|
|
70
|
+
const markdown = convert(html, {
|
|
71
|
+
headingStyle: 'atx',
|
|
72
|
+
codeBlockStyle: 'backticks',
|
|
73
|
+
});
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Deno
|
|
77
|
+
|
|
78
|
+
```typescript
|
|
79
|
+
import { convert } from "npm:html-to-markdown-wasm";
|
|
80
|
+
|
|
81
|
+
const html = await Deno.readTextFile("input.html");
|
|
82
|
+
|
|
83
|
+
const markdown = convert(html, {
|
|
84
|
+
headingStyle: "atx",
|
|
85
|
+
listIndentWidth: 2,
|
|
86
|
+
bullets: "-"
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
await Deno.writeTextFile("output.md", markdown);
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Bun
|
|
93
|
+
|
|
94
|
+
```typescript
|
|
95
|
+
import { convert } from 'html-to-markdown-wasm';
|
|
96
|
+
|
|
97
|
+
const markdown = convert('<h1>Fast conversion</h1>', {
|
|
98
|
+
headingStyle: 'atx',
|
|
99
|
+
wrap: true,
|
|
100
|
+
wrapWidth: 80
|
|
101
|
+
});
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
> **Note:** For Bun, consider using [html-to-markdown-node](https://www.npmjs.com/package/html-to-markdown-node) for ~3× better performance with native bindings.
|
|
105
|
+
|
|
106
|
+
### Browser (ESM)
|
|
107
|
+
|
|
108
|
+
```html
|
|
109
|
+
<!DOCTYPE html>
|
|
110
|
+
<html>
|
|
111
|
+
<head>
|
|
112
|
+
<title>HTML to Markdown</title>
|
|
113
|
+
</head>
|
|
114
|
+
<body>
|
|
115
|
+
<script type="module">
|
|
116
|
+
import init, { convert } from 'https://unpkg.com/html-to-markdown-wasm/dist-web/html_to_markdown_wasm.js';
|
|
117
|
+
|
|
118
|
+
// Initialize WASM module
|
|
119
|
+
await init();
|
|
120
|
+
|
|
121
|
+
const html = '<h1>Hello World</h1><p>This runs in the <strong>browser</strong>!</p>';
|
|
122
|
+
const markdown = convert(html, { headingStyle: 'atx' });
|
|
123
|
+
|
|
124
|
+
console.log(markdown);
|
|
125
|
+
document.body.innerHTML = `<pre>${markdown}</pre>`;
|
|
126
|
+
</script>
|
|
127
|
+
</body>
|
|
128
|
+
</html>
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Vite / Webpack / Bundlers
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
import { convert } from 'html-to-markdown-wasm';
|
|
135
|
+
|
|
136
|
+
const markdown = convert('<h1>Hello</h1>', {
|
|
137
|
+
headingStyle: 'atx',
|
|
138
|
+
codeBlockStyle: 'backticks'
|
|
139
|
+
});
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Cloudflare Workers
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
import { convert } from 'html-to-markdown-wasm';
|
|
146
|
+
|
|
147
|
+
export default {
|
|
148
|
+
async fetch(request: Request): Promise<Response> {
|
|
149
|
+
const html = await request.text();
|
|
150
|
+
const markdown = convert(html, { headingStyle: 'atx' });
|
|
151
|
+
|
|
152
|
+
return new Response(markdown, {
|
|
153
|
+
headers: { 'Content-Type': 'text/markdown' }
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
};
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## TypeScript
|
|
160
|
+
|
|
161
|
+
Full TypeScript support with type definitions:
|
|
162
|
+
|
|
163
|
+
```typescript
|
|
164
|
+
import {
|
|
165
|
+
convert,
|
|
166
|
+
convertWithInlineImages,
|
|
167
|
+
WasmInlineImageConfig,
|
|
168
|
+
type WasmConversionOptions
|
|
169
|
+
} from 'html-to-markdown-wasm';
|
|
170
|
+
|
|
171
|
+
const options: WasmConversionOptions = {
|
|
172
|
+
headingStyle: 'atx',
|
|
173
|
+
codeBlockStyle: 'backticks',
|
|
174
|
+
listIndentWidth: 2,
|
|
175
|
+
wrap: true,
|
|
176
|
+
wrapWidth: 80
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
const markdown = convert('<h1>Hello</h1>', options);
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Inline Images
|
|
183
|
+
|
|
184
|
+
Extract and decode inline images (data URIs, SVG):
|
|
185
|
+
|
|
186
|
+
```typescript
|
|
187
|
+
import { convertWithInlineImages, WasmInlineImageConfig } from 'html-to-markdown-wasm';
|
|
188
|
+
|
|
189
|
+
const html = '<img src="..." alt="Logo">';
|
|
190
|
+
|
|
191
|
+
const config = new WasmInlineImageConfig(5 * 1024 * 1024); // 5MB max
|
|
192
|
+
config.inferDimensions = true;
|
|
193
|
+
config.filenamePrefix = 'img_';
|
|
194
|
+
config.captureSvg = true;
|
|
195
|
+
|
|
196
|
+
const result = convertWithInlineImages(html, null, config);
|
|
197
|
+
|
|
198
|
+
console.log(result.markdown);
|
|
199
|
+
console.log(`Extracted ${result.inlineImages.length} images`);
|
|
200
|
+
|
|
201
|
+
for (const img of result.inlineImages) {
|
|
202
|
+
console.log(`${img.filename}: ${img.format}, ${img.data.length} bytes`);
|
|
203
|
+
// img.data is a Uint8Array - save to file or upload
|
|
204
|
+
}
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Build Targets
|
|
208
|
+
|
|
209
|
+
Three build targets are provided for different environments:
|
|
210
|
+
|
|
211
|
+
| Target | Path | Use Case |
|
|
212
|
+
| ----------- | --------------------------------- | ------------------------------ |
|
|
213
|
+
| **Bundler** | `html-to-markdown-wasm` | Webpack, Vite, Rollup, esbuild |
|
|
214
|
+
| **Node.js** | `html-to-markdown-wasm/dist-node` | Node.js, Bun (CommonJS/ESM) |
|
|
215
|
+
| **Web** | `html-to-markdown-wasm/dist-web` | Direct browser ESM imports |
|
|
216
|
+
|
|
217
|
+
## Runtime Compatibility
|
|
218
|
+
|
|
219
|
+
| Runtime | Support | Package |
|
|
220
|
+
| ------------------------- | ---------------------------- | -------------- |
|
|
221
|
+
| ✅ **Node.js** 18+ | Full support | `dist-node` |
|
|
222
|
+
| ✅ **Deno** | Full support | npm: specifier |
|
|
223
|
+
| ✅ **Bun** | Full support (prefer native) | Default export |
|
|
224
|
+
| ✅ **Browsers** | Full support | `dist-web` |
|
|
225
|
+
| ✅ **Cloudflare Workers** | Full support | Default export |
|
|
226
|
+
| ✅ **Deno Deploy** | Full support | npm: specifier |
|
|
227
|
+
|
|
228
|
+
## When to Use
|
|
229
|
+
|
|
230
|
+
Choose `html-to-markdown-wasm` when:
|
|
231
|
+
|
|
232
|
+
- 🌐 Running in browsers or edge runtimes
|
|
233
|
+
- 🦕 Using Deno
|
|
234
|
+
- ☁️ Deploying to Cloudflare Workers, Deno Deploy
|
|
235
|
+
- 📦 Building universal libraries
|
|
236
|
+
- 🔄 Need consistent behavior across all platforms
|
|
237
|
+
|
|
238
|
+
Use [html-to-markdown-node](https://www.npmjs.com/package/html-to-markdown-node) for:
|
|
239
|
+
|
|
240
|
+
- ⚡ Maximum performance in Node.js/Bun (~3× faster)
|
|
241
|
+
- 🖥️ Server-side only applications
|
|
242
|
+
|
|
243
|
+
## Configuration Options
|
|
244
|
+
|
|
245
|
+
See the [TypeScript definitions](./dist-node/html_to_markdown_wasm.d.ts) for all available options:
|
|
246
|
+
|
|
247
|
+
- Heading styles (atx, underlined, atxClosed)
|
|
248
|
+
- Code block styles (indented, backticks, tildes)
|
|
249
|
+
- List formatting (indent width, bullet characters)
|
|
250
|
+
- Text escaping and formatting
|
|
251
|
+
- Preprocessing for web scraping
|
|
252
|
+
- hOCR table extraction
|
|
253
|
+
- And more...
|
|
254
|
+
|
|
255
|
+
## Examples
|
|
256
|
+
|
|
257
|
+
### Deno Web Server
|
|
258
|
+
|
|
259
|
+
```typescript
|
|
260
|
+
import { convert } from "npm:html-to-markdown-wasm";
|
|
261
|
+
|
|
262
|
+
Deno.serve((req) => {
|
|
263
|
+
const url = new URL(req.url);
|
|
264
|
+
|
|
265
|
+
if (url.pathname === "/convert" && req.method === "POST") {
|
|
266
|
+
const html = await req.text();
|
|
267
|
+
const markdown = convert(html, { headingStyle: "atx" });
|
|
268
|
+
|
|
269
|
+
return new Response(markdown, {
|
|
270
|
+
headers: { "Content-Type": "text/markdown" }
|
|
271
|
+
});
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return new Response("Not found", { status: 404 });
|
|
275
|
+
});
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Browser File Conversion
|
|
279
|
+
|
|
280
|
+
```html
|
|
281
|
+
<input type="file" id="htmlFile" accept=".html">
|
|
282
|
+
<button onclick="convertFile()">Convert to Markdown</button>
|
|
283
|
+
<pre id="output"></pre>
|
|
284
|
+
|
|
285
|
+
<script type="module">
|
|
286
|
+
import init, { convert } from 'https://unpkg.com/html-to-markdown-wasm/dist-web/html_to_markdown_wasm.js';
|
|
287
|
+
|
|
288
|
+
await init();
|
|
289
|
+
|
|
290
|
+
window.convertFile = async () => {
|
|
291
|
+
const file = document.getElementById('htmlFile').files[0];
|
|
292
|
+
const html = await file.text();
|
|
293
|
+
const markdown = convert(html, { headingStyle: 'atx' });
|
|
294
|
+
document.getElementById('output').textContent = markdown;
|
|
295
|
+
};
|
|
296
|
+
</script>
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Web Scraping (Deno)
|
|
300
|
+
|
|
301
|
+
```typescript
|
|
302
|
+
import { convert } from "npm:html-to-markdown-wasm";
|
|
303
|
+
|
|
304
|
+
const response = await fetch("https://example.com");
|
|
305
|
+
const html = await response.text();
|
|
306
|
+
|
|
307
|
+
const markdown = convert(html, {
|
|
308
|
+
preprocessing: {
|
|
309
|
+
enabled: true,
|
|
310
|
+
preset: "aggressive",
|
|
311
|
+
removeNavigation: true,
|
|
312
|
+
removeForms: true
|
|
313
|
+
},
|
|
314
|
+
headingStyle: "atx",
|
|
315
|
+
codeBlockStyle: "backticks"
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
console.log(markdown);
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
## Links
|
|
322
|
+
|
|
323
|
+
- [GitHub Repository](https://github.com/Goldziher/html-to-markdown)
|
|
324
|
+
- [Full Documentation](https://github.com/Goldziher/html-to-markdown/blob/main/README.md)
|
|
325
|
+
- [Native Node Package](https://www.npmjs.com/package/html-to-markdown-node)
|
|
326
|
+
- [Python Package](https://pypi.org/project/html-to-markdown/)
|
|
327
|
+
- [Rust Crate](https://crates.io/crates/html-to-markdown-rs)
|
|
328
|
+
|
|
329
|
+
## License
|
|
330
|
+
|
|
331
|
+
MIT
|
package/dist/README.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# html-to-markdown
|
|
2
|
+
|
|
3
|
+
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
6
|
+
[](https://www.npmjs.com/package/html-to-markdown)
|
|
7
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
8
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
9
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
10
|
+
[](https://discord.gg/pXxagNK2zN)
|
|
11
|
+
|
|
12
|
+
## Documentation
|
|
13
|
+
|
|
14
|
+
- **JavaScript/TypeScript guides**:
|
|
15
|
+
- Node.js/Bun (native) – [Node.js README](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-node)
|
|
16
|
+
- WebAssembly (universal) – [WASM README](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-wasm)
|
|
17
|
+
- **Python guide** – [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/README_PYPI.md)
|
|
18
|
+
- **Rust guide** – [Rust README](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown)
|
|
19
|
+
- **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here!
|
|
20
|
+
- **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
| Target | Command |
|
|
25
|
+
| --------------------------- | ------------------------------------------------------------------------- |
|
|
26
|
+
| **Node.js/Bun** (native) | `npm install html-to-markdown-node` |
|
|
27
|
+
| **WebAssembly** (universal) | `npm install html-to-markdown-wasm` |
|
|
28
|
+
| **Deno** | `import { convert } from "npm:html-to-markdown-wasm"` |
|
|
29
|
+
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
30
|
+
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
31
|
+
| Rust CLI | `cargo install html-to-markdown-cli` |
|
|
32
|
+
| Homebrew CLI | `brew tap goldziher/tap`<br>`brew install html-to-markdown` |
|
|
33
|
+
| Releases | [GitHub Releases](https://github.com/Goldziher/html-to-markdown/releases) |
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### JavaScript/TypeScript
|
|
38
|
+
|
|
39
|
+
**Node.js / Bun (Native - Fastest):**
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
import { convert } from 'html-to-markdown-node';
|
|
43
|
+
|
|
44
|
+
const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
|
|
45
|
+
const markdown = convert(html, {
|
|
46
|
+
headingStyle: 'Atx',
|
|
47
|
+
codeBlockStyle: 'Backticks',
|
|
48
|
+
wrap: true,
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**Deno / Browsers / Edge (Universal):**
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
import { convert } from "npm:html-to-markdown-wasm"; // Deno
|
|
56
|
+
// or: import { convert } from 'html-to-markdown-wasm'; // Bundlers
|
|
57
|
+
|
|
58
|
+
const markdown = convert(html, {
|
|
59
|
+
headingStyle: 'atx',
|
|
60
|
+
listIndentWidth: 2,
|
|
61
|
+
});
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Performance:** Native bindings average ~19k ops/sec, WASM averages ~16k ops/sec (benchmarked on complex real-world documents).
|
|
65
|
+
|
|
66
|
+
See the JavaScript guides for full API documentation:
|
|
67
|
+
|
|
68
|
+
- [Node.js/Bun guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-node)
|
|
69
|
+
- [WebAssembly guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-wasm)
|
|
70
|
+
|
|
71
|
+
### CLI
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Convert a file
|
|
75
|
+
html-to-markdown input.html > output.md
|
|
76
|
+
|
|
77
|
+
# Stream from stdin
|
|
78
|
+
curl https://example.com | html-to-markdown > output.md
|
|
79
|
+
|
|
80
|
+
# Apply options
|
|
81
|
+
html-to-markdown --heading-style atx --list-indent-width 2 input.html
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Python (v2 API)
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from html_to_markdown import convert, convert_with_inline_images, InlineImageConfig
|
|
88
|
+
|
|
89
|
+
html = "<h1>Hello</h1><p>Rust ❤️ Markdown</p>"
|
|
90
|
+
markdown = convert(html)
|
|
91
|
+
|
|
92
|
+
markdown, inline_images, warnings = convert_with_inline_images(
|
|
93
|
+
'<img src="data:image/png;base64,...==" alt="Pixel">',
|
|
94
|
+
image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Rust
|
|
99
|
+
|
|
100
|
+
```rust
|
|
101
|
+
use html_to_markdown_rs::{convert, ConversionOptions, HeadingStyle};
|
|
102
|
+
|
|
103
|
+
let html = "<h1>Welcome</h1><p>Fast conversion</p>";
|
|
104
|
+
let markdown = convert(html, None)?;
|
|
105
|
+
|
|
106
|
+
let options = ConversionOptions {
|
|
107
|
+
heading_style: HeadingStyle::Atx,
|
|
108
|
+
..Default::default()
|
|
109
|
+
};
|
|
110
|
+
let markdown = convert(html, Some(options))?;
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
See the language-specific READMEs for complete configuration, hOCR workflows, and inline image extraction.
|
|
114
|
+
|
|
115
|
+
## Performance
|
|
116
|
+
|
|
117
|
+
Benchmarked on Apple M4 with complex real-world documents (Wikipedia articles, tables, lists):
|
|
118
|
+
|
|
119
|
+
### Operations per Second (higher is better)
|
|
120
|
+
|
|
121
|
+
| Document Type | Node.js (NAPI) | WASM | Python (PyO3) | Speedup (Node vs Python) |
|
|
122
|
+
| -------------------------- | -------------- | ------ | ------------- | ------------------------ |
|
|
123
|
+
| **Small (5 paragraphs)** | 86,233 | 70,300 | 8,443 | **10.2×** |
|
|
124
|
+
| **Medium (25 paragraphs)** | 18,979 | 15,282 | 1,846 | **10.3×** |
|
|
125
|
+
| **Large (100 paragraphs)** | 4,907 | 3,836 | 438 | **11.2×** |
|
|
126
|
+
| **Tables (complex)** | 5,003 | 3,748 | 4,829 | 1.0× |
|
|
127
|
+
| **Lists (nested)** | 1,819 | 1,391 | 1,165 | **1.6×** |
|
|
128
|
+
| **Wikipedia (129KB)** | 1,125 | 1,022 | - | - |
|
|
129
|
+
| **Wikipedia (653KB)** | 156 | 147 | - | - |
|
|
130
|
+
|
|
131
|
+
### Average Performance Summary
|
|
132
|
+
|
|
133
|
+
| Implementation | Avg ops/sec | vs WASM | vs Python | Best For |
|
|
134
|
+
| --------------------- | ---------------- | ------------ | --------------- | --------------------------------- |
|
|
135
|
+
| **Node.js (NAPI-RS)** | **18,162** | 1.17× faster | **7.4× faster** | Maximum throughput in Node.js/Bun |
|
|
136
|
+
| **WebAssembly** | **15,536** | baseline | **6.3× faster** | Universal (Deno, browsers, edge) |
|
|
137
|
+
| **Python (PyO3)** | **2,465** | 6.3× slower | baseline | Python ecosystem integration |
|
|
138
|
+
| **Rust CLI/Binary** | **150-210 MB/s** | - | - | Standalone processing |
|
|
139
|
+
|
|
140
|
+
### Key Insights
|
|
141
|
+
|
|
142
|
+
- **JavaScript bindings are fastest**: Native Node.js bindings achieve ~18k ops/sec average, with WASM close behind at ~16k ops/sec
|
|
143
|
+
- **Python is 6-10× slower**: Despite using the same Rust core, PyO3 FFI overhead significantly impacts Python performance
|
|
144
|
+
- **Small documents**: Both JS implementations reach 70-90k ops/sec on simple HTML
|
|
145
|
+
- **Large documents**: Performance gap widens with complexity
|
|
146
|
+
|
|
147
|
+
**Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
|
|
148
|
+
|
|
149
|
+
## Compatibility (v1 → v2)
|
|
150
|
+
|
|
151
|
+
- V2’s Rust core sustains **150–210 MB/s** throughput; V1 averaged **≈ 2.5 MB/s** in its Python/BeautifulSoup implementation (60–80× faster).
|
|
152
|
+
- The Python package offers a compatibility shim in `html_to_markdown.v1_compat` (`convert_to_markdown`, `convert_to_markdown_stream`, `markdownify`). Details and keyword mappings live in [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/README_PYPI.md#v1-compatibility).
|
|
153
|
+
- CLI flag changes, option renames, and other breaking updates are summarised in [CHANGELOG](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md#breaking-changes).
|
|
154
|
+
|
|
155
|
+
## Community
|
|
156
|
+
|
|
157
|
+
- Chat with us on [Discord](https://discord.gg/pXxagNK2zN)
|
|
158
|
+
- Explore the broader [Kreuzberg](https://kreuzberg.dev) document-processing ecosystem
|
|
159
|
+
- Sponsor development via [GitHub Sponsors](https://github.com/sponsors/Goldziher)
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "html-to-markdown-wasm",
|
|
3
|
+
"version": "2.3.1",
|
|
4
|
+
"description": "High-performance HTML to Markdown converter - WebAssembly bindings",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"repository": "https://github.com/Goldziher/html-to-markdown",
|
|
8
|
+
"license": "MIT",
|
|
9
|
+
"keywords": [
|
|
10
|
+
"html",
|
|
11
|
+
"markdown",
|
|
12
|
+
"converter",
|
|
13
|
+
"rust",
|
|
14
|
+
"wasm",
|
|
15
|
+
"webassembly"
|
|
16
|
+
],
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"README.md"
|
|
20
|
+
],
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"@types/node": "^24.7.2",
|
|
23
|
+
"tinybench": "^5.0.1",
|
|
24
|
+
"tsx": "^4.20.6",
|
|
25
|
+
"vitest": "^3.0.0",
|
|
26
|
+
"wasm-pack": "^0.13.1"
|
|
27
|
+
},
|
|
28
|
+
"publishConfig": {
|
|
29
|
+
"registry": "https://registry.npmjs.org/",
|
|
30
|
+
"access": "public"
|
|
31
|
+
},
|
|
32
|
+
"dependencies": {
|
|
33
|
+
"up": "^1.0.2"
|
|
34
|
+
},
|
|
35
|
+
"scripts": {
|
|
36
|
+
"build": "wasm-pack build --target bundler --out-dir dist",
|
|
37
|
+
"build:nodejs": "wasm-pack build --target nodejs --out-dir dist-node",
|
|
38
|
+
"build:web": "wasm-pack build --target web --out-dir dist-web",
|
|
39
|
+
"build:all": "pnpm run build && pnpm run build:nodejs && pnpm run build:web",
|
|
40
|
+
"test": "vitest run",
|
|
41
|
+
"test:watch": "vitest",
|
|
42
|
+
"test:wasm-pack": "wasm-pack test --headless --chrome",
|
|
43
|
+
"bench": "tsx benchmark.ts",
|
|
44
|
+
"clean": "rm -rf dist dist-node dist-web node_modules pkg"
|
|
45
|
+
}
|
|
46
|
+
}
|