@kreuzberg/html-to-markdown-node 2.19.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +350 -0
- package/html-to-markdown-node.darwin-arm64.node +0 -0
- package/index.d.ts +455 -0
- package/index.js +608 -0
- package/package.json +83 -0
package/README.md
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
# @kreuzberg/html-to-markdown-node
|
|
2
|
+
|
|
3
|
+
> **npm package:** `@kreuzberg/html-to-markdown-node` (this README).
|
|
4
|
+
> Use [`@kreuzberg/html-to-markdown-wasm`](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm) for the portable WASM build.
|
|
5
|
+
|
|
6
|
+
Native Node.js and Bun bindings for html-to-markdown using NAPI-RS v3.
|
|
7
|
+
|
|
8
|
+
Built on the shared Rust engine that powers the Python wheels, Ruby gem, PHP extension, WebAssembly package, and CLI – ensuring identical Markdown output across every language target.
|
|
9
|
+
|
|
10
|
+
High-performance HTML to Markdown conversion using native Rust code compiled to platform-specific binaries.
|
|
11
|
+
|
|
12
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
13
|
+
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node)
|
|
14
|
+
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm)
|
|
15
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
16
|
+
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
17
|
+
[](https://rubygems.org/gems/html-to-markdown)
|
|
18
|
+
[](https://www.nuget.org/packages/Goldziher.HtmlToMarkdown/)
|
|
19
|
+
[](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
|
|
20
|
+
[](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
21
|
+
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
22
|
+
|
|
23
|
+
## Performance
|
|
24
|
+
|
|
25
|
+
Native NAPI-RS bindings deliver **the fastest HTML to Markdown conversion** available in JavaScript.
|
|
26
|
+
|
|
27
|
+
### Benchmark Results (Apple M4)
|
|
28
|
+
|
|
29
|
+
| Document Type | ops/sec | Notes |
|
|
30
|
+
| -------------------------- | ---------- | ------------------ |
|
|
31
|
+
| **Small (5 paragraphs)** | **86,233** | Simple documents |
|
|
32
|
+
| **Medium (25 paragraphs)** | **18,979** | Nested formatting |
|
|
33
|
+
| **Large (100 paragraphs)** | **4,907** | Complex structures |
|
|
34
|
+
| **Tables (20 tables)** | **5,003** | Table processing |
|
|
35
|
+
| **Lists (500 items)** | **1,819** | Nested lists |
|
|
36
|
+
| **Wikipedia (129KB)** | **1,125** | Real-world content |
|
|
37
|
+
| **Wikipedia (653KB)** | **156** | Large documents |
|
|
38
|
+
|
|
39
|
+
**Average: ~18,162 ops/sec** across varied workloads.
|
|
40
|
+
|
|
41
|
+
### Comparison
|
|
42
|
+
|
|
43
|
+
- **vs WASM**: ~1.17× faster (native has zero startup time, direct memory access)
|
|
44
|
+
- **vs Python**: ~7.4× faster (avoids FFI overhead)
|
|
45
|
+
- **Best for**: Node.js and Bun server-side applications requiring maximum throughput
|
|
46
|
+
|
|
47
|
+
### Benchmark Fixtures (Apple M4)
|
|
48
|
+
|
|
49
|
+
The shared benchmark harness lives in `tools/benchmark-harness`. Node keeps pace with the Rust CLI across the board:
|
|
50
|
+
|
|
51
|
+
| Document | Size | ops/sec (Node) |
|
|
52
|
+
| ---------------------- | ------ | -------------- |
|
|
53
|
+
| Lists (Timeline) | 129 KB | 3,137 |
|
|
54
|
+
| Tables (Countries) | 360 KB | 932 |
|
|
55
|
+
| Medium (Python) | 657 KB | 460 |
|
|
56
|
+
| Large (Rust) | 567 KB | 554 |
|
|
57
|
+
| Small (Intro) | 463 KB | 627 |
|
|
58
|
+
| hOCR German PDF | 44 KB | 8,724 |
|
|
59
|
+
| hOCR Invoice | 4 KB | 96,138 |
|
|
60
|
+
| hOCR Embedded Tables | 37 KB | 9,591 |
|
|
61
|
+
|
|
62
|
+
> Run `task bench:harness -- --frameworks node` to regenerate these numbers.
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
### Node.js
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
npm install @kreuzberg/html-to-markdown-node
|
|
70
|
+
# or
|
|
71
|
+
yarn add @kreuzberg/html-to-markdown-node
|
|
72
|
+
# or
|
|
73
|
+
pnpm add @kreuzberg/html-to-markdown-node
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Bun
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
bun add @kreuzberg/html-to-markdown-node
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Usage
|
|
83
|
+
|
|
84
|
+
### Basic Conversion
|
|
85
|
+
|
|
86
|
+
```javascript
|
|
87
|
+
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
88
|
+
|
|
89
|
+
const html = '<h1>Hello World</h1><p>This is <strong>fast</strong>!</p>';
|
|
90
|
+
const markdown = convert(html);
|
|
91
|
+
console.log(markdown);
|
|
92
|
+
// # Hello World
|
|
93
|
+
//
|
|
94
|
+
// This is **fast**!
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### With Options
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
101
|
+
|
|
102
|
+
const markdown = convert(html, {
|
|
103
|
+
headingStyle: 'Atx',
|
|
104
|
+
codeBlockStyle: 'Backticks',
|
|
105
|
+
listIndentWidth: 2,
|
|
106
|
+
bullets: '-',
|
|
107
|
+
wrap: true,
|
|
108
|
+
wrapWidth: 80
|
|
109
|
+
});
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Preserve Complex HTML (NEW in v2.5)
|
|
113
|
+
|
|
114
|
+
```typescript
|
|
115
|
+
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
116
|
+
|
|
117
|
+
const html = `
|
|
118
|
+
<h1>Report</h1>
|
|
119
|
+
<table>
|
|
120
|
+
<tr><th>Name</th><th>Value</th></tr>
|
|
121
|
+
<tr><td>Foo</td><td>Bar</td></tr>
|
|
122
|
+
</table>
|
|
123
|
+
`;
|
|
124
|
+
|
|
125
|
+
const markdown = convert(html, {
|
|
126
|
+
preserveTags: ['table'] // Keep tables as HTML
|
|
127
|
+
});
|
|
128
|
+
// # Report
|
|
129
|
+
//
|
|
130
|
+
// <table>
|
|
131
|
+
// <tr><th>Name</th><th>Value</th></tr>
|
|
132
|
+
// <tr><td>Foo</td><td>Bar</td></tr>
|
|
133
|
+
// </table>
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## TypeScript
|
|
137
|
+
|
|
138
|
+
Full TypeScript definitions included:
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
import { convert, convertWithInlineImages, type JsConversionOptions } from '@kreuzberg/html-to-markdown-node';
|
|
142
|
+
|
|
143
|
+
const options: JsConversionOptions = {
|
|
144
|
+
headingStyle: 'Atx',
|
|
145
|
+
codeBlockStyle: 'Backticks',
|
|
146
|
+
listIndentWidth: 2,
|
|
147
|
+
bullets: '-',
|
|
148
|
+
wrap: true,
|
|
149
|
+
wrapWidth: 80
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
const markdown = convert('<h1>Hello</h1>', options);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Reusing Parsed Options
|
|
156
|
+
|
|
157
|
+
Avoid re-parsing the same options object on every call (benchmarks, tight render loops) by creating a reusable handle:
|
|
158
|
+
|
|
159
|
+
```ts
|
|
160
|
+
import {
|
|
161
|
+
createConversionOptionsHandle,
|
|
162
|
+
convertWithOptionsHandle,
|
|
163
|
+
} from '@kreuzberg/html-to-markdown-node';
|
|
164
|
+
|
|
165
|
+
const handle = createConversionOptionsHandle({ hocrSpatialTables: false });
|
|
166
|
+
const markdown = convertWithOptionsHandle('<h1>Handles</h1>', handle);
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Zero-Copy Buffer Input
|
|
170
|
+
|
|
171
|
+
Skip the intermediate UTF-16 string allocation by feeding `Buffer`/`Uint8Array` inputs directly—handy for benchmark harnesses or when you already have raw bytes:
|
|
172
|
+
|
|
173
|
+
```ts
|
|
174
|
+
import {
|
|
175
|
+
convertBuffer,
|
|
176
|
+
convertInlineImagesBuffer,
|
|
177
|
+
convertBufferWithOptionsHandle,
|
|
178
|
+
createConversionOptionsHandle,
|
|
179
|
+
} from '@kreuzberg/html-to-markdown-node';
|
|
180
|
+
import { readFileSync } from 'node:fs';
|
|
181
|
+
|
|
182
|
+
const html = readFileSync('fixtures/lists.html'); // Buffer
|
|
183
|
+
const markdown = convertBuffer(html);
|
|
184
|
+
|
|
185
|
+
const handle = createConversionOptionsHandle({ headingStyle: 'Atx' });
|
|
186
|
+
const markdownFromHandle = convertBufferWithOptionsHandle(html, handle);
|
|
187
|
+
|
|
188
|
+
// Inline images work too:
|
|
189
|
+
const extraction = convertInlineImagesBuffer(html, null, {
|
|
190
|
+
maxDecodedSizeBytes: 5 * 1024 * 1024,
|
|
191
|
+
});
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Inline Images
|
|
195
|
+
|
|
196
|
+
Extract and decode inline images (data URIs, SVG):
|
|
197
|
+
|
|
198
|
+
```typescript
|
|
199
|
+
import { convertWithInlineImages } from '@kreuzberg/html-to-markdown-node';
|
|
200
|
+
|
|
201
|
+
const html = '<img src="..." alt="Logo">';
|
|
202
|
+
|
|
203
|
+
const result = convertWithInlineImages(html, null, {
|
|
204
|
+
maxDecodedSizeBytes: 5 * 1024 * 1024, // 5MB
|
|
205
|
+
inferDimensions: true,
|
|
206
|
+
filenamePrefix: 'img_',
|
|
207
|
+
captureSvg: true
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
console.log(result.markdown);
|
|
211
|
+
console.log(`Extracted ${result.inlineImages.length} images`);
|
|
212
|
+
|
|
213
|
+
for (const img of result.inlineImages) {
|
|
214
|
+
console.log(`${img.filename}: ${img.format}, ${img.data.length} bytes`);
|
|
215
|
+
// Save image data to disk
|
|
216
|
+
require('fs').writeFileSync(img.filename, img.data);
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Supported Platforms
|
|
221
|
+
|
|
222
|
+
Pre-built native binaries are provided for:
|
|
223
|
+
|
|
224
|
+
| Platform | Architectures |
|
|
225
|
+
| ----------- | --------------------------------------------------- |
|
|
226
|
+
| **macOS** | x64 (Intel), ARM64 (Apple Silicon) |
|
|
227
|
+
| **Linux** | x64 (glibc/musl), ARM64 (glibc/musl), ARMv7 (glibc) |
|
|
228
|
+
| **Windows** | x64, ARM64 |
|
|
229
|
+
|
|
230
|
+
### Runtime Compatibility
|
|
231
|
+
|
|
232
|
+
✅ **Node.js** 18+ (LTS)
|
|
233
|
+
✅ **Bun** 1.0+ (full NAPI-RS support)
|
|
234
|
+
❌ **Deno** (use [@kreuzberg/html-to-markdown-wasm](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm) instead)
|
|
235
|
+
|
|
236
|
+
## When to Use
|
|
237
|
+
|
|
238
|
+
Choose `@kreuzberg/html-to-markdown-node` when:
|
|
239
|
+
|
|
240
|
+
- ✅ Running in Node.js or Bun
|
|
241
|
+
- ✅ Maximum performance is required
|
|
242
|
+
- ✅ Server-side conversion at scale
|
|
243
|
+
|
|
244
|
+
Use [`html-to-markdown-wasm`](https://www.npmjs.com/package/html-to-markdown-wasm) for:
|
|
245
|
+
|
|
246
|
+
- 🌐 Browser/client-side conversion
|
|
247
|
+
- 🦕 Deno runtime
|
|
248
|
+
- ☁️ Edge runtimes (Cloudflare Workers, Deno Deploy)
|
|
249
|
+
- 📦 Universal packages
|
|
250
|
+
|
|
251
|
+
Other runtimes:
|
|
252
|
+
|
|
253
|
+
- 🐍 Python: [`html-to-markdown`](https://pypi.org/project/html-to-markdown/)
|
|
254
|
+
- 💎 Ruby: [`html-to-markdown`](https://rubygems.org/gems/html-to-markdown)
|
|
255
|
+
- 🐘 PHP: [`goldziher/html-to-markdown`](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
256
|
+
- 🌐 WebAssembly: [`html-to-markdown-wasm`](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
257
|
+
|
|
258
|
+
## Configuration Options
|
|
259
|
+
|
|
260
|
+
See [ConversionOptions](https://github.com/kreuzberg-dev/html-to-markdown/tree/main/crates/html-to-markdown-node#types) for all available options including:
|
|
261
|
+
|
|
262
|
+
- Heading styles (ATX, underlined, ATX closed)
|
|
263
|
+
- Code block styles (indented, backticks, tildes)
|
|
264
|
+
- List formatting (indent width, bullet characters)
|
|
265
|
+
- Text escaping and formatting
|
|
266
|
+
- Tag preservation (`preserveTags`) and stripping (`stripTags`)
|
|
267
|
+
- Preprocessing for web scraping
|
|
268
|
+
- hOCR table extraction
|
|
269
|
+
- And more...
|
|
270
|
+
|
|
271
|
+
## Examples
|
|
272
|
+
|
|
273
|
+
### Preserving HTML Tags
|
|
274
|
+
|
|
275
|
+
Keep specific HTML tags in their original form instead of converting to Markdown:
|
|
276
|
+
|
|
277
|
+
```typescript
|
|
278
|
+
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
279
|
+
|
|
280
|
+
const html = `
|
|
281
|
+
<p>Before table</p>
|
|
282
|
+
<table class="data">
|
|
283
|
+
<tr><th>Name</th><th>Value</th></tr>
|
|
284
|
+
<tr><td>Item 1</td><td>100</td></tr>
|
|
285
|
+
</table>
|
|
286
|
+
<p>After table</p>
|
|
287
|
+
`;
|
|
288
|
+
|
|
289
|
+
const markdown = convert(html, {
|
|
290
|
+
preserveTags: ['table']
|
|
291
|
+
});
|
|
292
|
+
|
|
293
|
+
// Result includes the table as HTML:
|
|
294
|
+
// "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
Combine with `stripTags` for fine-grained control:
|
|
298
|
+
|
|
299
|
+
```typescript
|
|
300
|
+
const markdown = convert(html, {
|
|
301
|
+
preserveTags: ['table', 'form'], // Keep these as HTML
|
|
302
|
+
stripTags: ['script', 'style'] // Remove these entirely
|
|
303
|
+
});
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Web Scraping
|
|
307
|
+
|
|
308
|
+
```javascript
|
|
309
|
+
const { convert } = require('@kreuzberg/html-to-markdown-node');
|
|
310
|
+
|
|
311
|
+
const scrapedHtml = await fetch('https://example.com').then(r => r.text());
|
|
312
|
+
|
|
313
|
+
const markdown = convert(scrapedHtml, {
|
|
314
|
+
preprocessing: {
|
|
315
|
+
enabled: true,
|
|
316
|
+
preset: 'Aggressive',
|
|
317
|
+
removeNavigation: true,
|
|
318
|
+
removeForms: true
|
|
319
|
+
},
|
|
320
|
+
headingStyle: 'Atx',
|
|
321
|
+
codeBlockStyle: 'Backticks'
|
|
322
|
+
});
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
### hOCR Document Processing
|
|
326
|
+
|
|
327
|
+
```javascript
|
|
328
|
+
const { convert } = require('@kreuzberg/html-to-markdown-node');
|
|
329
|
+
const fs = require('fs');
|
|
330
|
+
|
|
331
|
+
// OCR output from Tesseract in hOCR format
|
|
332
|
+
const hocrHtml = fs.readFileSync('scan.hocr', 'utf8');
|
|
333
|
+
|
|
334
|
+
// Automatically detects hOCR and reconstructs tables
|
|
335
|
+
const markdown = convert(hocrHtml, {
|
|
336
|
+
hocrSpatialTables: true // Enable spatial table reconstruction
|
|
337
|
+
});
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
## Links
|
|
341
|
+
|
|
342
|
+
- [GitHub Repository](https://github.com/kreuzberg-dev/html-to-markdown)
|
|
343
|
+
- [Full Documentation](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/README.md)
|
|
344
|
+
- [WASM Package](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
345
|
+
- [Python Package](https://pypi.org/project/html-to-markdown/)
|
|
346
|
+
- [Rust Crate](https://crates.io/crates/html-to-markdown-rs)
|
|
347
|
+
|
|
348
|
+
## License
|
|
349
|
+
|
|
350
|
+
MIT
|
|
Binary file
|