html-to-markdown-wasm 2.5.5 → 2.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/README.md +14 -7
- package/dist/html_to_markdown_wasm.d.ts +27 -27
- package/dist/html_to_markdown_wasm_bg.js +121 -120
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/html_to_markdown_wasm_bg.wasm.d.ts +15 -15
- package/dist/package.json +1 -1
- package/dist-node/README.md +203 -0
- package/dist-node/html_to_markdown_wasm.d.ts +96 -0
- package/dist-node/html_to_markdown_wasm.js +972 -0
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/html_to_markdown_wasm_bg.wasm.d.ts +32 -0
- package/dist-node/package.json +20 -0
- package/dist-web/README.md +203 -0
- package/dist-web/html_to_markdown_wasm.d.ts +152 -0
- package/dist-web/html_to_markdown_wasm.js +1016 -0
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/html_to_markdown_wasm_bg.wasm.d.ts +32 -0
- package/dist-web/package.json +24 -0
- package/package.json +38 -18
- package/LICENSE +0 -21
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# html-to-markdown
|
|
2
|
+
|
|
3
|
+
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
4
|
+
|
|
5
|
+
[](https://crates.io/crates/html-to-markdown-rs)
|
|
6
|
+
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
7
|
+
[](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
8
|
+
[](https://www.npmjs.com/package/html-to-markdown)
|
|
9
|
+
[](https://pypi.org/project/html-to-markdown/)
|
|
10
|
+
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
11
|
+
[](https://rubygems.org/gems/html-to-markdown)
|
|
12
|
+
[](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
|
|
13
|
+
[](https://discord.gg/pXxagNK2zN)
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 🎮 **[Try the Live Demo →](https://goldziher.github.io/html-to-markdown/)**
|
|
18
|
+
|
|
19
|
+
Experience WebAssembly-powered HTML to Markdown conversion instantly in your browser. No installation needed!
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Why html-to-markdown?
|
|
24
|
+
|
|
25
|
+
- **Blazing Fast**: Rust-powered core delivers 10-80× faster conversion than pure Python alternatives
|
|
26
|
+
- **Universal**: Works everywhere - Node.js, Bun, Deno, browsers, Python, Rust, and standalone CLI
|
|
27
|
+
- **Smart Conversion**: Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
|
|
28
|
+
- **Highly Configurable**: Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
|
|
29
|
+
- **Tag Preservation**: Keep specific HTML tags unconverted when markdown isn't expressive enough
|
|
30
|
+
- **Secure by Default**: Built-in HTML sanitization prevents malicious content
|
|
31
|
+
- **Consistent Output**: Identical markdown rendering across all language bindings
|
|
32
|
+
|
|
33
|
+
## Documentation
|
|
34
|
+
|
|
35
|
+
- **JavaScript/TypeScript guides**:
|
|
36
|
+
- Node.js/Bun (native) – [Node.js README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-node/README.md)
|
|
37
|
+
- WebAssembly (universal) – [WASM README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-wasm/README.md)
|
|
38
|
+
- TypeScript wrapper – [TypeScript README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/typescript/README.md)
|
|
39
|
+
- **Python guide** – [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md)
|
|
40
|
+
- **PHP guides**:
|
|
41
|
+
- PHP wrapper package – [PHP README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md)
|
|
42
|
+
- PHP extension (PIE) – [Extension README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php-ext/README.md)
|
|
43
|
+
- **Ruby guide** – [Ruby README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-rb/README.md)
|
|
44
|
+
- **Rust guide** – [Rust README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md)
|
|
45
|
+
- **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here!
|
|
46
|
+
- **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
| Target | Command |
|
|
51
|
+
| --------------------------- | ------------------------------------------------------------------------- |
|
|
52
|
+
| **Node.js/Bun** (native) | `npm install html-to-markdown-node` |
|
|
53
|
+
| **WebAssembly** (universal) | `npm install html-to-markdown-wasm` |
|
|
54
|
+
| **Deno** | `import { convert } from "npm:html-to-markdown-wasm"` |
|
|
55
|
+
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
56
|
+
| **PHP** (extension + helpers) | `pie install goldziher/html-to-markdown`<br>`composer require html-to-markdown/extension` |
|
|
57
|
+
| **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
|
|
58
|
+
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
59
|
+
| Rust CLI | `cargo install html-to-markdown-cli` |
|
|
60
|
+
| Homebrew CLI | `brew tap goldziher/tap`<br>`brew install html-to-markdown` |
|
|
61
|
+
| Releases | [GitHub Releases](https://github.com/Goldziher/html-to-markdown/releases) |
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### JavaScript/TypeScript
|
|
66
|
+
|
|
67
|
+
**Node.js / Bun (Native - Fastest):**
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
import { convert } from 'html-to-markdown-node';
|
|
71
|
+
|
|
72
|
+
const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
|
|
73
|
+
const markdown = convert(html, {
|
|
74
|
+
headingStyle: 'Atx',
|
|
75
|
+
codeBlockStyle: 'Backticks',
|
|
76
|
+
wrap: true,
|
|
77
|
+
preserveTags: ['table'], // NEW in v2.5: Keep complex HTML as-is
|
|
78
|
+
});
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Deno / Browsers / Edge (Universal):**
|
|
82
|
+
|
|
83
|
+
```typescript
|
|
84
|
+
import { convert } from "npm:html-to-markdown-wasm"; // Deno
|
|
85
|
+
// or: import { convert } from 'html-to-markdown-wasm'; // Bundlers
|
|
86
|
+
|
|
87
|
+
const markdown = convert(html, {
|
|
88
|
+
headingStyle: 'atx',
|
|
89
|
+
listIndentWidth: 2,
|
|
90
|
+
});
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
**Performance:** Native bindings average ~19k ops/sec, WASM averages ~16k ops/sec (benchmarked on complex real-world documents).
|
|
94
|
+
|
|
95
|
+
See the JavaScript guides for full API documentation:
|
|
96
|
+
|
|
97
|
+
- [Node.js/Bun guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-node)
|
|
98
|
+
- [WebAssembly guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-wasm)
|
|
99
|
+
|
|
100
|
+
### CLI
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
# Convert a file
|
|
104
|
+
html-to-markdown input.html > output.md
|
|
105
|
+
|
|
106
|
+
# Stream from stdin
|
|
107
|
+
curl https://example.com | html-to-markdown > output.md
|
|
108
|
+
|
|
109
|
+
# Apply options
|
|
110
|
+
html-to-markdown --heading-style atx --list-indent-width 2 input.html
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Python (v2 API)
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from html_to_markdown import convert, convert_with_inline_images, InlineImageConfig
|
|
117
|
+
|
|
118
|
+
html = "<h1>Hello</h1><p>Rust ❤️ Markdown</p>"
|
|
119
|
+
markdown = convert(html)
|
|
120
|
+
|
|
121
|
+
markdown, inline_images, warnings = convert_with_inline_images(
|
|
122
|
+
'<img src="data:image/png;base64,...==" alt="Pixel">',
|
|
123
|
+
image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Rust
|
|
128
|
+
|
|
129
|
+
```rust
|
|
130
|
+
use html_to_markdown_rs::{convert, ConversionOptions, HeadingStyle};
|
|
131
|
+
|
|
132
|
+
let html = "<h1>Welcome</h1><p>Fast conversion</p>";
|
|
133
|
+
let markdown = convert(html, None)?;
|
|
134
|
+
|
|
135
|
+
let options = ConversionOptions {
|
|
136
|
+
heading_style: HeadingStyle::Atx,
|
|
137
|
+
..Default::default()
|
|
138
|
+
};
|
|
139
|
+
let markdown = convert(html, Some(options))?;
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
See the language-specific READMEs for complete configuration, hOCR workflows, and inline image extraction.
|
|
143
|
+
|
|
144
|
+
## Performance
|
|
145
|
+
|
|
146
|
+
Benchmarked on Apple M4 with complex real-world documents (Wikipedia articles, tables, lists):
|
|
147
|
+
|
|
148
|
+
### Operations per Second (higher is better)
|
|
149
|
+
|
|
150
|
+
| Document Type | Node.js (NAPI) | WASM | Python (PyO3) | Speedup (Node vs Python) |
|
|
151
|
+
| -------------------------- | -------------- | ------ | ------------- | ------------------------ |
|
|
152
|
+
| **Small (5 paragraphs)** | 86,233 | 70,300 | 8,443 | **10.2×** |
|
|
153
|
+
| **Medium (25 paragraphs)** | 18,979 | 15,282 | 1,846 | **10.3×** |
|
|
154
|
+
| **Large (100 paragraphs)** | 4,907 | 3,836 | 438 | **11.2×** |
|
|
155
|
+
| **Tables (complex)** | 5,003 | 3,748 | 4,829 | 1.0× |
|
|
156
|
+
| **Lists (nested)** | 1,819 | 1,391 | 1,165 | **1.6×** |
|
|
157
|
+
| **Wikipedia (129KB)** | 1,125 | 1,022 | - | - |
|
|
158
|
+
| **Wikipedia (653KB)** | 156 | 147 | - | - |
|
|
159
|
+
|
|
160
|
+
### Average Performance Summary
|
|
161
|
+
|
|
162
|
+
| Implementation | Avg ops/sec | vs WASM | vs Python | Best For |
|
|
163
|
+
| --------------------- | ---------------- | ------------ | --------------- | --------------------------------- |
|
|
164
|
+
| **Node.js (NAPI-RS)** | **18,162** | 1.17× faster | **7.4× faster** | Maximum throughput in Node.js/Bun |
|
|
165
|
+
| **WebAssembly** | **15,536** | baseline | **6.3× faster** | Universal (Deno, browsers, edge) |
|
|
166
|
+
| **Python (PyO3)** | **2,465** | 6.3× slower | baseline | Python ecosystem integration |
|
|
167
|
+
| **Rust CLI/Binary** | **150-210 MB/s** | - | - | Standalone processing |
|
|
168
|
+
|
|
169
|
+
### Key Insights
|
|
170
|
+
|
|
171
|
+
- **JavaScript bindings are fastest**: Native Node.js bindings achieve ~18k ops/sec average, with WASM close behind at ~16k ops/sec
|
|
172
|
+
- **Python is 6-10× slower**: Despite using the same Rust core, PyO3 FFI overhead significantly impacts Python performance
|
|
173
|
+
- **Small documents**: Both JS implementations reach 70-90k ops/sec on simple HTML
|
|
174
|
+
- **Large documents**: Performance gap widens with complexity
|
|
175
|
+
|
|
176
|
+
**Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
|
|
177
|
+
|
|
178
|
+
## Compatibility (v1 → v2)
|
|
179
|
+
|
|
180
|
+
- V2’s Rust core sustains **150–210 MB/s** throughput; V1 averaged **≈ 2.5 MB/s** in its Python/BeautifulSoup implementation (60–80× faster).
|
|
181
|
+
- The Python package offers a compatibility shim in `html_to_markdown.v1_compat` (`convert_to_markdown`, `convert_to_markdown_stream`, `markdownify`). Details and keyword mappings live in [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md#v1-compatibility).
|
|
182
|
+
- CLI flag changes, option renames, and other breaking updates are summarised in [CHANGELOG](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md#breaking-changes).
|
|
183
|
+
|
|
184
|
+
## Community
|
|
185
|
+
|
|
186
|
+
- Chat with us on [Discord](https://discord.gg/pXxagNK2zN)
|
|
187
|
+
- Explore the broader [Kreuzberg](https://kreuzberg.dev) document-processing ecosystem
|
|
188
|
+
- Sponsor development via [GitHub Sponsors](https://github.com/sponsors/Goldziher)
|
|
189
|
+
### Ruby
|
|
190
|
+
|
|
191
|
+
```ruby
|
|
192
|
+
require 'html_to_markdown'
|
|
193
|
+
|
|
194
|
+
html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
|
|
195
|
+
markdown = HtmlToMarkdown.convert(html, heading_style: :atx, wrap: true)
|
|
196
|
+
|
|
197
|
+
puts markdown
|
|
198
|
+
# # Hello
|
|
199
|
+
#
|
|
200
|
+
# Rust ❤️ Markdown
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
See the language-specific READMEs for complete configuration, hOCR workflows, and inline image extraction.
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/* tslint:disable */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
/**
|
|
4
|
+
* Convert HTML to Markdown while collecting inline images
|
|
5
|
+
*
|
|
6
|
+
* # Arguments
|
|
7
|
+
*
|
|
8
|
+
* * `html` - The HTML string to convert
|
|
9
|
+
* * `options` - Optional conversion options (as a JavaScript object)
|
|
10
|
+
* * `image_config` - Configuration for inline image extraction
|
|
11
|
+
*
|
|
12
|
+
* # Example
|
|
13
|
+
*
|
|
14
|
+
* ```javascript
|
|
15
|
+
* import { convertWithInlineImages, WasmInlineImageConfig } from '@html-to-markdown/wasm';
|
|
16
|
+
*
|
|
17
|
+
* const html = '<img src="data:image/png;base64,..." alt="test">';
|
|
18
|
+
* const config = new WasmInlineImageConfig(1024 * 1024);
|
|
19
|
+
* config.inferDimensions = true;
|
|
20
|
+
*
|
|
21
|
+
* const result = convertWithInlineImages(html, null, config);
|
|
22
|
+
* console.log(result.markdown);
|
|
23
|
+
* console.log(result.inlineImages.length);
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
27
|
+
/**
|
|
28
|
+
* Convert HTML to Markdown
|
|
29
|
+
*
|
|
30
|
+
* # Arguments
|
|
31
|
+
*
|
|
32
|
+
* * `html` - The HTML string to convert
|
|
33
|
+
* * `options` - Optional conversion options (as a JavaScript object)
|
|
34
|
+
*
|
|
35
|
+
* # Example
|
|
36
|
+
*
|
|
37
|
+
* ```javascript
|
|
38
|
+
* import { convert } from '@html-to-markdown/wasm';
|
|
39
|
+
*
|
|
40
|
+
* const html = '<h1>Hello World</h1>';
|
|
41
|
+
* const markdown = convert(html);
|
|
42
|
+
* console.log(markdown); // # Hello World
|
|
43
|
+
* ```
|
|
44
|
+
*/
|
|
45
|
+
export function convert(html: string, options: any): string;
|
|
46
|
+
/**
|
|
47
|
+
* Initialize panic hook for better error messages in the browser
|
|
48
|
+
*/
|
|
49
|
+
export function init(): void;
|
|
50
|
+
/**
|
|
51
|
+
* Result of HTML extraction with inline images
|
|
52
|
+
*/
|
|
53
|
+
export class WasmHtmlExtraction {
|
|
54
|
+
private constructor();
|
|
55
|
+
free(): void;
|
|
56
|
+
[Symbol.dispose](): void;
|
|
57
|
+
readonly inlineImages: WasmInlineImage[];
|
|
58
|
+
readonly markdown: string;
|
|
59
|
+
readonly warnings: WasmInlineImageWarning[];
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Inline image data
|
|
63
|
+
*/
|
|
64
|
+
export class WasmInlineImage {
|
|
65
|
+
private constructor();
|
|
66
|
+
free(): void;
|
|
67
|
+
[Symbol.dispose](): void;
|
|
68
|
+
readonly attributes: any;
|
|
69
|
+
readonly dimensions: Uint32Array | undefined;
|
|
70
|
+
readonly description: string | undefined;
|
|
71
|
+
readonly data: Uint8Array;
|
|
72
|
+
readonly format: string;
|
|
73
|
+
readonly source: string;
|
|
74
|
+
readonly filename: string | undefined;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Inline image configuration
|
|
78
|
+
*/
|
|
79
|
+
export class WasmInlineImageConfig {
|
|
80
|
+
free(): void;
|
|
81
|
+
[Symbol.dispose](): void;
|
|
82
|
+
constructor(max_decoded_size_bytes?: number | null);
|
|
83
|
+
set captureSvg(value: boolean);
|
|
84
|
+
set filenamePrefix(value: string | null | undefined);
|
|
85
|
+
set inferDimensions(value: boolean);
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Warning about inline image processing
|
|
89
|
+
*/
|
|
90
|
+
export class WasmInlineImageWarning {
|
|
91
|
+
private constructor();
|
|
92
|
+
free(): void;
|
|
93
|
+
[Symbol.dispose](): void;
|
|
94
|
+
readonly index: number;
|
|
95
|
+
readonly message: string;
|
|
96
|
+
}
|