html-to-markdown-wasm 2.8.0 → 2.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/README.md +25 -12
- package/dist/html_to_markdown_wasm.d.ts +5 -5
- package/dist/html_to_markdown_wasm_bg.js +108 -108
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/package.json +1 -1
- package/dist-node/README.md +25 -12
- package/dist-node/html_to_markdown_wasm.d.ts +5 -5
- package/dist-node/html_to_markdown_wasm.js +108 -108
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/package.json +1 -1
- package/dist-web/README.md +25 -12
- package/dist-web/html_to_markdown_wasm.d.ts +5 -5
- package/dist-web/html_to_markdown_wasm.js +108 -108
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/package.json +1 -1
- package/package.json +1 -1
package/dist/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# html-to-markdown
|
|
2
2
|
|
|
3
|
-
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
3
|
+
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Elixir Rustler NIF, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
4
4
|
|
|
5
5
|
[](https://crates.io/crates/html-to-markdown)
|
|
6
6
|
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
@@ -8,6 +8,7 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
|
|
|
8
8
|
[](https://pypi.org/project/html-to-markdown/)
|
|
9
9
|
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
10
10
|
[](https://rubygems.org/gems/html-to-markdown)
|
|
11
|
+
[](https://hex.pm/packages/html_to_markdown)
|
|
11
12
|
[](https://www.nuget.org/packages/HtmlToMarkdown/)
|
|
12
13
|
[](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
|
|
13
14
|
[](https://pkg.go.dev/github.com/Goldziher/html-to-markdown/packages/go/htmltomarkdown)
|
|
@@ -43,6 +44,7 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
43
44
|
- PHP wrapper package – [PHP README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md)
|
|
44
45
|
- PHP extension (PIE) – [Extension README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php-ext/README.md)
|
|
45
46
|
- **Ruby guide** – [Ruby README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md)
|
|
47
|
+
- **Elixir guide** – [Elixir README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/elixir/README.md)
|
|
46
48
|
- **Rust guide** – [Rust README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md)
|
|
47
49
|
- **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here!
|
|
48
50
|
- **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
@@ -57,6 +59,7 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
57
59
|
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
58
60
|
| **PHP** (extension + helpers) | `pie install goldziher/html-to-markdown`<br>`composer require html-to-markdown/extension` |
|
|
59
61
|
| **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
|
|
62
|
+
| **Elixir** (Rustler NIF) | `{:html_to_markdown, "~> 2.8"}` |
|
|
60
63
|
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
61
64
|
| Rust CLI | `cargo install html-to-markdown-cli` |
|
|
62
65
|
| Homebrew CLI | `brew tap goldziher/tap`<br>`brew install html-to-markdown` |
|
|
@@ -126,6 +129,15 @@ markdown, inline_images, warnings = convert_with_inline_images(
|
|
|
126
129
|
)
|
|
127
130
|
```
|
|
128
131
|
|
|
132
|
+
### Elixir
|
|
133
|
+
|
|
134
|
+
```elixir
|
|
135
|
+
{:ok, markdown} = HtmlToMarkdown.convert("<h1>Hello</h1>")
|
|
136
|
+
|
|
137
|
+
# Keyword options are supported (internally mapped to the Rust ConversionOptions struct)
|
|
138
|
+
HtmlToMarkdown.convert!("<p>Wrap me</p>", wrap: true, wrap_width: 32, preprocessing: %{enabled: true})
|
|
139
|
+
```
|
|
140
|
+
|
|
129
141
|
### Rust
|
|
130
142
|
|
|
131
143
|
```rust
|
|
@@ -178,6 +190,7 @@ Derived directly from `tools/runtime-bench/results/latest.json` (Apple M4, share
|
|
|
178
190
|
- **Rust now leads throughput**: the fused preprocessing + `parse_owned` pathway pushes the CLI to ~1.7 k ops/sec on the 129 KB lists page and ~31 k ops/sec on the HOCR invoice fixture.
|
|
179
191
|
- **Node.js trails by only a few percent** after the buffer/handle work—~1.3 k ops/sec on the lists fixture and 27 k ops/sec on HOCR invoices without any UTF-16 copies.
|
|
180
192
|
- **Python remains competitive** but now sits below Node/Rust (~4.0 k average ops/sec); stick to the v2 API to avoid the deprecated compatibility shim.
|
|
193
|
+
- **Elixir matches the Rust core** because the Rustler NIF executes the same `ConversionOptions` pipeline—benchmarks land between 170–1,460 ops/sec on the Wikipedia fixtures and >20 k ops/sec on micro HOCR payloads.
|
|
181
194
|
- **PHP and WASM stay in the 35–70 MB/s band**, which is plenty for Composer queues or edge runtimes as long as the extension/module is built ahead of time.
|
|
182
195
|
- **Rust CLI results now mirror the bindings**, since `task bench:bindings` runs the harness with `cargo run --release` by default—profile there, then push optimizations down into each FFI layer.
|
|
183
196
|
|
|
@@ -185,20 +198,20 @@ Derived directly from `tools/runtime-bench/results/latest.json` (Apple M4, share
|
|
|
185
198
|
|
|
186
199
|
Measured on Apple M4 using the fixture-driven runtime harness in `tools/runtime-bench` (`task bench:bindings`). Every binding consumes the exact same HTML fixtures and hOCR samples from `test_documents/`:
|
|
187
200
|
|
|
188
|
-
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Rust ops/sec |
|
|
189
|
-
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | ------------ |
|
|
190
|
-
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | **1,700** |
|
|
191
|
-
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | **416** |
|
|
192
|
-
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | **190** |
|
|
193
|
-
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | **220** |
|
|
194
|
-
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | **258** |
|
|
195
|
-
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 2,760 |
|
|
196
|
-
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | **31,345** |
|
|
197
|
-
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,080 |
|
|
201
|
+
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Elixir ops/sec | Rust ops/sec |
|
|
202
|
+
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | -------------- | ------------ |
|
|
203
|
+
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | 1,463 | **1,700** |
|
|
204
|
+
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | 357 | **416** |
|
|
205
|
+
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | 171 | **190** |
|
|
206
|
+
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | 174 | **220** |
|
|
207
|
+
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | 247 | **258** |
|
|
208
|
+
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 3,113 | 2,760 |
|
|
209
|
+
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | 20,424 | **31,345** |
|
|
210
|
+
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,366 | 3,080 |
|
|
198
211
|
|
|
199
212
|
The harness shells out to each runtime’s lightweight benchmark driver (`packages/*/bin/benchmark.*`, `crates/*/bin/benchmark.ts`), feeds fixtures defined in `tools/runtime-bench/fixtures/*.toml`, and writes machine-readable JSON reports (`tools/runtime-bench/results/latest.json`) for regression tracking. Add new languages or scenarios by extending those fixture files and drivers.
|
|
200
213
|
|
|
201
|
-
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures, pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
214
|
+
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures (for example, `task bench:bindings -- --language elixir`), pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
202
215
|
|
|
203
216
|
Need a call-stack view of the Rust core? Run `task flamegraph:rust` (or call the harness with `--language rust --flamegraph path.svg`) to profile a fixture and dump a ready-to-inspect flamegraph in `tools/runtime-bench/results/`.
|
|
204
217
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
+
export function createConversionOptionsHandle(options: any): WasmConversionOptionsHandle;
|
|
4
|
+
export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
5
|
+
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
6
|
+
export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
|
|
3
7
|
export function convertBytes(html: Uint8Array, options: any): string;
|
|
4
8
|
/**
|
|
5
9
|
* Convert HTML to Markdown
|
|
@@ -20,15 +24,11 @@ export function convertBytes(html: Uint8Array, options: any): string;
|
|
|
20
24
|
* ```
|
|
21
25
|
*/
|
|
22
26
|
export function convert(html: string, options: any): string;
|
|
23
|
-
export function
|
|
27
|
+
export function convertBytesWithInlineImages(html: Uint8Array, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
24
28
|
/**
|
|
25
29
|
* Initialize panic hook for better error messages in the browser
|
|
26
30
|
*/
|
|
27
31
|
export function init(): void;
|
|
28
|
-
export function convertBytesWithInlineImages(html: Uint8Array, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
29
|
-
export function createConversionOptionsHandle(options: any): WasmConversionOptionsHandle;
|
|
30
|
-
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
31
|
-
export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
32
32
|
export class WasmConversionOptionsHandle {
|
|
33
33
|
free(): void;
|
|
34
34
|
[Symbol.dispose](): void;
|
|
@@ -232,88 +232,90 @@ function getArrayJsValueFromWasm0(ptr, len) {
|
|
|
232
232
|
return result;
|
|
233
233
|
}
|
|
234
234
|
/**
|
|
235
|
-
* @param {Uint8Array} html
|
|
236
235
|
* @param {any} options
|
|
237
|
-
* @returns {
|
|
236
|
+
* @returns {WasmConversionOptionsHandle}
|
|
238
237
|
*/
|
|
239
|
-
export function
|
|
240
|
-
let deferred2_0;
|
|
241
|
-
let deferred2_1;
|
|
238
|
+
export function createConversionOptionsHandle(options) {
|
|
242
239
|
try {
|
|
243
240
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
244
|
-
wasm.
|
|
241
|
+
wasm.createConversionOptionsHandle(retptr, addHeapObject(options));
|
|
245
242
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
246
243
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
247
244
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
var len1 = r1;
|
|
251
|
-
if (r3) {
|
|
252
|
-
ptr1 = 0; len1 = 0;
|
|
253
|
-
throw takeObject(r2);
|
|
245
|
+
if (r2) {
|
|
246
|
+
throw takeObject(r1);
|
|
254
247
|
}
|
|
255
|
-
|
|
256
|
-
deferred2_1 = len1;
|
|
257
|
-
return getStringFromWasm0(ptr1, len1);
|
|
248
|
+
return WasmConversionOptionsHandle.__wrap(r0);
|
|
258
249
|
} finally {
|
|
259
250
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
260
|
-
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
261
251
|
}
|
|
262
252
|
}
|
|
263
253
|
|
|
254
|
+
function _assertClass(instance, klass) {
|
|
255
|
+
if (!(instance instanceof klass)) {
|
|
256
|
+
throw new Error(`expected instance of ${klass.name}`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
264
259
|
/**
|
|
265
|
-
* Convert HTML to Markdown
|
|
266
|
-
*
|
|
267
|
-
* # Arguments
|
|
268
|
-
*
|
|
269
|
-
* * `html` - The HTML string to convert
|
|
270
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
271
|
-
*
|
|
272
|
-
* # Example
|
|
273
|
-
*
|
|
274
|
-
* ```javascript
|
|
275
|
-
* import { convert } from 'html-to-markdown-wasm';
|
|
276
|
-
*
|
|
277
|
-
* const html = '<h1>Hello World</h1>';
|
|
278
|
-
* const markdown = convert(html);
|
|
279
|
-
* console.log(markdown); // # Hello World
|
|
280
|
-
* ```
|
|
281
260
|
* @param {string} html
|
|
282
261
|
* @param {any} options
|
|
283
|
-
* @
|
|
262
|
+
* @param {WasmInlineImageConfig | null} [image_config]
|
|
263
|
+
* @returns {WasmHtmlExtraction}
|
|
284
264
|
*/
|
|
285
|
-
export function
|
|
286
|
-
let deferred3_0;
|
|
287
|
-
let deferred3_1;
|
|
265
|
+
export function convertWithInlineImages(html, options, image_config) {
|
|
288
266
|
try {
|
|
289
267
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
290
268
|
const ptr0 = passStringToWasm0(html, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
291
269
|
const len0 = WASM_VECTOR_LEN;
|
|
292
|
-
|
|
270
|
+
let ptr1 = 0;
|
|
271
|
+
if (!isLikeNone(image_config)) {
|
|
272
|
+
_assertClass(image_config, WasmInlineImageConfig);
|
|
273
|
+
ptr1 = image_config.__destroy_into_raw();
|
|
274
|
+
}
|
|
275
|
+
wasm.convertWithInlineImages(retptr, ptr0, len0, addHeapObject(options), ptr1);
|
|
276
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
277
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
278
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
279
|
+
if (r2) {
|
|
280
|
+
throw takeObject(r1);
|
|
281
|
+
}
|
|
282
|
+
return WasmHtmlExtraction.__wrap(r0);
|
|
283
|
+
} finally {
|
|
284
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* @param {Uint8Array} html
|
|
290
|
+
* @param {WasmConversionOptionsHandle} handle
|
|
291
|
+
* @returns {string}
|
|
292
|
+
*/
|
|
293
|
+
export function convertBytesWithOptionsHandle(html, handle) {
|
|
294
|
+
let deferred2_0;
|
|
295
|
+
let deferred2_1;
|
|
296
|
+
try {
|
|
297
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
298
|
+
_assertClass(handle, WasmConversionOptionsHandle);
|
|
299
|
+
wasm.convertBytesWithOptionsHandle(retptr, addHeapObject(html), handle.__wbg_ptr);
|
|
293
300
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
294
301
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
295
302
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
296
303
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
297
|
-
var
|
|
298
|
-
var
|
|
304
|
+
var ptr1 = r0;
|
|
305
|
+
var len1 = r1;
|
|
299
306
|
if (r3) {
|
|
300
|
-
|
|
307
|
+
ptr1 = 0; len1 = 0;
|
|
301
308
|
throw takeObject(r2);
|
|
302
309
|
}
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
return getStringFromWasm0(
|
|
310
|
+
deferred2_0 = ptr1;
|
|
311
|
+
deferred2_1 = len1;
|
|
312
|
+
return getStringFromWasm0(ptr1, len1);
|
|
306
313
|
} finally {
|
|
307
314
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
308
|
-
wasm.__wbindgen_export4(
|
|
315
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
309
316
|
}
|
|
310
317
|
}
|
|
311
318
|
|
|
312
|
-
function _assertClass(instance, klass) {
|
|
313
|
-
if (!(instance instanceof klass)) {
|
|
314
|
-
throw new Error(`expected instance of ${klass.name}`);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
319
|
/**
|
|
318
320
|
* @param {string} html
|
|
319
321
|
* @param {WasmConversionOptionsHandle} handle
|
|
@@ -347,108 +349,99 @@ export function convertWithOptionsHandle(html, handle) {
|
|
|
347
349
|
}
|
|
348
350
|
}
|
|
349
351
|
|
|
350
|
-
/**
|
|
351
|
-
* Initialize panic hook for better error messages in the browser
|
|
352
|
-
*/
|
|
353
|
-
export function init() {
|
|
354
|
-
wasm.init();
|
|
355
|
-
}
|
|
356
|
-
|
|
357
352
|
/**
|
|
358
353
|
* @param {Uint8Array} html
|
|
359
354
|
* @param {any} options
|
|
360
|
-
* @
|
|
361
|
-
* @returns {WasmHtmlExtraction}
|
|
355
|
+
* @returns {string}
|
|
362
356
|
*/
|
|
363
|
-
export function
|
|
357
|
+
export function convertBytes(html, options) {
|
|
358
|
+
let deferred2_0;
|
|
359
|
+
let deferred2_1;
|
|
364
360
|
try {
|
|
365
361
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
366
|
-
|
|
367
|
-
if (!isLikeNone(image_config)) {
|
|
368
|
-
_assertClass(image_config, WasmInlineImageConfig);
|
|
369
|
-
ptr0 = image_config.__destroy_into_raw();
|
|
370
|
-
}
|
|
371
|
-
wasm.convertBytesWithInlineImages(retptr, addHeapObject(html), addHeapObject(options), ptr0);
|
|
362
|
+
wasm.convertBytes(retptr, addHeapObject(html), addHeapObject(options));
|
|
372
363
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
373
364
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
374
365
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
375
|
-
|
|
376
|
-
|
|
366
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
367
|
+
var ptr1 = r0;
|
|
368
|
+
var len1 = r1;
|
|
369
|
+
if (r3) {
|
|
370
|
+
ptr1 = 0; len1 = 0;
|
|
371
|
+
throw takeObject(r2);
|
|
377
372
|
}
|
|
378
|
-
|
|
373
|
+
deferred2_0 = ptr1;
|
|
374
|
+
deferred2_1 = len1;
|
|
375
|
+
return getStringFromWasm0(ptr1, len1);
|
|
379
376
|
} finally {
|
|
380
377
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
378
|
+
wasm.__wbindgen_export4(deferred2_0, deferred2_1, 1);
|
|
381
379
|
}
|
|
382
380
|
}
|
|
383
381
|
|
|
384
382
|
/**
|
|
383
|
+
* Convert HTML to Markdown
|
|
384
|
+
*
|
|
385
|
+
* # Arguments
|
|
386
|
+
*
|
|
387
|
+
* * `html` - The HTML string to convert
|
|
388
|
+
* * `options` - Optional conversion options (as a JavaScript object)
|
|
389
|
+
*
|
|
390
|
+
* # Example
|
|
391
|
+
*
|
|
392
|
+
* ```javascript
|
|
393
|
+
* import { convert } from 'html-to-markdown-wasm';
|
|
394
|
+
*
|
|
395
|
+
* const html = '<h1>Hello World</h1>';
|
|
396
|
+
* const markdown = convert(html);
|
|
397
|
+
* console.log(markdown); // # Hello World
|
|
398
|
+
* ```
|
|
399
|
+
* @param {string} html
|
|
385
400
|
* @param {any} options
|
|
386
|
-
* @returns {WasmConversionOptionsHandle}
|
|
387
|
-
*/
|
|
388
|
-
export function createConversionOptionsHandle(options) {
|
|
389
|
-
try {
|
|
390
|
-
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
391
|
-
wasm.createConversionOptionsHandle(retptr, addHeapObject(options));
|
|
392
|
-
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
393
|
-
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
394
|
-
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
395
|
-
if (r2) {
|
|
396
|
-
throw takeObject(r1);
|
|
397
|
-
}
|
|
398
|
-
return WasmConversionOptionsHandle.__wrap(r0);
|
|
399
|
-
} finally {
|
|
400
|
-
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
/**
|
|
405
|
-
* @param {Uint8Array} html
|
|
406
|
-
* @param {WasmConversionOptionsHandle} handle
|
|
407
401
|
* @returns {string}
|
|
408
402
|
*/
|
|
409
|
-
export function
|
|
410
|
-
let
|
|
411
|
-
let
|
|
403
|
+
export function convert(html, options) {
|
|
404
|
+
let deferred3_0;
|
|
405
|
+
let deferred3_1;
|
|
412
406
|
try {
|
|
413
407
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
414
|
-
|
|
415
|
-
|
|
408
|
+
const ptr0 = passStringToWasm0(html, wasm.__wbindgen_export, wasm.__wbindgen_export2);
|
|
409
|
+
const len0 = WASM_VECTOR_LEN;
|
|
410
|
+
wasm.convert(retptr, ptr0, len0, addHeapObject(options));
|
|
416
411
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
417
412
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
418
413
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
419
414
|
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
420
|
-
var
|
|
421
|
-
var
|
|
415
|
+
var ptr2 = r0;
|
|
416
|
+
var len2 = r1;
|
|
422
417
|
if (r3) {
|
|
423
|
-
|
|
418
|
+
ptr2 = 0; len2 = 0;
|
|
424
419
|
throw takeObject(r2);
|
|
425
420
|
}
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
return getStringFromWasm0(
|
|
421
|
+
deferred3_0 = ptr2;
|
|
422
|
+
deferred3_1 = len2;
|
|
423
|
+
return getStringFromWasm0(ptr2, len2);
|
|
429
424
|
} finally {
|
|
430
425
|
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
431
|
-
wasm.__wbindgen_export4(
|
|
426
|
+
wasm.__wbindgen_export4(deferred3_0, deferred3_1, 1);
|
|
432
427
|
}
|
|
433
428
|
}
|
|
434
429
|
|
|
435
430
|
/**
|
|
436
|
-
* @param {
|
|
431
|
+
* @param {Uint8Array} html
|
|
437
432
|
* @param {any} options
|
|
438
433
|
* @param {WasmInlineImageConfig | null} [image_config]
|
|
439
434
|
* @returns {WasmHtmlExtraction}
|
|
440
435
|
*/
|
|
441
|
-
export function
|
|
436
|
+
export function convertBytesWithInlineImages(html, options, image_config) {
|
|
442
437
|
try {
|
|
443
438
|
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
444
|
-
|
|
445
|
-
const len0 = WASM_VECTOR_LEN;
|
|
446
|
-
let ptr1 = 0;
|
|
439
|
+
let ptr0 = 0;
|
|
447
440
|
if (!isLikeNone(image_config)) {
|
|
448
441
|
_assertClass(image_config, WasmInlineImageConfig);
|
|
449
|
-
|
|
442
|
+
ptr0 = image_config.__destroy_into_raw();
|
|
450
443
|
}
|
|
451
|
-
wasm.
|
|
444
|
+
wasm.convertBytesWithInlineImages(retptr, addHeapObject(html), addHeapObject(options), ptr0);
|
|
452
445
|
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
453
446
|
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
454
447
|
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
@@ -461,6 +454,13 @@ export function convertWithInlineImages(html, options, image_config) {
|
|
|
461
454
|
}
|
|
462
455
|
}
|
|
463
456
|
|
|
457
|
+
/**
|
|
458
|
+
* Initialize panic hook for better error messages in the browser
|
|
459
|
+
*/
|
|
460
|
+
export function init() {
|
|
461
|
+
wasm.init();
|
|
462
|
+
}
|
|
463
|
+
|
|
464
464
|
const WasmConversionOptionsHandleFinalization = (typeof FinalizationRegistry === 'undefined')
|
|
465
465
|
? { register: () => {}, unregister: () => {} }
|
|
466
466
|
: new FinalizationRegistry(ptr => wasm.__wbg_wasmconversionoptionshandle_free(ptr >>> 0, 1));
|
|
Binary file
|
package/dist/package.json
CHANGED
package/dist-node/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# html-to-markdown
|
|
2
2
|
|
|
3
|
-
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
3
|
+
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Elixir Rustler NIF, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
4
4
|
|
|
5
5
|
[](https://crates.io/crates/html-to-markdown)
|
|
6
6
|
[](https://www.npmjs.com/package/html-to-markdown-node)
|
|
@@ -8,6 +8,7 @@ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rus
|
|
|
8
8
|
[](https://pypi.org/project/html-to-markdown/)
|
|
9
9
|
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
10
10
|
[](https://rubygems.org/gems/html-to-markdown)
|
|
11
|
+
[](https://hex.pm/packages/html_to_markdown)
|
|
11
12
|
[](https://www.nuget.org/packages/HtmlToMarkdown/)
|
|
12
13
|
[](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
|
|
13
14
|
[](https://pkg.go.dev/github.com/Goldziher/html-to-markdown/packages/go/htmltomarkdown)
|
|
@@ -43,6 +44,7 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
43
44
|
- PHP wrapper package – [PHP README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md)
|
|
44
45
|
- PHP extension (PIE) – [Extension README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php-ext/README.md)
|
|
45
46
|
- **Ruby guide** – [Ruby README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/ruby/README.md)
|
|
47
|
+
- **Elixir guide** – [Elixir README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/elixir/README.md)
|
|
46
48
|
- **Rust guide** – [Rust README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md)
|
|
47
49
|
- **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here!
|
|
48
50
|
- **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
|
|
@@ -57,6 +59,7 @@ Experience WebAssembly-powered HTML to Markdown conversion instantly in your bro
|
|
|
57
59
|
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
58
60
|
| **PHP** (extension + helpers) | `pie install goldziher/html-to-markdown`<br>`composer require html-to-markdown/extension` |
|
|
59
61
|
| **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
|
|
62
|
+
| **Elixir** (Rustler NIF) | `{:html_to_markdown, "~> 2.8"}` |
|
|
60
63
|
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
61
64
|
| Rust CLI | `cargo install html-to-markdown-cli` |
|
|
62
65
|
| Homebrew CLI | `brew tap goldziher/tap`<br>`brew install html-to-markdown` |
|
|
@@ -126,6 +129,15 @@ markdown, inline_images, warnings = convert_with_inline_images(
|
|
|
126
129
|
)
|
|
127
130
|
```
|
|
128
131
|
|
|
132
|
+
### Elixir
|
|
133
|
+
|
|
134
|
+
```elixir
|
|
135
|
+
{:ok, markdown} = HtmlToMarkdown.convert("<h1>Hello</h1>")
|
|
136
|
+
|
|
137
|
+
# Keyword options are supported (internally mapped to the Rust ConversionOptions struct)
|
|
138
|
+
HtmlToMarkdown.convert!("<p>Wrap me</p>", wrap: true, wrap_width: 32, preprocessing: %{enabled: true})
|
|
139
|
+
```
|
|
140
|
+
|
|
129
141
|
### Rust
|
|
130
142
|
|
|
131
143
|
```rust
|
|
@@ -178,6 +190,7 @@ Derived directly from `tools/runtime-bench/results/latest.json` (Apple M4, share
|
|
|
178
190
|
- **Rust now leads throughput**: the fused preprocessing + `parse_owned` pathway pushes the CLI to ~1.7 k ops/sec on the 129 KB lists page and ~31 k ops/sec on the HOCR invoice fixture.
|
|
179
191
|
- **Node.js trails by only a few percent** after the buffer/handle work—~1.3 k ops/sec on the lists fixture and 27 k ops/sec on HOCR invoices without any UTF-16 copies.
|
|
180
192
|
- **Python remains competitive** but now sits below Node/Rust (~4.0 k average ops/sec); stick to the v2 API to avoid the deprecated compatibility shim.
|
|
193
|
+
- **Elixir matches the Rust core** because the Rustler NIF executes the same `ConversionOptions` pipeline—benchmarks land between 170–1,460 ops/sec on the Wikipedia fixtures and >20 k ops/sec on micro HOCR payloads.
|
|
181
194
|
- **PHP and WASM stay in the 35–70 MB/s band**, which is plenty for Composer queues or edge runtimes as long as the extension/module is built ahead of time.
|
|
182
195
|
- **Rust CLI results now mirror the bindings**, since `task bench:bindings` runs the harness with `cargo run --release` by default—profile there, then push optimizations down into each FFI layer.
|
|
183
196
|
|
|
@@ -185,20 +198,20 @@ Derived directly from `tools/runtime-bench/results/latest.json` (Apple M4, share
|
|
|
185
198
|
|
|
186
199
|
Measured on Apple M4 using the fixture-driven runtime harness in `tools/runtime-bench` (`task bench:bindings`). Every binding consumes the exact same HTML fixtures and hOCR samples from `test_documents/`:
|
|
187
200
|
|
|
188
|
-
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Rust ops/sec |
|
|
189
|
-
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | ------------ |
|
|
190
|
-
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | **1,700** |
|
|
191
|
-
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | **416** |
|
|
192
|
-
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | **190** |
|
|
193
|
-
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | **220** |
|
|
194
|
-
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | **258** |
|
|
195
|
-
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 2,760 |
|
|
196
|
-
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | **31,345** |
|
|
197
|
-
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,080 |
|
|
201
|
+
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Elixir ops/sec | Rust ops/sec |
|
|
202
|
+
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | -------------- | ------------ |
|
|
203
|
+
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | 1,463 | **1,700** |
|
|
204
|
+
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | 357 | **416** |
|
|
205
|
+
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | 171 | **190** |
|
|
206
|
+
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | 174 | **220** |
|
|
207
|
+
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | 247 | **258** |
|
|
208
|
+
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 3,113 | 2,760 |
|
|
209
|
+
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | 20,424 | **31,345** |
|
|
210
|
+
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,366 | 3,080 |
|
|
198
211
|
|
|
199
212
|
The harness shells out to each runtime’s lightweight benchmark driver (`packages/*/bin/benchmark.*`, `crates/*/bin/benchmark.ts`), feeds fixtures defined in `tools/runtime-bench/fixtures/*.toml`, and writes machine-readable JSON reports (`tools/runtime-bench/results/latest.json`) for regression tracking. Add new languages or scenarios by extending those fixture files and drivers.
|
|
200
213
|
|
|
201
|
-
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures, pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
214
|
+
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures (for example, `task bench:bindings -- --language elixir`), pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
202
215
|
|
|
203
216
|
Need a call-stack view of the Rust core? Run `task flamegraph:rust` (or call the harness with `--language rust --flamegraph path.svg`) to profile a fixture and dump a ready-to-inspect flamegraph in `tools/runtime-bench/results/`.
|
|
204
217
|
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
|
+
export function createConversionOptionsHandle(options: any): WasmConversionOptionsHandle;
|
|
4
|
+
export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
5
|
+
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
6
|
+
export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
|
|
3
7
|
export function convertBytes(html: Uint8Array, options: any): string;
|
|
4
8
|
/**
|
|
5
9
|
* Convert HTML to Markdown
|
|
@@ -20,15 +24,11 @@ export function convertBytes(html: Uint8Array, options: any): string;
|
|
|
20
24
|
* ```
|
|
21
25
|
*/
|
|
22
26
|
export function convert(html: string, options: any): string;
|
|
23
|
-
export function
|
|
27
|
+
export function convertBytesWithInlineImages(html: Uint8Array, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
24
28
|
/**
|
|
25
29
|
* Initialize panic hook for better error messages in the browser
|
|
26
30
|
*/
|
|
27
31
|
export function init(): void;
|
|
28
|
-
export function convertBytesWithInlineImages(html: Uint8Array, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
29
|
-
export function createConversionOptionsHandle(options: any): WasmConversionOptionsHandle;
|
|
30
|
-
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
31
|
-
export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
32
32
|
export class WasmConversionOptionsHandle {
|
|
33
33
|
free(): void;
|
|
34
34
|
[Symbol.dispose](): void;
|