html-to-markdown-wasm 2.15.0 → 2.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +1 -1
- package/dist/LICENSE +21 -0
- package/dist/README.md +16 -57
- package/dist/html_to_markdown_wasm_bg.js +0 -5
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/package.json +2 -1
- package/dist-node/LICENSE +21 -0
- package/dist-node/README.md +16 -57
- package/dist-node/html_to_markdown_wasm.js +0 -5
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/package.json +2 -1
- package/dist-web/LICENSE +21 -0
- package/dist-web/README.md +16 -57
- package/dist-web/html_to_markdown_wasm.js +0 -4
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/package.json +2 -1
- package/package.json +1 -1
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -46,7 +46,7 @@ Universal WebAssembly bindings with **excellent performance** across all JavaScr
|
|
|
46
46
|
|
|
47
47
|
### Benchmark Fixtures (Apple M4)
|
|
48
48
|
|
|
49
|
-
Numbers captured via
|
|
49
|
+
Numbers captured via the shared fixture harness in `tools/benchmark-harness`:
|
|
50
50
|
|
|
51
51
|
| Document | Size | ops/sec (WASM) |
|
|
52
52
|
| ---------------------- | ------ | -------------- |
|
package/dist/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/dist/README.md
CHANGED
|
@@ -99,7 +99,7 @@ const markdown = convert(html, {
|
|
|
99
99
|
});
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
-
**Performance:** The shared fixture harness
|
|
102
|
+
**Performance:** The shared fixture harness now lives in `tools/benchmark-harness` and is used to track Rust + binding throughput over time.
|
|
103
103
|
|
|
104
104
|
See the JavaScript guides for full API documentation:
|
|
105
105
|
|
|
@@ -568,65 +568,24 @@ See the language-specific READMEs for complete configuration, hOCR workflows, an
|
|
|
568
568
|
|
|
569
569
|
## Performance
|
|
570
570
|
|
|
571
|
-
Benchmarked on Apple M4
|
|
571
|
+
Benchmarked on Apple M4 using the shared fixture harness in `tools/benchmark-harness` (latest consolidated run: `20409971461`).
|
|
572
572
|
|
|
573
|
-
###
|
|
573
|
+
### Comparative Throughput (Median Across Fixtures)
|
|
574
574
|
|
|
575
|
-
|
|
575
|
+
| Runtime | Median ops/sec | Median throughput (MB/s) | Peak memory (MB) | Successes |
|
|
576
|
+
| ------- | -------------- | ------------------------ | ---------------- | --------- |
|
|
577
|
+
| Rust | 1,060.3 | 116.4 | 171.3 | 56/56 |
|
|
578
|
+
| Go | 1,496.3 | 131.1 | 22.9 | 16/16 |
|
|
579
|
+
| Ruby | 2,155.5 | 300.4 | 280.3 | 48/48 |
|
|
580
|
+
| PHP | 2,357.7 | 308.0 | 223.5 | 48/48 |
|
|
581
|
+
| Elixir | 1,564.1 | 269.1 | 384.7 | 48/48 |
|
|
582
|
+
| C# | 1,234.2 | 272.4 | 187.8 | 16/16 |
|
|
583
|
+
| Java | 1,298.7 | 167.1 | 527.2 | 16/16 |
|
|
584
|
+
| WASM | 1,485.8 | 157.6 | 95.3 | 48/48 |
|
|
585
|
+
| Node.js (NAPI) | 2,054.2 | 306.5 | 95.4 | 48/48 |
|
|
586
|
+
| Python (PyO3) | 3,120.3 | 307.5 | 83.5 | 48/48 |
|
|
576
587
|
|
|
577
|
-
|
|
578
|
-
| ---------------------- | -------------- | ---- | ------------- | ------------------------ |
|
|
579
|
-
| **Lists (Timeline)** | 1,308 | 882 | 1,405 | **0.9×** |
|
|
580
|
-
| **Tables (Countries)** | 331 | 242 | 352 | **0.9×** |
|
|
581
|
-
| **Medium (Python)** | 150 | 121 | 158 | **1.0×** |
|
|
582
|
-
| **Large (Rust)** | 163 | 124 | 183 | **0.9×** |
|
|
583
|
-
| **Small (Intro)** | 208 | 163 | 223 | **0.9×** |
|
|
584
|
-
| **HOCR German PDF** | 2,944 | 1,637| 2,991 | **1.0×** |
|
|
585
|
-
| **HOCR Invoice** | 27,326 | 7,775| 23,500 | **1.2×** |
|
|
586
|
-
| **HOCR Tables** | 3,475 | 1,667| 3,464 | **1.0×** |
|
|
587
|
-
|
|
588
|
-
### Average Performance Summary
|
|
589
|
-
|
|
590
|
-
| Implementation | Avg ops/sec (fixtures) | vs Python | Notes |
|
|
591
|
-
| --------------------- | ---------------------- | --------- | ----- |
|
|
592
|
-
| **Rust CLI/Binary** | **4,996** | **1.2× faster** | Preprocessing now stays in one pass + reuses `parse_owned`, so the CLI leads every fixture |
|
|
593
|
-
| **Node.js (NAPI-RS)** | **4,488** | 1.0× | Buffer/handle combo keeps Node within ~10 % of the Rust core while serving JS runtimes |
|
|
594
|
-
| **Ruby (magnus)** | **4,278** | 0.9× | Still extremely fast; ~25 k ops/sec on HOCR invoices without extra work |
|
|
595
|
-
| **Python (PyO3)** | **4,034** | baseline | Release-mode harness plus handle reuse keep it competitive, but it now trails Node/Rust |
|
|
596
|
-
| **WebAssembly** | **1,576** | 0.4× | Portable option for Deno/browsers/edge using the new byte APIs |
|
|
597
|
-
| **PHP (ext)** | **1,480** | 0.4× | Composer extension holds steady at 35–70 MB/s once the PIE build is installed |
|
|
598
|
-
|
|
599
|
-
### Key Insights
|
|
600
|
-
|
|
601
|
-
- **Rust now leads throughput**: the fused preprocessing + `parse_owned` pathway pushes the CLI to ~1.7 k ops/sec on the 129 KB lists page and ~31 k ops/sec on the HOCR invoice fixture.
|
|
602
|
-
- **Node.js trails by only a few percent** after the buffer/handle work—~1.3 k ops/sec on the lists fixture and 27 k ops/sec on HOCR invoices without any UTF-16 copies.
|
|
603
|
-
- **Python remains competitive** but now sits below Node/Rust (~4.0 k average ops/sec); stick to the v2 API to avoid the deprecated compatibility shim.
|
|
604
|
-
- **Elixir matches the Rust core** because the Rustler NIF executes the same `ConversionOptions` pipeline—benchmarks land between 170–1,460 ops/sec on the Wikipedia fixtures and >20 k ops/sec on micro HOCR payloads.
|
|
605
|
-
- **PHP and WASM stay in the 35–70 MB/s band**, which is plenty for Composer queues or edge runtimes as long as the extension/module is built ahead of time.
|
|
606
|
-
- **Rust CLI results now mirror the bindings**, since `task bench:bindings` runs the harness with `cargo run --release` by default—profile there, then push optimizations down into each FFI layer.
|
|
607
|
-
|
|
608
|
-
### Runtime Benchmarks (PHP / Ruby / Python / Node / WASM)
|
|
609
|
-
|
|
610
|
-
Measured on Apple M4 using the fixture-driven runtime harness in `tools/runtime-bench` (`task bench:bindings`). Every binding consumes the exact same HTML fixtures and hOCR samples from `test_documents/`:
|
|
611
|
-
|
|
612
|
-
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Elixir ops/sec | Rust ops/sec |
|
|
613
|
-
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | -------------- | ------------ |
|
|
614
|
-
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | 1,463 | **1,700** |
|
|
615
|
-
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | 357 | **416** |
|
|
616
|
-
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | 171 | **190** |
|
|
617
|
-
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | 174 | **220** |
|
|
618
|
-
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | 247 | **258** |
|
|
619
|
-
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 3,113 | 2,760 |
|
|
620
|
-
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | 20,424 | **31,345** |
|
|
621
|
-
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,366 | 3,080 |
|
|
622
|
-
|
|
623
|
-
The harness shells out to each runtime’s lightweight benchmark driver (`packages/*/bin/benchmark.*`, `crates/*/bin/benchmark.ts`), feeds fixtures defined in `tools/runtime-bench/fixtures/*.toml`, and writes machine-readable JSON reports (`tools/runtime-bench/results/latest.json`) for regression tracking. Add new languages or scenarios by extending those fixture files and drivers.
|
|
624
|
-
|
|
625
|
-
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures (for example, `task bench:bindings -- --language elixir`), pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
626
|
-
|
|
627
|
-
Need a call-stack view of the Rust core? Run `task flamegraph:rust` (or call the harness with `--language rust --flamegraph path.svg`) to profile a fixture and dump a ready-to-inspect flamegraph in `tools/runtime-bench/results/`.
|
|
628
|
-
|
|
629
|
-
**Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
|
|
588
|
+
Use `task bench:harness` to regenerate throughput numbers across the bindings, `task bench:harness:memory` for CPU/memory samples, and `task bench:harness:rust` for flamegraphs.
|
|
630
589
|
|
|
631
590
|
## Compatibility (v1 → v2)
|
|
632
591
|
|
|
@@ -1147,11 +1147,6 @@ export function __wbg_done_62ea16af4ce34b24(arg0) {
|
|
|
1147
1147
|
return ret;
|
|
1148
1148
|
};
|
|
1149
1149
|
|
|
1150
|
-
export function __wbg_entries_83c79938054e065f(arg0) {
|
|
1151
|
-
const ret = Object.entries(getObject(arg0));
|
|
1152
|
-
return addHeapObject(ret);
|
|
1153
|
-
};
|
|
1154
|
-
|
|
1155
1150
|
export function __wbg_error_7534b8e9a36f1ab4(arg0, arg1) {
|
|
1156
1151
|
let deferred0_0;
|
|
1157
1152
|
let deferred0_1;
|
|
Binary file
|
package/dist/package.json
CHANGED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/dist-node/README.md
CHANGED
|
@@ -99,7 +99,7 @@ const markdown = convert(html, {
|
|
|
99
99
|
});
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
-
**Performance:** The shared fixture harness
|
|
102
|
+
**Performance:** The shared fixture harness now lives in `tools/benchmark-harness` and is used to track Rust + binding throughput over time.
|
|
103
103
|
|
|
104
104
|
See the JavaScript guides for full API documentation:
|
|
105
105
|
|
|
@@ -568,65 +568,24 @@ See the language-specific READMEs for complete configuration, hOCR workflows, an
|
|
|
568
568
|
|
|
569
569
|
## Performance
|
|
570
570
|
|
|
571
|
-
Benchmarked on Apple M4
|
|
571
|
+
Benchmarked on Apple M4 using the shared fixture harness in `tools/benchmark-harness` (latest consolidated run: `20409971461`).
|
|
572
572
|
|
|
573
|
-
###
|
|
573
|
+
### Comparative Throughput (Median Across Fixtures)
|
|
574
574
|
|
|
575
|
-
|
|
575
|
+
| Runtime | Median ops/sec | Median throughput (MB/s) | Peak memory (MB) | Successes |
|
|
576
|
+
| ------- | -------------- | ------------------------ | ---------------- | --------- |
|
|
577
|
+
| Rust | 1,060.3 | 116.4 | 171.3 | 56/56 |
|
|
578
|
+
| Go | 1,496.3 | 131.1 | 22.9 | 16/16 |
|
|
579
|
+
| Ruby | 2,155.5 | 300.4 | 280.3 | 48/48 |
|
|
580
|
+
| PHP | 2,357.7 | 308.0 | 223.5 | 48/48 |
|
|
581
|
+
| Elixir | 1,564.1 | 269.1 | 384.7 | 48/48 |
|
|
582
|
+
| C# | 1,234.2 | 272.4 | 187.8 | 16/16 |
|
|
583
|
+
| Java | 1,298.7 | 167.1 | 527.2 | 16/16 |
|
|
584
|
+
| WASM | 1,485.8 | 157.6 | 95.3 | 48/48 |
|
|
585
|
+
| Node.js (NAPI) | 2,054.2 | 306.5 | 95.4 | 48/48 |
|
|
586
|
+
| Python (PyO3) | 3,120.3 | 307.5 | 83.5 | 48/48 |
|
|
576
587
|
|
|
577
|
-
|
|
578
|
-
| ---------------------- | -------------- | ---- | ------------- | ------------------------ |
|
|
579
|
-
| **Lists (Timeline)** | 1,308 | 882 | 1,405 | **0.9×** |
|
|
580
|
-
| **Tables (Countries)** | 331 | 242 | 352 | **0.9×** |
|
|
581
|
-
| **Medium (Python)** | 150 | 121 | 158 | **1.0×** |
|
|
582
|
-
| **Large (Rust)** | 163 | 124 | 183 | **0.9×** |
|
|
583
|
-
| **Small (Intro)** | 208 | 163 | 223 | **0.9×** |
|
|
584
|
-
| **HOCR German PDF** | 2,944 | 1,637| 2,991 | **1.0×** |
|
|
585
|
-
| **HOCR Invoice** | 27,326 | 7,775| 23,500 | **1.2×** |
|
|
586
|
-
| **HOCR Tables** | 3,475 | 1,667| 3,464 | **1.0×** |
|
|
587
|
-
|
|
588
|
-
### Average Performance Summary
|
|
589
|
-
|
|
590
|
-
| Implementation | Avg ops/sec (fixtures) | vs Python | Notes |
|
|
591
|
-
| --------------------- | ---------------------- | --------- | ----- |
|
|
592
|
-
| **Rust CLI/Binary** | **4,996** | **1.2× faster** | Preprocessing now stays in one pass + reuses `parse_owned`, so the CLI leads every fixture |
|
|
593
|
-
| **Node.js (NAPI-RS)** | **4,488** | 1.0× | Buffer/handle combo keeps Node within ~10 % of the Rust core while serving JS runtimes |
|
|
594
|
-
| **Ruby (magnus)** | **4,278** | 0.9× | Still extremely fast; ~25 k ops/sec on HOCR invoices without extra work |
|
|
595
|
-
| **Python (PyO3)** | **4,034** | baseline | Release-mode harness plus handle reuse keep it competitive, but it now trails Node/Rust |
|
|
596
|
-
| **WebAssembly** | **1,576** | 0.4× | Portable option for Deno/browsers/edge using the new byte APIs |
|
|
597
|
-
| **PHP (ext)** | **1,480** | 0.4× | Composer extension holds steady at 35–70 MB/s once the PIE build is installed |
|
|
598
|
-
|
|
599
|
-
### Key Insights
|
|
600
|
-
|
|
601
|
-
- **Rust now leads throughput**: the fused preprocessing + `parse_owned` pathway pushes the CLI to ~1.7 k ops/sec on the 129 KB lists page and ~31 k ops/sec on the HOCR invoice fixture.
|
|
602
|
-
- **Node.js trails by only a few percent** after the buffer/handle work—~1.3 k ops/sec on the lists fixture and 27 k ops/sec on HOCR invoices without any UTF-16 copies.
|
|
603
|
-
- **Python remains competitive** but now sits below Node/Rust (~4.0 k average ops/sec); stick to the v2 API to avoid the deprecated compatibility shim.
|
|
604
|
-
- **Elixir matches the Rust core** because the Rustler NIF executes the same `ConversionOptions` pipeline—benchmarks land between 170–1,460 ops/sec on the Wikipedia fixtures and >20 k ops/sec on micro HOCR payloads.
|
|
605
|
-
- **PHP and WASM stay in the 35–70 MB/s band**, which is plenty for Composer queues or edge runtimes as long as the extension/module is built ahead of time.
|
|
606
|
-
- **Rust CLI results now mirror the bindings**, since `task bench:bindings` runs the harness with `cargo run --release` by default—profile there, then push optimizations down into each FFI layer.
|
|
607
|
-
|
|
608
|
-
### Runtime Benchmarks (PHP / Ruby / Python / Node / WASM)
|
|
609
|
-
|
|
610
|
-
Measured on Apple M4 using the fixture-driven runtime harness in `tools/runtime-bench` (`task bench:bindings`). Every binding consumes the exact same HTML fixtures and hOCR samples from `test_documents/`:
|
|
611
|
-
|
|
612
|
-
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Elixir ops/sec | Rust ops/sec |
|
|
613
|
-
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | -------------- | ------------ |
|
|
614
|
-
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | 1,463 | **1,700** |
|
|
615
|
-
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | 357 | **416** |
|
|
616
|
-
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | 171 | **190** |
|
|
617
|
-
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | 174 | **220** |
|
|
618
|
-
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | 247 | **258** |
|
|
619
|
-
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 3,113 | 2,760 |
|
|
620
|
-
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | 20,424 | **31,345** |
|
|
621
|
-
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,366 | 3,080 |
|
|
622
|
-
|
|
623
|
-
The harness shells out to each runtime’s lightweight benchmark driver (`packages/*/bin/benchmark.*`, `crates/*/bin/benchmark.ts`), feeds fixtures defined in `tools/runtime-bench/fixtures/*.toml`, and writes machine-readable JSON reports (`tools/runtime-bench/results/latest.json`) for regression tracking. Add new languages or scenarios by extending those fixture files and drivers.
|
|
624
|
-
|
|
625
|
-
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures (for example, `task bench:bindings -- --language elixir`), pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
626
|
-
|
|
627
|
-
Need a call-stack view of the Rust core? Run `task flamegraph:rust` (or call the harness with `--language rust --flamegraph path.svg`) to profile a fixture and dump a ready-to-inspect flamegraph in `tools/runtime-bench/results/`.
|
|
628
|
-
|
|
629
|
-
**Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
|
|
588
|
+
Use `task bench:harness` to regenerate throughput numbers across the bindings, `task bench:harness:memory` for CPU/memory samples, and `task bench:harness:rust` for flamegraphs.
|
|
630
589
|
|
|
631
590
|
## Compatibility (v1 → v2)
|
|
632
591
|
|
|
@@ -1154,11 +1154,6 @@ exports.__wbg_done_62ea16af4ce34b24 = function(arg0) {
|
|
|
1154
1154
|
return ret;
|
|
1155
1155
|
};
|
|
1156
1156
|
|
|
1157
|
-
exports.__wbg_entries_83c79938054e065f = function(arg0) {
|
|
1158
|
-
const ret = Object.entries(getObject(arg0));
|
|
1159
|
-
return addHeapObject(ret);
|
|
1160
|
-
};
|
|
1161
|
-
|
|
1162
1157
|
exports.__wbg_error_7534b8e9a36f1ab4 = function(arg0, arg1) {
|
|
1163
1158
|
let deferred0_0;
|
|
1164
1159
|
let deferred0_1;
|
|
Binary file
|
package/dist-node/package.json
CHANGED
package/dist-web/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/dist-web/README.md
CHANGED
|
@@ -99,7 +99,7 @@ const markdown = convert(html, {
|
|
|
99
99
|
});
|
|
100
100
|
```
|
|
101
101
|
|
|
102
|
-
**Performance:** The shared fixture harness
|
|
102
|
+
**Performance:** The shared fixture harness now lives in `tools/benchmark-harness` and is used to track Rust + binding throughput over time.
|
|
103
103
|
|
|
104
104
|
See the JavaScript guides for full API documentation:
|
|
105
105
|
|
|
@@ -568,65 +568,24 @@ See the language-specific READMEs for complete configuration, hOCR workflows, an
|
|
|
568
568
|
|
|
569
569
|
## Performance
|
|
570
570
|
|
|
571
|
-
Benchmarked on Apple M4
|
|
571
|
+
Benchmarked on Apple M4 using the shared fixture harness in `tools/benchmark-harness` (latest consolidated run: `20409971461`).
|
|
572
572
|
|
|
573
|
-
###
|
|
573
|
+
### Comparative Throughput (Median Across Fixtures)
|
|
574
574
|
|
|
575
|
-
|
|
575
|
+
| Runtime | Median ops/sec | Median throughput (MB/s) | Peak memory (MB) | Successes |
|
|
576
|
+
| ------- | -------------- | ------------------------ | ---------------- | --------- |
|
|
577
|
+
| Rust | 1,060.3 | 116.4 | 171.3 | 56/56 |
|
|
578
|
+
| Go | 1,496.3 | 131.1 | 22.9 | 16/16 |
|
|
579
|
+
| Ruby | 2,155.5 | 300.4 | 280.3 | 48/48 |
|
|
580
|
+
| PHP | 2,357.7 | 308.0 | 223.5 | 48/48 |
|
|
581
|
+
| Elixir | 1,564.1 | 269.1 | 384.7 | 48/48 |
|
|
582
|
+
| C# | 1,234.2 | 272.4 | 187.8 | 16/16 |
|
|
583
|
+
| Java | 1,298.7 | 167.1 | 527.2 | 16/16 |
|
|
584
|
+
| WASM | 1,485.8 | 157.6 | 95.3 | 48/48 |
|
|
585
|
+
| Node.js (NAPI) | 2,054.2 | 306.5 | 95.4 | 48/48 |
|
|
586
|
+
| Python (PyO3) | 3,120.3 | 307.5 | 83.5 | 48/48 |
|
|
576
587
|
|
|
577
|
-
|
|
578
|
-
| ---------------------- | -------------- | ---- | ------------- | ------------------------ |
|
|
579
|
-
| **Lists (Timeline)** | 1,308 | 882 | 1,405 | **0.9×** |
|
|
580
|
-
| **Tables (Countries)** | 331 | 242 | 352 | **0.9×** |
|
|
581
|
-
| **Medium (Python)** | 150 | 121 | 158 | **1.0×** |
|
|
582
|
-
| **Large (Rust)** | 163 | 124 | 183 | **0.9×** |
|
|
583
|
-
| **Small (Intro)** | 208 | 163 | 223 | **0.9×** |
|
|
584
|
-
| **HOCR German PDF** | 2,944 | 1,637| 2,991 | **1.0×** |
|
|
585
|
-
| **HOCR Invoice** | 27,326 | 7,775| 23,500 | **1.2×** |
|
|
586
|
-
| **HOCR Tables** | 3,475 | 1,667| 3,464 | **1.0×** |
|
|
587
|
-
|
|
588
|
-
### Average Performance Summary
|
|
589
|
-
|
|
590
|
-
| Implementation | Avg ops/sec (fixtures) | vs Python | Notes |
|
|
591
|
-
| --------------------- | ---------------------- | --------- | ----- |
|
|
592
|
-
| **Rust CLI/Binary** | **4,996** | **1.2× faster** | Preprocessing now stays in one pass + reuses `parse_owned`, so the CLI leads every fixture |
|
|
593
|
-
| **Node.js (NAPI-RS)** | **4,488** | 1.0× | Buffer/handle combo keeps Node within ~10 % of the Rust core while serving JS runtimes |
|
|
594
|
-
| **Ruby (magnus)** | **4,278** | 0.9× | Still extremely fast; ~25 k ops/sec on HOCR invoices without extra work |
|
|
595
|
-
| **Python (PyO3)** | **4,034** | baseline | Release-mode harness plus handle reuse keep it competitive, but it now trails Node/Rust |
|
|
596
|
-
| **WebAssembly** | **1,576** | 0.4× | Portable option for Deno/browsers/edge using the new byte APIs |
|
|
597
|
-
| **PHP (ext)** | **1,480** | 0.4× | Composer extension holds steady at 35–70 MB/s once the PIE build is installed |
|
|
598
|
-
|
|
599
|
-
### Key Insights
|
|
600
|
-
|
|
601
|
-
- **Rust now leads throughput**: the fused preprocessing + `parse_owned` pathway pushes the CLI to ~1.7 k ops/sec on the 129 KB lists page and ~31 k ops/sec on the HOCR invoice fixture.
|
|
602
|
-
- **Node.js trails by only a few percent** after the buffer/handle work—~1.3 k ops/sec on the lists fixture and 27 k ops/sec on HOCR invoices without any UTF-16 copies.
|
|
603
|
-
- **Python remains competitive** but now sits below Node/Rust (~4.0 k average ops/sec); stick to the v2 API to avoid the deprecated compatibility shim.
|
|
604
|
-
- **Elixir matches the Rust core** because the Rustler NIF executes the same `ConversionOptions` pipeline—benchmarks land between 170–1,460 ops/sec on the Wikipedia fixtures and >20 k ops/sec on micro HOCR payloads.
|
|
605
|
-
- **PHP and WASM stay in the 35–70 MB/s band**, which is plenty for Composer queues or edge runtimes as long as the extension/module is built ahead of time.
|
|
606
|
-
- **Rust CLI results now mirror the bindings**, since `task bench:bindings` runs the harness with `cargo run --release` by default—profile there, then push optimizations down into each FFI layer.
|
|
607
|
-
|
|
608
|
-
### Runtime Benchmarks (PHP / Ruby / Python / Node / WASM)
|
|
609
|
-
|
|
610
|
-
Measured on Apple M4 using the fixture-driven runtime harness in `tools/runtime-bench` (`task bench:bindings`). Every binding consumes the exact same HTML fixtures and hOCR samples from `test_documents/`:
|
|
611
|
-
|
|
612
|
-
| Document | Size | Ruby ops/sec | PHP ops/sec | Python ops/sec | Node ops/sec | WASM ops/sec | Elixir ops/sec | Rust ops/sec |
|
|
613
|
-
| ------------------- | -------- | ------------ | ----------- | -------------- | ------------ | ------------ | -------------- | ------------ |
|
|
614
|
-
| Lists (Timeline) | 129 KB | 1,349 | 533 | 1,405 | 1,308 | 882 | 1,463 | **1,700** |
|
|
615
|
-
| Tables (Countries) | 360 KB | 326 | 118 | 352 | 331 | 242 | 357 | **416** |
|
|
616
|
-
| Medium (Python) | 657 KB | 157 | 59 | 158 | 150 | 121 | 171 | **190** |
|
|
617
|
-
| Large (Rust) | 567 KB | 174 | 65 | 183 | 163 | 124 | 174 | **220** |
|
|
618
|
-
| Small (Intro) | 463 KB | 214 | 83 | 223 | 208 | 163 | 247 | **258** |
|
|
619
|
-
| HOCR German PDF | 44 KB | 2,936 | 1,007 | **2,991** | 2,944 | 1,637 | 3,113 | 2,760 |
|
|
620
|
-
| HOCR Invoice | 4 KB | 25,740 | 8,781 | 23,500 | 27,326 | 7,775 | 20,424 | **31,345** |
|
|
621
|
-
| HOCR Embedded Tables| 37 KB | 3,328 | 1,194 | 3,464 | **3,475** | 1,667 | 3,366 | 3,080 |
|
|
622
|
-
|
|
623
|
-
The harness shells out to each runtime’s lightweight benchmark driver (`packages/*/bin/benchmark.*`, `crates/*/bin/benchmark.ts`), feeds fixtures defined in `tools/runtime-bench/fixtures/*.toml`, and writes machine-readable JSON reports (`tools/runtime-bench/results/latest.json`) for regression tracking. Add new languages or scenarios by extending those fixture files and drivers.
|
|
624
|
-
|
|
625
|
-
Use `task bench:bindings` to regenerate throughput numbers across all bindings or `task bench:bindings:profile` to capture CPU/memory samples while the benchmarks run. To focus on specific languages or fixtures (for example, `task bench:bindings -- --language elixir`), pass `--language` / `--fixture` directly to `cargo run --manifest-path tools/runtime-bench/Cargo.toml -- …`.
|
|
626
|
-
|
|
627
|
-
Need a call-stack view of the Rust core? Run `task flamegraph:rust` (or call the harness with `--language rust --flamegraph path.svg`) to profile a fixture and dump a ready-to-inspect flamegraph in `tools/runtime-bench/results/`.
|
|
628
|
-
|
|
629
|
-
**Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
|
|
588
|
+
Use `task bench:harness` to regenerate throughput numbers across the bindings, `task bench:harness:memory` for CPU/memory samples, and `task bench:harness:rust` for flamegraphs.
|
|
630
589
|
|
|
631
590
|
## Compatibility (v1 → v2)
|
|
632
591
|
|
|
@@ -1158,10 +1158,6 @@ function __wbg_get_imports() {
|
|
|
1158
1158
|
const ret = getObject(arg0).done;
|
|
1159
1159
|
return ret;
|
|
1160
1160
|
};
|
|
1161
|
-
imports.wbg.__wbg_entries_83c79938054e065f = function(arg0) {
|
|
1162
|
-
const ret = Object.entries(getObject(arg0));
|
|
1163
|
-
return addHeapObject(ret);
|
|
1164
|
-
};
|
|
1165
1161
|
imports.wbg.__wbg_error_7534b8e9a36f1ab4 = function(arg0, arg1) {
|
|
1166
1162
|
let deferred0_0;
|
|
1167
1163
|
let deferred0_1;
|
|
Binary file
|
package/dist-web/package.json
CHANGED
package/package.json
CHANGED