@kreuzberg/html-to-markdown-wasm 2.19.0-rc.1 → 2.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -2
- package/package.json +66 -66
- package/dist/LICENSE +0 -21
- package/dist/README.md +0 -202
- package/dist/html_to_markdown_wasm.d.ts +0 -200
- package/dist/html_to_markdown_wasm.js +0 -116
- package/dist/html_to_markdown_wasm_bg.js +0 -1355
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist/package.json +0 -27
- package/dist-node/LICENSE +0 -21
- package/dist-node/README.md +0 -202
- package/dist-node/html_to_markdown_wasm.d.ts +0 -197
- package/dist-node/html_to_markdown_wasm.js +0 -1369
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist-node/package.json +0 -21
- package/dist-web/LICENSE +0 -21
- package/dist-web/README.md +0 -202
- package/dist-web/html_to_markdown_wasm.d.ts +0 -277
- package/dist-web/html_to_markdown_wasm.js +0 -1395
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist-web/package.json +0 -25
package/README.md
CHANGED
|
@@ -15,11 +15,65 @@ Runs anywhere: Node.js, Deno, Bun, browsers, and edge runtimes.
|
|
|
15
15
|
[](https://pypi.org/project/html-to-markdown/)
|
|
16
16
|
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
17
17
|
[](https://rubygems.org/gems/html-to-markdown)
|
|
18
|
-
[](https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/)
|
|
19
|
+
[](https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown)
|
|
20
20
|
[](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
21
21
|
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
22
22
|
|
|
23
|
+
## Migration Guide (v2.18.x → v2.19.0)
|
|
24
|
+
|
|
25
|
+
> **⚠️ BREAKING CHANGE: Package Namespace Update**
|
|
26
|
+
>
|
|
27
|
+
> In v2.19.0, the npm package namespace changed from `html-to-markdown-wasm` to `@kreuzberg/html-to-markdown-wasm` to reflect the new Kreuzberg.dev organization.
|
|
28
|
+
|
|
29
|
+
### Install Updated Package
|
|
30
|
+
|
|
31
|
+
**Before (v2.18.x):**
|
|
32
|
+
```bash
|
|
33
|
+
npm install html-to-markdown-wasm
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**After (v2.19.0+):**
|
|
37
|
+
```bash
|
|
38
|
+
npm install @kreuzberg/html-to-markdown-wasm
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Update Import Statements
|
|
42
|
+
|
|
43
|
+
**Before:**
|
|
44
|
+
```typescript
|
|
45
|
+
import { convert } from 'html-to-markdown-wasm';
|
|
46
|
+
// or
|
|
47
|
+
import { convert } from "npm:html-to-markdown-wasm"; // Deno
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**After:**
|
|
51
|
+
```typescript
|
|
52
|
+
import { convert } from '@kreuzberg/html-to-markdown-wasm';
|
|
53
|
+
// or
|
|
54
|
+
import { convert } from "npm:@kreuzberg/html-to-markdown-wasm"; // Deno
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Update Browser ESM Imports
|
|
58
|
+
|
|
59
|
+
**Before:**
|
|
60
|
+
```javascript
|
|
61
|
+
import init, { convert } from 'https://unpkg.com/html-to-markdown-wasm/dist-web/html_to_markdown_wasm.js';
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**After:**
|
|
65
|
+
```javascript
|
|
66
|
+
import init, { convert } from 'https://unpkg.com/@kreuzberg/html-to-markdown-wasm/dist-web/html_to_markdown_wasm.js';
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Summary of Changes
|
|
70
|
+
|
|
71
|
+
- Package renamed from `html-to-markdown-wasm` to `@kreuzberg/html-to-markdown-wasm`
|
|
72
|
+
- All APIs remain identical
|
|
73
|
+
- Full backward compatibility after updating package name and imports
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
23
77
|
## Performance
|
|
24
78
|
|
|
25
79
|
Universal WebAssembly bindings with **excellent performance** across all JavaScript runtimes.
|
package/package.json
CHANGED
|
@@ -1,68 +1,68 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
2
|
+
"name": "@kreuzberg/html-to-markdown-wasm",
|
|
3
|
+
"version": "2.19.1",
|
|
4
|
+
"description": "High-performance HTML to Markdown converter - WebAssembly bindings",
|
|
5
|
+
"main": "dist/html_to_markdown_wasm.js",
|
|
6
|
+
"types": "dist/html_to_markdown_wasm.d.ts",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"import": "./dist/html_to_markdown_wasm.js",
|
|
10
|
+
"types": "./dist/html_to_markdown_wasm.d.ts",
|
|
11
|
+
"default": "./dist/html_to_markdown_wasm.js"
|
|
12
|
+
},
|
|
13
|
+
"./dist-node": {
|
|
14
|
+
"import": "./dist-node/html_to_markdown_wasm.js",
|
|
15
|
+
"require": "./dist-node/html_to_markdown_wasm.js",
|
|
16
|
+
"types": "./dist-node/html_to_markdown_wasm.d.ts"
|
|
17
|
+
},
|
|
18
|
+
"./dist-node/*": "./dist-node/*",
|
|
19
|
+
"./dist-web": {
|
|
20
|
+
"import": "./dist-web/html_to_markdown_wasm.js",
|
|
21
|
+
"types": "./dist-web/html_to_markdown_wasm.d.ts"
|
|
22
|
+
},
|
|
23
|
+
"./dist-web/*": "./dist-web/*"
|
|
24
|
+
},
|
|
25
|
+
"repository": "https://github.com/kreuzberg-dev/html-to-markdown",
|
|
26
|
+
"homepage": "https://github.com/kreuzberg-dev/html-to-markdown",
|
|
27
|
+
"license": "MIT",
|
|
28
|
+
"author": "Na'aman Hirschfeld <nhirschfeld@gmail.com>",
|
|
29
|
+
"bugs": "https://github.com/kreuzberg-dev/html-to-markdown/issues",
|
|
30
|
+
"keywords": [
|
|
31
|
+
"html",
|
|
32
|
+
"markdown",
|
|
33
|
+
"converter",
|
|
34
|
+
"rust",
|
|
35
|
+
"wasm",
|
|
36
|
+
"webassembly"
|
|
37
|
+
],
|
|
38
|
+
"files": [
|
|
39
|
+
"dist",
|
|
40
|
+
"dist-node",
|
|
41
|
+
"dist-web",
|
|
42
|
+
"README.md"
|
|
43
|
+
],
|
|
44
|
+
"scripts": {
|
|
45
|
+
"build": "wasm-pack build --target bundler --out-dir dist && node ./scripts/patch-bundler-entry.js",
|
|
46
|
+
"build:nodejs": "wasm-pack build --target nodejs --out-dir dist-node && node ./scripts/patch-bundler-entry.js dist-node --types-only",
|
|
47
|
+
"build:web": "wasm-pack build --target web --out-dir dist-web && node ./scripts/patch-bundler-entry.js dist-web --types-only",
|
|
48
|
+
"build:all": "pnpm run build && pnpm run build:nodejs && pnpm run build:web && pnpm run cleanup:gitignore",
|
|
49
|
+
"cleanup:gitignore": "node ./scripts/cleanup-gitignore.js",
|
|
50
|
+
"test": "vitest run",
|
|
51
|
+
"test:watch": "vitest",
|
|
52
|
+
"test:wasm-pack": "wasm-pack test --headless --chrome",
|
|
53
|
+
"clean": "rm -rf dist dist-node dist-web node_modules pkg"
|
|
54
|
+
},
|
|
55
|
+
"devDependencies": {
|
|
56
|
+
"@types/node": "^25.0.3",
|
|
57
|
+
"tsx": "^4.21.0",
|
|
58
|
+
"vitest": "^4.0.16",
|
|
59
|
+
"wasm-pack": "^0.13.1"
|
|
60
|
+
},
|
|
61
|
+
"publishConfig": {
|
|
62
|
+
"registry": "https://registry.npmjs.org/",
|
|
63
|
+
"access": "public"
|
|
64
|
+
},
|
|
65
|
+
"dependencies": {
|
|
66
|
+
"up": "^1.0.2"
|
|
67
|
+
}
|
|
68
68
|
}
|
package/dist/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
The MIT License (MIT)
|
|
2
|
-
|
|
3
|
-
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
package/dist/README.md
DELETED
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
# html-to-markdown
|
|
2
|
-
|
|
3
|
-
<img width="1128" height="191" alt="Linkedin- Banner (1)" src="https://github.com/user-attachments/assets/f8e91036-20a5-40f9-9fcc-9e6c6e15f1f5" />
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Elixir Rustler NIF, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
8
|
-
|
|
9
|
-
Part of the Kreuzberg.dev document intelligence ecosystem. Kreuzberg is a polyglot document intelligence framework with a fast Rust core. We build tools that help developers extract, process, and understand documents at scale, from PDFs to Office files, images, archives, emails, in 50+ formats. We've set out to make high-performance document intelligence faster and more ecological.
|
|
10
|
-
|
|
11
|
-
[](https://crates.io/crates/html-to-markdown-rs)
|
|
12
|
-
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node)
|
|
13
|
-
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm)
|
|
14
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
15
|
-
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
16
|
-
[](https://rubygems.org/gems/html-to-markdown)
|
|
17
|
-
[](https://hex.pm/packages/html_to_markdown)
|
|
18
|
-
[](https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/)
|
|
19
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown)
|
|
20
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
21
|
-
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
22
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
---
|
|
26
|
-
|
|
27
|
-
## 🎮 **[Try the Live Demo →](https://kreuzberg-dev.github.io/html-to-markdown/)**
|
|
28
|
-
|
|
29
|
-
Experience WebAssembly-powered HTML to Markdown conversion instantly in your browser. No installation needed!
|
|
30
|
-
|
|
31
|
-
---
|
|
32
|
-
|
|
33
|
-
## Why html-to-markdown?
|
|
34
|
-
|
|
35
|
-
- **Blazing Fast**: Rust-powered core delivers 10-80× faster conversion than pure Python alternatives
|
|
36
|
-
- **Universal**: Works everywhere - Node.js, Bun, Deno, browsers, Python, Rust, and standalone CLI
|
|
37
|
-
- **Smart Conversion**: Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
|
|
38
|
-
- **Metadata Extraction**: Extract document metadata (title, description, headers, links, images) alongside conversion
|
|
39
|
-
- **Highly Configurable**: Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
|
|
40
|
-
- **Tag Preservation**: Keep specific HTML tags unconverted when markdown isn't expressive enough
|
|
41
|
-
- **Secure by Default**: Built-in HTML sanitization prevents malicious content
|
|
42
|
-
- **Consistent Output**: Identical markdown rendering across all language bindings
|
|
43
|
-
|
|
44
|
-
## Quick Start
|
|
45
|
-
|
|
46
|
-
**Node.js / Bun (Native - Fastest):**
|
|
47
|
-
|
|
48
|
-
```typescript
|
|
49
|
-
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
50
|
-
|
|
51
|
-
const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
|
|
52
|
-
const markdown = convert(html, {
|
|
53
|
-
headingStyle: 'Atx',
|
|
54
|
-
codeBlockStyle: 'Backticks',
|
|
55
|
-
wrap: true,
|
|
56
|
-
preserveTags: ['table'],
|
|
57
|
-
});
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
**Python:**
|
|
61
|
-
|
|
62
|
-
```python
|
|
63
|
-
from html_to_markdown import convert
|
|
64
|
-
|
|
65
|
-
html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
|
|
66
|
-
markdown = convert(html, heading_style='Atx', wrap=True)
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
**Ruby:**
|
|
70
|
-
|
|
71
|
-
```ruby
|
|
72
|
-
require 'html_to_markdown'
|
|
73
|
-
|
|
74
|
-
html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
|
|
75
|
-
markdown = HtmlToMarkdown.convert(html, heading_style: :atx, wrap: true)
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
Full language guides: See [Language Guides](#language-guides) below.
|
|
79
|
-
|
|
80
|
-
## Installation
|
|
81
|
-
|
|
82
|
-
| Target | Command(s) |
|
|
83
|
-
| --------------------------- | ---------------------------------------------------------------------------------------------------------------- |
|
|
84
|
-
| **Node.js/Bun** (native) | `npm install @kreuzberg/html-to-markdown-node` |
|
|
85
|
-
| **WebAssembly** (universal) | `npm install @kreuzberg/html-to-markdown-wasm` |
|
|
86
|
-
| **Deno** | `import { convert } from "npm:@kreuzberg/html-to-markdown-wasm"` |
|
|
87
|
-
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
88
|
-
| **PHP** (extension + helpers) | `PHP_EXTENSION_DIR=$(php-config --extension-dir) pie install goldziher/html-to-markdown`<br>`composer require goldziher/html-to-markdown` |
|
|
89
|
-
| **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
|
|
90
|
-
| **Elixir** (Rustler NIF) | `{:html_to_markdown, "~> 2.8"}` |
|
|
91
|
-
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
92
|
-
| **Java** (Maven) | `<groupId>dev.kreuzberg</groupId><artifactId>html-to-markdown</artifactId>` |
|
|
93
|
-
| **C#/.NET** (NuGet) | `dotnet add package KreuzbergDev.HtmlToMarkdown` |
|
|
94
|
-
| Rust CLI (crates.io) | `cargo install html-to-markdown-cli` |
|
|
95
|
-
| Homebrew CLI | `brew install html-to-markdown` (core) |
|
|
96
|
-
| Releases | [GitHub Releases](https://github.com/kreuzberg-dev/html-to-markdown/releases) |
|
|
97
|
-
|
|
98
|
-
## Performance
|
|
99
|
-
|
|
100
|
-
Benchmarked on Apple M4 using the shared fixture harness in `tools/benchmark-harness`.
|
|
101
|
-
|
|
102
|
-
### Comparative Throughput (Median Across Fixtures)
|
|
103
|
-
|
|
104
|
-
| Runtime | Median ops/sec | Median throughput (MB/s) | Peak memory (MB) | Successes |
|
|
105
|
-
| ------- | -------------- | ------------------------ | ---------------- | --------- |
|
|
106
|
-
| Rust | 1,060.3 | 116.4 | 171.3 | 56/56 |
|
|
107
|
-
| Go | 1,496.3 | 131.1 | 22.9 | 16/16 |
|
|
108
|
-
| Ruby | 2,155.5 | 300.4 | 280.3 | 48/48 |
|
|
109
|
-
| PHP | 2,357.7 | 308.0 | 223.5 | 48/48 |
|
|
110
|
-
| Elixir | 1,564.1 | 269.1 | 384.7 | 48/48 |
|
|
111
|
-
| C# | 1,234.2 | 272.4 | 187.8 | 16/16 |
|
|
112
|
-
| Java | 1,298.7 | 167.1 | 527.2 | 16/16 |
|
|
113
|
-
| WASM | 1,485.8 | 157.6 | 95.3 | 48/48 |
|
|
114
|
-
| Node.js (NAPI) | 2,054.2 | 306.5 | 95.4 | 48/48 |
|
|
115
|
-
| Python (PyO3) | 3,120.3 | 307.5 | 83.5 | 48/48 |
|
|
116
|
-
|
|
117
|
-
Use `task bench:harness` to regenerate throughput numbers. See [Performance Guide](./examples/performance/) for benchmarking strategies and optimization tips.
|
|
118
|
-
|
|
119
|
-
## Language Guides
|
|
120
|
-
|
|
121
|
-
Complete documentation with examples for each language:
|
|
122
|
-
|
|
123
|
-
- **Python** – [README](./packages/python/README.md) | PyO3 bindings, metadata extraction, inline images
|
|
124
|
-
- **JavaScript/TypeScript** – [Node.js](./crates/html-to-markdown-node/README.md) | [TypeScript](./packages/typescript/README.md) | [WASM](./crates/html-to-markdown-wasm/README.md)
|
|
125
|
-
- **Ruby** – [README](./packages/ruby/README.md) | Magnus bindings, RBS type definitions, Steep checking
|
|
126
|
-
- **PHP** – [Package](./packages/php/README.md) | [Extension (PIE)](./packages/php-ext/README.md) | ext-php-rs extension
|
|
127
|
-
- **Go** – [README](./packages/go/README.md) | FFI bindings with cgo
|
|
128
|
-
- **Java** – [README](./packages/java/README.md) | Panama FFI, Maven/Gradle setup
|
|
129
|
-
- **C#/.NET** – [README](./packages/csharp/README.md) | P/Invoke FFI, NuGet distribution
|
|
130
|
-
- **Elixir** – [README](./packages/elixir/README.md) | Rustler NIF bindings
|
|
131
|
-
- **Rust** – [README](./crates/html-to-markdown/README.md) | Core library, error handling, advanced features
|
|
132
|
-
|
|
133
|
-
## Feature Guides
|
|
134
|
-
|
|
135
|
-
### Visitor Pattern
|
|
136
|
-
Customize HTML→Markdown conversion with callbacks for specific elements. Use cases: domain-specific dialects, content filtering, URL rewriting, accessibility validation.
|
|
137
|
-
|
|
138
|
-
**→ [Full Guide with Examples](./examples/visitor-pattern/)** (Python, TypeScript, Ruby)
|
|
139
|
-
|
|
140
|
-
### Metadata Extraction
|
|
141
|
-
Extract comprehensive metadata during conversion: title, description, headers, links, images, structured data. Use cases: SEO extraction, TOC generation, link validation, accessibility auditing, content migration.
|
|
142
|
-
|
|
143
|
-
**→ [Full Guide with Examples](./examples/metadata-extraction/)** (Python, TypeScript, Ruby)
|
|
144
|
-
|
|
145
|
-
### Performance & Benchmarking
|
|
146
|
-
Understand performance characteristics, run benchmarks, optimize for your use case. Includes benchmarking tools, memory profiling, streaming strategies, and optimization tips.
|
|
147
|
-
|
|
148
|
-
**→ [Full Guide](./examples/performance/)**
|
|
149
|
-
|
|
150
|
-
## Examples
|
|
151
|
-
|
|
152
|
-
Explore working code examples in multiple languages:
|
|
153
|
-
|
|
154
|
-
| Example | Path | Languages |
|
|
155
|
-
| ------- | ---- | --------- |
|
|
156
|
-
| **Visitor Pattern** | [examples/visitor-pattern/](./examples/visitor-pattern/) | Python, TypeScript, Ruby |
|
|
157
|
-
| **Metadata Extraction** | [examples/metadata-extraction/](./examples/metadata-extraction/) | Python, TypeScript, Ruby |
|
|
158
|
-
| **Performance** | [examples/performance/](./examples/performance/) | Benchmarks, profiling, optimization |
|
|
159
|
-
|
|
160
|
-
## Testing
|
|
161
|
-
|
|
162
|
-
Run the test suite locally:
|
|
163
|
-
|
|
164
|
-
```bash
|
|
165
|
-
# All core test suites (Rust, Python, Ruby, Node, PHP, Go, C#, Elixir, Java)
|
|
166
|
-
task test
|
|
167
|
-
|
|
168
|
-
# Run the Wasmtime-backed WASM integration tests
|
|
169
|
-
task wasm:test:wasmtime
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
## Compatibility & Migrations
|
|
173
|
-
|
|
174
|
-
### v2.19.0 Breaking Changes (Package Namespace Updates)
|
|
175
|
-
|
|
176
|
-
Several language bindings were updated to use new namespaces and package owners:
|
|
177
|
-
|
|
178
|
-
- **npm packages**: Scoped under `@kreuzberg` organization
|
|
179
|
-
- Old: `html-to-markdown-node` → New: `@kreuzberg/html-to-markdown-node`
|
|
180
|
-
- Old: `html-to-markdown-wasm` → New: `@kreuzberg/html-to-markdown-wasm`
|
|
181
|
-
- **Java**: Package namespace changed from `io.github.goldziher` to `dev.kreuzberg`
|
|
182
|
-
- **C#/.NET**: Package changed from `Goldziher.HtmlToMarkdown` to `KreuzbergDev.HtmlToMarkdown`
|
|
183
|
-
|
|
184
|
-
See [MIGRATION.md](./MIGRATION.md) for step-by-step upgrade instructions for each language.
|
|
185
|
-
|
|
186
|
-
### v1 → v2 Compatibility
|
|
187
|
-
|
|
188
|
-
- V2's Rust core sustains **150–210 MB/s** throughput; V1 averaged **≈ 2.5 MB/s** (60–80× faster).
|
|
189
|
-
- Python compatibility shim available in `html_to_markdown.v1_compat` (deprecated; emits warnings; plan migrations now). See [Python README](./packages/python/README.md#v1-compatibility) for keyword mappings.
|
|
190
|
-
- CLI flag changes and other breaking updates in [CHANGELOG](./CHANGELOG.md#breaking-changes).
|
|
191
|
-
|
|
192
|
-
## Community
|
|
193
|
-
|
|
194
|
-
- **Discord** – [Join our community](https://discord.gg/pXxagNK2zN)
|
|
195
|
-
- **Ecosystem** – Explore [Kreuzberg](https://kreuzberg.dev) document-processing tools
|
|
196
|
-
- **Contribute** – [CONTRIBUTING.md](./CONTRIBUTING.md)
|
|
197
|
-
- **Sponsor** – [GitHub Sponsors](https://github.com/sponsors/kreuzberg-dev)
|
|
198
|
-
- **Changelog** – [Version history](./CHANGELOG.md)
|
|
199
|
-
|
|
200
|
-
## License
|
|
201
|
-
|
|
202
|
-
MIT License – see [LICENSE](./LICENSE) for details.
|
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
/* tslint:disable */
|
|
2
|
-
/* eslint-disable */
|
|
3
|
-
|
|
4
|
-
export class WasmConversionOptionsHandle {
|
|
5
|
-
free(): void;
|
|
6
|
-
[Symbol.dispose](): void;
|
|
7
|
-
constructor(options?: WasmConversionOptions | null);
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
export class WasmHtmlExtraction {
|
|
11
|
-
private constructor();
|
|
12
|
-
free(): void;
|
|
13
|
-
[Symbol.dispose](): void;
|
|
14
|
-
readonly inlineImages: WasmInlineImage[];
|
|
15
|
-
readonly markdown: string;
|
|
16
|
-
readonly warnings: WasmInlineImageWarning[];
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
export class WasmInlineImage {
|
|
20
|
-
private constructor();
|
|
21
|
-
free(): void;
|
|
22
|
-
[Symbol.dispose](): void;
|
|
23
|
-
readonly attributes: Record<string, string>;
|
|
24
|
-
readonly dimensions: Uint32Array | undefined;
|
|
25
|
-
readonly description: string | undefined;
|
|
26
|
-
readonly data: Uint8Array;
|
|
27
|
-
readonly format: string;
|
|
28
|
-
readonly source: string;
|
|
29
|
-
readonly filename: string | undefined;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export class WasmInlineImageConfig {
|
|
33
|
-
free(): void;
|
|
34
|
-
[Symbol.dispose](): void;
|
|
35
|
-
constructor(max_decoded_size_bytes?: number | null);
|
|
36
|
-
set captureSvg(value: boolean);
|
|
37
|
-
set filenamePrefix(value: string | null | undefined);
|
|
38
|
-
set inferDimensions(value: boolean);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export class WasmInlineImageWarning {
|
|
42
|
-
private constructor();
|
|
43
|
-
free(): void;
|
|
44
|
-
[Symbol.dispose](): void;
|
|
45
|
-
readonly index: number;
|
|
46
|
-
readonly message: string;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
export class WasmMetadataConfig {
|
|
50
|
-
free(): void;
|
|
51
|
-
[Symbol.dispose](): void;
|
|
52
|
-
/**
|
|
53
|
-
* Create a new metadata configuration with defaults
|
|
54
|
-
*
|
|
55
|
-
* All extraction types enabled by default with 1MB structured data limit
|
|
56
|
-
*/
|
|
57
|
-
constructor();
|
|
58
|
-
extract_links: boolean;
|
|
59
|
-
extract_images: boolean;
|
|
60
|
-
extract_headers: boolean;
|
|
61
|
-
extract_document: boolean;
|
|
62
|
-
extract_structured_data: boolean;
|
|
63
|
-
max_structured_data_size: number;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Convert HTML to Markdown
|
|
68
|
-
*
|
|
69
|
-
* # Arguments
|
|
70
|
-
*
|
|
71
|
-
* * `html` - The HTML string to convert
|
|
72
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
73
|
-
*
|
|
74
|
-
* # Example
|
|
75
|
-
*
|
|
76
|
-
* ```javascript
|
|
77
|
-
* import { convert } from 'html-to-markdown-wasm';
|
|
78
|
-
*
|
|
79
|
-
* const html = '<h1>Hello World</h1>';
|
|
80
|
-
* const markdown = convert(html);
|
|
81
|
-
* console.log(markdown); // # Hello World
|
|
82
|
-
* ```
|
|
83
|
-
*/
|
|
84
|
-
export function convert(html: string, options?: WasmConversionOptions | null): string;
|
|
85
|
-
|
|
86
|
-
export function convertBytes(html: Uint8Array, options?: WasmConversionOptions | null): string;
|
|
87
|
-
|
|
88
|
-
export function convertBytesWithInlineImages(html: Uint8Array, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Convert HTML bytes to Markdown with metadata extraction
|
|
92
|
-
*
|
|
93
|
-
* # Arguments
|
|
94
|
-
*
|
|
95
|
-
* * `html` - The HTML bytes to convert
|
|
96
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
97
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
98
|
-
*
|
|
99
|
-
* # Returns
|
|
100
|
-
*
|
|
101
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
102
|
-
*/
|
|
103
|
-
export function convertBytesWithMetadata(html: Uint8Array, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
104
|
-
|
|
105
|
-
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
106
|
-
|
|
107
|
-
export function convertWithInlineImages(html: string, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
108
|
-
|
|
109
|
-
/**
|
|
110
|
-
* Convert HTML to Markdown with metadata extraction
|
|
111
|
-
*
|
|
112
|
-
* # Arguments
|
|
113
|
-
*
|
|
114
|
-
* * `html` - The HTML string to convert
|
|
115
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
116
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
117
|
-
*
|
|
118
|
-
* # Returns
|
|
119
|
-
*
|
|
120
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
121
|
-
*
|
|
122
|
-
* # Example
|
|
123
|
-
*
|
|
124
|
-
* ```javascript
|
|
125
|
-
* import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
126
|
-
*
|
|
127
|
-
* const html = '<h1>Hello World</h1><a href="https://example.com">Link</a>';
|
|
128
|
-
* const config = new WasmMetadataConfig();
|
|
129
|
-
* config.extractHeaders = true;
|
|
130
|
-
* config.extractLinks = true;
|
|
131
|
-
*
|
|
132
|
-
* const result = convertWithMetadata(html, null, config);
|
|
133
|
-
* console.log(result.markdown); // # Hello World\n\n[Link](https://example.com)
|
|
134
|
-
* console.log(result.metadata.headers); // [{ level: 1, text: "Hello World", ... }]
|
|
135
|
-
* console.log(result.metadata.links); // [{ href: "https://example.com", text: "Link", ... }]
|
|
136
|
-
* ```
|
|
137
|
-
*/
|
|
138
|
-
export function convertWithMetadata(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
139
|
-
|
|
140
|
-
export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
|
|
141
|
-
|
|
142
|
-
export function createConversionOptionsHandle(options?: WasmConversionOptions | null): WasmConversionOptionsHandle;
|
|
143
|
-
|
|
144
|
-
/**
|
|
145
|
-
* Initialize panic hook for better error messages in the browser
|
|
146
|
-
*/
|
|
147
|
-
export function init(): void;
|
|
148
|
-
|
|
149
|
-
export declare function initWasm(): Promise<void>;
|
|
150
|
-
export declare const wasmReady: Promise<void>;
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
export type WasmHeadingStyle = "underlined" | "atx" | "atxClosed";
|
|
154
|
-
export type WasmListIndentType = "spaces" | "tabs";
|
|
155
|
-
export type WasmWhitespaceMode = "normalized" | "strict";
|
|
156
|
-
export type WasmNewlineStyle = "spaces" | "backslash";
|
|
157
|
-
export type WasmCodeBlockStyle = "indented" | "backticks" | "tildes";
|
|
158
|
-
export type WasmHighlightStyle = "doubleEqual" | "html" | "bold" | "none";
|
|
159
|
-
export type WasmPreprocessingPreset = "minimal" | "standard" | "aggressive";
|
|
160
|
-
|
|
161
|
-
export interface WasmPreprocessingOptions {
|
|
162
|
-
enabled?: boolean;
|
|
163
|
-
preset?: WasmPreprocessingPreset;
|
|
164
|
-
removeNavigation?: boolean;
|
|
165
|
-
removeForms?: boolean;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
export interface WasmConversionOptions {
|
|
169
|
-
headingStyle?: WasmHeadingStyle;
|
|
170
|
-
listIndentType?: WasmListIndentType;
|
|
171
|
-
listIndentWidth?: number;
|
|
172
|
-
bullets?: string;
|
|
173
|
-
strongEmSymbol?: string;
|
|
174
|
-
escapeAsterisks?: boolean;
|
|
175
|
-
escapeUnderscores?: boolean;
|
|
176
|
-
escapeMisc?: boolean;
|
|
177
|
-
escapeAscii?: boolean;
|
|
178
|
-
codeLanguage?: string;
|
|
179
|
-
autolinks?: boolean;
|
|
180
|
-
defaultTitle?: boolean;
|
|
181
|
-
brInTables?: boolean;
|
|
182
|
-
hocrSpatialTables?: boolean;
|
|
183
|
-
highlightStyle?: WasmHighlightStyle;
|
|
184
|
-
extractMetadata?: boolean;
|
|
185
|
-
whitespaceMode?: WasmWhitespaceMode;
|
|
186
|
-
stripNewlines?: boolean;
|
|
187
|
-
wrap?: boolean;
|
|
188
|
-
wrapWidth?: number;
|
|
189
|
-
convertAsInline?: boolean;
|
|
190
|
-
subSymbol?: string;
|
|
191
|
-
supSymbol?: string;
|
|
192
|
-
newlineStyle?: WasmNewlineStyle;
|
|
193
|
-
codeBlockStyle?: WasmCodeBlockStyle;
|
|
194
|
-
keepInlineImagesIn?: string[];
|
|
195
|
-
preprocessing?: WasmPreprocessingOptions | null;
|
|
196
|
-
encoding?: string;
|
|
197
|
-
debug?: boolean;
|
|
198
|
-
stripTags?: string[];
|
|
199
|
-
preserveTags?: string[];
|
|
200
|
-
}
|