html-to-markdown-wasm 2.5.5 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+ export const memory: WebAssembly.Memory;
4
+ export const __wbg_wasmhtmlextraction_free: (a: number, b: number) => void;
5
+ export const __wbg_wasminlineimage_free: (a: number, b: number) => void;
6
+ export const __wbg_wasminlineimageconfig_free: (a: number, b: number) => void;
7
+ export const __wbg_wasminlineimagewarning_free: (a: number, b: number) => void;
8
+ export const convert: (a: number, b: number, c: number, d: number) => void;
9
+ export const convertWithInlineImages: (a: number, b: number, c: number, d: number, e: number) => void;
10
+ export const wasmhtmlextraction_inlineImages: (a: number, b: number) => void;
11
+ export const wasmhtmlextraction_markdown: (a: number, b: number) => void;
12
+ export const wasmhtmlextraction_warnings: (a: number, b: number) => void;
13
+ export const wasminlineimage_attributes: (a: number) => number;
14
+ export const wasminlineimage_data: (a: number) => number;
15
+ export const wasminlineimage_description: (a: number, b: number) => void;
16
+ export const wasminlineimage_dimensions: (a: number, b: number) => void;
17
+ export const wasminlineimage_filename: (a: number, b: number) => void;
18
+ export const wasminlineimage_format: (a: number, b: number) => void;
19
+ export const wasminlineimage_source: (a: number, b: number) => void;
20
+ export const wasminlineimageconfig_new: (a: number, b: number) => number;
21
+ export const wasminlineimageconfig_set_captureSvg: (a: number, b: number) => void;
22
+ export const wasminlineimageconfig_set_filenamePrefix: (a: number, b: number, c: number) => void;
23
+ export const wasminlineimageconfig_set_inferDimensions: (a: number, b: number) => void;
24
+ export const wasminlineimagewarning_index: (a: number) => number;
25
+ export const wasminlineimagewarning_message: (a: number, b: number) => void;
26
+ export const init: () => void;
27
+ export const __wbindgen_export: (a: number, b: number) => number;
28
+ export const __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
29
+ export const __wbindgen_export3: (a: number) => void;
30
+ export const __wbindgen_export4: (a: number, b: number, c: number) => void;
31
+ export const __wbindgen_add_to_stack_pointer: (a: number) => number;
32
+ export const __wbindgen_start: () => void;
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "html-to-markdown-wasm",
3
+ "collaborators": [
4
+ "Na'aman Hirschfeld <nhirschfeld@gmail.com>"
5
+ ],
6
+ "version": "2.6.1",
7
+ "license": "MIT",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/Goldziher/html-to-markdown"
11
+ },
12
+ "files": [
13
+ "html_to_markdown_wasm_bg.wasm",
14
+ "html_to_markdown_wasm.js",
15
+ "html_to_markdown_wasm.d.ts"
16
+ ],
17
+ "main": "html_to_markdown_wasm.js",
18
+ "homepage": "https://github.com/Goldziher/html-to-markdown",
19
+ "types": "html_to_markdown_wasm.d.ts"
20
+ }
@@ -0,0 +1,203 @@
1
+ # html-to-markdown
2
+
3
+ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
4
+
5
+ [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg)](https://crates.io/crates/html-to-markdown-rs)
6
+ [![npm (node)](https://badge.fury.io/js/html-to-markdown-node.svg)](https://www.npmjs.com/package/html-to-markdown-node)
7
+ [![npm (wasm)](https://badge.fury.io/js/html-to-markdown-wasm.svg)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
+ [![npm (typescript)](https://badge.fury.io/js/html-to-markdown.svg)](https://www.npmjs.com/package/html-to-markdown)
9
+ [![PyPI](https://badge.fury.io/py/html-to-markdown.svg)](https://pypi.org/project/html-to-markdown/)
10
+ [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
11
+ [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
12
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/Goldziher/html-to-markdown/blob/main/LICENSE)
13
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
14
+
15
+ ---
16
+
17
+ ## 🎮 **[Try the Live Demo →](https://goldziher.github.io/html-to-markdown/)**
18
+
19
+ Experience WebAssembly-powered HTML to Markdown conversion instantly in your browser. No installation needed!
20
+
21
+ ---
22
+
23
+ ## Why html-to-markdown?
24
+
25
+ - **Blazing Fast**: Rust-powered core delivers 10-80× faster conversion than pure Python alternatives
26
+ - **Universal**: Works everywhere - Node.js, Bun, Deno, browsers, Python, Rust, and standalone CLI
27
+ - **Smart Conversion**: Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
28
+ - **Highly Configurable**: Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
29
+ - **Tag Preservation**: Keep specific HTML tags unconverted when markdown isn't expressive enough
30
+ - **Secure by Default**: Built-in HTML sanitization prevents malicious content
31
+ - **Consistent Output**: Identical markdown rendering across all language bindings
32
+
33
+ ## Documentation
34
+
35
+ - **JavaScript/TypeScript guides**:
36
+ - Node.js/Bun (native) – [Node.js README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-node/README.md)
37
+ - WebAssembly (universal) – [WASM README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-wasm/README.md)
38
+ - TypeScript wrapper – [TypeScript README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/typescript/README.md)
39
+ - **Python guide** – [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md)
40
+ - **PHP guides**:
41
+ - PHP wrapper package – [PHP README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php/README.md)
42
+ - PHP extension (PIE) – [Extension README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/php-ext/README.md)
43
+ - **Ruby guide** – [Ruby README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown-rb/README.md)
44
+ - **Rust guide** – [Rust README](https://github.com/Goldziher/html-to-markdown/blob/main/crates/html-to-markdown/README.md)
45
+ - **Contributing** – [CONTRIBUTING.md](https://github.com/Goldziher/html-to-markdown/blob/main/CONTRIBUTING.md) ⭐ Start here!
46
+ - **Changelog** – [CHANGELOG.md](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md)
47
+
48
+ ## Installation
49
+
50
+ | Target | Command |
51
+ | --------------------------- | ------------------------------------------------------------------------- |
52
+ | **Node.js/Bun** (native) | `npm install html-to-markdown-node` |
53
+ | **WebAssembly** (universal) | `npm install html-to-markdown-wasm` |
54
+ | **Deno** | `import { convert } from "npm:html-to-markdown-wasm"` |
55
+ | **Python** (bindings + CLI) | `pip install html-to-markdown` |
56
+ | **PHP** (extension + helpers) | `pie install goldziher/html-to-markdown`<br>`composer require html-to-markdown/extension` |
57
+ | **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
58
+ | **Rust** crate | `cargo add html-to-markdown-rs` |
59
+ | Rust CLI | `cargo install html-to-markdown-cli` |
60
+ | Homebrew CLI | `brew tap goldziher/tap`<br>`brew install html-to-markdown` |
61
+ | Releases | [GitHub Releases](https://github.com/Goldziher/html-to-markdown/releases) |
62
+
63
+ ## Quick Start
64
+
65
+ ### JavaScript/TypeScript
66
+
67
+ **Node.js / Bun (Native - Fastest):**
68
+
69
+ ```typescript
70
+ import { convert } from 'html-to-markdown-node';
71
+
72
+ const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
73
+ const markdown = convert(html, {
74
+ headingStyle: 'Atx',
75
+ codeBlockStyle: 'Backticks',
76
+ wrap: true,
77
+ preserveTags: ['table'], // NEW in v2.5: Keep complex HTML as-is
78
+ });
79
+ ```
80
+
81
+ **Deno / Browsers / Edge (Universal):**
82
+
83
+ ```typescript
84
+ import { convert } from "npm:html-to-markdown-wasm"; // Deno
85
+ // or: import { convert } from 'html-to-markdown-wasm'; // Bundlers
86
+
87
+ const markdown = convert(html, {
88
+ headingStyle: 'atx',
89
+ listIndentWidth: 2,
90
+ });
91
+ ```
92
+
93
+ **Performance:** Native bindings average ~19k ops/sec, WASM averages ~16k ops/sec (benchmarked on complex real-world documents).
94
+
95
+ See the JavaScript guides for full API documentation:
96
+
97
+ - [Node.js/Bun guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-node)
98
+ - [WebAssembly guide](https://github.com/Goldziher/html-to-markdown/tree/main/crates/html-to-markdown-wasm)
99
+
100
+ ### CLI
101
+
102
+ ```bash
103
+ # Convert a file
104
+ html-to-markdown input.html > output.md
105
+
106
+ # Stream from stdin
107
+ curl https://example.com | html-to-markdown > output.md
108
+
109
+ # Apply options
110
+ html-to-markdown --heading-style atx --list-indent-width 2 input.html
111
+ ```
112
+
113
+ ### Python (v2 API)
114
+
115
+ ```python
116
+ from html_to_markdown import convert, convert_with_inline_images, InlineImageConfig
117
+
118
+ html = "<h1>Hello</h1><p>Rust ❤️ Markdown</p>"
119
+ markdown = convert(html)
120
+
121
+ markdown, inline_images, warnings = convert_with_inline_images(
122
+ '<img src="data:image/png;base64,...==" alt="Pixel">',
123
+ image_config=InlineImageConfig(max_decoded_size_bytes=1024, infer_dimensions=True),
124
+ )
125
+ ```
126
+
127
+ ### Rust
128
+
129
+ ```rust
130
+ use html_to_markdown_rs::{convert, ConversionOptions, HeadingStyle};
131
+
132
+ let html = "<h1>Welcome</h1><p>Fast conversion</p>";
133
+ let markdown = convert(html, None)?;
134
+
135
+ let options = ConversionOptions {
136
+ heading_style: HeadingStyle::Atx,
137
+ ..Default::default()
138
+ };
139
+ let markdown = convert(html, Some(options))?;
140
+ ```
141
+
142
+ See the language-specific READMEs for complete configuration, hOCR workflows, and inline image extraction.
143
+
144
+ ## Performance
145
+
146
+ Benchmarked on Apple M4 with complex real-world documents (Wikipedia articles, tables, lists):
147
+
148
+ ### Operations per Second (higher is better)
149
+
150
+ | Document Type | Node.js (NAPI) | WASM | Python (PyO3) | Speedup (Node vs Python) |
151
+ | -------------------------- | -------------- | ------ | ------------- | ------------------------ |
152
+ | **Small (5 paragraphs)** | 86,233 | 70,300 | 8,443 | **10.2×** |
153
+ | **Medium (25 paragraphs)** | 18,979 | 15,282 | 1,846 | **10.3×** |
154
+ | **Large (100 paragraphs)** | 4,907 | 3,836 | 438 | **11.2×** |
155
+ | **Tables (complex)** | 5,003 | 3,748 | 4,829 | 1.0× |
156
+ | **Lists (nested)** | 1,819 | 1,391 | 1,165 | **1.6×** |
157
+ | **Wikipedia (129KB)** | 1,125 | 1,022 | - | - |
158
+ | **Wikipedia (653KB)** | 156 | 147 | - | - |
159
+
160
+ ### Average Performance Summary
161
+
162
+ | Implementation | Avg ops/sec | vs WASM | vs Python | Best For |
163
+ | --------------------- | ---------------- | ------------ | --------------- | --------------------------------- |
164
+ | **Node.js (NAPI-RS)** | **18,162** | 1.17× faster | **7.4× faster** | Maximum throughput in Node.js/Bun |
165
+ | **WebAssembly** | **15,536** | baseline | **6.3× faster** | Universal (Deno, browsers, edge) |
166
+ | **Python (PyO3)** | **2,465** | 6.3× slower | baseline | Python ecosystem integration |
167
+ | **Rust CLI/Binary** | **150-210 MB/s** | - | - | Standalone processing |
168
+
169
+ ### Key Insights
170
+
171
+ - **JavaScript bindings are fastest**: Native Node.js bindings achieve ~18k ops/sec average, with WASM close behind at ~16k ops/sec
172
+ - **Python is 6-10× slower**: Despite using the same Rust core, PyO3 FFI overhead significantly impacts Python performance
173
+ - **Small documents**: Both JS implementations reach 70-90k ops/sec on simple HTML
174
+ - **Large documents**: Performance gap widens with complexity
175
+
176
+ **Note on Python performance**: The current Python bindings have optimization opportunities. The v2 API with direct `convert()` calls performs best; avoid the v1 compatibility layer for performance-critical applications.
177
+
178
+ ## Compatibility (v1 → v2)
179
+
180
+ - V2’s Rust core sustains **150–210 MB/s** throughput; V1 averaged **≈ 2.5 MB/s** in its Python/BeautifulSoup implementation (60–80× faster).
181
+ - The Python package offers a compatibility shim in `html_to_markdown.v1_compat` (`convert_to_markdown`, `convert_to_markdown_stream`, `markdownify`). Details and keyword mappings live in [Python README](https://github.com/Goldziher/html-to-markdown/blob/main/packages/python/README.md#v1-compatibility).
182
+ - CLI flag changes, option renames, and other breaking updates are summarised in [CHANGELOG](https://github.com/Goldziher/html-to-markdown/blob/main/CHANGELOG.md#breaking-changes).
183
+
184
+ ## Community
185
+
186
+ - Chat with us on [Discord](https://discord.gg/pXxagNK2zN)
187
+ - Explore the broader [Kreuzberg](https://kreuzberg.dev) document-processing ecosystem
188
+ - Sponsor development via [GitHub Sponsors](https://github.com/sponsors/Goldziher)
189
+ ### Ruby
190
+
191
+ ```ruby
192
+ require 'html_to_markdown'
193
+
194
+ html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
195
+ markdown = HtmlToMarkdown.convert(html, heading_style: :atx, wrap: true)
196
+
197
+ puts markdown
198
+ # # Hello
199
+ #
200
+ # Rust ❤️ Markdown
201
+ ```
202
+
203
+ See the language-specific READMEs for complete configuration, hOCR workflows, and inline image extraction.
@@ -0,0 +1,152 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+ /**
4
+ * Convert HTML to Markdown while collecting inline images
5
+ *
6
+ * # Arguments
7
+ *
8
+ * * `html` - The HTML string to convert
9
+ * * `options` - Optional conversion options (as a JavaScript object)
10
+ * * `image_config` - Configuration for inline image extraction
11
+ *
12
+ * # Example
13
+ *
14
+ * ```javascript
15
+ * import { convertWithInlineImages, WasmInlineImageConfig } from '@html-to-markdown/wasm';
16
+ *
17
+ * const html = '<img src="data:image/png;base64,..." alt="test">';
18
+ * const config = new WasmInlineImageConfig(1024 * 1024);
19
+ * config.inferDimensions = true;
20
+ *
21
+ * const result = convertWithInlineImages(html, null, config);
22
+ * console.log(result.markdown);
23
+ * console.log(result.inlineImages.length);
24
+ * ```
25
+ */
26
+ export function convertWithInlineImages(html: string, options: any, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
27
+ /**
28
+ * Convert HTML to Markdown
29
+ *
30
+ * # Arguments
31
+ *
32
+ * * `html` - The HTML string to convert
33
+ * * `options` - Optional conversion options (as a JavaScript object)
34
+ *
35
+ * # Example
36
+ *
37
+ * ```javascript
38
+ * import { convert } from '@html-to-markdown/wasm';
39
+ *
40
+ * const html = '<h1>Hello World</h1>';
41
+ * const markdown = convert(html);
42
+ * console.log(markdown); // # Hello World
43
+ * ```
44
+ */
45
+ export function convert(html: string, options: any): string;
46
+ /**
47
+ * Initialize panic hook for better error messages in the browser
48
+ */
49
+ export function init(): void;
50
+ /**
51
+ * Result of HTML extraction with inline images
52
+ */
53
+ export class WasmHtmlExtraction {
54
+ private constructor();
55
+ free(): void;
56
+ [Symbol.dispose](): void;
57
+ readonly inlineImages: WasmInlineImage[];
58
+ readonly markdown: string;
59
+ readonly warnings: WasmInlineImageWarning[];
60
+ }
61
+ /**
62
+ * Inline image data
63
+ */
64
+ export class WasmInlineImage {
65
+ private constructor();
66
+ free(): void;
67
+ [Symbol.dispose](): void;
68
+ readonly attributes: any;
69
+ readonly dimensions: Uint32Array | undefined;
70
+ readonly description: string | undefined;
71
+ readonly data: Uint8Array;
72
+ readonly format: string;
73
+ readonly source: string;
74
+ readonly filename: string | undefined;
75
+ }
76
+ /**
77
+ * Inline image configuration
78
+ */
79
+ export class WasmInlineImageConfig {
80
+ free(): void;
81
+ [Symbol.dispose](): void;
82
+ constructor(max_decoded_size_bytes?: number | null);
83
+ set captureSvg(value: boolean);
84
+ set filenamePrefix(value: string | null | undefined);
85
+ set inferDimensions(value: boolean);
86
+ }
87
+ /**
88
+ * Warning about inline image processing
89
+ */
90
+ export class WasmInlineImageWarning {
91
+ private constructor();
92
+ free(): void;
93
+ [Symbol.dispose](): void;
94
+ readonly index: number;
95
+ readonly message: string;
96
+ }
97
+
98
+ export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
99
+
100
+ export interface InitOutput {
101
+ readonly memory: WebAssembly.Memory;
102
+ readonly __wbg_wasmhtmlextraction_free: (a: number, b: number) => void;
103
+ readonly __wbg_wasminlineimage_free: (a: number, b: number) => void;
104
+ readonly __wbg_wasminlineimageconfig_free: (a: number, b: number) => void;
105
+ readonly __wbg_wasminlineimagewarning_free: (a: number, b: number) => void;
106
+ readonly convert: (a: number, b: number, c: number, d: number) => void;
107
+ readonly convertWithInlineImages: (a: number, b: number, c: number, d: number, e: number) => void;
108
+ readonly wasmhtmlextraction_inlineImages: (a: number, b: number) => void;
109
+ readonly wasmhtmlextraction_markdown: (a: number, b: number) => void;
110
+ readonly wasmhtmlextraction_warnings: (a: number, b: number) => void;
111
+ readonly wasminlineimage_attributes: (a: number) => number;
112
+ readonly wasminlineimage_data: (a: number) => number;
113
+ readonly wasminlineimage_description: (a: number, b: number) => void;
114
+ readonly wasminlineimage_dimensions: (a: number, b: number) => void;
115
+ readonly wasminlineimage_filename: (a: number, b: number) => void;
116
+ readonly wasminlineimage_format: (a: number, b: number) => void;
117
+ readonly wasminlineimage_source: (a: number, b: number) => void;
118
+ readonly wasminlineimageconfig_new: (a: number, b: number) => number;
119
+ readonly wasminlineimageconfig_set_captureSvg: (a: number, b: number) => void;
120
+ readonly wasminlineimageconfig_set_filenamePrefix: (a: number, b: number, c: number) => void;
121
+ readonly wasminlineimageconfig_set_inferDimensions: (a: number, b: number) => void;
122
+ readonly wasminlineimagewarning_index: (a: number) => number;
123
+ readonly wasminlineimagewarning_message: (a: number, b: number) => void;
124
+ readonly init: () => void;
125
+ readonly __wbindgen_export: (a: number, b: number) => number;
126
+ readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
127
+ readonly __wbindgen_export3: (a: number) => void;
128
+ readonly __wbindgen_export4: (a: number, b: number, c: number) => void;
129
+ readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
130
+ readonly __wbindgen_start: () => void;
131
+ }
132
+
133
+ export type SyncInitInput = BufferSource | WebAssembly.Module;
134
+ /**
135
+ * Instantiates the given `module`, which can either be bytes or
136
+ * a precompiled `WebAssembly.Module`.
137
+ *
138
+ * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
139
+ *
140
+ * @returns {InitOutput}
141
+ */
142
+ export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
143
+
144
+ /**
145
+ * If `module_or_path` is {RequestInfo} or {URL}, makes a request and
146
+ * for everything else, calls `WebAssembly.instantiate` directly.
147
+ *
148
+ * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
149
+ *
150
+ * @returns {Promise<InitOutput>}
151
+ */
152
+ export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;