@kreuzberg/html-to-markdown-wasm 2.19.4 → 2.19.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright 2024-2025 Na'aman Hirschfeld
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/dist/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # html-to-markdown
2
+
3
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
+ <!-- Language Bindings -->
5
+ <a href="https://crates.io/crates/html-to-markdown-rs">
6
+ <img src="https://img.shields.io/crates/v/html-to-markdown-rs?label=Rust&color=007ec6" alt="Rust">
7
+ </a>
8
+ <a href="https://pypi.org/project/html-to-markdown/">
9
+ <img src="https://img.shields.io/pypi/v/html-to-markdown?label=Python&color=007ec6" alt="Python">
10
+ </a>
11
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node">
12
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-node?label=Node.js&color=007ec6" alt="Node.js">
13
+ </a>
14
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm">
15
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-wasm?label=WASM&color=007ec6" alt="WASM">
16
+ </a>
17
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
+ </a>
20
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
+ <img src="https://img.shields.io/badge/Go-v2.19.0-007ec6" alt="Go">
22
+ </a>
23
+ <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
+ <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
25
+ </a>
26
+ <a href="https://packagist.org/packages/goldziher/html-to-markdown">
27
+ <img src="https://img.shields.io/packagist/v/goldziher/html-to-markdown?label=PHP&color=007ec6" alt="PHP">
28
+ </a>
29
+ <a href="https://rubygems.org/gems/html-to-markdown">
30
+ <img src="https://img.shields.io/gem/v/html-to-markdown?label=Ruby&color=007ec6" alt="Ruby">
31
+ </a>
32
+ <a href="https://hex.pm/packages/html_to_markdown">
33
+ <img src="https://img.shields.io/hexpm/v/html_to_markdown?label=Elixir&color=007ec6" alt="Elixir">
34
+ </a>
35
+
36
+ <!-- Project Info -->
37
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE">
38
+ <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
39
+ </a>
40
+ </div>
41
+
42
+
43
+ <img width="3384" height="573" alt="Linkedin- Banner" src="https://github.com/user-attachments/assets/1bd52e37-c45d-4f5c-8408-ee12997f6cfd" />
44
+
45
+
46
+ <div align="center" style="margin-top: 20px;">
47
+ <a href="https://discord.gg/pXxagNK2zN">
48
+ <img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
49
+ </a>
50
+ </div>
51
+
52
+ High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Elixir Rustler NIF, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behavior across all runtimes.
53
+
54
+ ## Key Features
55
+
56
+ - **Blazing Fast** – Rust-powered core delivers 10-80× faster conversion than pure Python alternatives (150–280 MB/s)
57
+ - **Polyglot** – Native bindings for Rust, Python, TypeScript/Node.js, Ruby, PHP, Go, Java, C#, and Elixir
58
+ - **Smart Conversion** – Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
59
+ - **Metadata Extraction** – Extract document metadata (title, description, headers, links, images, structured data) alongside conversion
60
+ - **Visitor Pattern** – Custom callbacks for domain-specific dialects, content filtering, URL rewriting, accessibility validation
61
+ - **Highly Configurable** – Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
62
+ - **Tag Preservation** – Keep specific HTML tags unconverted when markdown isn't expressive enough
63
+ - **Secure by Default** – Built-in HTML sanitization prevents malicious content
64
+ - **Consistent Output** – Identical markdown rendering across all language bindings
65
+
66
+ **[Try the Live Demo →](https://kreuzberg-dev.github.io/html-to-markdown/)**
67
+
68
+ ## Installation
69
+
70
+ Each language binding provides comprehensive documentation with installation instructions, examples, and best practices. Choose your platform to get started:
71
+
72
+ **Scripting Languages:**
73
+ - **[Python](./packages/python/README.md)** – PyPI package, metadata extraction, visitor pattern, CLI included
74
+ - **[Ruby](./packages/ruby/README.md)** – RubyGems package, RBS type definitions, Steep checking
75
+ - **[PHP](./packages/php/README.md)** – Composer package + PIE extension, PHP 8.2+, PHPStan level 9
76
+ - **[Elixir](./packages/elixir/README.md)** – Hex package, Rustler NIF bindings, Elixir 1.19+
77
+
78
+ **JavaScript/TypeScript:**
79
+ - **[Node.js / TypeScript](./packages/typescript/README.md)** – Native NAPI-RS bindings for Node.js/Bun, fastest performance, WebAssembly for browsers/Deno
80
+
81
+ **Compiled Languages:**
82
+ - **[Go](./packages/go/v2/README.md)** – Go module with FFI bindings, automatic library download
83
+ - **[Java](./packages/java/README.md)** – Maven Central, Panama Foreign Function & Memory API, Java 24+
84
+ - **[C#](./packages/csharp/README.md)** – NuGet package, .NET 8.0+, P/Invoke FFI bindings
85
+
86
+ **Native:**
87
+ - **[Rust](./crates/html-to-markdown/README.md)** – Core library, flexible feature flags, zero-copy APIs
88
+
89
+ **Command-Line:**
90
+ - **[CLI](https://crates.io/crates/html-to-markdown-cli)** – Cross-platform binary via `cargo install html-to-markdown-cli` or [Homebrew](https://formulae.brew.sh/formula/html-to-markdown)
91
+
92
+ <details>
93
+ <summary><strong>Metadata Extraction</strong></summary>
94
+
95
+ Extract comprehensive metadata during conversion: title, description, headers, links, images, structured data (JSON-LD, Microdata, RDFa). Use cases: SEO extraction, table-of-contents generation, link validation, accessibility auditing, content migration.
96
+
97
+ **[Metadata Extraction Guide →](./examples/metadata-extraction/)**
98
+
99
+ </details>
100
+
101
+ <details>
102
+ <summary><strong>Visitor Pattern</strong></summary>
103
+
104
+ Customize HTML→Markdown conversion with callbacks for specific elements. Intercept links, images, headings, lists, and more. Use cases: domain-specific Markdown dialects (Obsidian, Notion), content filtering, URL rewriting, accessibility validation, analytics.
105
+
106
+ **[Visitor Pattern Guide →](./examples/visitor-pattern/)**
107
+
108
+ </details>
109
+
110
+ <details>
111
+ <summary><strong>Performance & Benchmarking</strong></summary>
112
+
113
+ Rust-powered core delivers 150–280 MB/s throughput (10-80× faster than pure Python alternatives). Includes benchmarking tools, memory profiling, streaming strategies, and optimization tips.
114
+
115
+ **[Performance Guide →](./examples/performance/)**
116
+
117
+ </details>
118
+
119
+ <details>
120
+ <summary><strong>Tag Preservation</strong></summary>
121
+
122
+ Keep specific HTML tags unconverted when Markdown isn't expressive enough. Useful for tables, SVG, custom elements, or when you need mixed HTML/Markdown output.
123
+
124
+ See language-specific documentation for `preserveTags` configuration.
125
+
126
+ </details>
127
+
128
+ <details>
129
+ <summary><strong>Secure by Default</strong></summary>
130
+
131
+ Built-in HTML sanitization prevents XSS attacks and malicious content. Powered by ammonia with safe defaults. Configurable via `sanitize` options.
132
+
133
+ </details>
134
+
135
+ ## Contributing
136
+
137
+ Contributions are welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on:
138
+
139
+ - Setting up the development environment
140
+ - Running tests locally (Rust 95%+ coverage, language bindings 80%+)
141
+ - Submitting pull requests
142
+ - Reporting issues
143
+
144
+ All contributions must follow code quality standards enforced via pre-commit hooks (prek).
145
+
146
+ ## License
147
+
148
+ MIT License – see [LICENSE](LICENSE) for details. You can use html-to-markdown freely in both commercial and closed-source products with no obligations, no viral effects, and no licensing restrictions.
@@ -0,0 +1,200 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ export class WasmConversionOptionsHandle {
5
+ free(): void;
6
+ [Symbol.dispose](): void;
7
+ constructor(options?: WasmConversionOptions | null);
8
+ }
9
+
10
+ export class WasmHtmlExtraction {
11
+ private constructor();
12
+ free(): void;
13
+ [Symbol.dispose](): void;
14
+ readonly inlineImages: WasmInlineImage[];
15
+ readonly markdown: string;
16
+ readonly warnings: WasmInlineImageWarning[];
17
+ }
18
+
19
+ export class WasmInlineImage {
20
+ private constructor();
21
+ free(): void;
22
+ [Symbol.dispose](): void;
23
+ readonly attributes: Record<string, string>;
24
+ readonly dimensions: Uint32Array | undefined;
25
+ readonly description: string | undefined;
26
+ readonly data: Uint8Array;
27
+ readonly format: string;
28
+ readonly source: string;
29
+ readonly filename: string | undefined;
30
+ }
31
+
32
+ export class WasmInlineImageConfig {
33
+ free(): void;
34
+ [Symbol.dispose](): void;
35
+ constructor(max_decoded_size_bytes?: number | null);
36
+ set captureSvg(value: boolean);
37
+ set filenamePrefix(value: string | null | undefined);
38
+ set inferDimensions(value: boolean);
39
+ }
40
+
41
+ export class WasmInlineImageWarning {
42
+ private constructor();
43
+ free(): void;
44
+ [Symbol.dispose](): void;
45
+ readonly index: number;
46
+ readonly message: string;
47
+ }
48
+
49
+ export class WasmMetadataConfig {
50
+ free(): void;
51
+ [Symbol.dispose](): void;
52
+ /**
53
+ * Create a new metadata configuration with defaults
54
+ *
55
+ * All extraction types enabled by default with 1MB structured data limit
56
+ */
57
+ constructor();
58
+ extract_links: boolean;
59
+ extract_images: boolean;
60
+ extract_headers: boolean;
61
+ extract_document: boolean;
62
+ extract_structured_data: boolean;
63
+ max_structured_data_size: number;
64
+ }
65
+
66
+ /**
67
+ * Convert HTML to Markdown
68
+ *
69
+ * # Arguments
70
+ *
71
+ * * `html` - The HTML string to convert
72
+ * * `options` - Optional conversion options (as a JavaScript object)
73
+ *
74
+ * # Example
75
+ *
76
+ * ```javascript
77
+ * import { convert } from 'html-to-markdown-wasm';
78
+ *
79
+ * const html = '<h1>Hello World</h1>';
80
+ * const markdown = convert(html);
81
+ * console.log(markdown); // # Hello World
82
+ * ```
83
+ */
84
+ export function convert(html: string, options?: WasmConversionOptions | null): string;
85
+
86
+ export function convertBytes(html: Uint8Array, options?: WasmConversionOptions | null): string;
87
+
88
+ export function convertBytesWithInlineImages(html: Uint8Array, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
89
+
90
+ /**
91
+ * Convert HTML bytes to Markdown with metadata extraction
92
+ *
93
+ * # Arguments
94
+ *
95
+ * * `html` - The HTML bytes to convert
96
+ * * `options` - Optional conversion options (as a JavaScript object)
97
+ * * `metadata_config` - Metadata extraction configuration
98
+ *
99
+ * # Returns
100
+ *
101
+ * JavaScript object with `markdown` (string) and `metadata` (object) fields
102
+ */
103
+ export function convertBytesWithMetadata(html: Uint8Array, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
104
+
105
+ export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
106
+
107
+ export function convertWithInlineImages(html: string, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
108
+
109
+ /**
110
+ * Convert HTML to Markdown with metadata extraction
111
+ *
112
+ * # Arguments
113
+ *
114
+ * * `html` - The HTML string to convert
115
+ * * `options` - Optional conversion options (as a JavaScript object)
116
+ * * `metadata_config` - Metadata extraction configuration
117
+ *
118
+ * # Returns
119
+ *
120
+ * JavaScript object with `markdown` (string) and `metadata` (object) fields
121
+ *
122
+ * # Example
123
+ *
124
+ * ```javascript
125
+ * import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
126
+ *
127
+ * const html = '<h1>Hello World</h1><a href="https://example.com">Link</a>';
128
+ * const config = new WasmMetadataConfig();
129
+ * config.extractHeaders = true;
130
+ * config.extractLinks = true;
131
+ *
132
+ * const result = convertWithMetadata(html, null, config);
133
+ * console.log(result.markdown); // # Hello World\n\n[Link](https://example.com)
134
+ * console.log(result.metadata.headers); // [{ level: 1, text: "Hello World", ... }]
135
+ * console.log(result.metadata.links); // [{ href: "https://example.com", text: "Link", ... }]
136
+ * ```
137
+ */
138
+ export function convertWithMetadata(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
139
+
140
+ export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
141
+
142
+ export function createConversionOptionsHandle(options?: WasmConversionOptions | null): WasmConversionOptionsHandle;
143
+
144
+ /**
145
+ * Initialize panic hook for better error messages in the browser
146
+ */
147
+ export function init(): void;
148
+
149
+ export declare function initWasm(): Promise<void>;
150
+ export declare const wasmReady: Promise<void>;
151
+
152
+
153
+ export type WasmHeadingStyle = "underlined" | "atx" | "atxClosed";
154
+ export type WasmListIndentType = "spaces" | "tabs";
155
+ export type WasmWhitespaceMode = "normalized" | "strict";
156
+ export type WasmNewlineStyle = "spaces" | "backslash";
157
+ export type WasmCodeBlockStyle = "indented" | "backticks" | "tildes";
158
+ export type WasmHighlightStyle = "doubleEqual" | "html" | "bold" | "none";
159
+ export type WasmPreprocessingPreset = "minimal" | "standard" | "aggressive";
160
+
161
+ export interface WasmPreprocessingOptions {
162
+ enabled?: boolean;
163
+ preset?: WasmPreprocessingPreset;
164
+ removeNavigation?: boolean;
165
+ removeForms?: boolean;
166
+ }
167
+
168
+ export interface WasmConversionOptions {
169
+ headingStyle?: WasmHeadingStyle;
170
+ listIndentType?: WasmListIndentType;
171
+ listIndentWidth?: number;
172
+ bullets?: string;
173
+ strongEmSymbol?: string;
174
+ escapeAsterisks?: boolean;
175
+ escapeUnderscores?: boolean;
176
+ escapeMisc?: boolean;
177
+ escapeAscii?: boolean;
178
+ codeLanguage?: string;
179
+ autolinks?: boolean;
180
+ defaultTitle?: boolean;
181
+ brInTables?: boolean;
182
+ hocrSpatialTables?: boolean;
183
+ highlightStyle?: WasmHighlightStyle;
184
+ extractMetadata?: boolean;
185
+ whitespaceMode?: WasmWhitespaceMode;
186
+ stripNewlines?: boolean;
187
+ wrap?: boolean;
188
+ wrapWidth?: number;
189
+ convertAsInline?: boolean;
190
+ subSymbol?: string;
191
+ supSymbol?: string;
192
+ newlineStyle?: WasmNewlineStyle;
193
+ codeBlockStyle?: WasmCodeBlockStyle;
194
+ keepInlineImagesIn?: string[];
195
+ preprocessing?: WasmPreprocessingOptions | null;
196
+ encoding?: string;
197
+ debug?: boolean;
198
+ stripTags?: string[];
199
+ preserveTags?: string[];
200
+ }
@@ -0,0 +1,116 @@
1
+ import * as wasmModule from "./html_to_markdown_wasm_bg.wasm";
2
+ export * from "./html_to_markdown_wasm_bg.js";
3
+ import * as imports_mod from "./html_to_markdown_wasm_bg.js";
4
+
5
+ const notReadyError = () =>
6
+ new Error("html-to-markdown-wasm: WebAssembly bundle is still initializing. Await initWasm() before calling convert() in runtimes that load WASM asynchronously (e.g., Cloudflare Workers).");
7
+
8
+ const notReadyProxy = new Proxy({}, {
9
+ get(_target, prop) {
10
+ if (prop === "__esModule") {
11
+ return true;
12
+ }
13
+ throw notReadyError();
14
+ }
15
+ });
16
+
17
+ let wasmExports;
18
+ let initialized = false;
19
+ let initPromise;
20
+
21
+ imports_mod.__wbg_set_wasm(notReadyProxy);
22
+
23
+ function asExports(value) {
24
+ if (!value) {
25
+ return null;
26
+ }
27
+ if (typeof value.__wbindgen_start === "function") {
28
+ return value;
29
+ }
30
+ if (value instanceof WebAssembly.Instance) {
31
+ return value.exports;
32
+ }
33
+ if (typeof value === "object") {
34
+ if (value.instance instanceof WebAssembly.Instance) {
35
+ return value.instance.exports;
36
+ }
37
+ if (value.default instanceof WebAssembly.Instance) {
38
+ return value.default.exports;
39
+ }
40
+ if (value.default && value.default.instance instanceof WebAssembly.Instance) {
41
+ return value.default.instance.exports;
42
+ }
43
+ }
44
+ return null;
45
+ }
46
+
47
+ function finalize(exports) {
48
+ wasmExports = exports;
49
+ imports_mod.__wbg_set_wasm(exports);
50
+ if (typeof exports.__wbindgen_start === "function") {
51
+ exports.__wbindgen_start();
52
+ }
53
+ initialized = true;
54
+ return exports;
55
+ }
56
+
57
+ function trySyncInit() {
58
+ try {
59
+ const exports = asExports(wasmModule);
60
+ if (exports) {
61
+ finalize(exports);
62
+ }
63
+ } catch {
64
+ // ignore and fall back to async init
65
+ }
66
+ }
67
+
68
+ trySyncInit();
69
+
70
+ async function ensureInitPromise() {
71
+ if (initialized) {
72
+ return Promise.resolve(wasmExports);
73
+ }
74
+ if (!initPromise) {
75
+ initPromise = (async () => {
76
+ let module = wasmModule;
77
+
78
+ // Handle promise-wrapped modules
79
+ if (module && typeof module.then === "function") {
80
+ module = await module;
81
+ }
82
+
83
+ // Handle function loaders (like @rollup/plugin-wasm)
84
+ if (module && typeof module.default === "function") {
85
+ module = await module.default(module);
86
+ }
87
+
88
+ // Handle WebAssembly.Module (Wrangler/esbuild)
89
+ if (module && module.default instanceof WebAssembly.Module) {
90
+ const imports = {};
91
+ imports["./html_to_markdown_wasm_bg.js"] = {};
92
+ for (const key in imports_mod) {
93
+ if ((key.startsWith('__wbg_') || key.startsWith('__wbindgen_')) && key !== '__wbg_set_wasm' && typeof imports_mod[key] === 'function') {
94
+ imports["./html_to_markdown_wasm_bg.js"][key] = imports_mod[key];
95
+ }
96
+ }
97
+ const instance = await WebAssembly.instantiate(module.default, imports);
98
+ return finalize(instance.exports);
99
+ }
100
+
101
+ // Try standard export detection
102
+ const exports = asExports(module);
103
+ if (!exports) {
104
+ throw new Error("html-to-markdown-wasm: failed to initialize WebAssembly bundle. Call initWasm() with a supported bundler configuration.");
105
+ }
106
+ return finalize(exports);
107
+ })();
108
+ }
109
+ return initPromise;
110
+ }
111
+
112
+ export const wasmReady = ensureInitPromise();
113
+
114
+ export async function initWasm() {
115
+ return ensureInitPromise();
116
+ }