@kreuzberg/html-to-markdown-wasm 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/README.md CHANGED
@@ -17,8 +17,8 @@
17
17
  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
- <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v2.28.4" alt="Go">
20
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.0" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -65,6 +65,7 @@ High-performance HTML to Markdown conversion powered by Rust. Ships as native bi
65
65
 
66
66
  - **150-280 MB/s** throughput (10-80x faster than pure Python alternatives)
67
67
  - **12 language bindings** with consistent output across all runtimes
68
+ - **Structured result** — `convert()` returns `ConversionResult` with `content`, `metadata`, `tables`, `images`, and `warnings`
68
69
  - **Metadata extraction** — title, headers, links, images, structured data (JSON-LD, Microdata, RDFa)
69
70
  - **Visitor pattern** — custom callbacks for content filtering, URL rewriting, domain-specific dialects
70
71
  - **Table extraction** — extract structured table data (cells, headers, rendered markdown) during conversion
@@ -93,6 +94,53 @@ brew install kreuzberg-dev/tap/html-to-markdown
93
94
 
94
95
  See the **[Installation Guide](https://docs.html-to-markdown.kreuzberg.dev/getting-started/installation/)** for all languages including PHP, Go, Java, C#, Elixir, R, and WASM.
95
96
 
97
+ ### Usage
98
+
99
+ `convert()` is the single entry point. It returns a structured `ConversionResult`:
100
+
101
+ ```python
102
+ # Python
103
+ from html_to_markdown import convert
104
+
105
+ result = convert("<h1>Hello</h1><p>World</p>")
106
+ print(result["content"]) # # Hello\n\nWorld
107
+ print(result["metadata"]) # title, links, headings, …
108
+ ```
109
+
110
+ ```typescript
111
+ // TypeScript / Node.js
112
+ import { convert } from "@kreuzberg/html-to-markdown-node";
113
+
114
+ const result = convert("<h1>Hello</h1><p>World</p>");
115
+ console.log(result.content); // # Hello\n\nWorld
116
+ console.log(result.metadata); // title, links, headings, …
117
+ ```
118
+
119
+ ```rust
120
+ // Rust
121
+ use html_to_markdown_rs::convert;
122
+
123
+ let result = convert("<h1>Hello</h1><p>World</p>", None)?;
124
+ println!("{}", result.content.unwrap_or_default());
125
+ ```
126
+
127
+ ## Language Bindings
128
+
129
+ | Language | Package | Install |
130
+ |----------|---------|---------|
131
+ | Rust | [html-to-markdown-rs](https://crates.io/crates/html-to-markdown-rs) | `cargo add html-to-markdown-rs` |
132
+ | Python | [html-to-markdown](https://pypi.org/project/html-to-markdown/) | `pip install html-to-markdown` |
133
+ | TypeScript / Node.js | [@kreuzberg/html-to-markdown-node](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node) | `npm install @kreuzberg/html-to-markdown-node` |
134
+ | WebAssembly | [@kreuzberg/html-to-markdown-wasm](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm) | `npm install @kreuzberg/html-to-markdown-wasm` |
135
+ | Ruby | [html-to-markdown](https://rubygems.org/gems/html-to-markdown) | `gem install html-to-markdown` |
136
+ | PHP | [kreuzberg-dev/html-to-markdown](https://packagist.org/packages/kreuzberg-dev/html-to-markdown) | `composer require kreuzberg-dev/html-to-markdown` |
137
+ | Go | [htmltomarkdown](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown) | `go get github.com/kreuzberg-dev/html-to-markdown/packages/go/v3` |
138
+ | Java | [dev.kreuzberg:html-to-markdown](https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown) | Maven / Gradle |
139
+ | C# | [KreuzbergDev.HtmlToMarkdown](https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/) | `dotnet add package KreuzbergDev.HtmlToMarkdown` |
140
+ | Elixir | [html_to_markdown](https://hex.pm/packages/html_to_markdown) | `mix deps.get html_to_markdown` |
141
+ | R | [htmltomarkdown](https://kreuzberg-dev.r-universe.dev/htmltomarkdown) | `install.packages("htmltomarkdown")` |
142
+ | C (FFI) | [releases](https://github.com/kreuzberg-dev/html-to-markdown/releases) | Pre-built `.so` / `.dll` / `.dylib` |
143
+
96
144
  ## Part of the Kreuzberg Ecosystem
97
145
 
98
146
  html-to-markdown is developed by [kreuzberg.dev](https://kreuzberg.dev) and powers the HTML conversion pipeline in [Kreuzberg](https://docs.kreuzberg.dev), a document intelligence library for extracting text from PDFs, images, and office documents.
@@ -1,183 +1,35 @@
1
1
  /* tslint:disable */
2
2
  /* eslint-disable */
3
3
 
4
- export class WasmConversionOptionsHandle {
5
- free(): void;
6
- [Symbol.dispose](): void;
7
- constructor(options?: WasmConversionOptions | null);
8
- }
9
-
10
- /**
11
- * Result of HTML extraction with inline images
12
- */
13
- export class WasmHtmlExtraction {
14
- private constructor();
15
- free(): void;
16
- [Symbol.dispose](): void;
17
- readonly inlineImages: WasmInlineImage[];
18
- readonly markdown: string;
19
- readonly warnings: WasmInlineImageWarning[];
20
- }
21
-
22
4
  /**
23
- * Inline image data
24
- */
25
- export class WasmInlineImage {
26
- private constructor();
27
- free(): void;
28
- [Symbol.dispose](): void;
29
- readonly attributes: Record<string, string>;
30
- readonly data: Uint8Array;
31
- readonly description: string | undefined;
32
- readonly dimensions: Uint32Array | undefined;
33
- readonly filename: string | undefined;
34
- readonly format: string;
35
- readonly source: string;
36
- }
37
-
38
- /**
39
- * Inline image configuration
40
- */
41
- export class WasmInlineImageConfig {
42
- free(): void;
43
- [Symbol.dispose](): void;
44
- constructor(max_decoded_size_bytes?: number | null);
45
- set captureSvg(value: boolean);
46
- set filenamePrefix(value: string | null | undefined);
47
- set inferDimensions(value: boolean);
48
- }
49
-
50
- /**
51
- * Warning about inline image processing
52
- */
53
- export class WasmInlineImageWarning {
54
- private constructor();
55
- free(): void;
56
- [Symbol.dispose](): void;
57
- readonly index: number;
58
- readonly message: string;
59
- }
60
-
61
- export class WasmMetadataConfig {
62
- free(): void;
63
- [Symbol.dispose](): void;
64
- /**
65
- * Create a new metadata configuration with defaults
66
- *
67
- * All extraction types enabled by default with 1MB structured data limit
68
- */
69
- constructor();
70
- extract_document: boolean;
71
- extract_headers: boolean;
72
- extract_images: boolean;
73
- extract_links: boolean;
74
- extract_structured_data: boolean;
75
- max_structured_data_size: number;
76
- }
77
-
78
- /**
79
- * Convert HTML to Markdown
5
+ * Convert HTML to Markdown, returning a JavaScript object with structured content, metadata,
6
+ * images, and warnings in a single pass.
80
7
  *
81
- * # Arguments
82
- *
83
- * * `html` - The HTML string to convert
84
- * * `options` - Optional conversion options (as a JavaScript object)
85
- *
86
- * # Example
87
- *
88
- * ```javascript
89
- * import { convert } from 'html-to-markdown-wasm';
90
- *
91
- * const html = '<h1>Hello World</h1>';
92
- * const markdown = convert(html);
93
- * console.log(markdown); // # Hello World
94
- * ```
95
- */
96
- export function convert(html: string, options?: WasmConversionOptions | null): string;
97
-
98
- export function convertBytes(html: Uint8Array, options?: WasmConversionOptions | null): string;
99
-
100
- export function convertBytesWithInlineImages(html: Uint8Array, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
101
-
102
- /**
103
- * Convert HTML bytes to Markdown with metadata extraction
104
- *
105
- * # Arguments
106
- *
107
- * * `html` - The HTML bytes to convert
108
- * * `options` - Optional conversion options (as a JavaScript object)
109
- * * `metadata_config` - Metadata extraction configuration
110
- *
111
- * # Returns
112
- *
113
- * JavaScript object with `markdown` (string) and `metadata` (object) fields
114
- */
115
- export function convertBytesWithMetadata(html: Uint8Array, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
116
-
117
- export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
118
-
119
- export function convertWithInlineImages(html: string, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
120
-
121
- /**
122
- * Convert HTML to Markdown with metadata extraction
123
- *
124
- * # Arguments
125
- *
126
- * * `html` - The HTML string to convert
127
- * * `options` - Optional conversion options (as a JavaScript object)
128
- * * `metadata_config` - Metadata extraction configuration
129
- *
130
- * # Returns
131
- *
132
- * JavaScript object with `markdown` (string) and `metadata` (object) fields
133
- *
134
- * # Example
135
- *
136
- * ```javascript
137
- * import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
138
- *
139
- * const html = '<h1>Hello World</h1><a href="https://example.com">Link</a>';
140
- * const config = new WasmMetadataConfig();
141
- * config.extractHeaders = true;
142
- * config.extractLinks = true;
143
- *
144
- * const result = convertWithMetadata(html, null, config);
145
- * console.log(result.markdown); // # Hello World\n\n[Link](https://example.com)
146
- * console.log(result.metadata.headers); // [{ level: 1, text: "Hello World", ... }]
147
- * console.log(result.metadata.links); // [{ href: "https://example.com", text: "Link", ... }]
148
- * ```
149
- */
150
- export function convertWithMetadata(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
151
-
152
- export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
153
-
154
- /**
155
- * Convert HTML to Markdown with structured table extraction
8
+ * This is the primary API entry point. Returns a JavaScript object with:
9
+ * - `content`: converted text (string or null)
10
+ * - `document`: structured document tree (object or null)
11
+ * - `metadata`: extracted HTML metadata (object or null)
12
+ * - `tables`: array of extracted table data
13
+ * - `warnings`: array of non-fatal processing warnings
156
14
  *
157
15
  * # Arguments
158
16
  *
159
17
  * * `html` - The HTML string to convert
160
18
  * * `options` - Optional conversion options (as a JavaScript object)
161
- * * `metadata_config` - Optional metadata extraction configuration
162
- *
163
- * # Returns
164
- *
165
- * JavaScript object with `content` (string), `tables` (array), and `metadata` (object|null) fields
166
19
  *
167
20
  * # Example
168
21
  *
169
22
  * ```javascript
170
- * import { convertWithTables } from 'html-to-markdown-wasm';
23
+ * import { convert } from 'html-to-markdown-wasm';
171
24
  *
172
- * const html = '<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>';
173
- * const result = convertWithTables(html, null, null);
174
- * console.log(result.content);
175
- * console.log(result.tables[0].cells);
25
+ * const html = '<h1>Hello World</h1><p>Some text.</p>';
26
+ * const result = convert(html, null);
27
+ * console.log(result.content); // '# Hello World\n\nSome text.'
28
+ * console.log(result.tables); // []
29
+ * console.log(result.warnings); // []
176
30
  * ```
177
31
  */
178
- export function convertWithTables(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
179
-
180
- export function createConversionOptionsHandle(options?: WasmConversionOptions | null): WasmConversionOptionsHandle;
32
+ export function convert(html: string, options?: WasmConversionOptions | null): WasmConversionResult;
181
33
 
182
34
  /**
183
35
  * Initialize panic hook for better error messages in the browser
@@ -195,6 +47,7 @@ export type WasmNewlineStyle = "spaces" | "backslash";
195
47
  export type WasmCodeBlockStyle = "indented" | "backticks" | "tildes";
196
48
  export type WasmHighlightStyle = "doubleEqual" | "html" | "bold" | "none";
197
49
  export type WasmPreprocessingPreset = "minimal" | "standard" | "aggressive";
50
+ export type WasmOutputFormat = "markdown" | "djot" | "plain";
198
51
 
199
52
  export interface WasmPreprocessingOptions {
200
53
  enabled?: boolean;
@@ -235,4 +88,78 @@ export interface WasmConversionOptions {
235
88
  debug?: boolean;
236
89
  stripTags?: string[];
237
90
  preserveTags?: string[];
91
+ skipImages?: boolean;
92
+ outputFormat?: WasmOutputFormat;
93
+ includeDocumentStructure?: boolean;
94
+ extractImages?: boolean;
95
+ maxImageSize?: number;
96
+ captureSvg?: boolean;
97
+ inferDimensions?: boolean;
98
+ }
99
+
100
+ /** A single cell in a structured table grid. */
101
+ export interface WasmGridCell {
102
+ content: string;
103
+ row: number;
104
+ col: number;
105
+ rowSpan: number;
106
+ colSpan: number;
107
+ isHeader: boolean;
108
+ }
109
+
110
+ /** Structured table grid with cell-level data. */
111
+ export interface WasmTableGrid {
112
+ rows: number;
113
+ cols: number;
114
+ cells: WasmGridCell[];
115
+ }
116
+
117
+ /** A table extracted during conversion. */
118
+ export interface WasmConversionTable {
119
+ grid: WasmTableGrid;
120
+ markdown: string;
121
+ }
122
+
123
+ /** Non-fatal warning emitted during conversion. */
124
+ export interface WasmConversionWarning {
125
+ /** Human-readable warning message. */
126
+ message: string;
127
+ /** Warning kind identifier. */
128
+ kind: string;
129
+ }
130
+
131
+ /** An extracted inline image from the HTML document. */
132
+ export interface WasmInlineImage {
133
+ /** Raw image data as a Uint8Array. */
134
+ data: Uint8Array;
135
+ /** Image format (png, jpeg, gif, svg, etc.). */
136
+ format: string;
137
+ /** Generated or provided filename, or null. */
138
+ filename: string | null;
139
+ /** Alt text or description, or null. */
140
+ description: string | null;
141
+ /** Image width in pixels, or null if not available. */
142
+ width: number | null;
143
+ /** Image height in pixels, or null if not available. */
144
+ height: number | null;
145
+ /** Source type ("img_data_uri" or "svg_element"). */
146
+ source: string;
147
+ /** HTML attributes from the source element. */
148
+ attributes: Record<string, string>;
149
+ }
150
+
151
+ /** Result of the convert() API. */
152
+ export interface WasmConversionResult {
153
+ /** Converted text output (markdown, djot, or plain text), or null. */
154
+ content: string | null;
155
+ /** Structured document tree serialized as a JSON value, or null. */
156
+ document: unknown | null;
157
+ /** Extracted HTML metadata serialized as a JSON value, or null. */
158
+ metadata: unknown | null;
159
+ /** All tables found in the HTML, in document order. */
160
+ tables: WasmConversionTable[];
161
+ /** Extracted inline images (data URIs and SVGs). */
162
+ images: WasmInlineImage[];
163
+ /** Non-fatal processing warnings. */
164
+ warnings: WasmConversionWarning[];
238
165
  }