@kreuzberg/html-to-markdown-wasm 2.30.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -286
- package/dist/README.md +50 -2
- package/dist/html_to_markdown_wasm.d.ts +15 -163
- package/dist/html_to_markdown_wasm_bg.js +73 -887
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/html_to_markdown_wasm_bg.wasm.d.ts +0 -45
- package/dist/package.json +1 -1
- package/dist-node/README.md +50 -2
- package/dist-node/html_to_markdown_wasm.d.ts +15 -163
- package/dist-node/html_to_markdown_wasm.js +74 -903
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/html_to_markdown_wasm_bg.wasm.d.ts +0 -45
- package/dist-node/package.json +1 -1
- package/dist-web/README.md +50 -2
- package/dist-web/html_to_markdown_wasm.d.ts +15 -208
- package/dist-web/html_to_markdown_wasm.js +73 -888
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/html_to_markdown_wasm_bg.wasm.d.ts +0 -45
- package/dist-web/package.json +1 -1
- package/package.json +1 -1
package/dist/README.md
CHANGED
|
@@ -17,8 +17,8 @@
|
|
|
17
17
|
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
|
-
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=
|
|
20
|
+
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -65,6 +65,7 @@ High-performance HTML to Markdown conversion powered by Rust. Ships as native bi
|
|
|
65
65
|
|
|
66
66
|
- **150-280 MB/s** throughput (10-80x faster than pure Python alternatives)
|
|
67
67
|
- **12 language bindings** with consistent output across all runtimes
|
|
68
|
+
- **Structured result** — `convert()` returns `ConversionResult` with `content`, `metadata`, `tables`, `images`, and `warnings`
|
|
68
69
|
- **Metadata extraction** — title, headers, links, images, structured data (JSON-LD, Microdata, RDFa)
|
|
69
70
|
- **Visitor pattern** — custom callbacks for content filtering, URL rewriting, domain-specific dialects
|
|
70
71
|
- **Table extraction** — extract structured table data (cells, headers, rendered markdown) during conversion
|
|
@@ -93,6 +94,53 @@ brew install kreuzberg-dev/tap/html-to-markdown
|
|
|
93
94
|
|
|
94
95
|
See the **[Installation Guide](https://docs.html-to-markdown.kreuzberg.dev/getting-started/installation/)** for all languages including PHP, Go, Java, C#, Elixir, R, and WASM.
|
|
95
96
|
|
|
97
|
+
### Usage
|
|
98
|
+
|
|
99
|
+
`convert()` is the single entry point. It returns a structured `ConversionResult`:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
# Python
|
|
103
|
+
from html_to_markdown import convert
|
|
104
|
+
|
|
105
|
+
result = convert("<h1>Hello</h1><p>World</p>")
|
|
106
|
+
print(result["content"]) # # Hello\n\nWorld
|
|
107
|
+
print(result["metadata"]) # title, links, headings, …
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
```typescript
|
|
111
|
+
// TypeScript / Node.js
|
|
112
|
+
import { convert } from "@kreuzberg/html-to-markdown-node";
|
|
113
|
+
|
|
114
|
+
const result = convert("<h1>Hello</h1><p>World</p>");
|
|
115
|
+
console.log(result.content); // # Hello\n\nWorld
|
|
116
|
+
console.log(result.metadata); // title, links, headings, …
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
```rust
|
|
120
|
+
// Rust
|
|
121
|
+
use html_to_markdown_rs::convert;
|
|
122
|
+
|
|
123
|
+
let result = convert("<h1>Hello</h1><p>World</p>", None)?;
|
|
124
|
+
println!("{}", result.content.unwrap_or_default());
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Language Bindings
|
|
128
|
+
|
|
129
|
+
| Language | Package | Install |
|
|
130
|
+
|----------|---------|---------|
|
|
131
|
+
| Rust | [html-to-markdown-rs](https://crates.io/crates/html-to-markdown-rs) | `cargo add html-to-markdown-rs` |
|
|
132
|
+
| Python | [html-to-markdown](https://pypi.org/project/html-to-markdown/) | `pip install html-to-markdown` |
|
|
133
|
+
| TypeScript / Node.js | [@kreuzberg/html-to-markdown-node](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node) | `npm install @kreuzberg/html-to-markdown-node` |
|
|
134
|
+
| WebAssembly | [@kreuzberg/html-to-markdown-wasm](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm) | `npm install @kreuzberg/html-to-markdown-wasm` |
|
|
135
|
+
| Ruby | [html-to-markdown](https://rubygems.org/gems/html-to-markdown) | `gem install html-to-markdown` |
|
|
136
|
+
| PHP | [kreuzberg-dev/html-to-markdown](https://packagist.org/packages/kreuzberg-dev/html-to-markdown) | `composer require kreuzberg-dev/html-to-markdown` |
|
|
137
|
+
| Go | [htmltomarkdown](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown) | `go get github.com/kreuzberg-dev/html-to-markdown/packages/go/v3` |
|
|
138
|
+
| Java | [dev.kreuzberg:html-to-markdown](https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown) | Maven / Gradle |
|
|
139
|
+
| C# | [KreuzbergDev.HtmlToMarkdown](https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/) | `dotnet add package KreuzbergDev.HtmlToMarkdown` |
|
|
140
|
+
| Elixir | [html_to_markdown](https://hex.pm/packages/html_to_markdown) | `mix deps.get html_to_markdown` |
|
|
141
|
+
| R | [htmltomarkdown](https://kreuzberg-dev.r-universe.dev/htmltomarkdown) | `install.packages("htmltomarkdown")` |
|
|
142
|
+
| C (FFI) | [releases](https://github.com/kreuzberg-dev/html-to-markdown/releases) | Pre-built `.so` / `.dll` / `.dylib` |
|
|
143
|
+
|
|
96
144
|
## Part of the Kreuzberg Ecosystem
|
|
97
145
|
|
|
98
146
|
html-to-markdown is developed by [kreuzberg.dev](https://kreuzberg.dev) and powers the HTML conversion pipeline in [Kreuzberg](https://docs.kreuzberg.dev), a document intelligence library for extracting text from PDFs, images, and office documents.
|
|
@@ -1,183 +1,35 @@
|
|
|
1
1
|
/* tslint:disable */
|
|
2
2
|
/* eslint-disable */
|
|
3
3
|
|
|
4
|
-
export class WasmConversionOptionsHandle {
|
|
5
|
-
free(): void;
|
|
6
|
-
[Symbol.dispose](): void;
|
|
7
|
-
constructor(options?: WasmConversionOptions | null);
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Result of HTML extraction with inline images
|
|
12
|
-
*/
|
|
13
|
-
export class WasmHtmlExtraction {
|
|
14
|
-
private constructor();
|
|
15
|
-
free(): void;
|
|
16
|
-
[Symbol.dispose](): void;
|
|
17
|
-
readonly inlineImages: WasmInlineImage[];
|
|
18
|
-
readonly markdown: string;
|
|
19
|
-
readonly warnings: WasmInlineImageWarning[];
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Inline image data
|
|
24
|
-
*/
|
|
25
|
-
export class WasmInlineImage {
|
|
26
|
-
private constructor();
|
|
27
|
-
free(): void;
|
|
28
|
-
[Symbol.dispose](): void;
|
|
29
|
-
readonly attributes: Record<string, string>;
|
|
30
|
-
readonly data: Uint8Array;
|
|
31
|
-
readonly description: string | undefined;
|
|
32
|
-
readonly dimensions: Uint32Array | undefined;
|
|
33
|
-
readonly filename: string | undefined;
|
|
34
|
-
readonly format: string;
|
|
35
|
-
readonly source: string;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Inline image configuration
|
|
40
|
-
*/
|
|
41
|
-
export class WasmInlineImageConfig {
|
|
42
|
-
free(): void;
|
|
43
|
-
[Symbol.dispose](): void;
|
|
44
|
-
constructor(max_decoded_size_bytes?: number | null);
|
|
45
|
-
set captureSvg(value: boolean);
|
|
46
|
-
set filenamePrefix(value: string | null | undefined);
|
|
47
|
-
set inferDimensions(value: boolean);
|
|
48
|
-
}
|
|
49
|
-
|
|
50
4
|
/**
|
|
51
|
-
*
|
|
52
|
-
|
|
53
|
-
export class WasmInlineImageWarning {
|
|
54
|
-
private constructor();
|
|
55
|
-
free(): void;
|
|
56
|
-
[Symbol.dispose](): void;
|
|
57
|
-
readonly index: number;
|
|
58
|
-
readonly message: string;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
export class WasmMetadataConfig {
|
|
62
|
-
free(): void;
|
|
63
|
-
[Symbol.dispose](): void;
|
|
64
|
-
/**
|
|
65
|
-
* Create a new metadata configuration with defaults
|
|
66
|
-
*
|
|
67
|
-
* All extraction types enabled by default with 1MB structured data limit
|
|
68
|
-
*/
|
|
69
|
-
constructor();
|
|
70
|
-
extract_document: boolean;
|
|
71
|
-
extract_headers: boolean;
|
|
72
|
-
extract_images: boolean;
|
|
73
|
-
extract_links: boolean;
|
|
74
|
-
extract_structured_data: boolean;
|
|
75
|
-
max_structured_data_size: number;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Convert HTML to Markdown
|
|
5
|
+
* Convert HTML to Markdown, returning a JavaScript object with structured content, metadata,
|
|
6
|
+
* images, and warnings in a single pass.
|
|
80
7
|
*
|
|
81
|
-
*
|
|
82
|
-
*
|
|
83
|
-
*
|
|
84
|
-
*
|
|
85
|
-
*
|
|
86
|
-
*
|
|
87
|
-
*
|
|
88
|
-
* ```javascript
|
|
89
|
-
* import { convert } from 'html-to-markdown-wasm';
|
|
90
|
-
*
|
|
91
|
-
* const html = '<h1>Hello World</h1>';
|
|
92
|
-
* const markdown = convert(html);
|
|
93
|
-
* console.log(markdown); // # Hello World
|
|
94
|
-
* ```
|
|
95
|
-
*/
|
|
96
|
-
export function convert(html: string, options?: WasmConversionOptions | null): string;
|
|
97
|
-
|
|
98
|
-
export function convertBytes(html: Uint8Array, options?: WasmConversionOptions | null): string;
|
|
99
|
-
|
|
100
|
-
export function convertBytesWithInlineImages(html: Uint8Array, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
101
|
-
|
|
102
|
-
/**
|
|
103
|
-
* Convert HTML bytes to Markdown with metadata extraction
|
|
104
|
-
*
|
|
105
|
-
* # Arguments
|
|
106
|
-
*
|
|
107
|
-
* * `html` - The HTML bytes to convert
|
|
108
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
109
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
110
|
-
*
|
|
111
|
-
* # Returns
|
|
112
|
-
*
|
|
113
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
114
|
-
*/
|
|
115
|
-
export function convertBytesWithMetadata(html: Uint8Array, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
116
|
-
|
|
117
|
-
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
118
|
-
|
|
119
|
-
export function convertWithInlineImages(html: string, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* Convert HTML to Markdown with metadata extraction
|
|
123
|
-
*
|
|
124
|
-
* # Arguments
|
|
125
|
-
*
|
|
126
|
-
* * `html` - The HTML string to convert
|
|
127
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
128
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
129
|
-
*
|
|
130
|
-
* # Returns
|
|
131
|
-
*
|
|
132
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
133
|
-
*
|
|
134
|
-
* # Example
|
|
135
|
-
*
|
|
136
|
-
* ```javascript
|
|
137
|
-
* import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
138
|
-
*
|
|
139
|
-
* const html = '<h1>Hello World</h1><a href="https://example.com">Link</a>';
|
|
140
|
-
* const config = new WasmMetadataConfig();
|
|
141
|
-
* config.extractHeaders = true;
|
|
142
|
-
* config.extractLinks = true;
|
|
143
|
-
*
|
|
144
|
-
* const result = convertWithMetadata(html, null, config);
|
|
145
|
-
* console.log(result.markdown); // # Hello World\n\n[Link](https://example.com)
|
|
146
|
-
* console.log(result.metadata.headers); // [{ level: 1, text: "Hello World", ... }]
|
|
147
|
-
* console.log(result.metadata.links); // [{ href: "https://example.com", text: "Link", ... }]
|
|
148
|
-
* ```
|
|
149
|
-
*/
|
|
150
|
-
export function convertWithMetadata(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
151
|
-
|
|
152
|
-
export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
|
|
153
|
-
|
|
154
|
-
/**
|
|
155
|
-
* Convert HTML to Markdown with structured table extraction
|
|
8
|
+
* This is the primary API entry point. Returns a JavaScript object with:
|
|
9
|
+
* - `content`: converted text (string or null)
|
|
10
|
+
* - `document`: structured document tree (object or null)
|
|
11
|
+
* - `metadata`: extracted HTML metadata (object or null)
|
|
12
|
+
* - `tables`: array of extracted table data
|
|
13
|
+
* - `warnings`: array of non-fatal processing warnings
|
|
156
14
|
*
|
|
157
15
|
* # Arguments
|
|
158
16
|
*
|
|
159
17
|
* * `html` - The HTML string to convert
|
|
160
18
|
* * `options` - Optional conversion options (as a JavaScript object)
|
|
161
|
-
* * `metadata_config` - Optional metadata extraction configuration
|
|
162
|
-
*
|
|
163
|
-
* # Returns
|
|
164
|
-
*
|
|
165
|
-
* JavaScript object with `content` (string), `tables` (array), and `metadata` (object|null) fields
|
|
166
19
|
*
|
|
167
20
|
* # Example
|
|
168
21
|
*
|
|
169
22
|
* ```javascript
|
|
170
|
-
* import {
|
|
23
|
+
* import { convert } from 'html-to-markdown-wasm';
|
|
171
24
|
*
|
|
172
|
-
* const html = '<
|
|
173
|
-
* const result =
|
|
174
|
-
* console.log(result.content);
|
|
175
|
-
* console.log(result.tables[
|
|
25
|
+
* const html = '<h1>Hello World</h1><p>Some text.</p>';
|
|
26
|
+
* const result = convert(html, null);
|
|
27
|
+
* console.log(result.content); // '# Hello World\n\nSome text.'
|
|
28
|
+
* console.log(result.tables); // []
|
|
29
|
+
* console.log(result.warnings); // []
|
|
176
30
|
* ```
|
|
177
31
|
*/
|
|
178
|
-
export function
|
|
179
|
-
|
|
180
|
-
export function createConversionOptionsHandle(options?: WasmConversionOptions | null): WasmConversionOptionsHandle;
|
|
32
|
+
export function convert(html: string, options?: WasmConversionOptions | null): any;
|
|
181
33
|
|
|
182
34
|
/**
|
|
183
35
|
* Initialize panic hook for better error messages in the browser
|