@kreuzberg/html-to-markdown-wasm 2.19.0-rc.1 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -2
- package/package.json +66 -66
- package/dist/LICENSE +0 -21
- package/dist/README.md +0 -202
- package/dist/html_to_markdown_wasm.d.ts +0 -200
- package/dist/html_to_markdown_wasm.js +0 -116
- package/dist/html_to_markdown_wasm_bg.js +0 -1355
- package/dist/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist/package.json +0 -27
- package/dist-node/LICENSE +0 -21
- package/dist-node/README.md +0 -202
- package/dist-node/html_to_markdown_wasm.d.ts +0 -197
- package/dist-node/html_to_markdown_wasm.js +0 -1369
- package/dist-node/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-node/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist-node/package.json +0 -21
- package/dist-web/LICENSE +0 -21
- package/dist-web/README.md +0 -202
- package/dist-web/html_to_markdown_wasm.d.ts +0 -277
- package/dist-web/html_to_markdown_wasm.js +0 -1395
- package/dist-web/html_to_markdown_wasm_bg.wasm +0 -0
- package/dist-web/html_to_markdown_wasm_bg.wasm.d.ts +0 -55
- package/dist-web/package.json +0 -25
|
Binary file
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
/* tslint:disable */
|
|
2
|
-
/* eslint-disable */
|
|
3
|
-
export const memory: WebAssembly.Memory;
|
|
4
|
-
export const __wbg_wasmconversionoptionshandle_free: (a: number, b: number) => void;
|
|
5
|
-
export const __wbg_wasmhtmlextraction_free: (a: number, b: number) => void;
|
|
6
|
-
export const __wbg_wasminlineimage_free: (a: number, b: number) => void;
|
|
7
|
-
export const __wbg_wasminlineimageconfig_free: (a: number, b: number) => void;
|
|
8
|
-
export const __wbg_wasminlineimagewarning_free: (a: number, b: number) => void;
|
|
9
|
-
export const __wbg_wasmmetadataconfig_free: (a: number, b: number) => void;
|
|
10
|
-
export const convert: (a: number, b: number, c: number, d: number) => void;
|
|
11
|
-
export const convertBytes: (a: number, b: number, c: number) => void;
|
|
12
|
-
export const convertBytesWithInlineImages: (a: number, b: number, c: number, d: number) => void;
|
|
13
|
-
export const convertBytesWithMetadata: (a: number, b: number, c: number, d: number) => void;
|
|
14
|
-
export const convertBytesWithOptionsHandle: (a: number, b: number, c: number) => void;
|
|
15
|
-
export const convertWithInlineImages: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
16
|
-
export const convertWithMetadata: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
17
|
-
export const convertWithOptionsHandle: (a: number, b: number, c: number, d: number) => void;
|
|
18
|
-
export const createConversionOptionsHandle: (a: number, b: number) => void;
|
|
19
|
-
export const wasmconversionoptionshandle_new: (a: number, b: number) => void;
|
|
20
|
-
export const wasmhtmlextraction_inlineImages: (a: number, b: number) => void;
|
|
21
|
-
export const wasmhtmlextraction_markdown: (a: number, b: number) => void;
|
|
22
|
-
export const wasmhtmlextraction_warnings: (a: number, b: number) => void;
|
|
23
|
-
export const wasminlineimage_attributes: (a: number) => number;
|
|
24
|
-
export const wasminlineimage_data: (a: number) => number;
|
|
25
|
-
export const wasminlineimage_description: (a: number, b: number) => void;
|
|
26
|
-
export const wasminlineimage_dimensions: (a: number, b: number) => void;
|
|
27
|
-
export const wasminlineimage_filename: (a: number, b: number) => void;
|
|
28
|
-
export const wasminlineimage_format: (a: number, b: number) => void;
|
|
29
|
-
export const wasminlineimage_source: (a: number, b: number) => void;
|
|
30
|
-
export const wasminlineimageconfig_new: (a: number, b: number) => number;
|
|
31
|
-
export const wasminlineimageconfig_set_captureSvg: (a: number, b: number) => void;
|
|
32
|
-
export const wasminlineimageconfig_set_filenamePrefix: (a: number, b: number, c: number) => void;
|
|
33
|
-
export const wasminlineimageconfig_set_inferDimensions: (a: number, b: number) => void;
|
|
34
|
-
export const wasminlineimagewarning_index: (a: number) => number;
|
|
35
|
-
export const wasminlineimagewarning_message: (a: number, b: number) => void;
|
|
36
|
-
export const wasmmetadataconfig_extract_document: (a: number) => number;
|
|
37
|
-
export const wasmmetadataconfig_extract_headers: (a: number) => number;
|
|
38
|
-
export const wasmmetadataconfig_extract_images: (a: number) => number;
|
|
39
|
-
export const wasmmetadataconfig_extract_links: (a: number) => number;
|
|
40
|
-
export const wasmmetadataconfig_extract_structured_data: (a: number) => number;
|
|
41
|
-
export const wasmmetadataconfig_max_structured_data_size: (a: number) => number;
|
|
42
|
-
export const wasmmetadataconfig_new: () => number;
|
|
43
|
-
export const wasmmetadataconfig_set_extract_document: (a: number, b: number) => void;
|
|
44
|
-
export const wasmmetadataconfig_set_extract_headers: (a: number, b: number) => void;
|
|
45
|
-
export const wasmmetadataconfig_set_extract_images: (a: number, b: number) => void;
|
|
46
|
-
export const wasmmetadataconfig_set_extract_links: (a: number, b: number) => void;
|
|
47
|
-
export const wasmmetadataconfig_set_extract_structured_data: (a: number, b: number) => void;
|
|
48
|
-
export const wasmmetadataconfig_set_max_structured_data_size: (a: number, b: number) => void;
|
|
49
|
-
export const init: () => void;
|
|
50
|
-
export const __wbindgen_export: (a: number, b: number) => number;
|
|
51
|
-
export const __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
|
|
52
|
-
export const __wbindgen_export3: (a: number) => void;
|
|
53
|
-
export const __wbindgen_export4: (a: number, b: number, c: number) => void;
|
|
54
|
-
export const __wbindgen_add_to_stack_pointer: (a: number) => number;
|
|
55
|
-
export const __wbindgen_start: () => void;
|
package/dist-node/package.json
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "html-to-markdown-wasm",
|
|
3
|
-
"collaborators": [
|
|
4
|
-
"Na'aman Hirschfeld <nhirschfeld@gmail.com>"
|
|
5
|
-
],
|
|
6
|
-
"description": "HTML to Markdown conversion for WebAssembly targets",
|
|
7
|
-
"version": "2.19.0-rc.1",
|
|
8
|
-
"license": "MIT",
|
|
9
|
-
"repository": {
|
|
10
|
-
"type": "git",
|
|
11
|
-
"url": "https://github.com/kreuzberg-dev/html-to-markdown"
|
|
12
|
-
},
|
|
13
|
-
"files": [
|
|
14
|
-
"html_to_markdown_wasm_bg.wasm",
|
|
15
|
-
"html_to_markdown_wasm.js",
|
|
16
|
-
"html_to_markdown_wasm.d.ts"
|
|
17
|
-
],
|
|
18
|
-
"main": "html_to_markdown_wasm.js",
|
|
19
|
-
"homepage": "https://github.com/kreuzberg-dev/html-to-markdown",
|
|
20
|
-
"types": "html_to_markdown_wasm.d.ts"
|
|
21
|
-
}
|
package/dist-web/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
The MIT License (MIT)
|
|
2
|
-
|
|
3
|
-
Copyright 2024-2025 Na'aman Hirschfeld
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
package/dist-web/README.md
DELETED
|
@@ -1,202 +0,0 @@
|
|
|
1
|
-
# html-to-markdown
|
|
2
|
-
|
|
3
|
-
<img width="1128" height="191" alt="Linkedin- Banner (1)" src="https://github.com/user-attachments/assets/f8e91036-20a5-40f9-9fcc-9e6c6e15f1f5" />
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
High-performance HTML → Markdown conversion powered by Rust. Shipping as a Rust crate, Python package, PHP extension, Ruby gem, Elixir Rustler NIF, Node.js bindings, WebAssembly, and standalone CLI with identical rendering behaviour.
|
|
8
|
-
|
|
9
|
-
Part of the Kreuzberg.dev document intelligence ecosystem. Kreuzberg is a polyglot document intelligence framework with a fast Rust core. We build tools that help developers extract, process, and understand documents at scale, from PDFs to Office files, images, archives, emails, in 50+ formats. We've set out to make high-performance document intelligence faster and more ecological.
|
|
10
|
-
|
|
11
|
-
[](https://crates.io/crates/html-to-markdown-rs)
|
|
12
|
-
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node)
|
|
13
|
-
[](https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm)
|
|
14
|
-
[](https://pypi.org/project/html-to-markdown/)
|
|
15
|
-
[](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
16
|
-
[](https://rubygems.org/gems/html-to-markdown)
|
|
17
|
-
[](https://hex.pm/packages/html_to_markdown)
|
|
18
|
-
[](https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/)
|
|
19
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown)
|
|
20
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
|
|
21
|
-
[](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
|
|
22
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
---
|
|
26
|
-
|
|
27
|
-
## 🎮 **[Try the Live Demo →](https://kreuzberg-dev.github.io/html-to-markdown/)**
|
|
28
|
-
|
|
29
|
-
Experience WebAssembly-powered HTML to Markdown conversion instantly in your browser. No installation needed!
|
|
30
|
-
|
|
31
|
-
---
|
|
32
|
-
|
|
33
|
-
## Why html-to-markdown?
|
|
34
|
-
|
|
35
|
-
- **Blazing Fast**: Rust-powered core delivers 10-80× faster conversion than pure Python alternatives
|
|
36
|
-
- **Universal**: Works everywhere - Node.js, Bun, Deno, browsers, Python, Rust, and standalone CLI
|
|
37
|
-
- **Smart Conversion**: Handles complex documents including nested tables, code blocks, task lists, and hOCR OCR output
|
|
38
|
-
- **Metadata Extraction**: Extract document metadata (title, description, headers, links, images) alongside conversion
|
|
39
|
-
- **Highly Configurable**: Control heading styles, code block fences, list formatting, whitespace handling, and HTML sanitization
|
|
40
|
-
- **Tag Preservation**: Keep specific HTML tags unconverted when markdown isn't expressive enough
|
|
41
|
-
- **Secure by Default**: Built-in HTML sanitization prevents malicious content
|
|
42
|
-
- **Consistent Output**: Identical markdown rendering across all language bindings
|
|
43
|
-
|
|
44
|
-
## Quick Start
|
|
45
|
-
|
|
46
|
-
**Node.js / Bun (Native - Fastest):**
|
|
47
|
-
|
|
48
|
-
```typescript
|
|
49
|
-
import { convert } from '@kreuzberg/html-to-markdown-node';
|
|
50
|
-
|
|
51
|
-
const html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>';
|
|
52
|
-
const markdown = convert(html, {
|
|
53
|
-
headingStyle: 'Atx',
|
|
54
|
-
codeBlockStyle: 'Backticks',
|
|
55
|
-
wrap: true,
|
|
56
|
-
preserveTags: ['table'],
|
|
57
|
-
});
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
**Python:**
|
|
61
|
-
|
|
62
|
-
```python
|
|
63
|
-
from html_to_markdown import convert
|
|
64
|
-
|
|
65
|
-
html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
|
|
66
|
-
markdown = convert(html, heading_style='Atx', wrap=True)
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
**Ruby:**
|
|
70
|
-
|
|
71
|
-
```ruby
|
|
72
|
-
require 'html_to_markdown'
|
|
73
|
-
|
|
74
|
-
html = '<h1>Hello</h1><p>Rust ❤️ Markdown</p>'
|
|
75
|
-
markdown = HtmlToMarkdown.convert(html, heading_style: :atx, wrap: true)
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
Full language guides: See [Language Guides](#language-guides) below.
|
|
79
|
-
|
|
80
|
-
## Installation
|
|
81
|
-
|
|
82
|
-
| Target | Command(s) |
|
|
83
|
-
| --------------------------- | ---------------------------------------------------------------------------------------------------------------- |
|
|
84
|
-
| **Node.js/Bun** (native) | `npm install @kreuzberg/html-to-markdown-node` |
|
|
85
|
-
| **WebAssembly** (universal) | `npm install @kreuzberg/html-to-markdown-wasm` |
|
|
86
|
-
| **Deno** | `import { convert } from "npm:@kreuzberg/html-to-markdown-wasm"` |
|
|
87
|
-
| **Python** (bindings + CLI) | `pip install html-to-markdown` |
|
|
88
|
-
| **PHP** (extension + helpers) | `PHP_EXTENSION_DIR=$(php-config --extension-dir) pie install goldziher/html-to-markdown`<br>`composer require goldziher/html-to-markdown` |
|
|
89
|
-
| **Ruby** gem | `bundle add html-to-markdown` or `gem install html-to-markdown` |
|
|
90
|
-
| **Elixir** (Rustler NIF) | `{:html_to_markdown, "~> 2.8"}` |
|
|
91
|
-
| **Rust** crate | `cargo add html-to-markdown-rs` |
|
|
92
|
-
| **Java** (Maven) | `<groupId>dev.kreuzberg</groupId><artifactId>html-to-markdown</artifactId>` |
|
|
93
|
-
| **C#/.NET** (NuGet) | `dotnet add package KreuzbergDev.HtmlToMarkdown` |
|
|
94
|
-
| Rust CLI (crates.io) | `cargo install html-to-markdown-cli` |
|
|
95
|
-
| Homebrew CLI | `brew install html-to-markdown` (core) |
|
|
96
|
-
| Releases | [GitHub Releases](https://github.com/kreuzberg-dev/html-to-markdown/releases) |
|
|
97
|
-
|
|
98
|
-
## Performance
|
|
99
|
-
|
|
100
|
-
Benchmarked on Apple M4 using the shared fixture harness in `tools/benchmark-harness`.
|
|
101
|
-
|
|
102
|
-
### Comparative Throughput (Median Across Fixtures)
|
|
103
|
-
|
|
104
|
-
| Runtime | Median ops/sec | Median throughput (MB/s) | Peak memory (MB) | Successes |
|
|
105
|
-
| ------- | -------------- | ------------------------ | ---------------- | --------- |
|
|
106
|
-
| Rust | 1,060.3 | 116.4 | 171.3 | 56/56 |
|
|
107
|
-
| Go | 1,496.3 | 131.1 | 22.9 | 16/16 |
|
|
108
|
-
| Ruby | 2,155.5 | 300.4 | 280.3 | 48/48 |
|
|
109
|
-
| PHP | 2,357.7 | 308.0 | 223.5 | 48/48 |
|
|
110
|
-
| Elixir | 1,564.1 | 269.1 | 384.7 | 48/48 |
|
|
111
|
-
| C# | 1,234.2 | 272.4 | 187.8 | 16/16 |
|
|
112
|
-
| Java | 1,298.7 | 167.1 | 527.2 | 16/16 |
|
|
113
|
-
| WASM | 1,485.8 | 157.6 | 95.3 | 48/48 |
|
|
114
|
-
| Node.js (NAPI) | 2,054.2 | 306.5 | 95.4 | 48/48 |
|
|
115
|
-
| Python (PyO3) | 3,120.3 | 307.5 | 83.5 | 48/48 |
|
|
116
|
-
|
|
117
|
-
Use `task bench:harness` to regenerate throughput numbers. See [Performance Guide](./examples/performance/) for benchmarking strategies and optimization tips.
|
|
118
|
-
|
|
119
|
-
## Language Guides
|
|
120
|
-
|
|
121
|
-
Complete documentation with examples for each language:
|
|
122
|
-
|
|
123
|
-
- **Python** – [README](./packages/python/README.md) | PyO3 bindings, metadata extraction, inline images
|
|
124
|
-
- **JavaScript/TypeScript** – [Node.js](./crates/html-to-markdown-node/README.md) | [TypeScript](./packages/typescript/README.md) | [WASM](./crates/html-to-markdown-wasm/README.md)
|
|
125
|
-
- **Ruby** – [README](./packages/ruby/README.md) | Magnus bindings, RBS type definitions, Steep checking
|
|
126
|
-
- **PHP** – [Package](./packages/php/README.md) | [Extension (PIE)](./packages/php-ext/README.md) | ext-php-rs extension
|
|
127
|
-
- **Go** – [README](./packages/go/README.md) | FFI bindings with cgo
|
|
128
|
-
- **Java** – [README](./packages/java/README.md) | Panama FFI, Maven/Gradle setup
|
|
129
|
-
- **C#/.NET** – [README](./packages/csharp/README.md) | P/Invoke FFI, NuGet distribution
|
|
130
|
-
- **Elixir** – [README](./packages/elixir/README.md) | Rustler NIF bindings
|
|
131
|
-
- **Rust** – [README](./crates/html-to-markdown/README.md) | Core library, error handling, advanced features
|
|
132
|
-
|
|
133
|
-
## Feature Guides
|
|
134
|
-
|
|
135
|
-
### Visitor Pattern
|
|
136
|
-
Customize HTML→Markdown conversion with callbacks for specific elements. Use cases: domain-specific dialects, content filtering, URL rewriting, accessibility validation.
|
|
137
|
-
|
|
138
|
-
**→ [Full Guide with Examples](./examples/visitor-pattern/)** (Python, TypeScript, Ruby)
|
|
139
|
-
|
|
140
|
-
### Metadata Extraction
|
|
141
|
-
Extract comprehensive metadata during conversion: title, description, headers, links, images, structured data. Use cases: SEO extraction, TOC generation, link validation, accessibility auditing, content migration.
|
|
142
|
-
|
|
143
|
-
**→ [Full Guide with Examples](./examples/metadata-extraction/)** (Python, TypeScript, Ruby)
|
|
144
|
-
|
|
145
|
-
### Performance & Benchmarking
|
|
146
|
-
Understand performance characteristics, run benchmarks, optimize for your use case. Includes benchmarking tools, memory profiling, streaming strategies, and optimization tips.
|
|
147
|
-
|
|
148
|
-
**→ [Full Guide](./examples/performance/)**
|
|
149
|
-
|
|
150
|
-
## Examples
|
|
151
|
-
|
|
152
|
-
Explore working code examples in multiple languages:
|
|
153
|
-
|
|
154
|
-
| Example | Path | Languages |
|
|
155
|
-
| ------- | ---- | --------- |
|
|
156
|
-
| **Visitor Pattern** | [examples/visitor-pattern/](./examples/visitor-pattern/) | Python, TypeScript, Ruby |
|
|
157
|
-
| **Metadata Extraction** | [examples/metadata-extraction/](./examples/metadata-extraction/) | Python, TypeScript, Ruby |
|
|
158
|
-
| **Performance** | [examples/performance/](./examples/performance/) | Benchmarks, profiling, optimization |
|
|
159
|
-
|
|
160
|
-
## Testing
|
|
161
|
-
|
|
162
|
-
Run the test suite locally:
|
|
163
|
-
|
|
164
|
-
```bash
|
|
165
|
-
# All core test suites (Rust, Python, Ruby, Node, PHP, Go, C#, Elixir, Java)
|
|
166
|
-
task test
|
|
167
|
-
|
|
168
|
-
# Run the Wasmtime-backed WASM integration tests
|
|
169
|
-
task wasm:test:wasmtime
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
## Compatibility & Migrations
|
|
173
|
-
|
|
174
|
-
### v2.19.0 Breaking Changes (Package Namespace Updates)
|
|
175
|
-
|
|
176
|
-
Several language bindings were updated to use new namespaces and package owners:
|
|
177
|
-
|
|
178
|
-
- **npm packages**: Scoped under `@kreuzberg` organization
|
|
179
|
-
- Old: `html-to-markdown-node` → New: `@kreuzberg/html-to-markdown-node`
|
|
180
|
-
- Old: `html-to-markdown-wasm` → New: `@kreuzberg/html-to-markdown-wasm`
|
|
181
|
-
- **Java**: Package namespace changed from `io.github.goldziher` to `dev.kreuzberg`
|
|
182
|
-
- **C#/.NET**: Package changed from `Goldziher.HtmlToMarkdown` to `KreuzbergDev.HtmlToMarkdown`
|
|
183
|
-
|
|
184
|
-
See [MIGRATION.md](./MIGRATION.md) for step-by-step upgrade instructions for each language.
|
|
185
|
-
|
|
186
|
-
### v1 → v2 Compatibility
|
|
187
|
-
|
|
188
|
-
- V2's Rust core sustains **150–210 MB/s** throughput; V1 averaged **≈ 2.5 MB/s** (60–80× faster).
|
|
189
|
-
- Python compatibility shim available in `html_to_markdown.v1_compat` (deprecated; emits warnings; plan migrations now). See [Python README](./packages/python/README.md#v1-compatibility) for keyword mappings.
|
|
190
|
-
- CLI flag changes and other breaking updates in [CHANGELOG](./CHANGELOG.md#breaking-changes).
|
|
191
|
-
|
|
192
|
-
## Community
|
|
193
|
-
|
|
194
|
-
- **Discord** – [Join our community](https://discord.gg/pXxagNK2zN)
|
|
195
|
-
- **Ecosystem** – Explore [Kreuzberg](https://kreuzberg.dev) document-processing tools
|
|
196
|
-
- **Contribute** – [CONTRIBUTING.md](./CONTRIBUTING.md)
|
|
197
|
-
- **Sponsor** – [GitHub Sponsors](https://github.com/sponsors/kreuzberg-dev)
|
|
198
|
-
- **Changelog** – [Version history](./CHANGELOG.md)
|
|
199
|
-
|
|
200
|
-
## License
|
|
201
|
-
|
|
202
|
-
MIT License – see [LICENSE](./LICENSE) for details.
|
|
@@ -1,277 +0,0 @@
|
|
|
1
|
-
/* tslint:disable */
|
|
2
|
-
/* eslint-disable */
|
|
3
|
-
|
|
4
|
-
export class WasmConversionOptionsHandle {
|
|
5
|
-
free(): void;
|
|
6
|
-
[Symbol.dispose](): void;
|
|
7
|
-
constructor(options?: WasmConversionOptions | null);
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
export class WasmHtmlExtraction {
|
|
11
|
-
private constructor();
|
|
12
|
-
free(): void;
|
|
13
|
-
[Symbol.dispose](): void;
|
|
14
|
-
readonly inlineImages: WasmInlineImage[];
|
|
15
|
-
readonly markdown: string;
|
|
16
|
-
readonly warnings: WasmInlineImageWarning[];
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
export class WasmInlineImage {
|
|
20
|
-
private constructor();
|
|
21
|
-
free(): void;
|
|
22
|
-
[Symbol.dispose](): void;
|
|
23
|
-
readonly attributes: Record<string, string>;
|
|
24
|
-
readonly dimensions: Uint32Array | undefined;
|
|
25
|
-
readonly description: string | undefined;
|
|
26
|
-
readonly data: Uint8Array;
|
|
27
|
-
readonly format: string;
|
|
28
|
-
readonly source: string;
|
|
29
|
-
readonly filename: string | undefined;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
export class WasmInlineImageConfig {
|
|
33
|
-
free(): void;
|
|
34
|
-
[Symbol.dispose](): void;
|
|
35
|
-
constructor(max_decoded_size_bytes?: number | null);
|
|
36
|
-
set captureSvg(value: boolean);
|
|
37
|
-
set filenamePrefix(value: string | null | undefined);
|
|
38
|
-
set inferDimensions(value: boolean);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export class WasmInlineImageWarning {
|
|
42
|
-
private constructor();
|
|
43
|
-
free(): void;
|
|
44
|
-
[Symbol.dispose](): void;
|
|
45
|
-
readonly index: number;
|
|
46
|
-
readonly message: string;
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
export class WasmMetadataConfig {
|
|
50
|
-
free(): void;
|
|
51
|
-
[Symbol.dispose](): void;
|
|
52
|
-
/**
|
|
53
|
-
* Create a new metadata configuration with defaults
|
|
54
|
-
*
|
|
55
|
-
* All extraction types enabled by default with 1MB structured data limit
|
|
56
|
-
*/
|
|
57
|
-
constructor();
|
|
58
|
-
extract_links: boolean;
|
|
59
|
-
extract_images: boolean;
|
|
60
|
-
extract_headers: boolean;
|
|
61
|
-
extract_document: boolean;
|
|
62
|
-
extract_structured_data: boolean;
|
|
63
|
-
max_structured_data_size: number;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Convert HTML to Markdown
|
|
68
|
-
*
|
|
69
|
-
* # Arguments
|
|
70
|
-
*
|
|
71
|
-
* * `html` - The HTML string to convert
|
|
72
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
73
|
-
*
|
|
74
|
-
* # Example
|
|
75
|
-
*
|
|
76
|
-
* ```javascript
|
|
77
|
-
* import { convert } from 'html-to-markdown-wasm';
|
|
78
|
-
*
|
|
79
|
-
* const html = '<h1>Hello World</h1>';
|
|
80
|
-
* const markdown = convert(html);
|
|
81
|
-
* console.log(markdown); // # Hello World
|
|
82
|
-
* ```
|
|
83
|
-
*/
|
|
84
|
-
export function convert(html: string, options?: WasmConversionOptions | null): string;
|
|
85
|
-
|
|
86
|
-
export function convertBytes(html: Uint8Array, options?: WasmConversionOptions | null): string;
|
|
87
|
-
|
|
88
|
-
export function convertBytesWithInlineImages(html: Uint8Array, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Convert HTML bytes to Markdown with metadata extraction
|
|
92
|
-
*
|
|
93
|
-
* # Arguments
|
|
94
|
-
*
|
|
95
|
-
* * `html` - The HTML bytes to convert
|
|
96
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
97
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
98
|
-
*
|
|
99
|
-
* # Returns
|
|
100
|
-
*
|
|
101
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
102
|
-
*/
|
|
103
|
-
export function convertBytesWithMetadata(html: Uint8Array, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
104
|
-
|
|
105
|
-
export function convertBytesWithOptionsHandle(html: Uint8Array, handle: WasmConversionOptionsHandle): string;
|
|
106
|
-
|
|
107
|
-
export function convertWithInlineImages(html: string, options?: WasmConversionOptions | null, image_config?: WasmInlineImageConfig | null): WasmHtmlExtraction;
|
|
108
|
-
|
|
109
|
-
/**
|
|
110
|
-
* Convert HTML to Markdown with metadata extraction
|
|
111
|
-
*
|
|
112
|
-
* # Arguments
|
|
113
|
-
*
|
|
114
|
-
* * `html` - The HTML string to convert
|
|
115
|
-
* * `options` - Optional conversion options (as a JavaScript object)
|
|
116
|
-
* * `metadata_config` - Metadata extraction configuration
|
|
117
|
-
*
|
|
118
|
-
* # Returns
|
|
119
|
-
*
|
|
120
|
-
* JavaScript object with `markdown` (string) and `metadata` (object) fields
|
|
121
|
-
*
|
|
122
|
-
* # Example
|
|
123
|
-
*
|
|
124
|
-
* ```javascript
|
|
125
|
-
* import { convertWithMetadata, WasmMetadataConfig } from 'html-to-markdown-wasm';
|
|
126
|
-
*
|
|
127
|
-
* const html = '<h1>Hello World</h1><a href="https://example.com">Link</a>';
|
|
128
|
-
* const config = new WasmMetadataConfig();
|
|
129
|
-
* config.extractHeaders = true;
|
|
130
|
-
* config.extractLinks = true;
|
|
131
|
-
*
|
|
132
|
-
* const result = convertWithMetadata(html, null, config);
|
|
133
|
-
* console.log(result.markdown); // # Hello World\n\n[Link](https://example.com)
|
|
134
|
-
* console.log(result.metadata.headers); // [{ level: 1, text: "Hello World", ... }]
|
|
135
|
-
* console.log(result.metadata.links); // [{ href: "https://example.com", text: "Link", ... }]
|
|
136
|
-
* ```
|
|
137
|
-
*/
|
|
138
|
-
export function convertWithMetadata(html: string, options?: WasmConversionOptions | null, metadata_config?: WasmMetadataConfig | null): any;
|
|
139
|
-
|
|
140
|
-
export function convertWithOptionsHandle(html: string, handle: WasmConversionOptionsHandle): string;
|
|
141
|
-
|
|
142
|
-
export function createConversionOptionsHandle(options?: WasmConversionOptions | null): WasmConversionOptionsHandle;
|
|
143
|
-
|
|
144
|
-
/**
|
|
145
|
-
* Initialize panic hook for better error messages in the browser
|
|
146
|
-
*/
|
|
147
|
-
export function init(): void;
|
|
148
|
-
|
|
149
|
-
export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module;
|
|
150
|
-
|
|
151
|
-
export interface InitOutput {
|
|
152
|
-
readonly memory: WebAssembly.Memory;
|
|
153
|
-
readonly __wbg_wasmconversionoptionshandle_free: (a: number, b: number) => void;
|
|
154
|
-
readonly __wbg_wasmhtmlextraction_free: (a: number, b: number) => void;
|
|
155
|
-
readonly __wbg_wasminlineimage_free: (a: number, b: number) => void;
|
|
156
|
-
readonly __wbg_wasminlineimageconfig_free: (a: number, b: number) => void;
|
|
157
|
-
readonly __wbg_wasminlineimagewarning_free: (a: number, b: number) => void;
|
|
158
|
-
readonly __wbg_wasmmetadataconfig_free: (a: number, b: number) => void;
|
|
159
|
-
readonly convert: (a: number, b: number, c: number, d: number) => void;
|
|
160
|
-
readonly convertBytes: (a: number, b: number, c: number) => void;
|
|
161
|
-
readonly convertBytesWithInlineImages: (a: number, b: number, c: number, d: number) => void;
|
|
162
|
-
readonly convertBytesWithMetadata: (a: number, b: number, c: number, d: number) => void;
|
|
163
|
-
readonly convertBytesWithOptionsHandle: (a: number, b: number, c: number) => void;
|
|
164
|
-
readonly convertWithInlineImages: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
165
|
-
readonly convertWithMetadata: (a: number, b: number, c: number, d: number, e: number) => void;
|
|
166
|
-
readonly convertWithOptionsHandle: (a: number, b: number, c: number, d: number) => void;
|
|
167
|
-
readonly createConversionOptionsHandle: (a: number, b: number) => void;
|
|
168
|
-
readonly wasmconversionoptionshandle_new: (a: number, b: number) => void;
|
|
169
|
-
readonly wasmhtmlextraction_inlineImages: (a: number, b: number) => void;
|
|
170
|
-
readonly wasmhtmlextraction_markdown: (a: number, b: number) => void;
|
|
171
|
-
readonly wasmhtmlextraction_warnings: (a: number, b: number) => void;
|
|
172
|
-
readonly wasminlineimage_attributes: (a: number) => number;
|
|
173
|
-
readonly wasminlineimage_data: (a: number) => number;
|
|
174
|
-
readonly wasminlineimage_description: (a: number, b: number) => void;
|
|
175
|
-
readonly wasminlineimage_dimensions: (a: number, b: number) => void;
|
|
176
|
-
readonly wasminlineimage_filename: (a: number, b: number) => void;
|
|
177
|
-
readonly wasminlineimage_format: (a: number, b: number) => void;
|
|
178
|
-
readonly wasminlineimage_source: (a: number, b: number) => void;
|
|
179
|
-
readonly wasminlineimageconfig_new: (a: number, b: number) => number;
|
|
180
|
-
readonly wasminlineimageconfig_set_captureSvg: (a: number, b: number) => void;
|
|
181
|
-
readonly wasminlineimageconfig_set_filenamePrefix: (a: number, b: number, c: number) => void;
|
|
182
|
-
readonly wasminlineimageconfig_set_inferDimensions: (a: number, b: number) => void;
|
|
183
|
-
readonly wasminlineimagewarning_index: (a: number) => number;
|
|
184
|
-
readonly wasminlineimagewarning_message: (a: number, b: number) => void;
|
|
185
|
-
readonly wasmmetadataconfig_extract_document: (a: number) => number;
|
|
186
|
-
readonly wasmmetadataconfig_extract_headers: (a: number) => number;
|
|
187
|
-
readonly wasmmetadataconfig_extract_images: (a: number) => number;
|
|
188
|
-
readonly wasmmetadataconfig_extract_links: (a: number) => number;
|
|
189
|
-
readonly wasmmetadataconfig_extract_structured_data: (a: number) => number;
|
|
190
|
-
readonly wasmmetadataconfig_max_structured_data_size: (a: number) => number;
|
|
191
|
-
readonly wasmmetadataconfig_new: () => number;
|
|
192
|
-
readonly wasmmetadataconfig_set_extract_document: (a: number, b: number) => void;
|
|
193
|
-
readonly wasmmetadataconfig_set_extract_headers: (a: number, b: number) => void;
|
|
194
|
-
readonly wasmmetadataconfig_set_extract_images: (a: number, b: number) => void;
|
|
195
|
-
readonly wasmmetadataconfig_set_extract_links: (a: number, b: number) => void;
|
|
196
|
-
readonly wasmmetadataconfig_set_extract_structured_data: (a: number, b: number) => void;
|
|
197
|
-
readonly wasmmetadataconfig_set_max_structured_data_size: (a: number, b: number) => void;
|
|
198
|
-
readonly init: () => void;
|
|
199
|
-
readonly __wbindgen_export: (a: number, b: number) => number;
|
|
200
|
-
readonly __wbindgen_export2: (a: number, b: number, c: number, d: number) => number;
|
|
201
|
-
readonly __wbindgen_export3: (a: number) => void;
|
|
202
|
-
readonly __wbindgen_export4: (a: number, b: number, c: number) => void;
|
|
203
|
-
readonly __wbindgen_add_to_stack_pointer: (a: number) => number;
|
|
204
|
-
readonly __wbindgen_start: () => void;
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
export type SyncInitInput = BufferSource | WebAssembly.Module;
|
|
208
|
-
|
|
209
|
-
/**
|
|
210
|
-
* Instantiates the given `module`, which can either be bytes or
|
|
211
|
-
* a precompiled `WebAssembly.Module`.
|
|
212
|
-
*
|
|
213
|
-
* @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated.
|
|
214
|
-
*
|
|
215
|
-
* @returns {InitOutput}
|
|
216
|
-
*/
|
|
217
|
-
export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput;
|
|
218
|
-
|
|
219
|
-
/**
|
|
220
|
-
* If `module_or_path` is {RequestInfo} or {URL}, makes a request and
|
|
221
|
-
* for everything else, calls `WebAssembly.instantiate` directly.
|
|
222
|
-
*
|
|
223
|
-
* @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated.
|
|
224
|
-
*
|
|
225
|
-
* @returns {Promise<InitOutput>}
|
|
226
|
-
*/
|
|
227
|
-
export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
export type WasmHeadingStyle = "underlined" | "atx" | "atxClosed";
|
|
231
|
-
export type WasmListIndentType = "spaces" | "tabs";
|
|
232
|
-
export type WasmWhitespaceMode = "normalized" | "strict";
|
|
233
|
-
export type WasmNewlineStyle = "spaces" | "backslash";
|
|
234
|
-
export type WasmCodeBlockStyle = "indented" | "backticks" | "tildes";
|
|
235
|
-
export type WasmHighlightStyle = "doubleEqual" | "html" | "bold" | "none";
|
|
236
|
-
export type WasmPreprocessingPreset = "minimal" | "standard" | "aggressive";
|
|
237
|
-
|
|
238
|
-
export interface WasmPreprocessingOptions {
|
|
239
|
-
enabled?: boolean;
|
|
240
|
-
preset?: WasmPreprocessingPreset;
|
|
241
|
-
removeNavigation?: boolean;
|
|
242
|
-
removeForms?: boolean;
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
export interface WasmConversionOptions {
|
|
246
|
-
headingStyle?: WasmHeadingStyle;
|
|
247
|
-
listIndentType?: WasmListIndentType;
|
|
248
|
-
listIndentWidth?: number;
|
|
249
|
-
bullets?: string;
|
|
250
|
-
strongEmSymbol?: string;
|
|
251
|
-
escapeAsterisks?: boolean;
|
|
252
|
-
escapeUnderscores?: boolean;
|
|
253
|
-
escapeMisc?: boolean;
|
|
254
|
-
escapeAscii?: boolean;
|
|
255
|
-
codeLanguage?: string;
|
|
256
|
-
autolinks?: boolean;
|
|
257
|
-
defaultTitle?: boolean;
|
|
258
|
-
brInTables?: boolean;
|
|
259
|
-
hocrSpatialTables?: boolean;
|
|
260
|
-
highlightStyle?: WasmHighlightStyle;
|
|
261
|
-
extractMetadata?: boolean;
|
|
262
|
-
whitespaceMode?: WasmWhitespaceMode;
|
|
263
|
-
stripNewlines?: boolean;
|
|
264
|
-
wrap?: boolean;
|
|
265
|
-
wrapWidth?: number;
|
|
266
|
-
convertAsInline?: boolean;
|
|
267
|
-
subSymbol?: string;
|
|
268
|
-
supSymbol?: string;
|
|
269
|
-
newlineStyle?: WasmNewlineStyle;
|
|
270
|
-
codeBlockStyle?: WasmCodeBlockStyle;
|
|
271
|
-
keepInlineImagesIn?: string[];
|
|
272
|
-
preprocessing?: WasmPreprocessingOptions | null;
|
|
273
|
-
encoding?: string;
|
|
274
|
-
debug?: boolean;
|
|
275
|
-
stripTags?: string[];
|
|
276
|
-
preserveTags?: string[];
|
|
277
|
-
}
|