html-to-markdown 2.18.0 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -28
- data/README.md +190 -188
- data/ext/html-to-markdown-rb/native/Cargo.toml +2 -2
- data/lib/html_to_markdown/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 846bb8410c19e5a5ee99a7998d78f6f9aa21e698dc2cf1ac4ab6aaf45e257cca
|
|
4
|
+
data.tar.gz: 04cc1deab74259fc77481f30e1fdf236439b86b2fd05b1a5efc06d873e54f806
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4d1adafa16b6633374c5f509881bb8feb5ec670bfd5c7388c9100c6d3d6a53538db86de6cfbce81a0976af7ad8865fbd5242b639631828d67a88cefc938763ae
|
|
7
|
+
data.tar.gz: f292a71719770821f0e5000d263627f8f44facc74f4c0f6f7da0e4c7feb66034e57baf254163ac88a58f6ebf911a8787797e2742bd31bba9a27fe289799c833d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.19.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -29,16 +29,8 @@ GEM
|
|
|
29
29
|
diff-lcs (1.6.2)
|
|
30
30
|
drb (2.2.3)
|
|
31
31
|
ffi (1.17.2)
|
|
32
|
-
ffi (1.17.2-aarch64-linux-gnu)
|
|
33
|
-
ffi (1.17.2-aarch64-linux-musl)
|
|
34
|
-
ffi (1.17.2-arm-linux-gnu)
|
|
35
|
-
ffi (1.17.2-arm-linux-musl)
|
|
36
32
|
ffi (1.17.2-arm64-darwin)
|
|
37
|
-
ffi (1.17.2-x86-linux-gnu)
|
|
38
|
-
ffi (1.17.2-x86-linux-musl)
|
|
39
|
-
ffi (1.17.2-x86_64-darwin)
|
|
40
33
|
ffi (1.17.2-x86_64-linux-gnu)
|
|
41
|
-
ffi (1.17.2-x86_64-linux-musl)
|
|
42
34
|
fileutils (1.8.0)
|
|
43
35
|
i18n (1.14.8)
|
|
44
36
|
concurrent-ruby (~> 1.0)
|
|
@@ -131,17 +123,8 @@ GEM
|
|
|
131
123
|
uri (1.1.1)
|
|
132
124
|
|
|
133
125
|
PLATFORMS
|
|
134
|
-
aarch64-linux-gnu
|
|
135
|
-
aarch64-linux-musl
|
|
136
|
-
arm-linux-gnu
|
|
137
|
-
arm-linux-musl
|
|
138
126
|
arm64-darwin
|
|
139
|
-
|
|
140
|
-
x86-linux-gnu
|
|
141
|
-
x86-linux-musl
|
|
142
|
-
x86_64-darwin
|
|
143
|
-
x86_64-linux-gnu
|
|
144
|
-
x86_64-linux-musl
|
|
127
|
+
x86_64-linux
|
|
145
128
|
|
|
146
129
|
DEPENDENCIES
|
|
147
130
|
html-to-markdown!
|
|
@@ -164,18 +147,10 @@ CHECKSUMS
|
|
|
164
147
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
165
148
|
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
166
149
|
ffi (1.17.2) sha256=297235842e5947cc3036ebe64077584bff583cd7a4e94e9a02fdec399ef46da6
|
|
167
|
-
ffi (1.17.2-aarch64-linux-gnu) sha256=c910bd3cae70b76690418cce4572b7f6c208d271f323d692a067d59116211a1a
|
|
168
|
-
ffi (1.17.2-aarch64-linux-musl) sha256=69e6556b091d45df83e6c3b19d3c54177c206910965155a6ec98de5e893c7b7c
|
|
169
|
-
ffi (1.17.2-arm-linux-gnu) sha256=d4a438f2b40224ae42ec72f293b3ebe0ba2159f7d1bd47f8417e6af2f68dbaa5
|
|
170
|
-
ffi (1.17.2-arm-linux-musl) sha256=977dfb7f3a6381206dbda9bc441d9e1f9366bf189a634559c3b7c182c497aaa3
|
|
171
150
|
ffi (1.17.2-arm64-darwin) sha256=54dd9789be1d30157782b8de42d8f887a3c3c345293b57ffb6b45b4d1165f813
|
|
172
|
-
ffi (1.17.2-x86-linux-gnu) sha256=95d8f9ebea23c39888e2ab85a02c98f54acb2f4e79b829250d7267ce741dc7b0
|
|
173
|
-
ffi (1.17.2-x86-linux-musl) sha256=41741449bab2b9530f42a47baa5c26263925306fad0ac2d60887f51af2e3b24c
|
|
174
|
-
ffi (1.17.2-x86_64-darwin) sha256=981f2d4e32ea03712beb26e55e972797c2c5a7b0257955d8667ba58f2da6440e
|
|
175
151
|
ffi (1.17.2-x86_64-linux-gnu) sha256=05d2026fc9dbb7cfd21a5934559f16293815b7ce0314846fee2ac8efbdb823ea
|
|
176
|
-
ffi (1.17.2-x86_64-linux-musl) sha256=97c0eb3981414309285a64dc4d466bd149e981c279a56371ef811395d68cb95c
|
|
177
152
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
178
|
-
html-to-markdown (2.
|
|
153
|
+
html-to-markdown (2.19.0)
|
|
179
154
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
180
155
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
181
156
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
data/README.md
CHANGED
|
@@ -1,274 +1,276 @@
|
|
|
1
|
-
# html-to-markdown
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
1
|
+
# html-to-markdown
|
|
2
|
+
|
|
3
|
+
<div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
|
|
4
|
+
<!-- Language Bindings -->
|
|
5
|
+
<a href="https://crates.io/crates/html-to-markdown-rs">
|
|
6
|
+
<img src="https://img.shields.io/crates/v/html-to-markdown-rs?label=Rust&color=007ec6" alt="Rust">
|
|
7
|
+
</a>
|
|
8
|
+
<a href="https://pypi.org/project/html-to-markdown/">
|
|
9
|
+
<img src="https://img.shields.io/pypi/v/html-to-markdown?label=Python&color=007ec6" alt="Python">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node">
|
|
12
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-node?label=Node.js&color=007ec6" alt="Node.js">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm">
|
|
15
|
+
<img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-wasm?label=WASM&color=007ec6" alt="WASM">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
|
|
18
|
+
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
|
+
</a>
|
|
20
|
+
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
|
|
21
|
+
<img src="https://img.shields.io/badge/Go-v2.19.0-007ec6" alt="Go">
|
|
22
|
+
</a>
|
|
23
|
+
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
|
+
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
25
|
+
</a>
|
|
26
|
+
<a href="https://packagist.org/packages/goldziher/html-to-markdown">
|
|
27
|
+
<img src="https://img.shields.io/packagist/v/goldziher/html-to-markdown?label=PHP&color=007ec6" alt="PHP">
|
|
28
|
+
</a>
|
|
29
|
+
<a href="https://rubygems.org/gems/html-to-markdown">
|
|
30
|
+
<img src="https://img.shields.io/gem/v/html-to-markdown?label=Ruby&color=007ec6" alt="Ruby">
|
|
31
|
+
</a>
|
|
32
|
+
<a href="https://hex.pm/packages/html_to_markdown">
|
|
33
|
+
<img src="https://img.shields.io/hexpm/v/html_to_markdown?label=Elixir&color=007ec6" alt="Elixir">
|
|
34
|
+
</a>
|
|
35
|
+
|
|
36
|
+
<!-- Project Info -->
|
|
37
|
+
<a href="https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE">
|
|
38
|
+
<img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
|
|
39
|
+
</a>
|
|
40
|
+
</div>
|
|
41
|
+
|
|
42
|
+
<img width="1128" height="191" alt="html-to-markdown" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|
|
43
|
+
|
|
44
|
+
<div align="center" style="margin-top: 20px;">
|
|
45
|
+
<a href="https://discord.gg/pXxagNK2zN">
|
|
46
|
+
<img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
|
|
47
|
+
</a>
|
|
48
|
+
</div>
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
|
|
52
|
+
Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
|
|
53
|
+
|
|
33
54
|
|
|
34
55
|
## Installation
|
|
35
56
|
|
|
36
57
|
```bash
|
|
37
|
-
bundle add html-to-markdown
|
|
38
|
-
# or
|
|
39
58
|
gem install html-to-markdown
|
|
40
59
|
```
|
|
41
60
|
|
|
42
|
-
Add the gem to your project and Bundler will compile the native Rust extension on first install.
|
|
43
61
|
|
|
44
|
-
### Requirements
|
|
45
62
|
|
|
46
|
-
|
|
47
|
-
- Rust toolchain **1.85+** with Cargo available on your `$PATH`
|
|
48
|
-
- Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
|
|
63
|
+
Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, macOS.
|
|
49
64
|
|
|
50
|
-
**Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
|
|
51
65
|
|
|
52
|
-
```powershell
|
|
53
|
-
ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
|
|
54
|
-
```
|
|
55
66
|
|
|
56
|
-
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
## Performance Snapshot
|
|
71
|
+
|
|
72
|
+
Apple M4 • Real Wikipedia documents • `convert()` (Ruby)
|
|
73
|
+
|
|
74
|
+
| Document | Size | Latency | Throughput |
|
|
75
|
+
| -------- | ---- | ------- | ---------- |
|
|
76
|
+
| Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
|
|
77
|
+
| Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
|
|
78
|
+
| Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
See [Performance Guide](../../examples/performance/) for detailed benchmarks.
|
|
82
|
+
|
|
57
83
|
|
|
58
84
|
## Quick Start
|
|
59
85
|
|
|
86
|
+
Basic conversion:
|
|
87
|
+
|
|
60
88
|
```ruby
|
|
61
89
|
require 'html_to_markdown'
|
|
62
90
|
|
|
63
|
-
html =
|
|
64
|
-
<h1>Welcome</h1>
|
|
65
|
-
<p>This is <strong>Rust-fast</strong> conversion!</p>
|
|
66
|
-
<ul>
|
|
67
|
-
<li>Native extension</li>
|
|
68
|
-
<li>Identical output across languages</li>
|
|
69
|
-
</ul>
|
|
70
|
-
HTML
|
|
71
|
-
|
|
91
|
+
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
|
|
72
92
|
markdown = HtmlToMarkdown.convert(html)
|
|
73
|
-
puts markdown
|
|
74
|
-
# # Welcome
|
|
75
|
-
#
|
|
76
|
-
# This is **Rust-fast** conversion!
|
|
77
|
-
#
|
|
78
|
-
# - Native extension
|
|
79
|
-
# - Identical output across languages
|
|
80
93
|
```
|
|
81
94
|
|
|
82
|
-
## API Reference
|
|
83
95
|
|
|
84
|
-
|
|
96
|
+
|
|
97
|
+
With conversion options:
|
|
85
98
|
|
|
86
99
|
```ruby
|
|
87
|
-
|
|
88
|
-
markdown = HtmlToMarkdown.convert(html)
|
|
100
|
+
require 'html_to_markdown'
|
|
89
101
|
|
|
90
|
-
|
|
102
|
+
html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
|
|
91
103
|
markdown = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
|
|
104
|
+
```
|
|
92
105
|
|
|
93
|
-
# With inline images
|
|
94
|
-
result = HtmlToMarkdown.convert_with_inline_images(html, image_config: {...})
|
|
95
|
-
markdown = result.markdown
|
|
96
|
-
images = result.inline_images
|
|
97
106
|
|
|
98
|
-
# With metadata extraction
|
|
99
|
-
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, metadata_config)
|
|
100
107
|
|
|
101
|
-
# With visitor pattern (custom callbacks)
|
|
102
|
-
result = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new, options: {...})
|
|
103
|
-
```
|
|
104
108
|
|
|
105
|
-
### Conversion Options Hash
|
|
106
109
|
|
|
107
|
-
```ruby
|
|
108
|
-
{
|
|
109
|
-
heading_style: :atx, # :atx or :setext
|
|
110
|
-
code_block_style: :fenced, # :fenced or :indented
|
|
111
|
-
bullets: '*+-', # List bullet chars
|
|
112
|
-
list_indent_type: :spaces, # :spaces or :tabs
|
|
113
|
-
list_indent_width: 2, # Number of indent spaces
|
|
114
|
-
whitespace_mode: :normalized, # :normalized, :preserve, or :collapse
|
|
115
|
-
highlight_style: :double_equal, # Code highlighting style
|
|
116
|
-
hocr_spatial_tables: false, # Special hOCR table handling
|
|
117
|
-
preprocessing: {
|
|
118
|
-
enabled: true,
|
|
119
|
-
preset: :aggressive, # :minimal, :standard, :aggressive
|
|
120
|
-
remove_navigation: true,
|
|
121
|
-
remove_forms: true
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
```
|
|
125
110
|
|
|
126
|
-
|
|
111
|
+
## API Reference
|
|
127
112
|
|
|
128
|
-
|
|
113
|
+
### Core Functions
|
|
129
114
|
|
|
130
|
-
```ruby
|
|
131
|
-
handle = HtmlToMarkdown.options(hocr_spatial_tables: false)
|
|
132
115
|
|
|
133
|
-
|
|
134
|
-
HtmlToMarkdown.convert_with_options(html, handle)
|
|
135
|
-
end
|
|
136
|
-
```
|
|
116
|
+
**`convert(html, options: nil) -> String`**
|
|
137
117
|
|
|
138
|
-
|
|
118
|
+
Basic HTML-to-Markdown conversion. Fast and simple.
|
|
139
119
|
|
|
140
|
-
|
|
120
|
+
**`convert_with_metadata(html, options: nil, config: nil) -> [String, Hash]`**
|
|
141
121
|
|
|
142
|
-
|
|
143
|
-
html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
|
|
144
|
-
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
122
|
+
Extract Markdown plus metadata (headers, links, images, structured data) in a single pass. See [Metadata Extraction Guide](../../examples/metadata-extraction/).
|
|
145
123
|
|
|
146
|
-
|
|
147
|
-
puts metadata[:headers].first[:text] # "Hello"
|
|
148
|
-
```
|
|
124
|
+
**`convert_with_visitor(html, visitor:, options: nil) -> String`**
|
|
149
125
|
|
|
150
|
-
|
|
126
|
+
Customize conversion with visitor callbacks for element interception. See [Visitor Pattern Guide](../../examples/visitor-pattern/).
|
|
151
127
|
|
|
152
|
-
|
|
128
|
+
**`convert_with_inline_images(html, config: nil) -> [String, Array, Array]`**
|
|
153
129
|
|
|
154
|
-
|
|
130
|
+
Extract base64-encoded inline images with metadata.
|
|
155
131
|
|
|
156
|
-
```ruby
|
|
157
|
-
class MyVisitor
|
|
158
|
-
def visit_link(ctx, href, text, title = nil)
|
|
159
|
-
{ type: :custom, output: "[#{text}](#{href})" }
|
|
160
|
-
end
|
|
161
132
|
|
|
162
|
-
def visit_image(ctx, src, alt, title = nil)
|
|
163
|
-
{ type: :skip } # Remove images
|
|
164
|
-
end
|
|
165
|
-
end
|
|
166
133
|
|
|
167
|
-
|
|
168
|
-
```
|
|
134
|
+
### Options
|
|
169
135
|
|
|
170
|
-
|
|
136
|
+
**`ConversionOptions`** – Key configuration fields:
|
|
137
|
+
- `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
|
|
138
|
+
- `list_indent_width`: Spaces per indent level — default: `2`
|
|
139
|
+
- `bullets`: Bullet characters cycle — default: `"*+-"`
|
|
140
|
+
- `wrap`: Enable text wrapping — default: `false`
|
|
141
|
+
- `wrap_width`: Wrap at column — default: `80`
|
|
142
|
+
- `code_language`: Default fenced code block language — default: none
|
|
143
|
+
- `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
|
|
171
144
|
|
|
172
|
-
|
|
145
|
+
**`MetadataConfig`** – Selective metadata extraction:
|
|
146
|
+
- `extract_headers`: h1-h6 elements — default: `true`
|
|
147
|
+
- `extract_links`: Hyperlinks — default: `true`
|
|
148
|
+
- `extract_images`: Image elements — default: `true`
|
|
149
|
+
- `extract_structured_data`: JSON-LD, Microdata, RDFa — default: `true`
|
|
150
|
+
- `max_structured_data_size`: Size limit in bytes — default: `100KB`
|
|
173
151
|
|
|
174
|
-
For advanced examples (image filtering, link analytics, footnote dialects), RBS type-safety patterns, and full method reference, see [Visitor Pattern Guide](../../examples/visitor-pattern/).
|
|
175
152
|
|
|
176
|
-
## RBS Types & Strict Type Checking
|
|
177
153
|
|
|
178
|
-
|
|
154
|
+
## Metadata Extraction
|
|
179
155
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
156
|
+
The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass.
|
|
157
|
+
|
|
158
|
+
**Use Cases:**
|
|
159
|
+
- **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
|
|
160
|
+
- **Table of contents generation** – Build structured outlines from heading hierarchy
|
|
161
|
+
- **Content migration** – Document all external links and resources
|
|
162
|
+
- **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
|
|
163
|
+
- **Link validation** – Classify and validate anchor, internal, external, email, and phone links
|
|
183
164
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
- `HtmlToMarkdown::extended_metadata` - Metadata extraction result
|
|
165
|
+
**Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Disable unused metadata types in `MetadataConfig` to optimize further.
|
|
166
|
+
|
|
167
|
+
### Example: Quick Start
|
|
188
168
|
|
|
189
|
-
Type-safe visitor implementation:
|
|
190
169
|
|
|
191
170
|
```ruby
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
171
|
+
require 'html_to_markdown'
|
|
172
|
+
|
|
173
|
+
html = '<h1>Article</h1><img src="test.jpg" alt="test">'
|
|
174
|
+
markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
|
|
175
|
+
|
|
176
|
+
puts metadata[:document][:title] # Document title
|
|
177
|
+
puts metadata[:headers] # All h1-h6 elements
|
|
178
|
+
puts metadata[:links] # All hyperlinks
|
|
179
|
+
puts metadata[:images] # All images with alt text
|
|
180
|
+
puts metadata[:structured_data] # JSON-LD, Microdata, RDFa
|
|
202
181
|
```
|
|
203
182
|
|
|
204
|
-
All public methods are typed for early error detection and LSP editor support (Ruby 3+).
|
|
205
183
|
|
|
206
|
-
## Magnus Native Extension
|
|
207
184
|
|
|
208
|
-
|
|
185
|
+
For detailed examples including SEO extraction, table-of-contents generation, link validation, and accessibility audits, see the [Metadata Extraction Guide](../../examples/metadata-extraction/).
|
|
209
186
|
|
|
210
|
-
- **Zero-copy interop**: String and hash data flows directly between Ruby and Rust
|
|
211
|
-
- **Safe bindings**: No segfaults; Rust's type system ensures memory safety
|
|
212
|
-
- **Automatic error mapping**: Rust errors convert to Ruby exceptions with full context
|
|
213
|
-
- **Native performance**: Compiled to `.so` (Linux/macOS) or `.dll` (Windows)
|
|
214
|
-
- **Smart compilation**: Prebuilt binaries for common platforms; falls back to on-install compilation
|
|
215
187
|
|
|
216
|
-
Build manually:
|
|
217
188
|
|
|
218
|
-
```bash
|
|
219
|
-
bundle exec rake compile
|
|
220
|
-
```
|
|
221
189
|
|
|
222
|
-
##
|
|
190
|
+
## Visitor Pattern
|
|
191
|
+
|
|
192
|
+
The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Use visitors to transform content, filter elements, validate structure, or collect analytics.
|
|
193
|
+
|
|
194
|
+
**Use Cases:**
|
|
195
|
+
- **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
|
|
196
|
+
- **Content filtering** – Remove tracking pixels, ads, or unwanted elements
|
|
197
|
+
- **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
|
|
198
|
+
- **Accessibility validation** – Check alt text, heading hierarchy, link text
|
|
199
|
+
- **Analytics** – Track element usage, link destinations, image sources
|
|
200
|
+
|
|
201
|
+
**Supported Visitor Methods:** 40+ callbacks for text, inline elements, links, images, headings, lists, blocks, and tables.
|
|
202
|
+
|
|
203
|
+
### Example: Quick Start
|
|
223
204
|
|
|
224
|
-
Call the Rust CLI directly from Ruby or shell:
|
|
225
205
|
|
|
226
206
|
```ruby
|
|
227
|
-
require 'html_to_markdown
|
|
207
|
+
require 'html_to_markdown'
|
|
228
208
|
|
|
229
|
-
|
|
209
|
+
class MyVisitor
|
|
210
|
+
def visit_link(ctx, href, text, title = nil)
|
|
211
|
+
# Rewrite CDN URLs
|
|
212
|
+
if href.start_with?('https://old-cdn.com')
|
|
213
|
+
href = href.sub('https://old-cdn.com', 'https://new-cdn.com')
|
|
214
|
+
end
|
|
215
|
+
{ type: :custom, output: "[#{text}](#{href})" }
|
|
216
|
+
end
|
|
230
217
|
|
|
231
|
-
|
|
232
|
-
|
|
218
|
+
def visit_image(ctx, src, alt = nil, title = nil)
|
|
219
|
+
# Skip tracking pixels
|
|
220
|
+
src.include?('tracking') ? { type: :skip } : { type: :continue }
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
|
|
225
|
+
markdown = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
|
|
233
226
|
```
|
|
234
227
|
|
|
235
|
-
## Error Handling
|
|
236
228
|
|
|
237
|
-
- `HtmlToMarkdown::Error` - Conversion errors with Rust error context
|
|
238
|
-
- `HtmlToMarkdown::CLIProxy::MissingBinaryError` - CLI binary not found
|
|
239
|
-
- `HtmlToMarkdown::CLIProxy::CLIExecutionError` - Command execution failed
|
|
240
229
|
|
|
241
|
-
|
|
230
|
+
For comprehensive examples including content filtering, link footnotes, accessibility validation, and asynchronous URL validation, see the [Visitor Pattern Guide](../../examples/visitor-pattern/).
|
|
231
|
+
|
|
232
|
+
|
|
242
233
|
|
|
243
234
|
## Examples
|
|
244
235
|
|
|
245
|
-
|
|
236
|
+
- [Visitor Pattern Guide](../../examples/visitor-pattern/)
|
|
237
|
+
- [Metadata Extraction Guide](../../examples/metadata-extraction/)
|
|
238
|
+
- [Performance Guide](../../examples/performance/)
|
|
246
239
|
|
|
247
|
-
|
|
248
|
-
- **[Metadata Extraction](../../examples/metadata-extraction/)** - SEO data, heading hierarchy, link classification, structured data parsing
|
|
249
|
-
- **[Performance Guide](../../examples/performance/)** - Benchmarking, profiling, throughput optimization
|
|
240
|
+
## Links
|
|
250
241
|
|
|
251
|
-
|
|
242
|
+
- **GitHub:** [github.com/kreuzberg-dev/html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown)
|
|
252
243
|
|
|
253
|
-
|
|
244
|
+
- **RubyGems:** [rubygems.org/gems/html-to-markdown](https://rubygems.org/gems/html-to-markdown)
|
|
254
245
|
|
|
255
|
-
- [
|
|
256
|
-
- [
|
|
257
|
-
- [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
|
|
258
|
-
- [PHP extension](https://packagist.org/packages/goldziher/html-to-markdown)
|
|
259
|
-
- The Rust crate and CLI
|
|
246
|
+
- **Kreuzberg Ecosystem:** [kreuzberg.dev](https://kreuzberg.dev)
|
|
247
|
+
- **Discord:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
260
248
|
|
|
261
|
-
|
|
249
|
+
## Contributing
|
|
262
250
|
|
|
263
|
-
|
|
251
|
+
We welcome contributions! Please see our [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md) for details on:
|
|
264
252
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
253
|
+
- Setting up the development environment
|
|
254
|
+
- Running tests locally
|
|
255
|
+
- Submitting pull requests
|
|
256
|
+
- Reporting issues
|
|
257
|
+
|
|
258
|
+
All contributions must follow our code quality standards (enforced via pre-commit hooks):
|
|
269
259
|
|
|
270
|
-
|
|
260
|
+
- Proper test coverage (Rust 95%+, language bindings 80%+)
|
|
261
|
+
- Formatting and linting checks
|
|
262
|
+
- Documentation for public APIs
|
|
271
263
|
|
|
272
264
|
## License
|
|
273
265
|
|
|
274
|
-
MIT
|
|
266
|
+
MIT License – see [LICENSE](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE).
|
|
267
|
+
|
|
268
|
+
## Support
|
|
269
|
+
|
|
270
|
+
If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/kreuzberg-dev).
|
|
271
|
+
|
|
272
|
+
Have questions or run into issues? We're here to help:
|
|
273
|
+
|
|
274
|
+
- **GitHub Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
|
|
275
|
+
- **Discussions:** [github.com/kreuzberg-dev/html-to-markdown/discussions](https://github.com/kreuzberg-dev/html-to-markdown/discussions)
|
|
276
|
+
- **Discord Community:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rb"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.19.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -18,7 +18,7 @@ name = "html_to_markdown_rb"
|
|
|
18
18
|
crate-type = ["cdylib", "rlib"]
|
|
19
19
|
|
|
20
20
|
[dependencies]
|
|
21
|
-
html-to-markdown-rs = { version = "2.
|
|
21
|
+
html-to-markdown-rs = { version = "2.18.0", features = ["inline-images", "visitor", "metadata"] }
|
|
22
22
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
|
|
23
23
|
|
|
24
24
|
[target.'cfg(not(target_os = "windows"))'.dependencies]
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.19.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-29 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|