html-to-markdown 2.18.0 → 2.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 46277b2cbcf02b063ea61a2fe083829ac80bd395e10c5799f7ace0e82cfdf4ed
4
- data.tar.gz: fbbd6a3ef8f45cc35d85ac7a51b77361c198ab13c981ebb0dc5ad703df8de54c
3
+ metadata.gz: 846bb8410c19e5a5ee99a7998d78f6f9aa21e698dc2cf1ac4ab6aaf45e257cca
4
+ data.tar.gz: 04cc1deab74259fc77481f30e1fdf236439b86b2fd05b1a5efc06d873e54f806
5
5
  SHA512:
6
- metadata.gz: 57db78a9ffbd68b713a4265a0d7cc68d67fbab07ac8a9963b5b7e1c213ce9ec0aec8ef4cbc5cb76aa67bc0cd4b8f976efd45f0e55c458f85136174f25344e74b
7
- data.tar.gz: 70dcae164812ac269efb2b0a36224b4320b0f1f6341970ee5d626666189d6ede0aac3921edb72f36f30896e257eefdbd55b09328613f02abde501721bd31e944
6
+ metadata.gz: 4d1adafa16b6633374c5f509881bb8feb5ec670bfd5c7388c9100c6d3d6a53538db86de6cfbce81a0976af7ad8865fbd5242b639631828d67a88cefc938763ae
7
+ data.tar.gz: f292a71719770821f0e5000d263627f8f44facc74f4c0f6f7da0e4c7feb66034e57baf254163ac88a58f6ebf911a8787797e2742bd31bba9a27fe289799c833d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.18.0)
4
+ html-to-markdown (2.19.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -29,16 +29,8 @@ GEM
29
29
  diff-lcs (1.6.2)
30
30
  drb (2.2.3)
31
31
  ffi (1.17.2)
32
- ffi (1.17.2-aarch64-linux-gnu)
33
- ffi (1.17.2-aarch64-linux-musl)
34
- ffi (1.17.2-arm-linux-gnu)
35
- ffi (1.17.2-arm-linux-musl)
36
32
  ffi (1.17.2-arm64-darwin)
37
- ffi (1.17.2-x86-linux-gnu)
38
- ffi (1.17.2-x86-linux-musl)
39
- ffi (1.17.2-x86_64-darwin)
40
33
  ffi (1.17.2-x86_64-linux-gnu)
41
- ffi (1.17.2-x86_64-linux-musl)
42
34
  fileutils (1.8.0)
43
35
  i18n (1.14.8)
44
36
  concurrent-ruby (~> 1.0)
@@ -131,17 +123,8 @@ GEM
131
123
  uri (1.1.1)
132
124
 
133
125
  PLATFORMS
134
- aarch64-linux-gnu
135
- aarch64-linux-musl
136
- arm-linux-gnu
137
- arm-linux-musl
138
126
  arm64-darwin
139
- ruby
140
- x86-linux-gnu
141
- x86-linux-musl
142
- x86_64-darwin
143
- x86_64-linux-gnu
144
- x86_64-linux-musl
127
+ x86_64-linux
145
128
 
146
129
  DEPENDENCIES
147
130
  html-to-markdown!
@@ -164,18 +147,10 @@ CHECKSUMS
164
147
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
165
148
  drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
166
149
  ffi (1.17.2) sha256=297235842e5947cc3036ebe64077584bff583cd7a4e94e9a02fdec399ef46da6
167
- ffi (1.17.2-aarch64-linux-gnu) sha256=c910bd3cae70b76690418cce4572b7f6c208d271f323d692a067d59116211a1a
168
- ffi (1.17.2-aarch64-linux-musl) sha256=69e6556b091d45df83e6c3b19d3c54177c206910965155a6ec98de5e893c7b7c
169
- ffi (1.17.2-arm-linux-gnu) sha256=d4a438f2b40224ae42ec72f293b3ebe0ba2159f7d1bd47f8417e6af2f68dbaa5
170
- ffi (1.17.2-arm-linux-musl) sha256=977dfb7f3a6381206dbda9bc441d9e1f9366bf189a634559c3b7c182c497aaa3
171
150
  ffi (1.17.2-arm64-darwin) sha256=54dd9789be1d30157782b8de42d8f887a3c3c345293b57ffb6b45b4d1165f813
172
- ffi (1.17.2-x86-linux-gnu) sha256=95d8f9ebea23c39888e2ab85a02c98f54acb2f4e79b829250d7267ce741dc7b0
173
- ffi (1.17.2-x86-linux-musl) sha256=41741449bab2b9530f42a47baa5c26263925306fad0ac2d60887f51af2e3b24c
174
- ffi (1.17.2-x86_64-darwin) sha256=981f2d4e32ea03712beb26e55e972797c2c5a7b0257955d8667ba58f2da6440e
175
151
  ffi (1.17.2-x86_64-linux-gnu) sha256=05d2026fc9dbb7cfd21a5934559f16293815b7ce0314846fee2ac8efbdb823ea
176
- ffi (1.17.2-x86_64-linux-musl) sha256=97c0eb3981414309285a64dc4d466bd149e981c279a56371ef811395d68cb95c
177
152
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
178
- html-to-markdown (2.18.0)
153
+ html-to-markdown (2.19.0)
179
154
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
180
155
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
181
156
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
data/README.md CHANGED
@@ -1,274 +1,276 @@
1
- # html-to-markdown-rb
2
-
3
- Blazing-fast HTML Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages. Ship identical Markdown across every runtime while enjoying native extension performance.
4
-
5
- [![Crates.io](https://img.shields.io/crates/v/html-to-markdown-rs.svg?logo=rust&label=crates.io)](https://crates.io/crates/html-to-markdown-rs)
6
- [![npm (node)](https://img.shields.io/npm/v/html-to-markdown-node.svg?logo=npm)](https://www.npmjs.com/package/html-to-markdown-node)
7
- [![npm (wasm)](https://img.shields.io/npm/v/html-to-markdown-wasm.svg?logo=npm)](https://www.npmjs.com/package/html-to-markdown-wasm)
8
- [![PyPI](https://img.shields.io/pypi/v/html-to-markdown.svg?logo=pypi)](https://pypi.org/project/html-to-markdown/)
9
- [![Packagist](https://img.shields.io/packagist/v/goldziher/html-to-markdown.svg)](https://packagist.org/packages/goldziher/html-to-markdown)
10
- [![RubyGems](https://badge.fury.io/rb/html-to-markdown.svg)](https://rubygems.org/gems/html-to-markdown)
11
- [![Hex.pm](https://img.shields.io/hexpm/v/html_to_markdown.svg)](https://hex.pm/packages/html_to_markdown)
12
- [![NuGet](https://img.shields.io/nuget/v/Goldziher.HtmlToMarkdown.svg)](https://www.nuget.org/packages/Goldziher.HtmlToMarkdown/)
13
- [![Maven Central](https://img.shields.io/maven-central/v/io.github.goldziher/html-to-markdown.svg)](https://central.sonatype.com/artifact/io.github.goldziher/html-to-markdown)
14
- [![Go Reference](https://pkg.go.dev/badge/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown.svg)](https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown)
15
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE)
16
- [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
17
-
18
- ## Features
19
-
20
- - ⚡ **Rust-fast**: Ruby bindings around a highly optimised Rust core (60‑80× faster than BeautifulSoup-based converters).
21
- - 🔁 **Identical output**: Shares logic with the Python wheels, npm bindings, PHP extension, WASM package, and CLI — consistent Markdown everywhere.
22
- - ⚙️ **Rich configuration**: Control heading styles, list indentation, whitespace handling, HTML preprocessing, and more.
23
- - 🖼️ **Inline image extraction**: Pull out embedded images (PNG/JPEG/SVG/data URIs) alongside Markdown.
24
- - 🧰 **Bundled CLI proxy**: Call the Rust CLI straight from Ruby or shell scripts.
25
- - 🛠️ **First-class Rails support**: Works with `Gem.win_platform?` builds, supports Trusted Publishing, and compiles on install if no native gem matches.
26
-
27
- ## Documentation & Support
28
-
29
- - [GitHub repository](https://github.com/kreuzberg-dev/html-to-markdown)
30
- - [Issue tracker](https://github.com/kreuzberg-dev/html-to-markdown/issues)
31
- - [Changelog](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CHANGELOG.md)
32
- - [Live demo (WASM)](https://kreuzberg-dev.github.io/html-to-markdown/)
1
+ # html-to-markdown
2
+
3
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
+ <!-- Language Bindings -->
5
+ <a href="https://crates.io/crates/html-to-markdown-rs">
6
+ <img src="https://img.shields.io/crates/v/html-to-markdown-rs?label=Rust&color=007ec6" alt="Rust">
7
+ </a>
8
+ <a href="https://pypi.org/project/html-to-markdown/">
9
+ <img src="https://img.shields.io/pypi/v/html-to-markdown?label=Python&color=007ec6" alt="Python">
10
+ </a>
11
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-node">
12
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-node?label=Node.js&color=007ec6" alt="Node.js">
13
+ </a>
14
+ <a href="https://www.npmjs.com/package/@kreuzberg/html-to-markdown-wasm">
15
+ <img src="https://img.shields.io/npm/v/@kreuzberg/html-to-markdown-wasm?label=WASM&color=007ec6" alt="WASM">
16
+ </a>
17
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
+ </a>
20
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
+ <img src="https://img.shields.io/badge/Go-v2.19.0-007ec6" alt="Go">
22
+ </a>
23
+ <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
+ <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
25
+ </a>
26
+ <a href="https://packagist.org/packages/goldziher/html-to-markdown">
27
+ <img src="https://img.shields.io/packagist/v/goldziher/html-to-markdown?label=PHP&color=007ec6" alt="PHP">
28
+ </a>
29
+ <a href="https://rubygems.org/gems/html-to-markdown">
30
+ <img src="https://img.shields.io/gem/v/html-to-markdown?label=Ruby&color=007ec6" alt="Ruby">
31
+ </a>
32
+ <a href="https://hex.pm/packages/html_to_markdown">
33
+ <img src="https://img.shields.io/hexpm/v/html_to_markdown?label=Elixir&color=007ec6" alt="Elixir">
34
+ </a>
35
+
36
+ <!-- Project Info -->
37
+ <a href="https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE">
38
+ <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
39
+ </a>
40
+ </div>
41
+
42
+ <img width="1128" height="191" alt="html-to-markdown" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
43
+
44
+ <div align="center" style="margin-top: 20px;">
45
+ <a href="https://discord.gg/pXxagNK2zN">
46
+ <img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
47
+ </a>
48
+ </div>
49
+
50
+
51
+ Blazing-fast HTML to Markdown conversion for Ruby, powered by the same Rust engine used by our Python, Node.js, WebAssembly, and PHP packages.
52
+ Ship identical Markdown across every runtime while enjoying native extension performance with Magnus bindings.
53
+
33
54
 
34
55
  ## Installation
35
56
 
36
57
  ```bash
37
- bundle add html-to-markdown
38
- # or
39
58
  gem install html-to-markdown
40
59
  ```
41
60
 
42
- Add the gem to your project and Bundler will compile the native Rust extension on first install.
43
61
 
44
- ### Requirements
45
62
 
46
- - Ruby **3.2+** (Magnus relies on the fiber scheduler APIs added in 3.2)
47
- - Rust toolchain **1.85+** with Cargo available on your `$PATH`
48
- - Ruby development headers (`ruby-dev`, `ruby-devel`, or the platform equivalent)
63
+ Requires Ruby 3.2+ with Magnus native extension bindings. Published for Linux, macOS.
49
64
 
50
- **Windows**: install [RubyInstaller with MSYS2](https://rubyinstaller.org/) (UCRT64). Run once:
51
65
 
52
- ```powershell
53
- ridk exec pacman -S --needed --noconfirm base-devel mingw-w64-ucrt-x86_64-toolchain
54
- ```
55
66
 
56
- This provides the standard headers (including `strings.h`) required for the bindgen step.
67
+
68
+
69
+
70
+ ## Performance Snapshot
71
+
72
+ Apple M4 • Real Wikipedia documents • `convert()` (Ruby)
73
+
74
+ | Document | Size | Latency | Throughput |
75
+ | -------- | ---- | ------- | ---------- |
76
+ | Lists (Timeline) | 129KB | 0.71ms | 182 MB/s |
77
+ | Tables (Countries) | 360KB | 2.15ms | 167 MB/s |
78
+ | Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
79
+
80
+
81
+ See [Performance Guide](../../examples/performance/) for detailed benchmarks.
82
+
57
83
 
58
84
  ## Quick Start
59
85
 
86
+ Basic conversion:
87
+
60
88
  ```ruby
61
89
  require 'html_to_markdown'
62
90
 
63
- html = <<~HTML
64
- <h1>Welcome</h1>
65
- <p>This is <strong>Rust-fast</strong> conversion!</p>
66
- <ul>
67
- <li>Native extension</li>
68
- <li>Identical output across languages</li>
69
- </ul>
70
- HTML
71
-
91
+ html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
72
92
  markdown = HtmlToMarkdown.convert(html)
73
- puts markdown
74
- # # Welcome
75
- #
76
- # This is **Rust-fast** conversion!
77
- #
78
- # - Native extension
79
- # - Identical output across languages
80
93
  ```
81
94
 
82
- ## API Reference
83
95
 
84
- ### Basic Conversion
96
+
97
+ With conversion options:
85
98
 
86
99
  ```ruby
87
- # Simple conversion
88
- markdown = HtmlToMarkdown.convert(html)
100
+ require 'html_to_markdown'
89
101
 
90
- # With options (pass a Ruby hash with symbol keys)
102
+ html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
91
103
  markdown = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
104
+ ```
92
105
 
93
- # With inline images
94
- result = HtmlToMarkdown.convert_with_inline_images(html, image_config: {...})
95
- markdown = result.markdown
96
- images = result.inline_images
97
106
 
98
- # With metadata extraction
99
- markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, metadata_config)
100
107
 
101
- # With visitor pattern (custom callbacks)
102
- result = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new, options: {...})
103
- ```
104
108
 
105
- ### Conversion Options Hash
106
109
 
107
- ```ruby
108
- {
109
- heading_style: :atx, # :atx or :setext
110
- code_block_style: :fenced, # :fenced or :indented
111
- bullets: '*+-', # List bullet chars
112
- list_indent_type: :spaces, # :spaces or :tabs
113
- list_indent_width: 2, # Number of indent spaces
114
- whitespace_mode: :normalized, # :normalized, :preserve, or :collapse
115
- highlight_style: :double_equal, # Code highlighting style
116
- hocr_spatial_tables: false, # Special hOCR table handling
117
- preprocessing: {
118
- enabled: true,
119
- preset: :aggressive, # :minimal, :standard, :aggressive
120
- remove_navigation: true,
121
- remove_forms: true
122
- }
123
- }
124
- ```
125
110
 
126
- ### Performance: Reusing Options
111
+ ## API Reference
127
112
 
128
- For tight loops, build an options handle once:
113
+ ### Core Functions
129
114
 
130
- ```ruby
131
- handle = HtmlToMarkdown.options(hocr_spatial_tables: false)
132
115
 
133
- 100.times do
134
- HtmlToMarkdown.convert_with_options(html, handle)
135
- end
136
- ```
116
+ **`convert(html, options: nil) -> String`**
137
117
 
138
- ### Metadata Extraction
118
+ Basic HTML-to-Markdown conversion. Fast and simple.
139
119
 
140
- Extract document properties (title, description, author, language), social metadata (Open Graph, Twitter cards), heading hierarchy, link analysis, image metadata, and structured data (JSON-LD, Microdata, RDFa):
120
+ **`convert_with_metadata(html, options: nil, config: nil) -> [String, Hash]`**
141
121
 
142
- ```ruby
143
- html = '<html lang="en"><head><title>Test</title></head><body><h1>Hello</h1></body></html>'
144
- markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
122
+ Extract Markdown plus metadata (headers, links, images, structured data) in a single pass. See [Metadata Extraction Guide](../../examples/metadata-extraction/).
145
123
 
146
- puts metadata[:document][:title] # "Test"
147
- puts metadata[:headers].first[:text] # "Hello"
148
- ```
124
+ **`convert_with_visitor(html, visitor:, options: nil) -> String`**
149
125
 
150
- For detailed examples (SEO extraction, heading hierarchy analysis, structured data) and full metadata structure reference, see [Metadata Extraction Guide](../../examples/metadata-extraction/).
126
+ Customize conversion with visitor callbacks for element interception. See [Visitor Pattern Guide](../../examples/visitor-pattern/).
151
127
 
152
- ### Visitor Pattern
128
+ **`convert_with_inline_images(html, config: nil) -> [String, Array, Array]`**
153
129
 
154
- Customize conversion with fine-grained element callbacks. Perfect for custom element handling, analytics during conversion, domain-specific markdown dialects, and conditional rendering:
130
+ Extract base64-encoded inline images with metadata.
155
131
 
156
- ```ruby
157
- class MyVisitor
158
- def visit_link(ctx, href, text, title = nil)
159
- { type: :custom, output: "[#{text}](#{href})" }
160
- end
161
132
 
162
- def visit_image(ctx, src, alt, title = nil)
163
- { type: :skip } # Remove images
164
- end
165
- end
166
133
 
167
- result = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
168
- ```
134
+ ### Options
169
135
 
170
- **Return types**: `{ type: :continue }` (default), `{ type: :custom, output: "..." }` (replace), `{ type: :skip }` (omit), `{ type: :preserve_html }` (keep HTML), `{ type: :error, message: "..." }` (halt).
136
+ **`ConversionOptions`** Key configuration fields:
137
+ - `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
138
+ - `list_indent_width`: Spaces per indent level — default: `2`
139
+ - `bullets`: Bullet characters cycle — default: `"*+-"`
140
+ - `wrap`: Enable text wrapping — default: `false`
141
+ - `wrap_width`: Wrap at column — default: `80`
142
+ - `code_language`: Default fenced code block language — default: none
143
+ - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
171
144
 
172
- **40+ visitor methods** for text, inline formatting, blocks, lists, tables, advanced elements, and lifecycle hooks. Callback parameters include `NodeContext` with element metadata (tag_name, attributes, depth, parent_tag, is_inline).
145
+ **`MetadataConfig`** Selective metadata extraction:
146
+ - `extract_headers`: h1-h6 elements — default: `true`
147
+ - `extract_links`: Hyperlinks — default: `true`
148
+ - `extract_images`: Image elements — default: `true`
149
+ - `extract_structured_data`: JSON-LD, Microdata, RDFa — default: `true`
150
+ - `max_structured_data_size`: Size limit in bytes — default: `100KB`
173
151
 
174
- For advanced examples (image filtering, link analytics, footnote dialects), RBS type-safety patterns, and full method reference, see [Visitor Pattern Guide](../../examples/visitor-pattern/).
175
152
 
176
- ## RBS Types & Strict Type Checking
177
153
 
178
- Full RBS type definitions in `sig/html_to_markdown.rbs` enable strict type checking with [Steep](https://github.com/soutaro/steep):
154
+ ## Metadata Extraction
179
155
 
180
- ```bash
181
- steep check
182
- ```
156
+ The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass.
157
+
158
+ **Use Cases:**
159
+ - **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
160
+ - **Table of contents generation** – Build structured outlines from heading hierarchy
161
+ - **Content migration** – Document all external links and resources
162
+ - **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
163
+ - **Link validation** – Classify and validate anchor, internal, external, email, and phone links
183
164
 
184
- Key types:
185
- - `HtmlToMarkdown::NodeContext` - Element metadata in visitor callbacks (tag_name, attributes, depth, etc.)
186
- - `HtmlToMarkdown::visitor_result` - Return type union for visitor methods
187
- - `HtmlToMarkdown::extended_metadata` - Metadata extraction result
165
+ **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Disable unused metadata types in `MetadataConfig` to optimize further.
166
+
167
+ ### Example: Quick Start
188
168
 
189
- Type-safe visitor implementation:
190
169
 
191
170
  ```ruby
192
- class TypedVisitor
193
- def visit_link(
194
- ctx : HtmlToMarkdown::NodeContext,
195
- href : String,
196
- text : String,
197
- title : String | nil = nil
198
- ) : HtmlToMarkdown::visitor_result
199
- { type: :custom, output: "[#{text}](#{href})" }
200
- end
201
- end
171
+ require 'html_to_markdown'
172
+
173
+ html = '<h1>Article</h1><img src="test.jpg" alt="test">'
174
+ markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
175
+
176
+ puts metadata[:document][:title] # Document title
177
+ puts metadata[:headers] # All h1-h6 elements
178
+ puts metadata[:links] # All hyperlinks
179
+ puts metadata[:images] # All images with alt text
180
+ puts metadata[:structured_data] # JSON-LD, Microdata, RDFa
202
181
  ```
203
182
 
204
- All public methods are typed for early error detection and LSP editor support (Ruby 3+).
205
183
 
206
- ## Magnus Native Extension
207
184
 
208
- The gem compiles a native Rust extension via [Magnus](https://github.com/matsadler/magnus) FFI bindings:
185
+ For detailed examples including SEO extraction, table-of-contents generation, link validation, and accessibility audits, see the [Metadata Extraction Guide](../../examples/metadata-extraction/).
209
186
 
210
- - **Zero-copy interop**: String and hash data flows directly between Ruby and Rust
211
- - **Safe bindings**: No segfaults; Rust's type system ensures memory safety
212
- - **Automatic error mapping**: Rust errors convert to Ruby exceptions with full context
213
- - **Native performance**: Compiled to `.so` (Linux/macOS) or `.dll` (Windows)
214
- - **Smart compilation**: Prebuilt binaries for common platforms; falls back to on-install compilation
215
187
 
216
- Build manually:
217
188
 
218
- ```bash
219
- bundle exec rake compile
220
- ```
221
189
 
222
- ## CLI Proxy
190
+ ## Visitor Pattern
191
+
192
+ The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Use visitors to transform content, filter elements, validate structure, or collect analytics.
193
+
194
+ **Use Cases:**
195
+ - **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
196
+ - **Content filtering** – Remove tracking pixels, ads, or unwanted elements
197
+ - **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
198
+ - **Accessibility validation** – Check alt text, heading hierarchy, link text
199
+ - **Analytics** – Track element usage, link destinations, image sources
200
+
201
+ **Supported Visitor Methods:** 40+ callbacks for text, inline elements, links, images, headings, lists, blocks, and tables.
202
+
203
+ ### Example: Quick Start
223
204
 
224
- Call the Rust CLI directly from Ruby or shell:
225
205
 
226
206
  ```ruby
227
- require 'html_to_markdown/cli'
207
+ require 'html_to_markdown'
228
208
 
229
- HtmlToMarkdown::CLI.run(%w[--heading-style atx input.html], stdout: $stdout)
209
+ class MyVisitor
210
+ def visit_link(ctx, href, text, title = nil)
211
+ # Rewrite CDN URLs
212
+ if href.start_with?('https://old-cdn.com')
213
+ href = href.sub('https://old-cdn.com', 'https://new-cdn.com')
214
+ end
215
+ { type: :custom, output: "[#{text}](#{href})" }
216
+ end
230
217
 
231
- # Or call the binary directly
232
- HtmlToMarkdown::CLIProxy.call(['--version'])
218
+ def visit_image(ctx, src, alt = nil, title = nil)
219
+ # Skip tracking pixels
220
+ src.include?('tracking') ? { type: :skip } : { type: :continue }
221
+ end
222
+ end
223
+
224
+ html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
225
+ markdown = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
233
226
  ```
234
227
 
235
- ## Error Handling
236
228
 
237
- - `HtmlToMarkdown::Error` - Conversion errors with Rust error context
238
- - `HtmlToMarkdown::CLIProxy::MissingBinaryError` - CLI binary not found
239
- - `HtmlToMarkdown::CLIProxy::CLIExecutionError` - Command execution failed
240
229
 
241
- Binary data inputs (e.g., PDF bytes as string) raise `HtmlToMarkdown::Error` with "Invalid input" message.
230
+ For comprehensive examples including content filtering, link footnotes, accessibility validation, and asynchronous URL validation, see the [Visitor Pattern Guide](../../examples/visitor-pattern/).
231
+
232
+
242
233
 
243
234
  ## Examples
244
235
 
245
- Comprehensive guides with real-world patterns (Ruby examples included):
236
+ - [Visitor Pattern Guide](../../examples/visitor-pattern/)
237
+ - [Metadata Extraction Guide](../../examples/metadata-extraction/)
238
+ - [Performance Guide](../../examples/performance/)
246
239
 
247
- - **[Visitor Pattern](../../examples/visitor-pattern/)** - Custom callbacks, element-by-element control, analytics, domain-specific markdown dialects
248
- - **[Metadata Extraction](../../examples/metadata-extraction/)** - SEO data, heading hierarchy, link classification, structured data parsing
249
- - **[Performance Guide](../../examples/performance/)** - Benchmarking, profiling, throughput optimization
240
+ ## Links
250
241
 
251
- ## Consistent Across Languages
242
+ - **GitHub:** [github.com/kreuzberg-dev/html-to-markdown](https://github.com/kreuzberg-dev/html-to-markdown)
252
243
 
253
- The Ruby gem shares the exact Rust core with:
244
+ - **RubyGems:** [rubygems.org/gems/html-to-markdown](https://rubygems.org/gems/html-to-markdown)
254
245
 
255
- - [Python wheels](https://pypi.org/project/html-to-markdown/)
256
- - [Node.js / Bun bindings](https://www.npmjs.com/package/html-to-markdown-node)
257
- - [WebAssembly package](https://www.npmjs.com/package/html-to-markdown-wasm)
258
- - [PHP extension](https://packagist.org/packages/goldziher/html-to-markdown)
259
- - The Rust crate and CLI
246
+ - **Kreuzberg Ecosystem:** [kreuzberg.dev](https://kreuzberg.dev)
247
+ - **Discord:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
260
248
 
261
- Use whichever runtime fits your stack while keeping formatting behaviour identical.
249
+ ## Contributing
262
250
 
263
- ## Development
251
+ We welcome contributions! Please see our [Contributing Guide](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/CONTRIBUTING.md) for details on:
264
252
 
265
- ```bash
266
- bundle exec rake compile # build the native extension
267
- bundle exec rspec # run test suite
268
- ```
253
+ - Setting up the development environment
254
+ - Running tests locally
255
+ - Submitting pull requests
256
+ - Reporting issues
257
+
258
+ All contributions must follow our code quality standards (enforced via pre-commit hooks):
269
259
 
270
- When editing Rust code under `src/`, rerun `rake compile`.
260
+ - Proper test coverage (Rust 95%+, language bindings 80%+)
261
+ - Formatting and linting checks
262
+ - Documentation for public APIs
271
263
 
272
264
  ## License
273
265
 
274
- MIT © Na'aman Hirschfeld
266
+ MIT License see [LICENSE](https://github.com/kreuzberg-dev/html-to-markdown/blob/main/LICENSE).
267
+
268
+ ## Support
269
+
270
+ If you find this library useful, consider [sponsoring the project](https://github.com/sponsors/kreuzberg-dev).
271
+
272
+ Have questions or run into issues? We're here to help:
273
+
274
+ - **GitHub Issues:** [github.com/kreuzberg-dev/html-to-markdown/issues](https://github.com/kreuzberg-dev/html-to-markdown/issues)
275
+ - **Discussions:** [github.com/kreuzberg-dev/html-to-markdown/discussions](https://github.com/kreuzberg-dev/html-to-markdown/discussions)
276
+ - **Discord Community:** [discord.gg/pXxagNK2zN](https://discord.gg/pXxagNK2zN)
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "2.18.0"
3
+ version = "2.19.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
6
6
  license = "MIT"
@@ -18,7 +18,7 @@ name = "html_to_markdown_rb"
18
18
  crate-type = ["cdylib", "rlib"]
19
19
 
20
20
  [dependencies]
21
- html-to-markdown-rs = { version = "2.16.1", features = ["inline-images", "visitor", "metadata"] }
21
+ html-to-markdown-rs = { version = "2.18.0", features = ["inline-images", "visitor", "metadata"] }
22
22
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = ["rb-sys"] }
23
23
 
24
24
  [target.'cfg(not(target_os = "windows"))'.dependencies]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.18.0'
4
+ VERSION = '2.19.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.18.0
4
+ version: 2.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-12-28 00:00:00.000000000 Z
11
+ date: 2025-12-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys