html-to-markdown 2.29.0-x86_64-linux → 3.0.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 49299fdb1105ea4dbec8393ac879dd0f8928064543f6c4ff743aa82e94d63f88
4
- data.tar.gz: 9896cb02971863cb1e99ba5e84bbba8419b16b0edf50496f23db7d6ba6138fc1
3
+ metadata.gz: 07d676f000540af84276c48d2b0e84768e9f4708098cdda3de3d999520e7e716
4
+ data.tar.gz: 3b7fbe10fc72c7af0965ded5f770e0cf6fec353e39497d029f80d7c77f6c7f24
5
5
  SHA512:
6
- metadata.gz: fb71dbe83523eff32aa73642276468f7f2fe5fc8707f339c39097ad9946b6da5924930e1515986dbb3c1b091b58e1f9e4bf71805bc9d2890bf8678b7d0ae08bf
7
- data.tar.gz: ed4c79f974c170565c46b7ef836d3f11e4e4826d8b9cc98dce76f7f9792a12d5090c377f00a981cb3b10a3020ca6cbe6cb103c343ca646a2ea66e56fbbec0199
6
+ metadata.gz: 207023e2ce048eb36df739aa1166af50b2086ea8c388016250d838230b8c7dba1b5be208540c9afb93277538a9551183a83370d46aa6f71b750073bb71a8cb91
7
+ data.tar.gz: 8801822711dc240f82151044a45248bcc3656080ddf3d922ec6391c4371d49ce801227b0c85cfb5a4488827af1474a8e84c650b576272d1ae6f6c97e0d9cc1ad
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.29.0)
4
+ html-to-markdown (3.0.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
8
8
  remote: https://rubygems.org/
9
9
  specs:
10
- activesupport (8.1.2)
10
+ activesupport (8.1.3)
11
11
  base64
12
12
  bigdecimal
13
13
  concurrent-ruby (~> 1.0, >= 1.3.1)
@@ -20,28 +20,20 @@ GEM
20
20
  securerandom (>= 0.3)
21
21
  tzinfo (~> 2.0, >= 2.0.5)
22
22
  uri (>= 0.13.1)
23
- addressable (2.8.9)
24
- public_suffix (>= 2.0.2, < 8.0)
25
23
  ast (2.4.3)
26
24
  base64 (0.3.0)
27
- bigdecimal (4.0.1)
25
+ bigdecimal (4.1.0)
28
26
  concurrent-ruby (1.3.6)
29
27
  connection_pool (3.0.2)
30
28
  csv (3.3.5)
31
29
  diff-lcs (1.6.2)
32
30
  drb (2.2.3)
33
- ffi (1.17.3-aarch64-linux-gnu)
34
- ffi (1.17.3-arm64-darwin)
35
- ffi (1.17.3-x64-mingw-ucrt)
36
- ffi (1.17.3-x86_64-darwin)
37
- ffi (1.17.3-x86_64-linux-gnu)
31
+ ffi (1.17.4-arm64-darwin)
32
+ ffi (1.17.4-x86_64-linux-gnu)
38
33
  fileutils (1.8.0)
39
34
  i18n (1.14.8)
40
35
  concurrent-ruby (~> 1.0)
41
- json (2.19.2)
42
- json-schema (6.2.0)
43
- addressable (~> 2.8)
44
- bigdecimal (>= 3.1, < 5)
36
+ json (2.19.3)
45
37
  language_server-protocol (3.17.0.5)
46
38
  lint_roller (1.1.0)
47
39
  listen (3.10.0)
@@ -49,18 +41,15 @@ GEM
49
41
  rb-fsevent (~> 0.10, >= 0.10.3)
50
42
  rb-inotify (~> 0.9, >= 0.9.10)
51
43
  logger (1.7.0)
52
- mcp (0.9.0)
53
- json-schema (>= 4.1)
54
44
  minitest (6.0.2)
55
45
  drb (~> 2.0)
56
46
  prism (~> 1.5)
57
47
  mutex_m (0.3.0)
58
48
  parallel (1.27.0)
59
- parser (3.3.10.2)
49
+ parser (3.3.11.1)
60
50
  ast (~> 2.4.1)
61
51
  racc
62
52
  prism (1.9.0)
63
- public_suffix (7.0.5)
64
53
  racc (1.8.1)
65
54
  rainbow (3.1.1)
66
55
  rake (13.3.1)
@@ -72,7 +61,7 @@ GEM
72
61
  ffi (~> 1.0)
73
62
  rb_sys (0.9.124)
74
63
  rake-compiler-dock (= 1.11.0)
75
- rbs (3.10.3)
64
+ rbs (3.10.4)
76
65
  logger
77
66
  tsort
78
67
  regexp_parser (2.11.3)
@@ -89,11 +78,10 @@ GEM
89
78
  diff-lcs (>= 1.2.0, < 2.0)
90
79
  rspec-support (~> 3.13.0)
91
80
  rspec-support (3.13.7)
92
- rubocop (1.85.1)
81
+ rubocop (1.86.0)
93
82
  json (~> 2.3)
94
83
  language_server-protocol (~> 3.17.0.2)
95
84
  lint_roller (~> 1.1.0)
96
- mcp (~> 0.6)
97
85
  parallel (~> 1.10)
98
86
  parser (>= 3.3.0.2)
99
87
  rainbow (>= 2.2.2, < 4.0)
@@ -138,12 +126,8 @@ GEM
138
126
  uri (1.1.1)
139
127
 
140
128
  PLATFORMS
141
- aarch64-linux
142
129
  arm64-darwin
143
- x64-mingw-ucrt
144
- x86_64-darwin
145
130
  x86_64-linux
146
- x86_64-linux-gnu
147
131
 
148
132
  DEPENDENCIES
149
133
  html-to-markdown!
@@ -156,37 +140,30 @@ DEPENDENCIES
156
140
  steep
157
141
 
158
142
  CHECKSUMS
159
- activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
160
- addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
143
+ activesupport (8.1.3) sha256=21a5e0dfbd4c3ddd9e1317ec6a4d782fa226e7867dc70b0743acda81a1dca20e
161
144
  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
162
145
  base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
163
- bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
146
+ bigdecimal (4.1.0) sha256=6dc07767aa3dc456ccd48e7ae70a07b474e9afd7c5bc576f80bd6da5c8dd6cae
164
147
  concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
165
148
  connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
166
149
  csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
167
150
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
168
151
  drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
169
- ffi (1.17.3-aarch64-linux-gnu) sha256=28ad573df26560f0aedd8a90c3371279a0b2bd0b4e834b16a2baa10bd7a97068
170
- ffi (1.17.3-arm64-darwin) sha256=0c690555d4cee17a7f07c04d59df39b2fba74ec440b19da1f685c6579bb0717f
171
- ffi (1.17.3-x64-mingw-ucrt) sha256=5f1d7d067a9a1058ad183dba25b05557cd51c85fc1768c49338eabc1cf242d7c
172
- ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
173
- ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
152
+ ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
153
+ ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
174
154
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
175
- html-to-markdown (2.29.0)
155
+ html-to-markdown (3.0.0)
176
156
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
177
- json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
178
- json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
157
+ json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
179
158
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
180
159
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
181
160
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
182
161
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
183
- mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
184
162
  minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
185
163
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
186
164
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
187
- parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
165
+ parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
188
166
  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
189
- public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
190
167
  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
191
168
  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
192
169
  rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
@@ -195,14 +172,14 @@ CHECKSUMS
195
172
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
196
173
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
197
174
  rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
198
- rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
175
+ rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
199
176
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
200
177
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
201
178
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
202
179
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
203
180
  rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
204
181
  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
205
- rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
182
+ rubocop (1.86.0) sha256=4ff1186fe16ebe9baff5e7aad66bb0ad4cabf5cdcd419f773146dbba2565d186
206
183
  rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
207
184
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
208
185
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
data/README.md CHANGED
@@ -17,8 +17,8 @@
17
17
  <a href="https://central.sonatype.com/artifact/dev.kreuzberg/html-to-markdown">
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
- <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v2.29.0" alt="Go">
20
+ <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.0" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -87,7 +87,6 @@ Apple M4 • Real Wikipedia documents • `convert()` (Ruby)
87
87
  | Mixed (Python wiki) | 656KB | 4.89ms | 134 MB/s |
88
88
 
89
89
 
90
- See [Performance Guide](../../examples/performance/) for detailed benchmarks.
91
90
 
92
91
 
93
92
  ## Quick Start
@@ -98,7 +97,8 @@ Basic conversion:
98
97
  require 'html_to_markdown'
99
98
 
100
99
  html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
101
- markdown = HtmlToMarkdown.convert(html)
100
+ result = HtmlToMarkdown.convert(html)
101
+ markdown = result[:content]
102
102
  ```
103
103
 
104
104
 
@@ -109,60 +109,50 @@ With conversion options:
109
109
  require 'html_to_markdown'
110
110
 
111
111
  html = "<h1>Hello</h1><p>This is <strong>fast</strong>!</p>"
112
- markdown = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
112
+ result = HtmlToMarkdown.convert(html, heading_style: :atx, code_block_style: :fenced)
113
+ markdown = result[:content]
113
114
  ```
114
115
 
115
116
 
116
117
 
117
118
 
118
-
119
-
120
119
  ## API Reference
121
120
 
122
- ### Core Functions
123
-
124
-
125
- **`convert(html, options: nil) -> String`**
121
+ ### Core Function
126
122
 
127
- Basic HTML-to-Markdown conversion. Fast and simple.
128
123
 
129
- **`convert_with_metadata(html, options: nil, config: nil) -> [String, Hash]`**
124
+ **`convert(html, options: nil, visitor: nil) -> ConversionResult`**
130
125
 
131
- Extract Markdown plus metadata (headers, links, images, structured data) in a single pass. See [Metadata Extraction Guide](../../examples/metadata-extraction/).
126
+ Converts HTML to Markdown. Returns a `ConversionResult` hash with all results in a single call.
132
127
 
133
- **`convert_with_visitor(html, visitor:, options: nil) -> String`**
134
-
135
- Customize conversion with visitor callbacks for element interception. See [Visitor Pattern Guide](../../examples/visitor-pattern/).
136
-
137
- **`convert_with_inline_images(html, config: nil) -> [String, Array, Array]`**
138
-
139
- Extract base64-encoded inline images with metadata.
140
-
141
- **`convert_with_tables(html, options: nil, config: nil) -> ConversionWithTables`**
128
+ ```ruby
129
+ require 'html_to_markdown'
142
130
 
143
- Extract structured table data (cells, headers, rendered markdown) alongside conversion.
131
+ result = HtmlToMarkdown.convert(html)
132
+ markdown = result[:content] # Converted Markdown string
133
+ metadata = result[:metadata] # Metadata (when extract_metadata: true)
134
+ tables = result[:tables] # Structured table data (when extract_tables: true)
135
+ document = result[:document] # Document-level info
136
+ images = result[:images] # Extracted images
137
+ warnings = result[:warnings] # Any conversion warnings
138
+ ```
144
139
 
145
140
 
146
141
 
147
142
  ### Options
148
143
 
149
144
  **`ConversionOptions`** – Key configuration fields:
145
+
150
146
  - `heading_style`: Heading format (`"underlined"` | `"atx"` | `"atx_closed"`) — default: `"underlined"`
151
147
  - `list_indent_width`: Spaces per indent level — default: `2`
152
148
  - `bullets`: Bullet characters cycle — default: `"*+-"`
153
149
  - `wrap`: Enable text wrapping — default: `false`
154
150
  - `wrap_width`: Wrap at column — default: `80`
155
151
  - `code_language`: Default fenced code block language — default: none
156
- - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
152
+ - `extract_metadata`: Enable metadata extraction into `result.metadata` — default: `false`
153
+ - `extract_tables`: Enable structured table extraction into `result.tables` — default: `false`
157
154
  - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
158
155
 
159
- **`MetadataConfig`** – Selective metadata extraction:
160
- - `extract_headers`: h1-h6 elements — default: `true`
161
- - `extract_links`: Hyperlinks — default: `true`
162
- - `extract_images`: Image elements — default: `true`
163
- - `extract_structured_data`: JSON-LD, Microdata, RDFa — default: `true`
164
- - `max_structured_data_size`: Size limit in bytes — default: `100KB`
165
-
166
156
 
167
157
  ## Djot Output Format
168
158
 
@@ -222,16 +212,17 @@ Plain text mode is useful for search indexing, text extraction, and feeding cont
222
212
 
223
213
  ## Metadata Extraction
224
214
 
225
- The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass.
215
+ The metadata extraction feature enables comprehensive document analysis during conversion. Extract document properties, headers, links, images, and structured data in a single pass — all via the standard `convert()` function.
226
216
 
227
217
  **Use Cases:**
218
+
228
219
  - **SEO analysis** – Extract title, description, Open Graph tags, Twitter cards
229
220
  - **Table of contents generation** – Build structured outlines from heading hierarchy
230
221
  - **Content migration** – Document all external links and resources
231
222
  - **Accessibility audits** – Check for images without alt text, empty links, invalid heading hierarchy
232
223
  - **Link validation** – Classify and validate anchor, internal, external, email, and phone links
233
224
 
234
- **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Disable unused metadata types in `MetadataConfig` to optimize further.
225
+ **Zero Overhead When Disabled:** Metadata extraction adds negligible overhead and happens during the HTML parsing pass. Pass `extract_metadata: true` in `ConversionOptions` to enable it; the result is available at `result.metadata`.
235
226
 
236
227
  ### Example: Quick Start
237
228
 
@@ -240,27 +231,27 @@ The metadata extraction feature enables comprehensive document analysis during c
240
231
  require 'html_to_markdown'
241
232
 
242
233
  html = '<h1>Article</h1><img src="test.jpg" alt="test">'
243
- markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
244
-
245
- puts metadata[:document][:title] # Document title
246
- puts metadata[:headers] # All h1-h6 elements
247
- puts metadata[:links] # All hyperlinks
248
- puts metadata[:images] # All images with alt text
249
- puts metadata[:structured_data] # JSON-LD, Microdata, RDFa
234
+ result = HtmlToMarkdown.convert(html, extract_metadata: true)
235
+
236
+ puts result[:content] # Converted Markdown
237
+ puts result[:metadata][:document][:title] # Document title
238
+ puts result[:metadata][:headers] # All h1-h6 elements
239
+ puts result[:metadata][:links] # All hyperlinks
240
+ puts result[:metadata][:images] # All images with alt text
241
+ puts result[:metadata][:structured_data] # JSON-LD, Microdata, RDFa
250
242
  ```
251
243
 
252
244
 
253
245
 
254
- For detailed examples including SEO extraction, table-of-contents generation, link validation, and accessibility audits, see the [Metadata Extraction Guide](../../examples/metadata-extraction/).
255
-
256
246
 
257
247
 
258
248
 
259
249
  ## Visitor Pattern
260
250
 
261
- The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Use visitors to transform content, filter elements, validate structure, or collect analytics.
251
+ The visitor pattern enables custom HTML→Markdown conversion logic by providing callbacks for specific HTML elements during traversal. Pass a visitor as the third argument to `convert()`.
262
252
 
263
253
  **Use Cases:**
254
+
264
255
  - **Custom Markdown dialects** – Convert to Obsidian, Notion, or other flavors
265
256
  - **Content filtering** – Remove tracking pixels, ads, or unwanted elements
266
257
  - **URL rewriting** – Rewrite CDN URLs, add query parameters, validate links
@@ -291,20 +282,16 @@ class MyVisitor
291
282
  end
292
283
 
293
284
  html = '<a href="https://old-cdn.com/file.pdf">Download</a>'
294
- markdown = HtmlToMarkdown.convert_with_visitor(html, visitor: MyVisitor.new)
285
+ result = HtmlToMarkdown.convert(html, visitor: MyVisitor.new)
286
+ markdown = result[:content]
295
287
  ```
296
288
 
297
289
 
298
290
 
299
- For comprehensive examples including content filtering, link footnotes, accessibility validation, and asynchronous URL validation, see the [Visitor Pattern Guide](../../examples/visitor-pattern/).
300
-
301
291
 
302
292
 
303
293
  ## Examples
304
294
 
305
- - [Visitor Pattern Guide](../../examples/visitor-pattern/)
306
- - [Metadata Extraction Guide](../../examples/metadata-extraction/)
307
- - [Performance Guide](../../examples/performance/)
308
295
 
309
296
  ## Links
310
297
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.29.0'
4
+ VERSION = '3.0.0'
5
5
  end
@@ -7,205 +7,24 @@ module HtmlToMarkdown
7
7
  autoload :CLI, 'html_to_markdown/cli'
8
8
  autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
9
 
10
- class Options; end # rubocop:disable Lint/EmptyClass
11
-
12
10
  class << self
13
11
  alias native_convert convert
14
- alias native_convert_with_inline_images convert_with_inline_images
15
- alias native_convert_with_inline_images_handle convert_with_inline_images_handle
16
- alias native_options options
17
- alias native_convert_with_options convert_with_options
18
- alias native_convert_with_metadata convert_with_metadata
19
- alias native_convert_with_metadata_handle convert_with_metadata_handle
20
- alias native_convert_with_visitor convert_with_visitor
21
- alias native_convert_with_tables convert_with_tables
22
12
  end
23
13
 
24
14
  module_function
25
15
 
26
- def convert(html, options = nil, visitor = nil)
27
- if visitor
28
- native_convert_with_visitor(html.to_s, options, visitor)
29
- else
30
- native_convert(html.to_s, options)
31
- end
32
- end
33
-
34
- def convert_with_options(html, options_handle)
35
- native_convert_with_options(html.to_s, options_handle)
36
- end
37
-
38
- def convert_with_inline_images(html, options = nil, image_config = nil, _visitor = nil)
39
- # NOTE: visitor parameter is accepted for API compatibility but not used in inline images mode
40
- # The visitor pattern is only supported in the standard convert() method
41
- native_convert_with_inline_images(html.to_s, options, image_config)
42
- end
43
-
44
- def convert_with_inline_images_handle(html, options_handle, image_config = nil)
45
- native_convert_with_inline_images_handle(html.to_s, options_handle, image_config)
46
- end
47
-
48
- def options(options_hash = nil)
49
- native_options(options_hash)
50
- end
51
-
52
- # Convert HTML to Markdown with comprehensive metadata extraction.
53
- #
54
- # Performs HTML-to-Markdown conversion while extracting document metadata, headers,
55
- # links, images, and structured data in a single pass. Ideal for content analysis,
56
- # SEO workflows, and document indexing.
57
- #
58
- # @param html [String] HTML string to convert. Line endings are normalized (CRLF -> LF).
59
- # @param options [ConversionOptions, Hash, nil] Optional conversion configuration.
60
- # When a Hash, keys should match ConversionOptions field names (as symbols or strings).
61
- # Common options:
62
- # - :heading_style [String] "atx", "atx_closed", or "underlined" (default: "underlined")
63
- # - :list_indent_type [String] "spaces" or "tabs" (default: "spaces")
64
- # - :list_indent_width [Integer] Spaces per indent level (default: 4)
65
- # - :wrap [true, false] Enable text wrapping (default: false)
66
- # - :wrap_width [Integer] Wrap at this column width (default: 80)
67
- # See ConversionOptions documentation for complete list.
68
- #
69
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
70
- # Keys should be symbols or strings. Supported keys:
71
- # - :extract_headers [true, false] Extract h1-h6 heading elements (default: true)
72
- # - :extract_links [true, false] Extract hyperlinks with type classification (default: true)
73
- # - :extract_images [true, false] Extract image elements (default: true)
74
- # - :extract_structured_data [true, false] Extract JSON-LD/Microdata/RDFa (default: true)
75
- # - :max_structured_data_size [Integer] Size limit for structured data in bytes (default: 1_000_000)
76
- #
77
- # @return [Array<String, Hash>] Tuple of [markdown_string, metadata_hash]
78
- # markdown_string: String - The converted Markdown output
79
- #
80
- # metadata_hash: Hash with keys:
81
- # - :document [Hash] Document-level metadata:
82
- # - :title [String, nil] From <title> tag
83
- # - :description [String, nil] From <meta name="description">
84
- # - :keywords [Array<String>] From <meta name="keywords">
85
- # - :author [String, nil] From <meta name="author">
86
- # - :language [String, nil] From lang attribute (e.g., "en")
87
- # - :text_direction [String, nil] "ltr", "rtl", or "auto"
88
- # - :canonical_url [String, nil] From <link rel="canonical">
89
- # - :base_href [String, nil] From <base href="">
90
- # - :open_graph [Hash<String, String>] Open Graph properties (og:* meta tags)
91
- # - :twitter_card [Hash<String, String>] Twitter Card properties (twitter:* meta tags)
92
- # - :meta_tags [Hash<String, String>] Other meta tags
93
- #
94
- # - :headers [Array<Hash>] Heading elements:
95
- # - :level [Integer] 1-6
96
- # - :text [String] Header text content
97
- # - :id [String, nil] HTML id attribute
98
- # - :depth [Integer] Tree nesting depth
99
- # - :html_offset [Integer] Byte offset in original HTML
100
- #
101
- # - :links [Array<Hash>] Hyperlinks:
102
- # - :href [String] Link URL
103
- # - :text [String] Link text content
104
- # - :title [String, nil] Title attribute
105
- # - :link_type [String] "anchor", "internal", "external", "email", "phone", or "other"
106
- # - :rel [Array<String>] Rel attribute values
107
- # - :attributes [Hash<String, String>] Additional HTML attributes
108
- #
109
- # - :images [Array<Hash>] Image elements:
110
- # - :src [String] Image source URL or data URI
111
- # - :alt [String, nil] Alt text for accessibility
112
- # - :title [String, nil] Title attribute
113
- # - :dimensions [Array<Integer>, nil] [width, height] if available
114
- # - :image_type [String] "data_uri", "external", "relative", or "inline_svg"
115
- # - :attributes [Hash<String, String>] Additional HTML attributes
116
- #
117
- # - :structured_data [Array<Hash>] Structured data blocks:
118
- # - :data_type [String] "json_ld", "microdata", or "rdfa"
119
- # - :raw_json [String] Raw JSON content
120
- # - :schema_type [String, nil] Schema type (e.g., "Article", "Event")
121
- #
122
- # @raise [StandardError] If conversion fails or invalid configuration
123
- #
124
- # @example Basic usage
125
- # html = <<~HTML
126
- # <html lang="en">
127
- # <head>
128
- # <title>My Article</title>
129
- # <meta name="description" content="A great read">
130
- # </head>
131
- # <body>
132
- # <h1 id="intro">Introduction</h1>
133
- # <p>Visit <a href="https://example.com">our site</a></p>
134
- # <img src="photo.jpg" alt="Beautiful landscape">
135
- # </body>
136
- # </html>
137
- # HTML
138
- #
139
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html)
140
- #
141
- # puts metadata[:document][:title] # => "My Article"
142
- # puts metadata[:document][:language] # => "en"
143
- # puts metadata[:headers].length # => 1
144
- # puts metadata[:headers][0][:text] # => "Introduction"
145
- # puts metadata[:links].length # => 1
146
- # puts metadata[:images].length # => 1
147
- #
148
- # @example With selective metadata extraction
149
- # config = {
150
- # extract_headers: true,
151
- # extract_links: true,
152
- # extract_images: false, # Skip images
153
- # extract_structured_data: false # Skip structured data
154
- # }
155
- #
156
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, nil, config)
157
- # puts metadata[:images].empty? # => true (not extracted)
158
- #
159
- # @example With conversion options
160
- # options = {
161
- # heading_style: "atx", # Use # H1, ## H2 style
162
- # wrap: true,
163
- # wrap_width: 80
164
- # }
165
- #
166
- # config = { extract_headers: true }
167
- #
168
- # markdown, metadata = HtmlToMarkdown.convert_with_metadata(html, options, config)
169
- # # Markdown uses ATX-style headings and wraps at 80 characters
170
- #
171
- # @see #convert Simple conversion without metadata
172
- # @see #convert_with_inline_images Extract inline images during conversion
173
- # @see ConversionOptions Detailed conversion configuration
174
- def convert_with_metadata(html, options = nil, metadata_config = nil, _visitor = nil)
175
- # NOTE: visitor parameter is accepted for API compatibility but not used in metadata extraction mode
176
- # The visitor pattern is only supported in the standard convert() method
177
- native_convert_with_metadata(html.to_s, options, metadata_config)
178
- end
179
-
180
- def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
181
- native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
182
- end
183
-
184
- # Convert HTML to Markdown with table extraction.
185
- #
186
- # Performs HTML-to-Markdown conversion while extracting structured table data
187
- # (cells, markdown representation, header row flags) in a single pass.
188
- #
189
- # @param html [String] HTML string to convert.
190
- # @param options [Hash, nil] Optional conversion configuration.
191
- # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
192
- #
193
- # @return [Hash] A hash with keys:
194
- # - :content [String] The converted Markdown output
195
- # - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
196
- # - :tables [Array<Hash>] Extracted tables, each with:
197
- # - :cells [Array<Array<String>>] Table cells organized as rows x columns
198
- # - :markdown [String] Complete rendered table in Markdown format
199
- # - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
200
- #
201
- # @raise [StandardError] If conversion fails or invalid configuration
202
- #
203
- # @example Basic usage
204
- # html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
205
- # result = HtmlToMarkdown.convert_with_tables(html)
206
- # puts result[:tables].length # => 1
207
- # puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
208
- def convert_with_tables(html, options = nil, metadata_config = nil)
209
- native_convert_with_tables(html.to_s, options, metadata_config)
16
+ # Convert HTML to Markdown, returning a Hash with:
17
+ # - :content [String, nil] the converted Markdown output
18
+ # - :document [nil] document structure (not yet exposed)
19
+ # - :metadata [Hash, nil] extracted HTML metadata
20
+ # - :tables [Array<Hash>] extracted tables with :grid and :markdown
21
+ # - :images [Array<Hash>] extracted inline images
22
+ # - :warnings [Array<Hash>] processing warnings
23
+ #
24
+ # @param html [String] HTML string to convert
25
+ # @param options [Hash, nil] optional conversion options
26
+ # @return [Hash] conversion result
27
+ def convert(html, options = nil)
28
+ native_convert(html.to_s, options)
210
29
  end
211
30
  end
Binary file