html-to-markdown 2.27.3 → 2.28.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,7 +29,7 @@ homepage = "https://github.com/uuid-rs/uuid"
29
29
  name = "uuid"
30
30
  readme = "README.md"
31
31
  repository = "https://github.com/uuid-rs/uuid"
32
- version = "1.21.0" # remember to update html_root_url in lib.rs
32
+ version = "1.22.0" # remember to update html_root_url in lib.rs
33
33
  rust-version = "1.85.0"
34
34
 
35
35
  [package.metadata.docs.rs]
@@ -86,7 +86,7 @@ borsh = ["dep:borsh", "dep:borsh-derive"]
86
86
 
87
87
  # Public: Used in trait impls on `Uuid`
88
88
  [dependencies.bytemuck]
89
- version = "1.21.0"
89
+ version = "1.22.0"
90
90
  optional = true
91
91
  features = ["derive"]
92
92
 
@@ -140,13 +140,13 @@ optional = true
140
140
  [target.'cfg(all(target_arch = "wasm32", any(target_os = "unknown", target_os = "none")))'.dependencies.uuid-rng-internal-lib]
141
141
  # Work-around lack of support for both `dep:x` and `x/` in MSRV
142
142
  package = "uuid-rng-internal"
143
- version = "1.21.0"
143
+ version = "1.22.0"
144
144
  path = "rng"
145
145
  optional = true
146
146
 
147
147
  # Private
148
148
  [target.'cfg(not(all(target_arch = "wasm32", any(target_os = "unknown", target_os = "none"))))'.dependencies.rand]
149
- version = "0.9"
149
+ version = "0.10"
150
150
  optional = true
151
151
 
152
152
  # Private
@@ -28,7 +28,7 @@ Add the following to your `Cargo.toml`:
28
28
 
29
29
  ```toml
30
30
  [dependencies.uuid]
31
- version = "1.21.0"
31
+ version = "1.22.0"
32
32
  # Lets you generate random UUIDs
33
33
  features = [
34
34
  "v4",
@@ -64,11 +64,11 @@ assert_eq!(Some(Version::Random), my_uuid.get_version());
64
64
  If you'd like to parse UUIDs _really_ fast, check out the [`uuid-simd`](https://github.com/nugine/uuid-simd)
65
65
  library.
66
66
 
67
- For more details on using `uuid`, [see the library documentation](https://docs.rs/uuid/1.21.0/uuid).
67
+ For more details on using `uuid`, [see the library documentation](https://docs.rs/uuid/1.22.0/uuid).
68
68
 
69
69
  ## References
70
70
 
71
- * [`uuid` library docs](https://docs.rs/uuid/1.21.0/uuid).
71
+ * [`uuid` library docs](https://docs.rs/uuid/1.22.0/uuid).
72
72
  * [Wikipedia: Universally Unique Identifier](http://en.wikipedia.org/wiki/Universally_unique_identifier).
73
73
  * [RFC 9562: Universally Unique IDentifiers (UUID)](https://www.ietf.org/rfc/rfc9562.html).
74
74
 
@@ -38,7 +38,7 @@
38
38
  //!
39
39
  //! ```toml
40
40
  //! [dependencies.uuid]
41
- //! version = "1.21.0"
41
+ //! version = "1.22.0"
42
42
  //! # Lets you generate random UUIDs
43
43
  //! features = [
44
44
  //! "v4",
@@ -138,7 +138,7 @@
138
138
  //!
139
139
  //! ```toml
140
140
  //! [dependencies.uuid]
141
- //! version = "1.21.0"
141
+ //! version = "1.22.0"
142
142
  //! features = [
143
143
  //! "v4",
144
144
  //! "v7",
@@ -153,7 +153,7 @@
153
153
  //!
154
154
  //! ```toml
155
155
  //! [dependencies.uuid]
156
- //! version = "1.21.0"
156
+ //! version = "1.22.0"
157
157
  //! default-features = false
158
158
  //! ```
159
159
  //!
@@ -211,7 +211,7 @@
211
211
  #![doc(
212
212
  html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
213
213
  html_favicon_url = "https://www.rust-lang.org/favicon.ico",
214
- html_root_url = "https://docs.rs/uuid/1.21.0"
214
+ html_root_url = "https://docs.rs/uuid/1.22.0"
215
215
  )]
216
216
 
217
217
  #[cfg(any(feature = "std", test))]
@@ -185,6 +185,18 @@ module HtmlToMarkdown
185
185
  structured_data: Array[structured_data]
186
186
  }
187
187
 
188
+ type table_data = {
189
+ cells: Array[Array[String]],
190
+ markdown: String,
191
+ is_header_row: Array[bool]
192
+ }
193
+
194
+ type table_extraction_result = {
195
+ content: String,
196
+ metadata: extended_metadata?,
197
+ tables: Array[table_data]
198
+ }
199
+
188
200
  # Native methods (implemented in Rust via Magnus/rb-sys)
189
201
  # These are aliased from the Rust extension and available as both module and instance methods
190
202
  private
@@ -217,6 +229,11 @@ module HtmlToMarkdown
217
229
  conversion_options? options,
218
230
  visitor? visitor
219
231
  ) -> String
232
+ def self.native_convert_with_tables: (
233
+ String html,
234
+ conversion_options? options,
235
+ metadata_config? metadata_config
236
+ ) -> table_extraction_result
220
237
 
221
238
  def native_convert: (String html, conversion_options? options) -> String
222
239
  def native_options: (conversion_options? options_hash) -> Options
@@ -246,6 +263,11 @@ module HtmlToMarkdown
246
263
  conversion_options? options,
247
264
  visitor? visitor
248
265
  ) -> String
266
+ def native_convert_with_tables: (
267
+ String html,
268
+ conversion_options? options,
269
+ metadata_config? metadata_config
270
+ ) -> table_extraction_result
249
271
 
250
272
  # Visitor interface for customizing conversion behavior
251
273
  type visitor = Object
@@ -422,6 +444,25 @@ module HtmlToMarkdown
422
444
  ?metadata_config metadata_config
423
445
  ) -> [String, extended_metadata]
424
446
 
447
+ # Convert HTML and extract tables as structured data
448
+ #
449
+ # Args:
450
+ # html: HTML string to convert
451
+ # options: Optional conversion configuration
452
+ # metadata_config: Optional metadata extraction configuration
453
+ #
454
+ # Returns:
455
+ # table_extraction_result: Hash containing content, metadata, and tables array
456
+ #
457
+ # Example:
458
+ # result = HtmlToMarkdown.convert_with_tables(html)
459
+ # puts result[:tables].length
460
+ def self.convert_with_tables: (
461
+ String html,
462
+ ?conversion_options options,
463
+ ?metadata_config metadata_config
464
+ ) -> table_extraction_result
465
+
425
466
  # Instance method versions (created by module_function)
426
467
  def convert: (String html, ?conversion_options options, ?visitor visitor) -> String
427
468
  def options: (?conversion_options options_hash) -> Options
@@ -449,4 +490,9 @@ module HtmlToMarkdown
449
490
  Options options_handle,
450
491
  ?metadata_config metadata_config
451
492
  ) -> [String, extended_metadata]
493
+ def convert_with_tables: (
494
+ String html,
495
+ ?conversion_options options,
496
+ ?metadata_config metadata_config
497
+ ) -> table_extraction_result
452
498
  end
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert_with_tables' do
7
+ it 'returns a hash with content, metadata, and tables keys' do
8
+ html = '<table><tr><td>Cell</td></tr></table>'
9
+ result = described_class.convert_with_tables(html)
10
+
11
+ expect(result).to be_a(Hash)
12
+ expect(result).to include(:content, :metadata, :tables)
13
+ end
14
+
15
+ context 'with a basic table with header' do
16
+ let(:html) do
17
+ <<~HTML
18
+ <table>
19
+ <thead>
20
+ <tr><th>Name</th><th>Age</th></tr>
21
+ </thead>
22
+ <tbody>
23
+ <tr><td>Alice</td><td>30</td></tr>
24
+ </tbody>
25
+ </table>
26
+ HTML
27
+ end
28
+
29
+ it 'extracts exactly one table' do
30
+ result = described_class.convert_with_tables(html)
31
+
32
+ expect(result[:tables].length).to eq(1)
33
+ end
34
+
35
+ it 'extracts cells as rows of columns' do
36
+ result = described_class.convert_with_tables(html)
37
+ table = result[:tables][0]
38
+
39
+ expect(table[:cells]).to be_an(Array)
40
+ expect(table[:cells].length).to eq(2)
41
+ expect(table[:cells][0]).to eq(%w[Name Age])
42
+ expect(table[:cells][1]).to eq(%w[Alice 30])
43
+ end
44
+
45
+ it 'provides markdown representation' do
46
+ result = described_class.convert_with_tables(html)
47
+ table = result[:tables][0]
48
+
49
+ expect(table[:markdown]).to be_a(String)
50
+ expect(table[:markdown]).to include('Name')
51
+ expect(table[:markdown]).to include('Alice')
52
+ end
53
+
54
+ it 'marks header rows correctly' do
55
+ result = described_class.convert_with_tables(html)
56
+ table = result[:tables][0]
57
+
58
+ expect(table[:is_header_row]).to be_an(Array)
59
+ expect(table[:is_header_row].length).to eq(2)
60
+ expect(table[:is_header_row][0]).to be true
61
+ expect(table[:is_header_row][1]).to be false
62
+ end
63
+
64
+ it 'includes converted markdown content' do
65
+ result = described_class.convert_with_tables(html)
66
+
67
+ expect(result[:content]).to be_a(String)
68
+ expect(result[:content]).not_to be_empty
69
+ end
70
+ end
71
+
72
+ context 'with empty HTML' do
73
+ it 'returns empty tables array' do
74
+ result = described_class.convert_with_tables('')
75
+
76
+ expect(result[:tables]).to eq([])
77
+ expect(result[:content]).to be_a(String)
78
+ end
79
+ end
80
+
81
+ context 'with HTML containing no tables' do
82
+ it 'returns empty tables array' do
83
+ html = '<p>No tables here</p>'
84
+ result = described_class.convert_with_tables(html)
85
+
86
+ expect(result[:tables]).to eq([])
87
+ expect(result[:content]).to include('No tables here')
88
+ end
89
+ end
90
+
91
+ context 'with multiple tables' do
92
+ let(:html) do
93
+ <<~HTML
94
+ <table>
95
+ <tr><th>A</th></tr>
96
+ <tr><td>1</td></tr>
97
+ </table>
98
+ <p>Some text between tables</p>
99
+ <table>
100
+ <tr><th>B</th><th>C</th></tr>
101
+ <tr><td>2</td><td>3</td></tr>
102
+ <tr><td>4</td><td>5</td></tr>
103
+ </table>
104
+ HTML
105
+ end
106
+
107
+ it 'extracts all tables' do
108
+ result = described_class.convert_with_tables(html)
109
+
110
+ expect(result[:tables].length).to eq(2)
111
+ end
112
+
113
+ it 'preserves table order' do
114
+ result = described_class.convert_with_tables(html)
115
+
116
+ first_table = result[:tables][0]
117
+ second_table = result[:tables][1]
118
+
119
+ expect(first_table[:cells][0]).to eq(['A'])
120
+ expect(second_table[:cells][0]).to eq(%w[B C])
121
+ end
122
+
123
+ it 'extracts correct row counts per table' do
124
+ result = described_class.convert_with_tables(html)
125
+
126
+ expect(result[:tables][0][:cells].length).to eq(2)
127
+ expect(result[:tables][1][:cells].length).to eq(3)
128
+ end
129
+ end
130
+
131
+ context 'with special characters in cells' do
132
+ let(:html) do
133
+ <<~HTML
134
+ <table>
135
+ <tr><th>Key</th><th>Value</th></tr>
136
+ <tr><td>Brackets &lt;&gt;</td><td>Ampersand &amp;</td></tr>
137
+ <tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
138
+ <tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
139
+ </table>
140
+ HTML
141
+ end
142
+
143
+ it 'handles HTML entities in cells' do
144
+ result = described_class.convert_with_tables(html)
145
+ table = result[:tables][0]
146
+
147
+ expect(table[:cells][1][0]).to include('<>')
148
+ expect(table[:cells][1][1]).to include('&')
149
+ end
150
+
151
+ it 'handles quotes in cells' do
152
+ result = described_class.convert_with_tables(html)
153
+ table = result[:tables][0]
154
+
155
+ expect(table[:cells][2][0]).to include('"double"')
156
+ expect(table[:cells][2][1]).to include("'single'")
157
+ end
158
+
159
+ it 'handles unicode in cells' do
160
+ result = described_class.convert_with_tables(html)
161
+ table = result[:tables][0]
162
+
163
+ expect(table[:cells][3][0]).to be_a(String)
164
+ end
165
+ end
166
+
167
+ context 'with conversion options' do
168
+ it 'accepts options hash' do
169
+ html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
170
+ result = described_class.convert_with_tables(html, { heading_style: :atx })
171
+
172
+ expect(result).to be_a(Hash)
173
+ expect(result[:tables].length).to eq(1)
174
+ end
175
+
176
+ it 'accepts nil options' do
177
+ html = '<table><tr><td>Data</td></tr></table>'
178
+ result = described_class.convert_with_tables(html, nil, nil)
179
+
180
+ expect(result).to be_a(Hash)
181
+ expect(result[:tables].length).to eq(1)
182
+ end
183
+ end
184
+
185
+ context 'with metadata config' do
186
+ it 'includes metadata when configured' do
187
+ html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
188
+ result = described_class.convert_with_tables(html, nil, { extract_headers: true })
189
+
190
+ expect(result[:metadata]).to be_a(Hash).or(be_nil)
191
+ end
192
+ end
193
+ end
194
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.27.3
4
+ version: 2.28.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-05 00:00:00.000000000 Z
11
+ date: 2026-03-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -61,6 +61,7 @@ files:
61
61
  - ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
62
62
  - ext/html-to-markdown-rb/native/src/conversion/metadata.rs
63
63
  - ext/html-to-markdown-rb/native/src/conversion/mod.rs
64
+ - ext/html-to-markdown-rb/native/src/conversion/tables.rs
64
65
  - ext/html-to-markdown-rb/native/src/lib.rs
65
66
  - ext/html-to-markdown-rb/native/src/options.rs
66
67
  - ext/html-to-markdown-rb/native/src/profiling.rs
@@ -1958,6 +1959,7 @@ files:
1958
1959
  - rust-vendor/html-to-markdown-rs/tests/tables_test.rs
1959
1960
  - rust-vendor/html-to-markdown-rs/tests/test_custom_elements.rs
1960
1961
  - rust-vendor/html-to-markdown-rs/tests/test_issue_187.rs
1962
+ - rust-vendor/html-to-markdown-rs/tests/test_issue_218.rs
1961
1963
  - rust-vendor/html-to-markdown-rs/tests/test_nested_simple.rs
1962
1964
  - rust-vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs
1963
1965
  - rust-vendor/html-to-markdown-rs/tests/test_spa_bisect.rs
@@ -9737,6 +9739,7 @@ files:
9737
9739
  - sig/open3.rbs
9738
9740
  - spec/cli_proxy_spec.rb
9739
9741
  - spec/convert_spec.rb
9742
+ - spec/convert_with_tables_spec.rb
9740
9743
  - spec/metadata_extraction_spec.rb
9741
9744
  - spec/spec_helper.rb
9742
9745
  - spec/visitor_issue_187_spec.rb