html-to-markdown 2.27.3 → 2.28.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +6 -0
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +71 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +27 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +29 -0
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/README.md +29 -0
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +368 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +10 -5
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/utility/content.rs +17 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +8 -5
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/src/prelude.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/tests/test_issue_218.rs +56 -0
- data/rust-vendor/uuid/.cargo-checksum.json +1 -1
- data/rust-vendor/uuid/.cargo_vcs_info.json +1 -1
- data/rust-vendor/uuid/Cargo.lock +71 -137
- data/rust-vendor/uuid/Cargo.toml +4 -4
- data/rust-vendor/uuid/Cargo.toml.orig +4 -4
- data/rust-vendor/uuid/README.md +3 -3
- data/rust-vendor/uuid/src/lib.rs +4 -4
- data/sig/html_to_markdown.rbs +46 -0
- data/spec/convert_with_tables_spec.rb +194 -0
- metadata +5 -2
|
@@ -29,7 +29,7 @@ homepage = "https://github.com/uuid-rs/uuid"
|
|
|
29
29
|
name = "uuid"
|
|
30
30
|
readme = "README.md"
|
|
31
31
|
repository = "https://github.com/uuid-rs/uuid"
|
|
32
|
-
version = "1.
|
|
32
|
+
version = "1.22.0" # remember to update html_root_url in lib.rs
|
|
33
33
|
rust-version = "1.85.0"
|
|
34
34
|
|
|
35
35
|
[package.metadata.docs.rs]
|
|
@@ -86,7 +86,7 @@ borsh = ["dep:borsh", "dep:borsh-derive"]
|
|
|
86
86
|
|
|
87
87
|
# Public: Used in trait impls on `Uuid`
|
|
88
88
|
[dependencies.bytemuck]
|
|
89
|
-
version = "1.
|
|
89
|
+
version = "1.22.0"
|
|
90
90
|
optional = true
|
|
91
91
|
features = ["derive"]
|
|
92
92
|
|
|
@@ -140,13 +140,13 @@ optional = true
|
|
|
140
140
|
[target.'cfg(all(target_arch = "wasm32", any(target_os = "unknown", target_os = "none")))'.dependencies.uuid-rng-internal-lib]
|
|
141
141
|
# Work-around lack of support for both `dep:x` and `x/` in MSRV
|
|
142
142
|
package = "uuid-rng-internal"
|
|
143
|
-
version = "1.
|
|
143
|
+
version = "1.22.0"
|
|
144
144
|
path = "rng"
|
|
145
145
|
optional = true
|
|
146
146
|
|
|
147
147
|
# Private
|
|
148
148
|
[target.'cfg(not(all(target_arch = "wasm32", any(target_os = "unknown", target_os = "none"))))'.dependencies.rand]
|
|
149
|
-
version = "0.
|
|
149
|
+
version = "0.10"
|
|
150
150
|
optional = true
|
|
151
151
|
|
|
152
152
|
# Private
|
data/rust-vendor/uuid/README.md
CHANGED
|
@@ -28,7 +28,7 @@ Add the following to your `Cargo.toml`:
|
|
|
28
28
|
|
|
29
29
|
```toml
|
|
30
30
|
[dependencies.uuid]
|
|
31
|
-
version = "1.
|
|
31
|
+
version = "1.22.0"
|
|
32
32
|
# Lets you generate random UUIDs
|
|
33
33
|
features = [
|
|
34
34
|
"v4",
|
|
@@ -64,11 +64,11 @@ assert_eq!(Some(Version::Random), my_uuid.get_version());
|
|
|
64
64
|
If you'd like to parse UUIDs _really_ fast, check out the [`uuid-simd`](https://github.com/nugine/uuid-simd)
|
|
65
65
|
library.
|
|
66
66
|
|
|
67
|
-
For more details on using `uuid`, [see the library documentation](https://docs.rs/uuid/1.
|
|
67
|
+
For more details on using `uuid`, [see the library documentation](https://docs.rs/uuid/1.22.0/uuid).
|
|
68
68
|
|
|
69
69
|
## References
|
|
70
70
|
|
|
71
|
-
* [`uuid` library docs](https://docs.rs/uuid/1.
|
|
71
|
+
* [`uuid` library docs](https://docs.rs/uuid/1.22.0/uuid).
|
|
72
72
|
* [Wikipedia: Universally Unique Identifier](http://en.wikipedia.org/wiki/Universally_unique_identifier).
|
|
73
73
|
* [RFC 9562: Universally Unique IDentifiers (UUID)](https://www.ietf.org/rfc/rfc9562.html).
|
|
74
74
|
|
data/rust-vendor/uuid/src/lib.rs
CHANGED
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
//!
|
|
39
39
|
//! ```toml
|
|
40
40
|
//! [dependencies.uuid]
|
|
41
|
-
//! version = "1.
|
|
41
|
+
//! version = "1.22.0"
|
|
42
42
|
//! # Lets you generate random UUIDs
|
|
43
43
|
//! features = [
|
|
44
44
|
//! "v4",
|
|
@@ -138,7 +138,7 @@
|
|
|
138
138
|
//!
|
|
139
139
|
//! ```toml
|
|
140
140
|
//! [dependencies.uuid]
|
|
141
|
-
//! version = "1.
|
|
141
|
+
//! version = "1.22.0"
|
|
142
142
|
//! features = [
|
|
143
143
|
//! "v4",
|
|
144
144
|
//! "v7",
|
|
@@ -153,7 +153,7 @@
|
|
|
153
153
|
//!
|
|
154
154
|
//! ```toml
|
|
155
155
|
//! [dependencies.uuid]
|
|
156
|
-
//! version = "1.
|
|
156
|
+
//! version = "1.22.0"
|
|
157
157
|
//! default-features = false
|
|
158
158
|
//! ```
|
|
159
159
|
//!
|
|
@@ -211,7 +211,7 @@
|
|
|
211
211
|
#![doc(
|
|
212
212
|
html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
|
|
213
213
|
html_favicon_url = "https://www.rust-lang.org/favicon.ico",
|
|
214
|
-
html_root_url = "https://docs.rs/uuid/1.
|
|
214
|
+
html_root_url = "https://docs.rs/uuid/1.22.0"
|
|
215
215
|
)]
|
|
216
216
|
|
|
217
217
|
#[cfg(any(feature = "std", test))]
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -185,6 +185,18 @@ module HtmlToMarkdown
|
|
|
185
185
|
structured_data: Array[structured_data]
|
|
186
186
|
}
|
|
187
187
|
|
|
188
|
+
type table_data = {
|
|
189
|
+
cells: Array[Array[String]],
|
|
190
|
+
markdown: String,
|
|
191
|
+
is_header_row: Array[bool]
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
type table_extraction_result = {
|
|
195
|
+
content: String,
|
|
196
|
+
metadata: extended_metadata?,
|
|
197
|
+
tables: Array[table_data]
|
|
198
|
+
}
|
|
199
|
+
|
|
188
200
|
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
189
201
|
# These are aliased from the Rust extension and available as both module and instance methods
|
|
190
202
|
private
|
|
@@ -217,6 +229,11 @@ module HtmlToMarkdown
|
|
|
217
229
|
conversion_options? options,
|
|
218
230
|
visitor? visitor
|
|
219
231
|
) -> String
|
|
232
|
+
def self.native_convert_with_tables: (
|
|
233
|
+
String html,
|
|
234
|
+
conversion_options? options,
|
|
235
|
+
metadata_config? metadata_config
|
|
236
|
+
) -> table_extraction_result
|
|
220
237
|
|
|
221
238
|
def native_convert: (String html, conversion_options? options) -> String
|
|
222
239
|
def native_options: (conversion_options? options_hash) -> Options
|
|
@@ -246,6 +263,11 @@ module HtmlToMarkdown
|
|
|
246
263
|
conversion_options? options,
|
|
247
264
|
visitor? visitor
|
|
248
265
|
) -> String
|
|
266
|
+
def native_convert_with_tables: (
|
|
267
|
+
String html,
|
|
268
|
+
conversion_options? options,
|
|
269
|
+
metadata_config? metadata_config
|
|
270
|
+
) -> table_extraction_result
|
|
249
271
|
|
|
250
272
|
# Visitor interface for customizing conversion behavior
|
|
251
273
|
type visitor = Object
|
|
@@ -422,6 +444,25 @@ module HtmlToMarkdown
|
|
|
422
444
|
?metadata_config metadata_config
|
|
423
445
|
) -> [String, extended_metadata]
|
|
424
446
|
|
|
447
|
+
# Convert HTML and extract tables as structured data
|
|
448
|
+
#
|
|
449
|
+
# Args:
|
|
450
|
+
# html: HTML string to convert
|
|
451
|
+
# options: Optional conversion configuration
|
|
452
|
+
# metadata_config: Optional metadata extraction configuration
|
|
453
|
+
#
|
|
454
|
+
# Returns:
|
|
455
|
+
# table_extraction_result: Hash containing content, metadata, and tables array
|
|
456
|
+
#
|
|
457
|
+
# Example:
|
|
458
|
+
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
459
|
+
# puts result[:tables].length
|
|
460
|
+
def self.convert_with_tables: (
|
|
461
|
+
String html,
|
|
462
|
+
?conversion_options options,
|
|
463
|
+
?metadata_config metadata_config
|
|
464
|
+
) -> table_extraction_result
|
|
465
|
+
|
|
425
466
|
# Instance method versions (created by module_function)
|
|
426
467
|
def convert: (String html, ?conversion_options options, ?visitor visitor) -> String
|
|
427
468
|
def options: (?conversion_options options_hash) -> Options
|
|
@@ -449,4 +490,9 @@ module HtmlToMarkdown
|
|
|
449
490
|
Options options_handle,
|
|
450
491
|
?metadata_config metadata_config
|
|
451
492
|
) -> [String, extended_metadata]
|
|
493
|
+
def convert_with_tables: (
|
|
494
|
+
String html,
|
|
495
|
+
?conversion_options options,
|
|
496
|
+
?metadata_config metadata_config
|
|
497
|
+
) -> table_extraction_result
|
|
452
498
|
end
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe HtmlToMarkdown do
|
|
6
|
+
describe '.convert_with_tables' do
|
|
7
|
+
it 'returns a hash with content, metadata, and tables keys' do
|
|
8
|
+
html = '<table><tr><td>Cell</td></tr></table>'
|
|
9
|
+
result = described_class.convert_with_tables(html)
|
|
10
|
+
|
|
11
|
+
expect(result).to be_a(Hash)
|
|
12
|
+
expect(result).to include(:content, :metadata, :tables)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
context 'with a basic table with header' do
|
|
16
|
+
let(:html) do
|
|
17
|
+
<<~HTML
|
|
18
|
+
<table>
|
|
19
|
+
<thead>
|
|
20
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
21
|
+
</thead>
|
|
22
|
+
<tbody>
|
|
23
|
+
<tr><td>Alice</td><td>30</td></tr>
|
|
24
|
+
</tbody>
|
|
25
|
+
</table>
|
|
26
|
+
HTML
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'extracts exactly one table' do
|
|
30
|
+
result = described_class.convert_with_tables(html)
|
|
31
|
+
|
|
32
|
+
expect(result[:tables].length).to eq(1)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'extracts cells as rows of columns' do
|
|
36
|
+
result = described_class.convert_with_tables(html)
|
|
37
|
+
table = result[:tables][0]
|
|
38
|
+
|
|
39
|
+
expect(table[:cells]).to be_an(Array)
|
|
40
|
+
expect(table[:cells].length).to eq(2)
|
|
41
|
+
expect(table[:cells][0]).to eq(%w[Name Age])
|
|
42
|
+
expect(table[:cells][1]).to eq(%w[Alice 30])
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'provides markdown representation' do
|
|
46
|
+
result = described_class.convert_with_tables(html)
|
|
47
|
+
table = result[:tables][0]
|
|
48
|
+
|
|
49
|
+
expect(table[:markdown]).to be_a(String)
|
|
50
|
+
expect(table[:markdown]).to include('Name')
|
|
51
|
+
expect(table[:markdown]).to include('Alice')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'marks header rows correctly' do
|
|
55
|
+
result = described_class.convert_with_tables(html)
|
|
56
|
+
table = result[:tables][0]
|
|
57
|
+
|
|
58
|
+
expect(table[:is_header_row]).to be_an(Array)
|
|
59
|
+
expect(table[:is_header_row].length).to eq(2)
|
|
60
|
+
expect(table[:is_header_row][0]).to be true
|
|
61
|
+
expect(table[:is_header_row][1]).to be false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'includes converted markdown content' do
|
|
65
|
+
result = described_class.convert_with_tables(html)
|
|
66
|
+
|
|
67
|
+
expect(result[:content]).to be_a(String)
|
|
68
|
+
expect(result[:content]).not_to be_empty
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
context 'with empty HTML' do
|
|
73
|
+
it 'returns empty tables array' do
|
|
74
|
+
result = described_class.convert_with_tables('')
|
|
75
|
+
|
|
76
|
+
expect(result[:tables]).to eq([])
|
|
77
|
+
expect(result[:content]).to be_a(String)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
context 'with HTML containing no tables' do
|
|
82
|
+
it 'returns empty tables array' do
|
|
83
|
+
html = '<p>No tables here</p>'
|
|
84
|
+
result = described_class.convert_with_tables(html)
|
|
85
|
+
|
|
86
|
+
expect(result[:tables]).to eq([])
|
|
87
|
+
expect(result[:content]).to include('No tables here')
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
context 'with multiple tables' do
|
|
92
|
+
let(:html) do
|
|
93
|
+
<<~HTML
|
|
94
|
+
<table>
|
|
95
|
+
<tr><th>A</th></tr>
|
|
96
|
+
<tr><td>1</td></tr>
|
|
97
|
+
</table>
|
|
98
|
+
<p>Some text between tables</p>
|
|
99
|
+
<table>
|
|
100
|
+
<tr><th>B</th><th>C</th></tr>
|
|
101
|
+
<tr><td>2</td><td>3</td></tr>
|
|
102
|
+
<tr><td>4</td><td>5</td></tr>
|
|
103
|
+
</table>
|
|
104
|
+
HTML
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it 'extracts all tables' do
|
|
108
|
+
result = described_class.convert_with_tables(html)
|
|
109
|
+
|
|
110
|
+
expect(result[:tables].length).to eq(2)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it 'preserves table order' do
|
|
114
|
+
result = described_class.convert_with_tables(html)
|
|
115
|
+
|
|
116
|
+
first_table = result[:tables][0]
|
|
117
|
+
second_table = result[:tables][1]
|
|
118
|
+
|
|
119
|
+
expect(first_table[:cells][0]).to eq(['A'])
|
|
120
|
+
expect(second_table[:cells][0]).to eq(%w[B C])
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it 'extracts correct row counts per table' do
|
|
124
|
+
result = described_class.convert_with_tables(html)
|
|
125
|
+
|
|
126
|
+
expect(result[:tables][0][:cells].length).to eq(2)
|
|
127
|
+
expect(result[:tables][1][:cells].length).to eq(3)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
context 'with special characters in cells' do
|
|
132
|
+
let(:html) do
|
|
133
|
+
<<~HTML
|
|
134
|
+
<table>
|
|
135
|
+
<tr><th>Key</th><th>Value</th></tr>
|
|
136
|
+
<tr><td>Brackets <></td><td>Ampersand &</td></tr>
|
|
137
|
+
<tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
|
|
138
|
+
<tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
|
|
139
|
+
</table>
|
|
140
|
+
HTML
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'handles HTML entities in cells' do
|
|
144
|
+
result = described_class.convert_with_tables(html)
|
|
145
|
+
table = result[:tables][0]
|
|
146
|
+
|
|
147
|
+
expect(table[:cells][1][0]).to include('<>')
|
|
148
|
+
expect(table[:cells][1][1]).to include('&')
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it 'handles quotes in cells' do
|
|
152
|
+
result = described_class.convert_with_tables(html)
|
|
153
|
+
table = result[:tables][0]
|
|
154
|
+
|
|
155
|
+
expect(table[:cells][2][0]).to include('"double"')
|
|
156
|
+
expect(table[:cells][2][1]).to include("'single'")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'handles unicode in cells' do
|
|
160
|
+
result = described_class.convert_with_tables(html)
|
|
161
|
+
table = result[:tables][0]
|
|
162
|
+
|
|
163
|
+
expect(table[:cells][3][0]).to be_a(String)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
context 'with conversion options' do
|
|
168
|
+
it 'accepts options hash' do
|
|
169
|
+
html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
|
|
170
|
+
result = described_class.convert_with_tables(html, { heading_style: :atx })
|
|
171
|
+
|
|
172
|
+
expect(result).to be_a(Hash)
|
|
173
|
+
expect(result[:tables].length).to eq(1)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it 'accepts nil options' do
|
|
177
|
+
html = '<table><tr><td>Data</td></tr></table>'
|
|
178
|
+
result = described_class.convert_with_tables(html, nil, nil)
|
|
179
|
+
|
|
180
|
+
expect(result).to be_a(Hash)
|
|
181
|
+
expect(result[:tables].length).to eq(1)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
context 'with metadata config' do
|
|
186
|
+
it 'includes metadata when configured' do
|
|
187
|
+
html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
|
|
188
|
+
result = described_class.convert_with_tables(html, nil, { extract_headers: true })
|
|
189
|
+
|
|
190
|
+
expect(result[:metadata]).to be_a(Hash).or(be_nil)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.28.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-03-
|
|
11
|
+
date: 2026-03-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -61,6 +61,7 @@ files:
|
|
|
61
61
|
- ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
|
|
62
62
|
- ext/html-to-markdown-rb/native/src/conversion/metadata.rs
|
|
63
63
|
- ext/html-to-markdown-rb/native/src/conversion/mod.rs
|
|
64
|
+
- ext/html-to-markdown-rb/native/src/conversion/tables.rs
|
|
64
65
|
- ext/html-to-markdown-rb/native/src/lib.rs
|
|
65
66
|
- ext/html-to-markdown-rb/native/src/options.rs
|
|
66
67
|
- ext/html-to-markdown-rb/native/src/profiling.rs
|
|
@@ -1958,6 +1959,7 @@ files:
|
|
|
1958
1959
|
- rust-vendor/html-to-markdown-rs/tests/tables_test.rs
|
|
1959
1960
|
- rust-vendor/html-to-markdown-rs/tests/test_custom_elements.rs
|
|
1960
1961
|
- rust-vendor/html-to-markdown-rs/tests/test_issue_187.rs
|
|
1962
|
+
- rust-vendor/html-to-markdown-rs/tests/test_issue_218.rs
|
|
1961
1963
|
- rust-vendor/html-to-markdown-rs/tests/test_nested_simple.rs
|
|
1962
1964
|
- rust-vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs
|
|
1963
1965
|
- rust-vendor/html-to-markdown-rs/tests/test_spa_bisect.rs
|
|
@@ -9737,6 +9739,7 @@ files:
|
|
|
9737
9739
|
- sig/open3.rbs
|
|
9738
9740
|
- spec/cli_proxy_spec.rb
|
|
9739
9741
|
- spec/convert_spec.rb
|
|
9742
|
+
- spec/convert_with_tables_spec.rb
|
|
9740
9743
|
- spec/metadata_extraction_spec.rb
|
|
9741
9744
|
- spec/spec_helper.rb
|
|
9742
9745
|
- spec/visitor_issue_187_spec.rb
|