html-to-markdown 2.27.3 → 2.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +6 -0
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +71 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +27 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +29 -0
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/README.md +29 -0
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +368 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +10 -5
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/src/prelude.rs +3 -0
- data/sig/html_to_markdown.rbs +46 -0
- data/spec/convert_with_tables_spec.rb +194 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ee40f54b9f0a1b031a8a4c6a75caac7797e452dfb2fa076d79a6b69fdcd6093d
|
|
4
|
+
data.tar.gz: dd1fbfbcc08e4a562a4c096c7530104104054eb3546acbb8fbf265941ab381a7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7bef90afd805e6d0333d76fbec1c2698b62ca2f3260c4d5566d638d3950d606e2fd6b1f85c02f2c40fa39279929951ba8e8e666fde9e09ad6d0e2753c2d730e7
|
|
7
|
+
data.tar.gz: adf885b78edf97063b30e9285f2f33d3e1cee2a657e282e50d6f548fe13b686b0e3c4f4008248482d78f686ddf0041d4e1581ddfcd02bdb37a3bc66fdb88310a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.28.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -172,7 +172,7 @@ CHECKSUMS
|
|
|
172
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
173
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
174
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
175
|
-
html-to-markdown (2.
|
|
175
|
+
html-to-markdown (2.28.0)
|
|
176
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
177
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
178
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
@@ -5,7 +5,13 @@ pub mod inline_images;
|
|
|
5
5
|
#[cfg(feature = "metadata")]
|
|
6
6
|
pub mod metadata;
|
|
7
7
|
|
|
8
|
+
#[cfg(feature = "visitor")]
|
|
9
|
+
pub mod tables;
|
|
10
|
+
|
|
8
11
|
pub use inline_images::*;
|
|
9
12
|
|
|
10
13
|
#[cfg(feature = "metadata")]
|
|
11
14
|
pub use metadata::*;
|
|
15
|
+
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
pub use tables::*;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
//! Table extraction conversion functions for Ruby bindings.
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::{ConversionWithTables, TableData};
|
|
4
|
+
use magnus::prelude::*;
|
|
5
|
+
use magnus::{Error, Ruby, Value};
|
|
6
|
+
|
|
7
|
+
#[cfg(feature = "metadata")]
|
|
8
|
+
use super::metadata::extended_metadata_to_ruby;
|
|
9
|
+
|
|
10
|
+
fn table_data_to_ruby(ruby: &Ruby, table: TableData) -> Result<Value, Error> {
|
|
11
|
+
let hash = ruby.hash_new();
|
|
12
|
+
|
|
13
|
+
// cells: Array[Array[String]]
|
|
14
|
+
let cells_array = ruby.ary_new();
|
|
15
|
+
for row in table.cells {
|
|
16
|
+
let row_array = ruby.ary_new();
|
|
17
|
+
for cell in row {
|
|
18
|
+
row_array.push(cell)?;
|
|
19
|
+
}
|
|
20
|
+
cells_array.push(row_array)?;
|
|
21
|
+
}
|
|
22
|
+
hash.aset(ruby.intern("cells"), cells_array)?;
|
|
23
|
+
|
|
24
|
+
// markdown: String
|
|
25
|
+
hash.aset(ruby.intern("markdown"), table.markdown)?;
|
|
26
|
+
|
|
27
|
+
// is_header_row: Array[bool]
|
|
28
|
+
let header_array = ruby.ary_new();
|
|
29
|
+
for is_header in table.is_header_row {
|
|
30
|
+
header_array.push(is_header)?;
|
|
31
|
+
}
|
|
32
|
+
hash.aset(ruby.intern("is_header_row"), header_array)?;
|
|
33
|
+
|
|
34
|
+
Ok(hash.as_value())
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Convert a `ConversionWithTables` result to a Ruby Hash.
|
|
38
|
+
///
|
|
39
|
+
/// Returns a Hash with keys `:content`, `:metadata`, `:tables`.
|
|
40
|
+
pub fn tables_result_to_ruby(ruby: &Ruby, result: ConversionWithTables) -> Result<Value, Error> {
|
|
41
|
+
let hash = ruby.hash_new();
|
|
42
|
+
|
|
43
|
+
// content: String
|
|
44
|
+
hash.aset(ruby.intern("content"), result.content)?;
|
|
45
|
+
|
|
46
|
+
// metadata: Hash or nil
|
|
47
|
+
#[cfg(feature = "metadata")]
|
|
48
|
+
{
|
|
49
|
+
match result.metadata {
|
|
50
|
+
Some(metadata) => {
|
|
51
|
+
hash.aset(ruby.intern("metadata"), extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
52
|
+
}
|
|
53
|
+
None => {
|
|
54
|
+
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
#[cfg(not(feature = "metadata"))]
|
|
59
|
+
{
|
|
60
|
+
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// tables: Array[Hash]
|
|
64
|
+
let tables_array = ruby.ary_new();
|
|
65
|
+
for table in result.tables {
|
|
66
|
+
tables_array.push(table_data_to_ruby(ruby, table)?)?;
|
|
67
|
+
}
|
|
68
|
+
hash.aset(ruby.intern("tables"), tables_array)?;
|
|
69
|
+
|
|
70
|
+
Ok(hash.as_value())
|
|
71
|
+
}
|
|
@@ -8,6 +8,9 @@ use html_to_markdown_rs::{
|
|
|
8
8
|
#[cfg(feature = "visitor")]
|
|
9
9
|
use html_to_markdown_rs::convert_with_visitor as convert_with_visitor_inner;
|
|
10
10
|
|
|
11
|
+
#[cfg(feature = "visitor")]
|
|
12
|
+
use html_to_markdown_rs::convert_with_tables as convert_with_tables_inner;
|
|
13
|
+
|
|
11
14
|
#[cfg(feature = "metadata")]
|
|
12
15
|
use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
|
|
13
16
|
|
|
@@ -26,6 +29,9 @@ use types::{arg_error, runtime_error};
|
|
|
26
29
|
#[cfg(feature = "metadata")]
|
|
27
30
|
use conversion::{build_metadata_config, extended_metadata_to_ruby};
|
|
28
31
|
|
|
32
|
+
#[cfg(feature = "visitor")]
|
|
33
|
+
use conversion::tables_result_to_ruby;
|
|
34
|
+
|
|
29
35
|
#[cfg(feature = "visitor")]
|
|
30
36
|
use visitor::RubyVisitorWrapper;
|
|
31
37
|
|
|
@@ -138,6 +144,23 @@ fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value,
|
|
|
138
144
|
Ok(array.as_value())
|
|
139
145
|
}
|
|
140
146
|
|
|
147
|
+
#[cfg(feature = "visitor")]
|
|
148
|
+
fn convert_with_tables_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
149
|
+
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
150
|
+
let html = parsed.required.0;
|
|
151
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
152
|
+
|
|
153
|
+
#[cfg(feature = "metadata")]
|
|
154
|
+
let metadata_config = Some(build_metadata_config(ruby, parsed.optional.1)?);
|
|
155
|
+
#[cfg(not(feature = "metadata"))]
|
|
156
|
+
let metadata_config: Option<()> = None;
|
|
157
|
+
|
|
158
|
+
let result =
|
|
159
|
+
guard_panic(|| convert_with_tables_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
|
|
160
|
+
|
|
161
|
+
tables_result_to_ruby(ruby, result)
|
|
162
|
+
}
|
|
163
|
+
|
|
141
164
|
#[cfg(feature = "visitor")]
|
|
142
165
|
fn convert_with_visitor_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
143
166
|
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
@@ -227,7 +250,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
227
250
|
}
|
|
228
251
|
|
|
229
252
|
#[cfg(feature = "visitor")]
|
|
230
|
-
|
|
253
|
+
{
|
|
254
|
+
module.define_singleton_method("convert_with_visitor", function!(convert_with_visitor_fn, -1))?;
|
|
255
|
+
module.define_singleton_method("convert_with_tables", function!(convert_with_tables_fn, -1))?;
|
|
256
|
+
}
|
|
231
257
|
|
|
232
258
|
#[cfg(feature = "profiling")]
|
|
233
259
|
{
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -18,6 +18,7 @@ module HtmlToMarkdown
|
|
|
18
18
|
alias native_convert_with_metadata convert_with_metadata
|
|
19
19
|
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
20
20
|
alias native_convert_with_visitor convert_with_visitor
|
|
21
|
+
alias native_convert_with_tables convert_with_tables
|
|
21
22
|
end
|
|
22
23
|
|
|
23
24
|
module_function
|
|
@@ -179,4 +180,32 @@ module HtmlToMarkdown
|
|
|
179
180
|
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
180
181
|
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
181
182
|
end
|
|
183
|
+
|
|
184
|
+
# Convert HTML to Markdown with table extraction.
|
|
185
|
+
#
|
|
186
|
+
# Performs HTML-to-Markdown conversion while extracting structured table data
|
|
187
|
+
# (cells, markdown representation, header row flags) in a single pass.
|
|
188
|
+
#
|
|
189
|
+
# @param html [String] HTML string to convert.
|
|
190
|
+
# @param options [Hash, nil] Optional conversion configuration.
|
|
191
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
192
|
+
#
|
|
193
|
+
# @return [Hash] A hash with keys:
|
|
194
|
+
# - :content [String] The converted Markdown output
|
|
195
|
+
# - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
|
|
196
|
+
# - :tables [Array<Hash>] Extracted tables, each with:
|
|
197
|
+
# - :cells [Array<Array<String>>] Table cells organized as rows x columns
|
|
198
|
+
# - :markdown [String] Complete rendered table in Markdown format
|
|
199
|
+
# - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
|
|
200
|
+
#
|
|
201
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
202
|
+
#
|
|
203
|
+
# @example Basic usage
|
|
204
|
+
# html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
|
|
205
|
+
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
206
|
+
# puts result[:tables].length # => 1
|
|
207
|
+
# puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
|
|
208
|
+
def convert_with_tables(html, options = nil, metadata_config = nil)
|
|
209
|
+
native_convert_with_tables(html.to_s, options, metadata_config)
|
|
210
|
+
end
|
|
182
211
|
end
|
|
@@ -148,6 +148,35 @@ for (i, img) in extraction.inline_images.iter().enumerate() {
|
|
|
148
148
|
}
|
|
149
149
|
```
|
|
150
150
|
|
|
151
|
+
## Table Extraction
|
|
152
|
+
|
|
153
|
+
Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
|
|
154
|
+
|
|
155
|
+
Requires the `visitor` feature.
|
|
156
|
+
|
|
157
|
+
```rust
|
|
158
|
+
use html_to_markdown_rs::convert_with_tables;
|
|
159
|
+
|
|
160
|
+
let html = r#"
|
|
161
|
+
<table>
|
|
162
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
163
|
+
<tr><td>Alice</td><td>30</td></tr>
|
|
164
|
+
<tr><td>Bob</td><td>25</td></tr>
|
|
165
|
+
</table>
|
|
166
|
+
"#;
|
|
167
|
+
|
|
168
|
+
let result = convert_with_tables(html, None, None)?;
|
|
169
|
+
|
|
170
|
+
println!("{}", result.content);
|
|
171
|
+
for table in &result.tables {
|
|
172
|
+
println!("Table with {} rows:", table.cells.len());
|
|
173
|
+
for (i, row) in table.cells.iter().enumerate() {
|
|
174
|
+
let prefix = if table.is_header_row[i] { "Header" } else { "Row" };
|
|
175
|
+
println!(" {}: {:?}", prefix, row);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
151
180
|
## Other Language Bindings
|
|
152
181
|
|
|
153
182
|
This is the core Rust library. For other languages:
|
|
@@ -681,3 +681,371 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
|
|
|
681
681
|
let update: crate::MetadataConfigUpdate = parse_json(json)?;
|
|
682
682
|
Ok(MetadataConfig::from(update))
|
|
683
683
|
}
|
|
684
|
+
|
|
685
|
+
// ============================================================================
|
|
686
|
+
// Table Extraction API (requires visitor feature)
|
|
687
|
+
// ============================================================================
|
|
688
|
+
|
|
689
|
+
/// Extracted table data from HTML conversion.
|
|
690
|
+
///
|
|
691
|
+
/// Each instance represents a single `<table>` element found during conversion.
|
|
692
|
+
/// Tables are collected in document order.
|
|
693
|
+
#[cfg(feature = "visitor")]
|
|
694
|
+
#[derive(Debug, Clone)]
|
|
695
|
+
#[cfg_attr(
|
|
696
|
+
any(feature = "serde", feature = "metadata"),
|
|
697
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
698
|
+
)]
|
|
699
|
+
pub struct TableData {
|
|
700
|
+
/// Table cells organized as rows x columns. Cell contents are already
|
|
701
|
+
/// converted to the target output format (markdown/djot/plain).
|
|
702
|
+
pub cells: Vec<Vec<String>>,
|
|
703
|
+
/// Complete rendered table in the target output format.
|
|
704
|
+
pub markdown: String,
|
|
705
|
+
/// Per-row flag indicating whether the row was inside `<thead>`.
|
|
706
|
+
pub is_header_row: Vec<bool>,
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
/// Result of HTML-to-markdown conversion with extracted table data.
|
|
710
|
+
#[cfg(feature = "visitor")]
|
|
711
|
+
#[derive(Debug, Clone)]
|
|
712
|
+
#[cfg_attr(
|
|
713
|
+
any(feature = "serde", feature = "metadata"),
|
|
714
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
715
|
+
)]
|
|
716
|
+
pub struct ConversionWithTables {
|
|
717
|
+
/// Converted markdown/djot/plain text content.
|
|
718
|
+
pub content: String,
|
|
719
|
+
/// Extended metadata (if metadata extraction was requested).
|
|
720
|
+
#[cfg(feature = "metadata")]
|
|
721
|
+
pub metadata: Option<ExtendedMetadata>,
|
|
722
|
+
/// All tables found in the HTML, in document order.
|
|
723
|
+
pub tables: Vec<TableData>,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
#[cfg(feature = "visitor")]
|
|
727
|
+
#[derive(Debug)]
|
|
728
|
+
struct TableCollector {
|
|
729
|
+
tables: Vec<TableData>,
|
|
730
|
+
current_rows: Vec<Vec<String>>,
|
|
731
|
+
current_is_header: Vec<bool>,
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
#[cfg(feature = "visitor")]
|
|
735
|
+
impl TableCollector {
|
|
736
|
+
fn new() -> Self {
|
|
737
|
+
Self {
|
|
738
|
+
tables: Vec::new(),
|
|
739
|
+
current_rows: Vec::new(),
|
|
740
|
+
current_is_header: Vec::new(),
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
#[cfg(feature = "visitor")]
|
|
746
|
+
impl visitor::HtmlVisitor for TableCollector {
|
|
747
|
+
fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
|
|
748
|
+
self.current_rows.clear();
|
|
749
|
+
self.current_is_header.clear();
|
|
750
|
+
visitor::VisitResult::Continue
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
fn visit_table_row(
|
|
754
|
+
&mut self,
|
|
755
|
+
_ctx: &visitor::NodeContext,
|
|
756
|
+
cells: &[String],
|
|
757
|
+
is_header: bool,
|
|
758
|
+
) -> visitor::VisitResult {
|
|
759
|
+
self.current_rows.push(cells.to_vec());
|
|
760
|
+
self.current_is_header.push(is_header);
|
|
761
|
+
visitor::VisitResult::Continue
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
|
|
765
|
+
if !self.current_rows.is_empty() {
|
|
766
|
+
self.tables.push(TableData {
|
|
767
|
+
cells: std::mem::take(&mut self.current_rows),
|
|
768
|
+
markdown: output.to_string(),
|
|
769
|
+
is_header_row: std::mem::take(&mut self.current_is_header),
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
visitor::VisitResult::Continue
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
/// Convert HTML to markdown/djot/plain text with structured table extraction.
|
|
777
|
+
///
|
|
778
|
+
/// Combines conversion, optional metadata extraction, and table data collection
|
|
779
|
+
/// in a single DOM walk. Each table found in the HTML is returned with its
|
|
780
|
+
/// cell contents (already converted to the target format) and rendered output.
|
|
781
|
+
///
|
|
782
|
+
/// # Arguments
|
|
783
|
+
///
|
|
784
|
+
/// * `html` - The HTML string to convert
|
|
785
|
+
/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
|
|
786
|
+
/// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
|
|
787
|
+
///
|
|
788
|
+
/// # Example
|
|
789
|
+
///
|
|
790
|
+
/// ```ignore
|
|
791
|
+
/// use html_to_markdown_rs::convert_with_tables;
|
|
792
|
+
///
|
|
793
|
+
/// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
794
|
+
/// let result = convert_with_tables(html, None, None).unwrap();
|
|
795
|
+
/// assert_eq!(result.tables.len(), 1);
|
|
796
|
+
/// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
797
|
+
/// ```
|
|
798
|
+
///
|
|
799
|
+
/// # Errors
|
|
800
|
+
///
|
|
801
|
+
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
802
|
+
#[cfg(feature = "visitor")]
|
|
803
|
+
pub fn convert_with_tables(
|
|
804
|
+
html: &str,
|
|
805
|
+
options: Option<ConversionOptions>,
|
|
806
|
+
#[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
|
|
807
|
+
#[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
|
|
808
|
+
) -> Result<ConversionWithTables> {
|
|
809
|
+
use std::cell::RefCell;
|
|
810
|
+
use std::rc::Rc;
|
|
811
|
+
|
|
812
|
+
let collector = Rc::new(RefCell::new(TableCollector::new()));
|
|
813
|
+
let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
|
|
814
|
+
|
|
815
|
+
#[cfg(feature = "metadata")]
|
|
816
|
+
let result = {
|
|
817
|
+
let metadata_config = metadata_cfg.unwrap_or_default();
|
|
818
|
+
let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
|
|
819
|
+
let tables = Rc::try_unwrap(collector)
|
|
820
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
821
|
+
.into_inner()
|
|
822
|
+
.tables;
|
|
823
|
+
ConversionWithTables {
|
|
824
|
+
content,
|
|
825
|
+
metadata: Some(metadata),
|
|
826
|
+
tables,
|
|
827
|
+
}
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
#[cfg(not(feature = "metadata"))]
|
|
831
|
+
let result = {
|
|
832
|
+
let content = convert_with_visitor(html, options, Some(visitor_handle))?;
|
|
833
|
+
let tables = Rc::try_unwrap(collector)
|
|
834
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
835
|
+
.into_inner()
|
|
836
|
+
.tables;
|
|
837
|
+
ConversionWithTables { content, tables }
|
|
838
|
+
};
|
|
839
|
+
|
|
840
|
+
Ok(result)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
#[cfg(test)]
|
|
844
|
+
#[cfg(feature = "visitor")]
|
|
845
|
+
mod table_extraction_tests {
|
|
846
|
+
use super::*;
|
|
847
|
+
|
|
848
|
+
fn tables_from_html(html: &str) -> ConversionWithTables {
|
|
849
|
+
convert_with_tables(
|
|
850
|
+
html,
|
|
851
|
+
None,
|
|
852
|
+
#[cfg(feature = "metadata")]
|
|
853
|
+
None,
|
|
854
|
+
#[cfg(not(feature = "metadata"))]
|
|
855
|
+
None,
|
|
856
|
+
)
|
|
857
|
+
.unwrap()
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
#[test]
|
|
861
|
+
fn test_convert_with_tables_basic() {
|
|
862
|
+
let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
863
|
+
let result = tables_from_html(html);
|
|
864
|
+
assert_eq!(result.tables.len(), 1);
|
|
865
|
+
assert_eq!(result.tables[0].cells.len(), 2);
|
|
866
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
867
|
+
assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
|
|
868
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
869
|
+
assert!(!result.tables[0].is_header_row[1]);
|
|
870
|
+
assert!(result.tables[0].markdown.contains('|'));
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
#[test]
|
|
874
|
+
fn test_convert_with_tables_nested() {
|
|
875
|
+
let html = r#"
|
|
876
|
+
<table>
|
|
877
|
+
<tr><th>Category</th><th>Details</th></tr>
|
|
878
|
+
<tr>
|
|
879
|
+
<td>Project Alpha</td>
|
|
880
|
+
<td>
|
|
881
|
+
<table>
|
|
882
|
+
<tr><th>Task</th><th>Status</th></tr>
|
|
883
|
+
<tr><td>001</td><td>Done</td></tr>
|
|
884
|
+
</table>
|
|
885
|
+
</td>
|
|
886
|
+
</tr>
|
|
887
|
+
</table>"#;
|
|
888
|
+
let result = tables_from_html(html);
|
|
889
|
+
assert!(
|
|
890
|
+
result.tables.len() >= 2,
|
|
891
|
+
"Expected at least 2 tables (outer + nested), got {}",
|
|
892
|
+
result.tables.len()
|
|
893
|
+
);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
#[test]
|
|
897
|
+
fn test_convert_with_tables_no_tables() {
|
|
898
|
+
let html = "<p>No tables here</p>";
|
|
899
|
+
let result = tables_from_html(html);
|
|
900
|
+
assert!(result.tables.is_empty());
|
|
901
|
+
assert!(result.content.contains("No tables here"));
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
#[test]
|
|
905
|
+
fn test_convert_with_tables_empty_table() {
|
|
906
|
+
let result = tables_from_html("<table></table>");
|
|
907
|
+
assert!(result.tables.is_empty(), "Empty table should not produce TableData");
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
#[test]
|
|
911
|
+
fn test_convert_with_tables_headers_only() {
|
|
912
|
+
let html = r#"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>"#;
|
|
913
|
+
let result = tables_from_html(html);
|
|
914
|
+
assert_eq!(result.tables.len(), 1);
|
|
915
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
916
|
+
assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
#[test]
|
|
920
|
+
fn test_convert_with_tables_thead_tbody_tfoot() {
|
|
921
|
+
let html = r#"
|
|
922
|
+
<table>
|
|
923
|
+
<thead><tr><th>H1</th></tr></thead>
|
|
924
|
+
<tbody><tr><td>B1</td></tr></tbody>
|
|
925
|
+
<tfoot><tr><td>F1</td></tr></tfoot>
|
|
926
|
+
</table>"#;
|
|
927
|
+
let result = tables_from_html(html);
|
|
928
|
+
assert_eq!(result.tables.len(), 1);
|
|
929
|
+
let t = &result.tables[0];
|
|
930
|
+
assert!(t.is_header_row[0], "thead row should be header");
|
|
931
|
+
assert!(!t.is_header_row[1], "tbody row should not be header");
|
|
932
|
+
assert_eq!(t.cells[0], vec!["H1"]);
|
|
933
|
+
assert_eq!(t.cells[1], vec!["B1"]);
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
#[test]
|
|
937
|
+
fn test_convert_with_tables_multiple_separate() {
|
|
938
|
+
let html = r#"
|
|
939
|
+
<table><tr><td>T1</td></tr></table>
|
|
940
|
+
<p>Between tables</p>
|
|
941
|
+
<table><tr><td>T2</td></tr></table>"#;
|
|
942
|
+
let result = tables_from_html(html);
|
|
943
|
+
assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
#[test]
|
|
947
|
+
fn test_convert_with_tables_special_chars() {
|
|
948
|
+
let html = r#"<table><tr><td>a | b</td><td>c*d</td></tr></table>"#;
|
|
949
|
+
let result = tables_from_html(html);
|
|
950
|
+
assert_eq!(result.tables.len(), 1);
|
|
951
|
+
assert!(!result.tables[0].cells[0].is_empty());
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
#[test]
|
|
955
|
+
fn test_convert_with_tables_single_cell() {
|
|
956
|
+
let html = r#"<table><tr><td>Only cell</td></tr></table>"#;
|
|
957
|
+
let result = tables_from_html(html);
|
|
958
|
+
assert_eq!(result.tables.len(), 1);
|
|
959
|
+
assert_eq!(result.tables[0].cells.len(), 1);
|
|
960
|
+
assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
#[test]
|
|
964
|
+
fn test_convert_with_tables_content_preserved() {
|
|
965
|
+
let html = r#"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>"#;
|
|
966
|
+
let result = tables_from_html(html);
|
|
967
|
+
assert!(result.content.contains("Before"));
|
|
968
|
+
assert!(result.content.contains("After"));
|
|
969
|
+
assert!(result.content.contains('|'), "Markdown table should appear in content");
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
#[test]
|
|
973
|
+
fn test_convert_with_tables_with_options() {
|
|
974
|
+
let options = ConversionOptions {
|
|
975
|
+
heading_style: crate::options::HeadingStyle::Underlined,
|
|
976
|
+
..ConversionOptions::default()
|
|
977
|
+
};
|
|
978
|
+
let html = r#"<h1>Title</h1><table><tr><td>Cell</td></tr></table>"#;
|
|
979
|
+
let result = convert_with_tables(
|
|
980
|
+
html,
|
|
981
|
+
Some(options),
|
|
982
|
+
#[cfg(feature = "metadata")]
|
|
983
|
+
None,
|
|
984
|
+
#[cfg(not(feature = "metadata"))]
|
|
985
|
+
None,
|
|
986
|
+
)
|
|
987
|
+
.unwrap();
|
|
988
|
+
assert_eq!(result.tables.len(), 1);
|
|
989
|
+
assert!(result.content.contains("Title"));
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
#[test]
|
|
993
|
+
fn test_convert_with_tables_plain_text_format() {
|
|
994
|
+
let options = ConversionOptions {
|
|
995
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
996
|
+
..ConversionOptions::default()
|
|
997
|
+
};
|
|
998
|
+
let html = r#"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"#;
|
|
999
|
+
let result = convert_with_tables(
|
|
1000
|
+
html,
|
|
1001
|
+
Some(options),
|
|
1002
|
+
#[cfg(feature = "metadata")]
|
|
1003
|
+
None,
|
|
1004
|
+
#[cfg(not(feature = "metadata"))]
|
|
1005
|
+
None,
|
|
1006
|
+
)
|
|
1007
|
+
.unwrap();
|
|
1008
|
+
assert!(
|
|
1009
|
+
!result.tables.is_empty(),
|
|
1010
|
+
"Tables should be populated even with plain text output format"
|
|
1011
|
+
);
|
|
1012
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name"]);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
#[cfg(feature = "metadata")]
|
|
1016
|
+
#[test]
|
|
1017
|
+
fn test_convert_with_tables_metadata_integration() {
|
|
1018
|
+
let html = r#"<html lang="en"><head><title>Test</title></head><body>
|
|
1019
|
+
<table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
|
|
1020
|
+
</body></html>"#;
|
|
1021
|
+
let config = MetadataConfig::default();
|
|
1022
|
+
let result = convert_with_tables(html, None, Some(config)).unwrap();
|
|
1023
|
+
assert_eq!(result.tables.len(), 1);
|
|
1024
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1025
|
+
assert_eq!(meta.document.language, Some("en".to_string()));
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
#[cfg(feature = "metadata")]
|
|
1029
|
+
#[test]
|
|
1030
|
+
fn test_convert_with_tables_plain_text_metadata() {
|
|
1031
|
+
let options = ConversionOptions {
|
|
1032
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
1033
|
+
..ConversionOptions::default()
|
|
1034
|
+
};
|
|
1035
|
+
let html = r#"<html lang="fr"><body>
|
|
1036
|
+
<table><tr><td>Cell</td></tr></table>
|
|
1037
|
+
</body></html>"#;
|
|
1038
|
+
let config = MetadataConfig::default();
|
|
1039
|
+
let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
|
|
1040
|
+
assert!(
|
|
1041
|
+
!result.tables.is_empty(),
|
|
1042
|
+
"Tables should be populated in plain text mode"
|
|
1043
|
+
);
|
|
1044
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1045
|
+
assert_eq!(
|
|
1046
|
+
meta.document.language,
|
|
1047
|
+
Some("fr".to_string()),
|
|
1048
|
+
"Metadata should be populated in plain text mode"
|
|
1049
|
+
);
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
@@ -136,11 +136,9 @@ pub(crate) fn convert_html_impl(
|
|
|
136
136
|
}
|
|
137
137
|
}
|
|
138
138
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
return Ok(plain);
|
|
143
|
-
}
|
|
139
|
+
// Plain text output: run the full pipeline (for metadata + visitor callbacks),
|
|
140
|
+
// then return plain text instead of markdown.
|
|
141
|
+
let is_plain_text = options.output_format == OutputFormat::Plain;
|
|
144
142
|
|
|
145
143
|
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
|
|
146
144
|
#[cfg(feature = "metadata")]
|
|
@@ -230,6 +228,13 @@ pub(crate) fn convert_html_impl(
|
|
|
230
228
|
return Err(crate::error::ConversionError::Visitor(err.clone()));
|
|
231
229
|
}
|
|
232
230
|
|
|
231
|
+
// If plain text was requested, discard the markdown output and return plain text.
|
|
232
|
+
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
233
|
+
if is_plain_text {
|
|
234
|
+
let plain = extract_plain_text(&dom, parser, options);
|
|
235
|
+
return Ok(plain);
|
|
236
|
+
}
|
|
237
|
+
|
|
233
238
|
trim_line_end_whitespace(&mut output);
|
|
234
239
|
let trimmed = output.trim_end_matches('\n');
|
|
235
240
|
if trimmed.is_empty() {
|
|
@@ -98,6 +98,9 @@ pub use convert_api::convert_with_metadata;
|
|
|
98
98
|
#[cfg(feature = "visitor")]
|
|
99
99
|
pub use convert_api::convert_with_visitor;
|
|
100
100
|
|
|
101
|
+
#[cfg(feature = "visitor")]
|
|
102
|
+
pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
|
|
103
|
+
|
|
101
104
|
#[cfg(feature = "async-visitor")]
|
|
102
105
|
pub use convert_api::convert_with_async_visitor;
|
|
103
106
|
|
|
@@ -19,5 +19,8 @@ pub use crate::convert_with_metadata;
|
|
|
19
19
|
#[cfg(feature = "visitor")]
|
|
20
20
|
pub use crate::convert_with_visitor;
|
|
21
21
|
|
|
22
|
+
#[cfg(feature = "visitor")]
|
|
23
|
+
pub use crate::{ConversionWithTables, TableData, convert_with_tables};
|
|
24
|
+
|
|
22
25
|
#[cfg(feature = "async-visitor")]
|
|
23
26
|
pub use crate::convert_with_async_visitor;
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -185,6 +185,18 @@ module HtmlToMarkdown
|
|
|
185
185
|
structured_data: Array[structured_data]
|
|
186
186
|
}
|
|
187
187
|
|
|
188
|
+
type table_data = {
|
|
189
|
+
cells: Array[Array[String]],
|
|
190
|
+
markdown: String,
|
|
191
|
+
is_header_row: Array[bool]
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
type table_extraction_result = {
|
|
195
|
+
content: String,
|
|
196
|
+
metadata: extended_metadata?,
|
|
197
|
+
tables: Array[table_data]
|
|
198
|
+
}
|
|
199
|
+
|
|
188
200
|
# Native methods (implemented in Rust via Magnus/rb-sys)
|
|
189
201
|
# These are aliased from the Rust extension and available as both module and instance methods
|
|
190
202
|
private
|
|
@@ -217,6 +229,11 @@ module HtmlToMarkdown
|
|
|
217
229
|
conversion_options? options,
|
|
218
230
|
visitor? visitor
|
|
219
231
|
) -> String
|
|
232
|
+
def self.native_convert_with_tables: (
|
|
233
|
+
String html,
|
|
234
|
+
conversion_options? options,
|
|
235
|
+
metadata_config? metadata_config
|
|
236
|
+
) -> table_extraction_result
|
|
220
237
|
|
|
221
238
|
def native_convert: (String html, conversion_options? options) -> String
|
|
222
239
|
def native_options: (conversion_options? options_hash) -> Options
|
|
@@ -246,6 +263,11 @@ module HtmlToMarkdown
|
|
|
246
263
|
conversion_options? options,
|
|
247
264
|
visitor? visitor
|
|
248
265
|
) -> String
|
|
266
|
+
def native_convert_with_tables: (
|
|
267
|
+
String html,
|
|
268
|
+
conversion_options? options,
|
|
269
|
+
metadata_config? metadata_config
|
|
270
|
+
) -> table_extraction_result
|
|
249
271
|
|
|
250
272
|
# Visitor interface for customizing conversion behavior
|
|
251
273
|
type visitor = Object
|
|
@@ -422,6 +444,25 @@ module HtmlToMarkdown
|
|
|
422
444
|
?metadata_config metadata_config
|
|
423
445
|
) -> [String, extended_metadata]
|
|
424
446
|
|
|
447
|
+
# Convert HTML and extract tables as structured data
|
|
448
|
+
#
|
|
449
|
+
# Args:
|
|
450
|
+
# html: HTML string to convert
|
|
451
|
+
# options: Optional conversion configuration
|
|
452
|
+
# metadata_config: Optional metadata extraction configuration
|
|
453
|
+
#
|
|
454
|
+
# Returns:
|
|
455
|
+
# table_extraction_result: Hash containing content, metadata, and tables array
|
|
456
|
+
#
|
|
457
|
+
# Example:
|
|
458
|
+
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
459
|
+
# puts result[:tables].length
|
|
460
|
+
def self.convert_with_tables: (
|
|
461
|
+
String html,
|
|
462
|
+
?conversion_options options,
|
|
463
|
+
?metadata_config metadata_config
|
|
464
|
+
) -> table_extraction_result
|
|
465
|
+
|
|
425
466
|
# Instance method versions (created by module_function)
|
|
426
467
|
def convert: (String html, ?conversion_options options, ?visitor visitor) -> String
|
|
427
468
|
def options: (?conversion_options options_hash) -> Options
|
|
@@ -449,4 +490,9 @@ module HtmlToMarkdown
|
|
|
449
490
|
Options options_handle,
|
|
450
491
|
?metadata_config metadata_config
|
|
451
492
|
) -> [String, extended_metadata]
|
|
493
|
+
def convert_with_tables: (
|
|
494
|
+
String html,
|
|
495
|
+
?conversion_options options,
|
|
496
|
+
?metadata_config metadata_config
|
|
497
|
+
) -> table_extraction_result
|
|
452
498
|
end
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe HtmlToMarkdown do
|
|
6
|
+
describe '.convert_with_tables' do
|
|
7
|
+
it 'returns a hash with content, metadata, and tables keys' do
|
|
8
|
+
html = '<table><tr><td>Cell</td></tr></table>'
|
|
9
|
+
result = described_class.convert_with_tables(html)
|
|
10
|
+
|
|
11
|
+
expect(result).to be_a(Hash)
|
|
12
|
+
expect(result).to include(:content, :metadata, :tables)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
context 'with a basic table with header' do
|
|
16
|
+
let(:html) do
|
|
17
|
+
<<~HTML
|
|
18
|
+
<table>
|
|
19
|
+
<thead>
|
|
20
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
21
|
+
</thead>
|
|
22
|
+
<tbody>
|
|
23
|
+
<tr><td>Alice</td><td>30</td></tr>
|
|
24
|
+
</tbody>
|
|
25
|
+
</table>
|
|
26
|
+
HTML
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'extracts exactly one table' do
|
|
30
|
+
result = described_class.convert_with_tables(html)
|
|
31
|
+
|
|
32
|
+
expect(result[:tables].length).to eq(1)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'extracts cells as rows of columns' do
|
|
36
|
+
result = described_class.convert_with_tables(html)
|
|
37
|
+
table = result[:tables][0]
|
|
38
|
+
|
|
39
|
+
expect(table[:cells]).to be_an(Array)
|
|
40
|
+
expect(table[:cells].length).to eq(2)
|
|
41
|
+
expect(table[:cells][0]).to eq(%w[Name Age])
|
|
42
|
+
expect(table[:cells][1]).to eq(%w[Alice 30])
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'provides markdown representation' do
|
|
46
|
+
result = described_class.convert_with_tables(html)
|
|
47
|
+
table = result[:tables][0]
|
|
48
|
+
|
|
49
|
+
expect(table[:markdown]).to be_a(String)
|
|
50
|
+
expect(table[:markdown]).to include('Name')
|
|
51
|
+
expect(table[:markdown]).to include('Alice')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'marks header rows correctly' do
|
|
55
|
+
result = described_class.convert_with_tables(html)
|
|
56
|
+
table = result[:tables][0]
|
|
57
|
+
|
|
58
|
+
expect(table[:is_header_row]).to be_an(Array)
|
|
59
|
+
expect(table[:is_header_row].length).to eq(2)
|
|
60
|
+
expect(table[:is_header_row][0]).to be true
|
|
61
|
+
expect(table[:is_header_row][1]).to be false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'includes converted markdown content' do
|
|
65
|
+
result = described_class.convert_with_tables(html)
|
|
66
|
+
|
|
67
|
+
expect(result[:content]).to be_a(String)
|
|
68
|
+
expect(result[:content]).not_to be_empty
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
context 'with empty HTML' do
|
|
73
|
+
it 'returns empty tables array' do
|
|
74
|
+
result = described_class.convert_with_tables('')
|
|
75
|
+
|
|
76
|
+
expect(result[:tables]).to eq([])
|
|
77
|
+
expect(result[:content]).to be_a(String)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
context 'with HTML containing no tables' do
|
|
82
|
+
it 'returns empty tables array' do
|
|
83
|
+
html = '<p>No tables here</p>'
|
|
84
|
+
result = described_class.convert_with_tables(html)
|
|
85
|
+
|
|
86
|
+
expect(result[:tables]).to eq([])
|
|
87
|
+
expect(result[:content]).to include('No tables here')
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
context 'with multiple tables' do
|
|
92
|
+
let(:html) do
|
|
93
|
+
<<~HTML
|
|
94
|
+
<table>
|
|
95
|
+
<tr><th>A</th></tr>
|
|
96
|
+
<tr><td>1</td></tr>
|
|
97
|
+
</table>
|
|
98
|
+
<p>Some text between tables</p>
|
|
99
|
+
<table>
|
|
100
|
+
<tr><th>B</th><th>C</th></tr>
|
|
101
|
+
<tr><td>2</td><td>3</td></tr>
|
|
102
|
+
<tr><td>4</td><td>5</td></tr>
|
|
103
|
+
</table>
|
|
104
|
+
HTML
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it 'extracts all tables' do
|
|
108
|
+
result = described_class.convert_with_tables(html)
|
|
109
|
+
|
|
110
|
+
expect(result[:tables].length).to eq(2)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
it 'preserves table order' do
|
|
114
|
+
result = described_class.convert_with_tables(html)
|
|
115
|
+
|
|
116
|
+
first_table = result[:tables][0]
|
|
117
|
+
second_table = result[:tables][1]
|
|
118
|
+
|
|
119
|
+
expect(first_table[:cells][0]).to eq(['A'])
|
|
120
|
+
expect(second_table[:cells][0]).to eq(%w[B C])
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
it 'extracts correct row counts per table' do
|
|
124
|
+
result = described_class.convert_with_tables(html)
|
|
125
|
+
|
|
126
|
+
expect(result[:tables][0][:cells].length).to eq(2)
|
|
127
|
+
expect(result[:tables][1][:cells].length).to eq(3)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
context 'with special characters in cells' do
|
|
132
|
+
let(:html) do
|
|
133
|
+
<<~HTML
|
|
134
|
+
<table>
|
|
135
|
+
<tr><th>Key</th><th>Value</th></tr>
|
|
136
|
+
<tr><td>Brackets <></td><td>Ampersand &</td></tr>
|
|
137
|
+
<tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
|
|
138
|
+
<tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
|
|
139
|
+
</table>
|
|
140
|
+
HTML
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'handles HTML entities in cells' do
|
|
144
|
+
result = described_class.convert_with_tables(html)
|
|
145
|
+
table = result[:tables][0]
|
|
146
|
+
|
|
147
|
+
expect(table[:cells][1][0]).to include('<>')
|
|
148
|
+
expect(table[:cells][1][1]).to include('&')
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
it 'handles quotes in cells' do
|
|
152
|
+
result = described_class.convert_with_tables(html)
|
|
153
|
+
table = result[:tables][0]
|
|
154
|
+
|
|
155
|
+
expect(table[:cells][2][0]).to include('"double"')
|
|
156
|
+
expect(table[:cells][2][1]).to include("'single'")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
it 'handles unicode in cells' do
|
|
160
|
+
result = described_class.convert_with_tables(html)
|
|
161
|
+
table = result[:tables][0]
|
|
162
|
+
|
|
163
|
+
expect(table[:cells][3][0]).to be_a(String)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
context 'with conversion options' do
|
|
168
|
+
it 'accepts options hash' do
|
|
169
|
+
html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
|
|
170
|
+
result = described_class.convert_with_tables(html, { heading_style: :atx })
|
|
171
|
+
|
|
172
|
+
expect(result).to be_a(Hash)
|
|
173
|
+
expect(result[:tables].length).to eq(1)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it 'accepts nil options' do
|
|
177
|
+
html = '<table><tr><td>Data</td></tr></table>'
|
|
178
|
+
result = described_class.convert_with_tables(html, nil, nil)
|
|
179
|
+
|
|
180
|
+
expect(result).to be_a(Hash)
|
|
181
|
+
expect(result[:tables].length).to eq(1)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
context 'with metadata config' do
|
|
186
|
+
it 'includes metadata when configured' do
|
|
187
|
+
html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
|
|
188
|
+
result = described_class.convert_with_tables(html, nil, { extract_headers: true })
|
|
189
|
+
|
|
190
|
+
expect(result[:metadata]).to be_a(Hash).or(be_nil)
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.28.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
@@ -61,6 +61,7 @@ files:
|
|
|
61
61
|
- ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
|
|
62
62
|
- ext/html-to-markdown-rb/native/src/conversion/metadata.rs
|
|
63
63
|
- ext/html-to-markdown-rb/native/src/conversion/mod.rs
|
|
64
|
+
- ext/html-to-markdown-rb/native/src/conversion/tables.rs
|
|
64
65
|
- ext/html-to-markdown-rb/native/src/lib.rs
|
|
65
66
|
- ext/html-to-markdown-rb/native/src/options.rs
|
|
66
67
|
- ext/html-to-markdown-rb/native/src/profiling.rs
|
|
@@ -9737,6 +9738,7 @@ files:
|
|
|
9737
9738
|
- sig/open3.rbs
|
|
9738
9739
|
- spec/cli_proxy_spec.rb
|
|
9739
9740
|
- spec/convert_spec.rb
|
|
9741
|
+
- spec/convert_with_tables_spec.rb
|
|
9740
9742
|
- spec/metadata_extraction_spec.rb
|
|
9741
9743
|
- spec/spec_helper.rb
|
|
9742
9744
|
- spec/visitor_issue_187_spec.rb
|