html-to-markdown 2.27.3 → 2.28.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +6 -0
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +71 -0
- data/ext/html-to-markdown-rb/native/src/lib.rs +27 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +29 -0
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/README.md +29 -0
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +368 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +10 -5
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/utility/content.rs +17 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +8 -5
- data/rust-vendor/html-to-markdown-rs/src/lib.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/src/prelude.rs +3 -0
- data/rust-vendor/html-to-markdown-rs/tests/test_issue_218.rs +56 -0
- data/rust-vendor/uuid/.cargo-checksum.json +1 -1
- data/rust-vendor/uuid/.cargo_vcs_info.json +1 -1
- data/rust-vendor/uuid/Cargo.lock +71 -137
- data/rust-vendor/uuid/Cargo.toml +4 -4
- data/rust-vendor/uuid/Cargo.toml.orig +4 -4
- data/rust-vendor/uuid/README.md +3 -3
- data/rust-vendor/uuid/src/lib.rs +4 -4
- data/sig/html_to_markdown.rbs +46 -0
- data/spec/convert_with_tables_spec.rb +194 -0
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f36fc4bbd42216b6e57e8ab45f3efa66e841eac072ede248c51541b4c4b9c7c1
|
|
4
|
+
data.tar.gz: b7ce1c2842173054e6d3eea461fabbc9d5063ae51a5d2c37f6d303ac47c91de6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b82552c3f5b55afc82c226710c39955c8f4a6c79bf7a730d88fe1c65e60db64c8cfc7d93044666fd324c3d13ea8c3f57709d3d2c6014034075691c166a646984
|
|
7
|
+
data.tar.gz: ce1b059c010f246b10cb93cfba87d0bf635422f6cb75cf6168f55eb134fe0603a6cb74434e87736ec4d245bd2ba59b74eb8bcba1b9c7d0e70e91d3a25124ef09
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.28.1)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -172,7 +172,7 @@ CHECKSUMS
|
|
|
172
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
173
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
174
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
175
|
-
html-to-markdown (2.
|
|
175
|
+
html-to-markdown (2.28.1)
|
|
176
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
177
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
178
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
@@ -5,7 +5,13 @@ pub mod inline_images;
|
|
|
5
5
|
#[cfg(feature = "metadata")]
|
|
6
6
|
pub mod metadata;
|
|
7
7
|
|
|
8
|
+
#[cfg(feature = "visitor")]
|
|
9
|
+
pub mod tables;
|
|
10
|
+
|
|
8
11
|
pub use inline_images::*;
|
|
9
12
|
|
|
10
13
|
#[cfg(feature = "metadata")]
|
|
11
14
|
pub use metadata::*;
|
|
15
|
+
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
pub use tables::*;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
//! Table extraction conversion functions for Ruby bindings.
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::{ConversionWithTables, TableData};
|
|
4
|
+
use magnus::prelude::*;
|
|
5
|
+
use magnus::{Error, Ruby, Value};
|
|
6
|
+
|
|
7
|
+
#[cfg(feature = "metadata")]
|
|
8
|
+
use super::metadata::extended_metadata_to_ruby;
|
|
9
|
+
|
|
10
|
+
fn table_data_to_ruby(ruby: &Ruby, table: TableData) -> Result<Value, Error> {
|
|
11
|
+
let hash = ruby.hash_new();
|
|
12
|
+
|
|
13
|
+
// cells: Array[Array[String]]
|
|
14
|
+
let cells_array = ruby.ary_new();
|
|
15
|
+
for row in table.cells {
|
|
16
|
+
let row_array = ruby.ary_new();
|
|
17
|
+
for cell in row {
|
|
18
|
+
row_array.push(cell)?;
|
|
19
|
+
}
|
|
20
|
+
cells_array.push(row_array)?;
|
|
21
|
+
}
|
|
22
|
+
hash.aset(ruby.intern("cells"), cells_array)?;
|
|
23
|
+
|
|
24
|
+
// markdown: String
|
|
25
|
+
hash.aset(ruby.intern("markdown"), table.markdown)?;
|
|
26
|
+
|
|
27
|
+
// is_header_row: Array[bool]
|
|
28
|
+
let header_array = ruby.ary_new();
|
|
29
|
+
for is_header in table.is_header_row {
|
|
30
|
+
header_array.push(is_header)?;
|
|
31
|
+
}
|
|
32
|
+
hash.aset(ruby.intern("is_header_row"), header_array)?;
|
|
33
|
+
|
|
34
|
+
Ok(hash.as_value())
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Convert a `ConversionWithTables` result to a Ruby Hash.
|
|
38
|
+
///
|
|
39
|
+
/// Returns a Hash with keys `:content`, `:metadata`, `:tables`.
|
|
40
|
+
pub fn tables_result_to_ruby(ruby: &Ruby, result: ConversionWithTables) -> Result<Value, Error> {
|
|
41
|
+
let hash = ruby.hash_new();
|
|
42
|
+
|
|
43
|
+
// content: String
|
|
44
|
+
hash.aset(ruby.intern("content"), result.content)?;
|
|
45
|
+
|
|
46
|
+
// metadata: Hash or nil
|
|
47
|
+
#[cfg(feature = "metadata")]
|
|
48
|
+
{
|
|
49
|
+
match result.metadata {
|
|
50
|
+
Some(metadata) => {
|
|
51
|
+
hash.aset(ruby.intern("metadata"), extended_metadata_to_ruby(ruby, metadata)?)?;
|
|
52
|
+
}
|
|
53
|
+
None => {
|
|
54
|
+
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
#[cfg(not(feature = "metadata"))]
|
|
59
|
+
{
|
|
60
|
+
hash.aset(ruby.intern("metadata"), ruby.qnil())?;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// tables: Array[Hash]
|
|
64
|
+
let tables_array = ruby.ary_new();
|
|
65
|
+
for table in result.tables {
|
|
66
|
+
tables_array.push(table_data_to_ruby(ruby, table)?)?;
|
|
67
|
+
}
|
|
68
|
+
hash.aset(ruby.intern("tables"), tables_array)?;
|
|
69
|
+
|
|
70
|
+
Ok(hash.as_value())
|
|
71
|
+
}
|
|
@@ -8,6 +8,9 @@ use html_to_markdown_rs::{
|
|
|
8
8
|
#[cfg(feature = "visitor")]
|
|
9
9
|
use html_to_markdown_rs::convert_with_visitor as convert_with_visitor_inner;
|
|
10
10
|
|
|
11
|
+
#[cfg(feature = "visitor")]
|
|
12
|
+
use html_to_markdown_rs::convert_with_tables as convert_with_tables_inner;
|
|
13
|
+
|
|
11
14
|
#[cfg(feature = "metadata")]
|
|
12
15
|
use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
|
|
13
16
|
|
|
@@ -26,6 +29,9 @@ use types::{arg_error, runtime_error};
|
|
|
26
29
|
#[cfg(feature = "metadata")]
|
|
27
30
|
use conversion::{build_metadata_config, extended_metadata_to_ruby};
|
|
28
31
|
|
|
32
|
+
#[cfg(feature = "visitor")]
|
|
33
|
+
use conversion::tables_result_to_ruby;
|
|
34
|
+
|
|
29
35
|
#[cfg(feature = "visitor")]
|
|
30
36
|
use visitor::RubyVisitorWrapper;
|
|
31
37
|
|
|
@@ -138,6 +144,23 @@ fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value,
|
|
|
138
144
|
Ok(array.as_value())
|
|
139
145
|
}
|
|
140
146
|
|
|
147
|
+
#[cfg(feature = "visitor")]
|
|
148
|
+
fn convert_with_tables_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
|
|
149
|
+
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
150
|
+
let html = parsed.required.0;
|
|
151
|
+
let options = build_conversion_options(ruby, parsed.optional.0)?;
|
|
152
|
+
|
|
153
|
+
#[cfg(feature = "metadata")]
|
|
154
|
+
let metadata_config = Some(build_metadata_config(ruby, parsed.optional.1)?);
|
|
155
|
+
#[cfg(not(feature = "metadata"))]
|
|
156
|
+
let metadata_config: Option<()> = None;
|
|
157
|
+
|
|
158
|
+
let result =
|
|
159
|
+
guard_panic(|| convert_with_tables_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
|
|
160
|
+
|
|
161
|
+
tables_result_to_ruby(ruby, result)
|
|
162
|
+
}
|
|
163
|
+
|
|
141
164
|
#[cfg(feature = "visitor")]
|
|
142
165
|
fn convert_with_visitor_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
|
|
143
166
|
let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
|
|
@@ -227,7 +250,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
|
227
250
|
}
|
|
228
251
|
|
|
229
252
|
#[cfg(feature = "visitor")]
|
|
230
|
-
|
|
253
|
+
{
|
|
254
|
+
module.define_singleton_method("convert_with_visitor", function!(convert_with_visitor_fn, -1))?;
|
|
255
|
+
module.define_singleton_method("convert_with_tables", function!(convert_with_tables_fn, -1))?;
|
|
256
|
+
}
|
|
231
257
|
|
|
232
258
|
#[cfg(feature = "profiling")]
|
|
233
259
|
{
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -18,6 +18,7 @@ module HtmlToMarkdown
|
|
|
18
18
|
alias native_convert_with_metadata convert_with_metadata
|
|
19
19
|
alias native_convert_with_metadata_handle convert_with_metadata_handle
|
|
20
20
|
alias native_convert_with_visitor convert_with_visitor
|
|
21
|
+
alias native_convert_with_tables convert_with_tables
|
|
21
22
|
end
|
|
22
23
|
|
|
23
24
|
module_function
|
|
@@ -179,4 +180,32 @@ module HtmlToMarkdown
|
|
|
179
180
|
def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
|
|
180
181
|
native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
|
|
181
182
|
end
|
|
183
|
+
|
|
184
|
+
# Convert HTML to Markdown with table extraction.
|
|
185
|
+
#
|
|
186
|
+
# Performs HTML-to-Markdown conversion while extracting structured table data
|
|
187
|
+
# (cells, markdown representation, header row flags) in a single pass.
|
|
188
|
+
#
|
|
189
|
+
# @param html [String] HTML string to convert.
|
|
190
|
+
# @param options [Hash, nil] Optional conversion configuration.
|
|
191
|
+
# @param metadata_config [Hash, nil] Optional metadata extraction configuration.
|
|
192
|
+
#
|
|
193
|
+
# @return [Hash] A hash with keys:
|
|
194
|
+
# - :content [String] The converted Markdown output
|
|
195
|
+
# - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
|
|
196
|
+
# - :tables [Array<Hash>] Extracted tables, each with:
|
|
197
|
+
# - :cells [Array<Array<String>>] Table cells organized as rows x columns
|
|
198
|
+
# - :markdown [String] Complete rendered table in Markdown format
|
|
199
|
+
# - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
|
|
200
|
+
#
|
|
201
|
+
# @raise [StandardError] If conversion fails or invalid configuration
|
|
202
|
+
#
|
|
203
|
+
# @example Basic usage
|
|
204
|
+
# html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
|
|
205
|
+
# result = HtmlToMarkdown.convert_with_tables(html)
|
|
206
|
+
# puts result[:tables].length # => 1
|
|
207
|
+
# puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
|
|
208
|
+
def convert_with_tables(html, options = nil, metadata_config = nil)
|
|
209
|
+
native_convert_with_tables(html.to_s, options, metadata_config)
|
|
210
|
+
end
|
|
182
211
|
end
|
|
@@ -148,6 +148,35 @@ for (i, img) in extraction.inline_images.iter().enumerate() {
|
|
|
148
148
|
}
|
|
149
149
|
```
|
|
150
150
|
|
|
151
|
+
## Table Extraction
|
|
152
|
+
|
|
153
|
+
Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
|
|
154
|
+
|
|
155
|
+
Requires the `visitor` feature.
|
|
156
|
+
|
|
157
|
+
```rust
|
|
158
|
+
use html_to_markdown_rs::convert_with_tables;
|
|
159
|
+
|
|
160
|
+
let html = r#"
|
|
161
|
+
<table>
|
|
162
|
+
<tr><th>Name</th><th>Age</th></tr>
|
|
163
|
+
<tr><td>Alice</td><td>30</td></tr>
|
|
164
|
+
<tr><td>Bob</td><td>25</td></tr>
|
|
165
|
+
</table>
|
|
166
|
+
"#;
|
|
167
|
+
|
|
168
|
+
let result = convert_with_tables(html, None, None)?;
|
|
169
|
+
|
|
170
|
+
println!("{}", result.content);
|
|
171
|
+
for table in &result.tables {
|
|
172
|
+
println!("Table with {} rows:", table.cells.len());
|
|
173
|
+
for (i, row) in table.cells.iter().enumerate() {
|
|
174
|
+
let prefix = if table.is_header_row[i] { "Header" } else { "Row" };
|
|
175
|
+
println!(" {}: {:?}", prefix, row);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
151
180
|
## Other Language Bindings
|
|
152
181
|
|
|
153
182
|
This is the core Rust library. For other languages:
|
|
@@ -681,3 +681,371 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
|
|
|
681
681
|
let update: crate::MetadataConfigUpdate = parse_json(json)?;
|
|
682
682
|
Ok(MetadataConfig::from(update))
|
|
683
683
|
}
|
|
684
|
+
|
|
685
|
+
// ============================================================================
|
|
686
|
+
// Table Extraction API (requires visitor feature)
|
|
687
|
+
// ============================================================================
|
|
688
|
+
|
|
689
|
+
/// Extracted table data from HTML conversion.
|
|
690
|
+
///
|
|
691
|
+
/// Each instance represents a single `<table>` element found during conversion.
|
|
692
|
+
/// Tables are collected in document order.
|
|
693
|
+
#[cfg(feature = "visitor")]
|
|
694
|
+
#[derive(Debug, Clone)]
|
|
695
|
+
#[cfg_attr(
|
|
696
|
+
any(feature = "serde", feature = "metadata"),
|
|
697
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
698
|
+
)]
|
|
699
|
+
pub struct TableData {
|
|
700
|
+
/// Table cells organized as rows x columns. Cell contents are already
|
|
701
|
+
/// converted to the target output format (markdown/djot/plain).
|
|
702
|
+
pub cells: Vec<Vec<String>>,
|
|
703
|
+
/// Complete rendered table in the target output format.
|
|
704
|
+
pub markdown: String,
|
|
705
|
+
/// Per-row flag indicating whether the row was inside `<thead>`.
|
|
706
|
+
pub is_header_row: Vec<bool>,
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
/// Result of HTML-to-markdown conversion with extracted table data.
|
|
710
|
+
#[cfg(feature = "visitor")]
|
|
711
|
+
#[derive(Debug, Clone)]
|
|
712
|
+
#[cfg_attr(
|
|
713
|
+
any(feature = "serde", feature = "metadata"),
|
|
714
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
715
|
+
)]
|
|
716
|
+
pub struct ConversionWithTables {
|
|
717
|
+
/// Converted markdown/djot/plain text content.
|
|
718
|
+
pub content: String,
|
|
719
|
+
/// Extended metadata (if metadata extraction was requested).
|
|
720
|
+
#[cfg(feature = "metadata")]
|
|
721
|
+
pub metadata: Option<ExtendedMetadata>,
|
|
722
|
+
/// All tables found in the HTML, in document order.
|
|
723
|
+
pub tables: Vec<TableData>,
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
#[cfg(feature = "visitor")]
|
|
727
|
+
#[derive(Debug)]
|
|
728
|
+
struct TableCollector {
|
|
729
|
+
tables: Vec<TableData>,
|
|
730
|
+
current_rows: Vec<Vec<String>>,
|
|
731
|
+
current_is_header: Vec<bool>,
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
#[cfg(feature = "visitor")]
|
|
735
|
+
impl TableCollector {
|
|
736
|
+
fn new() -> Self {
|
|
737
|
+
Self {
|
|
738
|
+
tables: Vec::new(),
|
|
739
|
+
current_rows: Vec::new(),
|
|
740
|
+
current_is_header: Vec::new(),
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
#[cfg(feature = "visitor")]
|
|
746
|
+
impl visitor::HtmlVisitor for TableCollector {
|
|
747
|
+
fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
|
|
748
|
+
self.current_rows.clear();
|
|
749
|
+
self.current_is_header.clear();
|
|
750
|
+
visitor::VisitResult::Continue
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
fn visit_table_row(
|
|
754
|
+
&mut self,
|
|
755
|
+
_ctx: &visitor::NodeContext,
|
|
756
|
+
cells: &[String],
|
|
757
|
+
is_header: bool,
|
|
758
|
+
) -> visitor::VisitResult {
|
|
759
|
+
self.current_rows.push(cells.to_vec());
|
|
760
|
+
self.current_is_header.push(is_header);
|
|
761
|
+
visitor::VisitResult::Continue
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
|
|
765
|
+
if !self.current_rows.is_empty() {
|
|
766
|
+
self.tables.push(TableData {
|
|
767
|
+
cells: std::mem::take(&mut self.current_rows),
|
|
768
|
+
markdown: output.to_string(),
|
|
769
|
+
is_header_row: std::mem::take(&mut self.current_is_header),
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
visitor::VisitResult::Continue
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
/// Convert HTML to markdown/djot/plain text with structured table extraction.
|
|
777
|
+
///
|
|
778
|
+
/// Combines conversion, optional metadata extraction, and table data collection
|
|
779
|
+
/// in a single DOM walk. Each table found in the HTML is returned with its
|
|
780
|
+
/// cell contents (already converted to the target format) and rendered output.
|
|
781
|
+
///
|
|
782
|
+
/// # Arguments
|
|
783
|
+
///
|
|
784
|
+
/// * `html` - The HTML string to convert
|
|
785
|
+
/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
|
|
786
|
+
/// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
|
|
787
|
+
///
|
|
788
|
+
/// # Example
|
|
789
|
+
///
|
|
790
|
+
/// ```ignore
|
|
791
|
+
/// use html_to_markdown_rs::convert_with_tables;
|
|
792
|
+
///
|
|
793
|
+
/// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
794
|
+
/// let result = convert_with_tables(html, None, None).unwrap();
|
|
795
|
+
/// assert_eq!(result.tables.len(), 1);
|
|
796
|
+
/// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
797
|
+
/// ```
|
|
798
|
+
///
|
|
799
|
+
/// # Errors
|
|
800
|
+
///
|
|
801
|
+
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
802
|
+
#[cfg(feature = "visitor")]
|
|
803
|
+
pub fn convert_with_tables(
|
|
804
|
+
html: &str,
|
|
805
|
+
options: Option<ConversionOptions>,
|
|
806
|
+
#[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
|
|
807
|
+
#[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
|
|
808
|
+
) -> Result<ConversionWithTables> {
|
|
809
|
+
use std::cell::RefCell;
|
|
810
|
+
use std::rc::Rc;
|
|
811
|
+
|
|
812
|
+
let collector = Rc::new(RefCell::new(TableCollector::new()));
|
|
813
|
+
let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
|
|
814
|
+
|
|
815
|
+
#[cfg(feature = "metadata")]
|
|
816
|
+
let result = {
|
|
817
|
+
let metadata_config = metadata_cfg.unwrap_or_default();
|
|
818
|
+
let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
|
|
819
|
+
let tables = Rc::try_unwrap(collector)
|
|
820
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
821
|
+
.into_inner()
|
|
822
|
+
.tables;
|
|
823
|
+
ConversionWithTables {
|
|
824
|
+
content,
|
|
825
|
+
metadata: Some(metadata),
|
|
826
|
+
tables,
|
|
827
|
+
}
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
#[cfg(not(feature = "metadata"))]
|
|
831
|
+
let result = {
|
|
832
|
+
let content = convert_with_visitor(html, options, Some(visitor_handle))?;
|
|
833
|
+
let tables = Rc::try_unwrap(collector)
|
|
834
|
+
.map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
|
|
835
|
+
.into_inner()
|
|
836
|
+
.tables;
|
|
837
|
+
ConversionWithTables { content, tables }
|
|
838
|
+
};
|
|
839
|
+
|
|
840
|
+
Ok(result)
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
#[cfg(test)]
|
|
844
|
+
#[cfg(feature = "visitor")]
|
|
845
|
+
mod table_extraction_tests {
|
|
846
|
+
use super::*;
|
|
847
|
+
|
|
848
|
+
fn tables_from_html(html: &str) -> ConversionWithTables {
|
|
849
|
+
convert_with_tables(
|
|
850
|
+
html,
|
|
851
|
+
None,
|
|
852
|
+
#[cfg(feature = "metadata")]
|
|
853
|
+
None,
|
|
854
|
+
#[cfg(not(feature = "metadata"))]
|
|
855
|
+
None,
|
|
856
|
+
)
|
|
857
|
+
.unwrap()
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
#[test]
|
|
861
|
+
fn test_convert_with_tables_basic() {
|
|
862
|
+
let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
|
|
863
|
+
let result = tables_from_html(html);
|
|
864
|
+
assert_eq!(result.tables.len(), 1);
|
|
865
|
+
assert_eq!(result.tables[0].cells.len(), 2);
|
|
866
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
|
|
867
|
+
assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
|
|
868
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
869
|
+
assert!(!result.tables[0].is_header_row[1]);
|
|
870
|
+
assert!(result.tables[0].markdown.contains('|'));
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
#[test]
|
|
874
|
+
fn test_convert_with_tables_nested() {
|
|
875
|
+
let html = r#"
|
|
876
|
+
<table>
|
|
877
|
+
<tr><th>Category</th><th>Details</th></tr>
|
|
878
|
+
<tr>
|
|
879
|
+
<td>Project Alpha</td>
|
|
880
|
+
<td>
|
|
881
|
+
<table>
|
|
882
|
+
<tr><th>Task</th><th>Status</th></tr>
|
|
883
|
+
<tr><td>001</td><td>Done</td></tr>
|
|
884
|
+
</table>
|
|
885
|
+
</td>
|
|
886
|
+
</tr>
|
|
887
|
+
</table>"#;
|
|
888
|
+
let result = tables_from_html(html);
|
|
889
|
+
assert!(
|
|
890
|
+
result.tables.len() >= 2,
|
|
891
|
+
"Expected at least 2 tables (outer + nested), got {}",
|
|
892
|
+
result.tables.len()
|
|
893
|
+
);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
#[test]
|
|
897
|
+
fn test_convert_with_tables_no_tables() {
|
|
898
|
+
let html = "<p>No tables here</p>";
|
|
899
|
+
let result = tables_from_html(html);
|
|
900
|
+
assert!(result.tables.is_empty());
|
|
901
|
+
assert!(result.content.contains("No tables here"));
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
#[test]
|
|
905
|
+
fn test_convert_with_tables_empty_table() {
|
|
906
|
+
let result = tables_from_html("<table></table>");
|
|
907
|
+
assert!(result.tables.is_empty(), "Empty table should not produce TableData");
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
#[test]
|
|
911
|
+
fn test_convert_with_tables_headers_only() {
|
|
912
|
+
let html = r#"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>"#;
|
|
913
|
+
let result = tables_from_html(html);
|
|
914
|
+
assert_eq!(result.tables.len(), 1);
|
|
915
|
+
assert!(result.tables[0].is_header_row[0]);
|
|
916
|
+
assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
#[test]
|
|
920
|
+
fn test_convert_with_tables_thead_tbody_tfoot() {
|
|
921
|
+
let html = r#"
|
|
922
|
+
<table>
|
|
923
|
+
<thead><tr><th>H1</th></tr></thead>
|
|
924
|
+
<tbody><tr><td>B1</td></tr></tbody>
|
|
925
|
+
<tfoot><tr><td>F1</td></tr></tfoot>
|
|
926
|
+
</table>"#;
|
|
927
|
+
let result = tables_from_html(html);
|
|
928
|
+
assert_eq!(result.tables.len(), 1);
|
|
929
|
+
let t = &result.tables[0];
|
|
930
|
+
assert!(t.is_header_row[0], "thead row should be header");
|
|
931
|
+
assert!(!t.is_header_row[1], "tbody row should not be header");
|
|
932
|
+
assert_eq!(t.cells[0], vec!["H1"]);
|
|
933
|
+
assert_eq!(t.cells[1], vec!["B1"]);
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
#[test]
|
|
937
|
+
fn test_convert_with_tables_multiple_separate() {
|
|
938
|
+
let html = r#"
|
|
939
|
+
<table><tr><td>T1</td></tr></table>
|
|
940
|
+
<p>Between tables</p>
|
|
941
|
+
<table><tr><td>T2</td></tr></table>"#;
|
|
942
|
+
let result = tables_from_html(html);
|
|
943
|
+
assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
|
|
944
|
+
}
|
|
945
|
+
|
|
946
|
+
#[test]
|
|
947
|
+
fn test_convert_with_tables_special_chars() {
|
|
948
|
+
let html = r#"<table><tr><td>a | b</td><td>c*d</td></tr></table>"#;
|
|
949
|
+
let result = tables_from_html(html);
|
|
950
|
+
assert_eq!(result.tables.len(), 1);
|
|
951
|
+
assert!(!result.tables[0].cells[0].is_empty());
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
#[test]
|
|
955
|
+
fn test_convert_with_tables_single_cell() {
|
|
956
|
+
let html = r#"<table><tr><td>Only cell</td></tr></table>"#;
|
|
957
|
+
let result = tables_from_html(html);
|
|
958
|
+
assert_eq!(result.tables.len(), 1);
|
|
959
|
+
assert_eq!(result.tables[0].cells.len(), 1);
|
|
960
|
+
assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
#[test]
|
|
964
|
+
fn test_convert_with_tables_content_preserved() {
|
|
965
|
+
let html = r#"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>"#;
|
|
966
|
+
let result = tables_from_html(html);
|
|
967
|
+
assert!(result.content.contains("Before"));
|
|
968
|
+
assert!(result.content.contains("After"));
|
|
969
|
+
assert!(result.content.contains('|'), "Markdown table should appear in content");
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
#[test]
|
|
973
|
+
fn test_convert_with_tables_with_options() {
|
|
974
|
+
let options = ConversionOptions {
|
|
975
|
+
heading_style: crate::options::HeadingStyle::Underlined,
|
|
976
|
+
..ConversionOptions::default()
|
|
977
|
+
};
|
|
978
|
+
let html = r#"<h1>Title</h1><table><tr><td>Cell</td></tr></table>"#;
|
|
979
|
+
let result = convert_with_tables(
|
|
980
|
+
html,
|
|
981
|
+
Some(options),
|
|
982
|
+
#[cfg(feature = "metadata")]
|
|
983
|
+
None,
|
|
984
|
+
#[cfg(not(feature = "metadata"))]
|
|
985
|
+
None,
|
|
986
|
+
)
|
|
987
|
+
.unwrap();
|
|
988
|
+
assert_eq!(result.tables.len(), 1);
|
|
989
|
+
assert!(result.content.contains("Title"));
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
#[test]
|
|
993
|
+
fn test_convert_with_tables_plain_text_format() {
|
|
994
|
+
let options = ConversionOptions {
|
|
995
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
996
|
+
..ConversionOptions::default()
|
|
997
|
+
};
|
|
998
|
+
let html = r#"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"#;
|
|
999
|
+
let result = convert_with_tables(
|
|
1000
|
+
html,
|
|
1001
|
+
Some(options),
|
|
1002
|
+
#[cfg(feature = "metadata")]
|
|
1003
|
+
None,
|
|
1004
|
+
#[cfg(not(feature = "metadata"))]
|
|
1005
|
+
None,
|
|
1006
|
+
)
|
|
1007
|
+
.unwrap();
|
|
1008
|
+
assert!(
|
|
1009
|
+
!result.tables.is_empty(),
|
|
1010
|
+
"Tables should be populated even with plain text output format"
|
|
1011
|
+
);
|
|
1012
|
+
assert_eq!(result.tables[0].cells[0], vec!["Name"]);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
#[cfg(feature = "metadata")]
|
|
1016
|
+
#[test]
|
|
1017
|
+
fn test_convert_with_tables_metadata_integration() {
|
|
1018
|
+
let html = r#"<html lang="en"><head><title>Test</title></head><body>
|
|
1019
|
+
<table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
|
|
1020
|
+
</body></html>"#;
|
|
1021
|
+
let config = MetadataConfig::default();
|
|
1022
|
+
let result = convert_with_tables(html, None, Some(config)).unwrap();
|
|
1023
|
+
assert_eq!(result.tables.len(), 1);
|
|
1024
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1025
|
+
assert_eq!(meta.document.language, Some("en".to_string()));
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
#[cfg(feature = "metadata")]
|
|
1029
|
+
#[test]
|
|
1030
|
+
fn test_convert_with_tables_plain_text_metadata() {
|
|
1031
|
+
let options = ConversionOptions {
|
|
1032
|
+
output_format: crate::options::OutputFormat::Plain,
|
|
1033
|
+
..ConversionOptions::default()
|
|
1034
|
+
};
|
|
1035
|
+
let html = r#"<html lang="fr"><body>
|
|
1036
|
+
<table><tr><td>Cell</td></tr></table>
|
|
1037
|
+
</body></html>"#;
|
|
1038
|
+
let config = MetadataConfig::default();
|
|
1039
|
+
let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
|
|
1040
|
+
assert!(
|
|
1041
|
+
!result.tables.is_empty(),
|
|
1042
|
+
"Tables should be populated in plain text mode"
|
|
1043
|
+
);
|
|
1044
|
+
let meta = result.metadata.as_ref().expect("metadata should be present");
|
|
1045
|
+
assert_eq!(
|
|
1046
|
+
meta.document.language,
|
|
1047
|
+
Some("fr".to_string()),
|
|
1048
|
+
"Metadata should be populated in plain text mode"
|
|
1049
|
+
);
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
@@ -136,11 +136,9 @@ pub(crate) fn convert_html_impl(
|
|
|
136
136
|
}
|
|
137
137
|
}
|
|
138
138
|
|
|
139
|
-
//
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
return Ok(plain);
|
|
143
|
-
}
|
|
139
|
+
// Plain text output: run the full pipeline (for metadata + visitor callbacks),
|
|
140
|
+
// then return plain text instead of markdown.
|
|
141
|
+
let is_plain_text = options.output_format == OutputFormat::Plain;
|
|
144
142
|
|
|
145
143
|
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
|
|
146
144
|
#[cfg(feature = "metadata")]
|
|
@@ -230,6 +228,13 @@ pub(crate) fn convert_html_impl(
|
|
|
230
228
|
return Err(crate::error::ConversionError::Visitor(err.clone()));
|
|
231
229
|
}
|
|
232
230
|
|
|
231
|
+
// If plain text was requested, discard the markdown output and return plain text.
|
|
232
|
+
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
233
|
+
if is_plain_text {
|
|
234
|
+
let plain = extract_plain_text(&dom, parser, options);
|
|
235
|
+
return Ok(plain);
|
|
236
|
+
}
|
|
237
|
+
|
|
233
238
|
trim_line_end_whitespace(&mut output);
|
|
234
239
|
let trimmed = output.trim_end_matches('\n');
|
|
235
240
|
if trimmed.is_empty() {
|