html-to-markdown 2.27.3 → 2.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2c1f001483774ae8b601ba6953cf5e42a798840cc2d99f1048f54977cde509cf
4
- data.tar.gz: 1d3953f85ef0cd47ba65ee7f83f15f7233a31b8e8dd1c370024d85f893539eda
3
+ metadata.gz: ee40f54b9f0a1b031a8a4c6a75caac7797e452dfb2fa076d79a6b69fdcd6093d
4
+ data.tar.gz: dd1fbfbcc08e4a562a4c096c7530104104054eb3546acbb8fbf265941ab381a7
5
5
  SHA512:
6
- metadata.gz: 95258f98b61c5029f06e8c91f7f78e37fa74275038b6846ade5cbf7a5d16bced41336ec4f5eb7ddf6c110c41a30a2461eb6f293c19adde5c0c038f0bf67a272f
7
- data.tar.gz: 6dae1754667432115bbc7a18b3eb92f95a20bc1e8fc1de24672d23a63fb7f59d80beef17204a60eb2ab2a1d886232b031a141ba827f925bbbbbac77d6cc97e3c
6
+ metadata.gz: 7bef90afd805e6d0333d76fbec1c2698b62ca2f3260c4d5566d638d3950d606e2fd6b1f85c02f2c40fa39279929951ba8e8e666fde9e09ad6d0e2753c2d730e7
7
+ data.tar.gz: adf885b78edf97063b30e9285f2f33d3e1cee2a657e282e50d6f548fe13b686b0e3c4f4008248482d78f686ddf0041d4e1581ddfcd02bdb37a3bc66fdb88310a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.27.3)
4
+ html-to-markdown (2.28.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -172,7 +172,7 @@ CHECKSUMS
172
172
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
173
173
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
174
174
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
175
- html-to-markdown (2.27.3)
175
+ html-to-markdown (2.28.0)
176
176
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
177
177
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
178
178
  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.27.3"
3
+ version ="2.28.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -5,7 +5,13 @@ pub mod inline_images;
5
5
  #[cfg(feature = "metadata")]
6
6
  pub mod metadata;
7
7
 
8
+ #[cfg(feature = "visitor")]
9
+ pub mod tables;
10
+
8
11
  pub use inline_images::*;
9
12
 
10
13
  #[cfg(feature = "metadata")]
11
14
  pub use metadata::*;
15
+
16
+ #[cfg(feature = "visitor")]
17
+ pub use tables::*;
@@ -0,0 +1,71 @@
1
+ //! Table extraction conversion functions for Ruby bindings.
2
+
3
+ use html_to_markdown_rs::{ConversionWithTables, TableData};
4
+ use magnus::prelude::*;
5
+ use magnus::{Error, Ruby, Value};
6
+
7
+ #[cfg(feature = "metadata")]
8
+ use super::metadata::extended_metadata_to_ruby;
9
+
10
+ fn table_data_to_ruby(ruby: &Ruby, table: TableData) -> Result<Value, Error> {
11
+ let hash = ruby.hash_new();
12
+
13
+ // cells: Array[Array[String]]
14
+ let cells_array = ruby.ary_new();
15
+ for row in table.cells {
16
+ let row_array = ruby.ary_new();
17
+ for cell in row {
18
+ row_array.push(cell)?;
19
+ }
20
+ cells_array.push(row_array)?;
21
+ }
22
+ hash.aset(ruby.intern("cells"), cells_array)?;
23
+
24
+ // markdown: String
25
+ hash.aset(ruby.intern("markdown"), table.markdown)?;
26
+
27
+ // is_header_row: Array[bool]
28
+ let header_array = ruby.ary_new();
29
+ for is_header in table.is_header_row {
30
+ header_array.push(is_header)?;
31
+ }
32
+ hash.aset(ruby.intern("is_header_row"), header_array)?;
33
+
34
+ Ok(hash.as_value())
35
+ }
36
+
37
+ /// Convert a `ConversionWithTables` result to a Ruby Hash.
38
+ ///
39
+ /// Returns a Hash with keys `:content`, `:metadata`, `:tables`.
40
+ pub fn tables_result_to_ruby(ruby: &Ruby, result: ConversionWithTables) -> Result<Value, Error> {
41
+ let hash = ruby.hash_new();
42
+
43
+ // content: String
44
+ hash.aset(ruby.intern("content"), result.content)?;
45
+
46
+ // metadata: Hash or nil
47
+ #[cfg(feature = "metadata")]
48
+ {
49
+ match result.metadata {
50
+ Some(metadata) => {
51
+ hash.aset(ruby.intern("metadata"), extended_metadata_to_ruby(ruby, metadata)?)?;
52
+ }
53
+ None => {
54
+ hash.aset(ruby.intern("metadata"), ruby.qnil())?;
55
+ }
56
+ }
57
+ }
58
+ #[cfg(not(feature = "metadata"))]
59
+ {
60
+ hash.aset(ruby.intern("metadata"), ruby.qnil())?;
61
+ }
62
+
63
+ // tables: Array[Hash]
64
+ let tables_array = ruby.ary_new();
65
+ for table in result.tables {
66
+ tables_array.push(table_data_to_ruby(ruby, table)?)?;
67
+ }
68
+ hash.aset(ruby.intern("tables"), tables_array)?;
69
+
70
+ Ok(hash.as_value())
71
+ }
@@ -8,6 +8,9 @@ use html_to_markdown_rs::{
8
8
  #[cfg(feature = "visitor")]
9
9
  use html_to_markdown_rs::convert_with_visitor as convert_with_visitor_inner;
10
10
 
11
+ #[cfg(feature = "visitor")]
12
+ use html_to_markdown_rs::convert_with_tables as convert_with_tables_inner;
13
+
11
14
  #[cfg(feature = "metadata")]
12
15
  use html_to_markdown_rs::convert_with_metadata as convert_with_metadata_inner;
13
16
 
@@ -26,6 +29,9 @@ use types::{arg_error, runtime_error};
26
29
  #[cfg(feature = "metadata")]
27
30
  use conversion::{build_metadata_config, extended_metadata_to_ruby};
28
31
 
32
+ #[cfg(feature = "visitor")]
33
+ use conversion::tables_result_to_ruby;
34
+
29
35
  #[cfg(feature = "visitor")]
30
36
  use visitor::RubyVisitorWrapper;
31
37
 
@@ -138,6 +144,23 @@ fn convert_with_metadata_handle_fn(ruby: &Ruby, args: &[Value]) -> Result<Value,
138
144
  Ok(array.as_value())
139
145
  }
140
146
 
147
+ #[cfg(feature = "visitor")]
148
+ fn convert_with_tables_fn(ruby: &Ruby, args: &[Value]) -> Result<Value, Error> {
149
+ let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
150
+ let html = parsed.required.0;
151
+ let options = build_conversion_options(ruby, parsed.optional.0)?;
152
+
153
+ #[cfg(feature = "metadata")]
154
+ let metadata_config = Some(build_metadata_config(ruby, parsed.optional.1)?);
155
+ #[cfg(not(feature = "metadata"))]
156
+ let metadata_config: Option<()> = None;
157
+
158
+ let result =
159
+ guard_panic(|| convert_with_tables_inner(&html, Some(options), metadata_config)).map_err(conversion_error)?;
160
+
161
+ tables_result_to_ruby(ruby, result)
162
+ }
163
+
141
164
  #[cfg(feature = "visitor")]
142
165
  fn convert_with_visitor_fn(ruby: &Ruby, args: &[Value]) -> Result<String, Error> {
143
166
  let parsed = scan_args::<(String,), (Option<Value>, Option<Value>), (), (), (), ()>(args)?;
@@ -227,7 +250,10 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
227
250
  }
228
251
 
229
252
  #[cfg(feature = "visitor")]
230
- module.define_singleton_method("convert_with_visitor", function!(convert_with_visitor_fn, -1))?;
253
+ {
254
+ module.define_singleton_method("convert_with_visitor", function!(convert_with_visitor_fn, -1))?;
255
+ module.define_singleton_method("convert_with_tables", function!(convert_with_tables_fn, -1))?;
256
+ }
231
257
 
232
258
  #[cfg(feature = "profiling")]
233
259
  {
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.27.3'
4
+ VERSION = '2.28.0'
5
5
  end
@@ -18,6 +18,7 @@ module HtmlToMarkdown
18
18
  alias native_convert_with_metadata convert_with_metadata
19
19
  alias native_convert_with_metadata_handle convert_with_metadata_handle
20
20
  alias native_convert_with_visitor convert_with_visitor
21
+ alias native_convert_with_tables convert_with_tables
21
22
  end
22
23
 
23
24
  module_function
@@ -179,4 +180,32 @@ module HtmlToMarkdown
179
180
  def convert_with_metadata_handle(html, options_handle, metadata_config = nil)
180
181
  native_convert_with_metadata_handle(html.to_s, options_handle, metadata_config)
181
182
  end
183
+
184
+ # Convert HTML to Markdown with table extraction.
185
+ #
186
+ # Performs HTML-to-Markdown conversion while extracting structured table data
187
+ # (cells, markdown representation, header row flags) in a single pass.
188
+ #
189
+ # @param html [String] HTML string to convert.
190
+ # @param options [Hash, nil] Optional conversion configuration.
191
+ # @param metadata_config [Hash, nil] Optional metadata extraction configuration.
192
+ #
193
+ # @return [Hash] A hash with keys:
194
+ # - :content [String] The converted Markdown output
195
+ # - :metadata [Hash, nil] Extended metadata (if metadata extraction was configured)
196
+ # - :tables [Array<Hash>] Extracted tables, each with:
197
+ # - :cells [Array<Array<String>>] Table cells organized as rows x columns
198
+ # - :markdown [String] Complete rendered table in Markdown format
199
+ # - :is_header_row [Array<Boolean>] Per-row flag indicating header rows
200
+ #
201
+ # @raise [StandardError] If conversion fails or invalid configuration
202
+ #
203
+ # @example Basic usage
204
+ # html = '<table><thead><tr><th>Name</th></tr></thead><tbody><tr><td>Alice</td></tr></tbody></table>'
205
+ # result = HtmlToMarkdown.convert_with_tables(html)
206
+ # puts result[:tables].length # => 1
207
+ # puts result[:tables][0][:cells] # => [["Name"], ["Alice"]]
208
+ def convert_with_tables(html, options = nil, metadata_config = nil)
209
+ native_convert_with_tables(html.to_s, options, metadata_config)
210
+ end
182
211
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.27.3"
3
+ version = "2.28.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -148,6 +148,35 @@ for (i, img) in extraction.inline_images.iter().enumerate() {
148
148
  }
149
149
  ```
150
150
 
151
+ ## Table Extraction
152
+
153
+ Extract structured table data alongside the Markdown conversion. Each table found in the HTML is returned with its cell contents, header row flags, and rendered Markdown output.
154
+
155
+ Requires the `visitor` feature.
156
+
157
+ ```rust
158
+ use html_to_markdown_rs::convert_with_tables;
159
+
160
+ let html = r#"
161
+ <table>
162
+ <tr><th>Name</th><th>Age</th></tr>
163
+ <tr><td>Alice</td><td>30</td></tr>
164
+ <tr><td>Bob</td><td>25</td></tr>
165
+ </table>
166
+ "#;
167
+
168
+ let result = convert_with_tables(html, None, None)?;
169
+
170
+ println!("{}", result.content);
171
+ for table in &result.tables {
172
+ println!("Table with {} rows:", table.cells.len());
173
+ for (i, row) in table.cells.iter().enumerate() {
174
+ let prefix = if table.is_header_row[i] { "Header" } else { "Row" };
175
+ println!(" {}: {:?}", prefix, row);
176
+ }
177
+ }
178
+ ```
179
+
151
180
  ## Other Language Bindings
152
181
 
153
182
  This is the core Rust library. For other languages:
@@ -681,3 +681,371 @@ pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
681
681
  let update: crate::MetadataConfigUpdate = parse_json(json)?;
682
682
  Ok(MetadataConfig::from(update))
683
683
  }
684
+
685
+ // ============================================================================
686
+ // Table Extraction API (requires visitor feature)
687
+ // ============================================================================
688
+
689
+ /// Extracted table data from HTML conversion.
690
+ ///
691
+ /// Each instance represents a single `<table>` element found during conversion.
692
+ /// Tables are collected in document order.
693
+ #[cfg(feature = "visitor")]
694
+ #[derive(Debug, Clone)]
695
+ #[cfg_attr(
696
+ any(feature = "serde", feature = "metadata"),
697
+ derive(serde::Serialize, serde::Deserialize)
698
+ )]
699
+ pub struct TableData {
700
+ /// Table cells organized as rows x columns. Cell contents are already
701
+ /// converted to the target output format (markdown/djot/plain).
702
+ pub cells: Vec<Vec<String>>,
703
+ /// Complete rendered table in the target output format.
704
+ pub markdown: String,
705
+ /// Per-row flag indicating whether the row was inside `<thead>`.
706
+ pub is_header_row: Vec<bool>,
707
+ }
708
+
709
+ /// Result of HTML-to-markdown conversion with extracted table data.
710
+ #[cfg(feature = "visitor")]
711
+ #[derive(Debug, Clone)]
712
+ #[cfg_attr(
713
+ any(feature = "serde", feature = "metadata"),
714
+ derive(serde::Serialize, serde::Deserialize)
715
+ )]
716
+ pub struct ConversionWithTables {
717
+ /// Converted markdown/djot/plain text content.
718
+ pub content: String,
719
+ /// Extended metadata (if metadata extraction was requested).
720
+ #[cfg(feature = "metadata")]
721
+ pub metadata: Option<ExtendedMetadata>,
722
+ /// All tables found in the HTML, in document order.
723
+ pub tables: Vec<TableData>,
724
+ }
725
+
726
+ #[cfg(feature = "visitor")]
727
+ #[derive(Debug)]
728
+ struct TableCollector {
729
+ tables: Vec<TableData>,
730
+ current_rows: Vec<Vec<String>>,
731
+ current_is_header: Vec<bool>,
732
+ }
733
+
734
+ #[cfg(feature = "visitor")]
735
+ impl TableCollector {
736
+ fn new() -> Self {
737
+ Self {
738
+ tables: Vec::new(),
739
+ current_rows: Vec::new(),
740
+ current_is_header: Vec::new(),
741
+ }
742
+ }
743
+ }
744
+
745
+ #[cfg(feature = "visitor")]
746
+ impl visitor::HtmlVisitor for TableCollector {
747
+ fn visit_table_start(&mut self, _ctx: &visitor::NodeContext) -> visitor::VisitResult {
748
+ self.current_rows.clear();
749
+ self.current_is_header.clear();
750
+ visitor::VisitResult::Continue
751
+ }
752
+
753
+ fn visit_table_row(
754
+ &mut self,
755
+ _ctx: &visitor::NodeContext,
756
+ cells: &[String],
757
+ is_header: bool,
758
+ ) -> visitor::VisitResult {
759
+ self.current_rows.push(cells.to_vec());
760
+ self.current_is_header.push(is_header);
761
+ visitor::VisitResult::Continue
762
+ }
763
+
764
+ fn visit_table_end(&mut self, _ctx: &visitor::NodeContext, output: &str) -> visitor::VisitResult {
765
+ if !self.current_rows.is_empty() {
766
+ self.tables.push(TableData {
767
+ cells: std::mem::take(&mut self.current_rows),
768
+ markdown: output.to_string(),
769
+ is_header_row: std::mem::take(&mut self.current_is_header),
770
+ });
771
+ }
772
+ visitor::VisitResult::Continue
773
+ }
774
+ }
775
+
776
+ /// Convert HTML to markdown/djot/plain text with structured table extraction.
777
+ ///
778
+ /// Combines conversion, optional metadata extraction, and table data collection
779
+ /// in a single DOM walk. Each table found in the HTML is returned with its
780
+ /// cell contents (already converted to the target format) and rendered output.
781
+ ///
782
+ /// # Arguments
783
+ ///
784
+ /// * `html` - The HTML string to convert
785
+ /// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
786
+ /// * `metadata_cfg` - Optional metadata extraction configuration (requires `metadata` feature)
787
+ ///
788
+ /// # Example
789
+ ///
790
+ /// ```ignore
791
+ /// use html_to_markdown_rs::convert_with_tables;
792
+ ///
793
+ /// let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
794
+ /// let result = convert_with_tables(html, None, None).unwrap();
795
+ /// assert_eq!(result.tables.len(), 1);
796
+ /// assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
797
+ /// ```
798
+ ///
799
+ /// # Errors
800
+ ///
801
+ /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
802
+ #[cfg(feature = "visitor")]
803
+ pub fn convert_with_tables(
804
+ html: &str,
805
+ options: Option<ConversionOptions>,
806
+ #[cfg(feature = "metadata")] metadata_cfg: Option<MetadataConfig>,
807
+ #[cfg(not(feature = "metadata"))] _metadata_cfg: Option<()>,
808
+ ) -> Result<ConversionWithTables> {
809
+ use std::cell::RefCell;
810
+ use std::rc::Rc;
811
+
812
+ let collector = Rc::new(RefCell::new(TableCollector::new()));
813
+ let visitor_handle: visitor::VisitorHandle = Rc::clone(&collector) as visitor::VisitorHandle;
814
+
815
+ #[cfg(feature = "metadata")]
816
+ let result = {
817
+ let metadata_config = metadata_cfg.unwrap_or_default();
818
+ let (content, metadata) = convert_with_metadata(html, options, metadata_config, Some(visitor_handle))?;
819
+ let tables = Rc::try_unwrap(collector)
820
+ .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
821
+ .into_inner()
822
+ .tables;
823
+ ConversionWithTables {
824
+ content,
825
+ metadata: Some(metadata),
826
+ tables,
827
+ }
828
+ };
829
+
830
+ #[cfg(not(feature = "metadata"))]
831
+ let result = {
832
+ let content = convert_with_visitor(html, options, Some(visitor_handle))?;
833
+ let tables = Rc::try_unwrap(collector)
834
+ .map_err(|_| ConversionError::Other("failed to recover table collector state".into()))?
835
+ .into_inner()
836
+ .tables;
837
+ ConversionWithTables { content, tables }
838
+ };
839
+
840
+ Ok(result)
841
+ }
842
+
843
+ #[cfg(test)]
844
+ #[cfg(feature = "visitor")]
845
+ mod table_extraction_tests {
846
+ use super::*;
847
+
848
+ fn tables_from_html(html: &str) -> ConversionWithTables {
849
+ convert_with_tables(
850
+ html,
851
+ None,
852
+ #[cfg(feature = "metadata")]
853
+ None,
854
+ #[cfg(not(feature = "metadata"))]
855
+ None,
856
+ )
857
+ .unwrap()
858
+ }
859
+
860
+ #[test]
861
+ fn test_convert_with_tables_basic() {
862
+ let html = r#"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>"#;
863
+ let result = tables_from_html(html);
864
+ assert_eq!(result.tables.len(), 1);
865
+ assert_eq!(result.tables[0].cells.len(), 2);
866
+ assert_eq!(result.tables[0].cells[0], vec!["Name", "Age"]);
867
+ assert_eq!(result.tables[0].cells[1], vec!["Alice", "30"]);
868
+ assert!(result.tables[0].is_header_row[0]);
869
+ assert!(!result.tables[0].is_header_row[1]);
870
+ assert!(result.tables[0].markdown.contains('|'));
871
+ }
872
+
873
+ #[test]
874
+ fn test_convert_with_tables_nested() {
875
+ let html = r#"
876
+ <table>
877
+ <tr><th>Category</th><th>Details</th></tr>
878
+ <tr>
879
+ <td>Project Alpha</td>
880
+ <td>
881
+ <table>
882
+ <tr><th>Task</th><th>Status</th></tr>
883
+ <tr><td>001</td><td>Done</td></tr>
884
+ </table>
885
+ </td>
886
+ </tr>
887
+ </table>"#;
888
+ let result = tables_from_html(html);
889
+ assert!(
890
+ result.tables.len() >= 2,
891
+ "Expected at least 2 tables (outer + nested), got {}",
892
+ result.tables.len()
893
+ );
894
+ }
895
+
896
+ #[test]
897
+ fn test_convert_with_tables_no_tables() {
898
+ let html = "<p>No tables here</p>";
899
+ let result = tables_from_html(html);
900
+ assert!(result.tables.is_empty());
901
+ assert!(result.content.contains("No tables here"));
902
+ }
903
+
904
+ #[test]
905
+ fn test_convert_with_tables_empty_table() {
906
+ let result = tables_from_html("<table></table>");
907
+ assert!(result.tables.is_empty(), "Empty table should not produce TableData");
908
+ }
909
+
910
+ #[test]
911
+ fn test_convert_with_tables_headers_only() {
912
+ let html = r#"<table><thead><tr><th>A</th><th>B</th></tr></thead></table>"#;
913
+ let result = tables_from_html(html);
914
+ assert_eq!(result.tables.len(), 1);
915
+ assert!(result.tables[0].is_header_row[0]);
916
+ assert_eq!(result.tables[0].cells[0], vec!["A", "B"]);
917
+ }
918
+
919
+ #[test]
920
+ fn test_convert_with_tables_thead_tbody_tfoot() {
921
+ let html = r#"
922
+ <table>
923
+ <thead><tr><th>H1</th></tr></thead>
924
+ <tbody><tr><td>B1</td></tr></tbody>
925
+ <tfoot><tr><td>F1</td></tr></tfoot>
926
+ </table>"#;
927
+ let result = tables_from_html(html);
928
+ assert_eq!(result.tables.len(), 1);
929
+ let t = &result.tables[0];
930
+ assert!(t.is_header_row[0], "thead row should be header");
931
+ assert!(!t.is_header_row[1], "tbody row should not be header");
932
+ assert_eq!(t.cells[0], vec!["H1"]);
933
+ assert_eq!(t.cells[1], vec!["B1"]);
934
+ }
935
+
936
+ #[test]
937
+ fn test_convert_with_tables_multiple_separate() {
938
+ let html = r#"
939
+ <table><tr><td>T1</td></tr></table>
940
+ <p>Between tables</p>
941
+ <table><tr><td>T2</td></tr></table>"#;
942
+ let result = tables_from_html(html);
943
+ assert_eq!(result.tables.len(), 2, "Should find 2 separate tables");
944
+ }
945
+
946
+ #[test]
947
+ fn test_convert_with_tables_special_chars() {
948
+ let html = r#"<table><tr><td>a | b</td><td>c*d</td></tr></table>"#;
949
+ let result = tables_from_html(html);
950
+ assert_eq!(result.tables.len(), 1);
951
+ assert!(!result.tables[0].cells[0].is_empty());
952
+ }
953
+
954
+ #[test]
955
+ fn test_convert_with_tables_single_cell() {
956
+ let html = r#"<table><tr><td>Only cell</td></tr></table>"#;
957
+ let result = tables_from_html(html);
958
+ assert_eq!(result.tables.len(), 1);
959
+ assert_eq!(result.tables[0].cells.len(), 1);
960
+ assert_eq!(result.tables[0].cells[0], vec!["Only cell"]);
961
+ }
962
+
963
+ #[test]
964
+ fn test_convert_with_tables_content_preserved() {
965
+ let html = r#"<p>Before</p><table><tr><td>Cell</td></tr></table><p>After</p>"#;
966
+ let result = tables_from_html(html);
967
+ assert!(result.content.contains("Before"));
968
+ assert!(result.content.contains("After"));
969
+ assert!(result.content.contains('|'), "Markdown table should appear in content");
970
+ }
971
+
972
+ #[test]
973
+ fn test_convert_with_tables_with_options() {
974
+ let options = ConversionOptions {
975
+ heading_style: crate::options::HeadingStyle::Underlined,
976
+ ..ConversionOptions::default()
977
+ };
978
+ let html = r#"<h1>Title</h1><table><tr><td>Cell</td></tr></table>"#;
979
+ let result = convert_with_tables(
980
+ html,
981
+ Some(options),
982
+ #[cfg(feature = "metadata")]
983
+ None,
984
+ #[cfg(not(feature = "metadata"))]
985
+ None,
986
+ )
987
+ .unwrap();
988
+ assert_eq!(result.tables.len(), 1);
989
+ assert!(result.content.contains("Title"));
990
+ }
991
+
992
+ #[test]
993
+ fn test_convert_with_tables_plain_text_format() {
994
+ let options = ConversionOptions {
995
+ output_format: crate::options::OutputFormat::Plain,
996
+ ..ConversionOptions::default()
997
+ };
998
+ let html = r#"<table><tr><th>Name</th></tr><tr><td>Alice</td></tr></table>"#;
999
+ let result = convert_with_tables(
1000
+ html,
1001
+ Some(options),
1002
+ #[cfg(feature = "metadata")]
1003
+ None,
1004
+ #[cfg(not(feature = "metadata"))]
1005
+ None,
1006
+ )
1007
+ .unwrap();
1008
+ assert!(
1009
+ !result.tables.is_empty(),
1010
+ "Tables should be populated even with plain text output format"
1011
+ );
1012
+ assert_eq!(result.tables[0].cells[0], vec!["Name"]);
1013
+ }
1014
+
1015
+ #[cfg(feature = "metadata")]
1016
+ #[test]
1017
+ fn test_convert_with_tables_metadata_integration() {
1018
+ let html = r#"<html lang="en"><head><title>Test</title></head><body>
1019
+ <table><tr><th>Col</th></tr><tr><td>Val</td></tr></table>
1020
+ </body></html>"#;
1021
+ let config = MetadataConfig::default();
1022
+ let result = convert_with_tables(html, None, Some(config)).unwrap();
1023
+ assert_eq!(result.tables.len(), 1);
1024
+ let meta = result.metadata.as_ref().expect("metadata should be present");
1025
+ assert_eq!(meta.document.language, Some("en".to_string()));
1026
+ }
1027
+
1028
+ #[cfg(feature = "metadata")]
1029
+ #[test]
1030
+ fn test_convert_with_tables_plain_text_metadata() {
1031
+ let options = ConversionOptions {
1032
+ output_format: crate::options::OutputFormat::Plain,
1033
+ ..ConversionOptions::default()
1034
+ };
1035
+ let html = r#"<html lang="fr"><body>
1036
+ <table><tr><td>Cell</td></tr></table>
1037
+ </body></html>"#;
1038
+ let config = MetadataConfig::default();
1039
+ let result = convert_with_tables(html, Some(options), Some(config)).unwrap();
1040
+ assert!(
1041
+ !result.tables.is_empty(),
1042
+ "Tables should be populated in plain text mode"
1043
+ );
1044
+ let meta = result.metadata.as_ref().expect("metadata should be present");
1045
+ assert_eq!(
1046
+ meta.document.language,
1047
+ Some("fr".to_string()),
1048
+ "Metadata should be populated in plain text mode"
1049
+ );
1050
+ }
1051
+ }
@@ -136,11 +136,9 @@ pub(crate) fn convert_html_impl(
136
136
  }
137
137
  }
138
138
 
139
- // Fast path for plain text output: skip the full conversion pipeline
140
- if options.output_format == OutputFormat::Plain {
141
- let plain = extract_plain_text(&dom, parser, options);
142
- return Ok(plain);
143
- }
139
+ // Plain text output: run the full pipeline (for metadata + visitor callbacks),
140
+ // then return plain text instead of markdown.
141
+ let is_plain_text = options.output_format == OutputFormat::Plain;
144
142
 
145
143
  let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
146
144
  #[cfg(feature = "metadata")]
@@ -230,6 +228,13 @@ pub(crate) fn convert_html_impl(
230
228
  return Err(crate::error::ConversionError::Visitor(err.clone()));
231
229
  }
232
230
 
231
+ // If plain text was requested, discard the markdown output and return plain text.
232
+ // The full pipeline was still run above so that metadata + visitor callbacks fire.
233
+ if is_plain_text {
234
+ let plain = extract_plain_text(&dom, parser, options);
235
+ return Ok(plain);
236
+ }
237
+
233
238
  trim_line_end_whitespace(&mut output);
234
239
  let trimmed = output.trim_end_matches('\n');
235
240
  if trimmed.is_empty() {
@@ -98,6 +98,9 @@ pub use convert_api::convert_with_metadata;
98
98
  #[cfg(feature = "visitor")]
99
99
  pub use convert_api::convert_with_visitor;
100
100
 
101
+ #[cfg(feature = "visitor")]
102
+ pub use convert_api::{ConversionWithTables, TableData, convert_with_tables};
103
+
101
104
  #[cfg(feature = "async-visitor")]
102
105
  pub use convert_api::convert_with_async_visitor;
103
106
 
@@ -19,5 +19,8 @@ pub use crate::convert_with_metadata;
19
19
  #[cfg(feature = "visitor")]
20
20
  pub use crate::convert_with_visitor;
21
21
 
22
+ #[cfg(feature = "visitor")]
23
+ pub use crate::{ConversionWithTables, TableData, convert_with_tables};
24
+
22
25
  #[cfg(feature = "async-visitor")]
23
26
  pub use crate::convert_with_async_visitor;
@@ -185,6 +185,18 @@ module HtmlToMarkdown
185
185
  structured_data: Array[structured_data]
186
186
  }
187
187
 
188
+ type table_data = {
189
+ cells: Array[Array[String]],
190
+ markdown: String,
191
+ is_header_row: Array[bool]
192
+ }
193
+
194
+ type table_extraction_result = {
195
+ content: String,
196
+ metadata: extended_metadata?,
197
+ tables: Array[table_data]
198
+ }
199
+
188
200
  # Native methods (implemented in Rust via Magnus/rb-sys)
189
201
  # These are aliased from the Rust extension and available as both module and instance methods
190
202
  private
@@ -217,6 +229,11 @@ module HtmlToMarkdown
217
229
  conversion_options? options,
218
230
  visitor? visitor
219
231
  ) -> String
232
+ def self.native_convert_with_tables: (
233
+ String html,
234
+ conversion_options? options,
235
+ metadata_config? metadata_config
236
+ ) -> table_extraction_result
220
237
 
221
238
  def native_convert: (String html, conversion_options? options) -> String
222
239
  def native_options: (conversion_options? options_hash) -> Options
@@ -246,6 +263,11 @@ module HtmlToMarkdown
246
263
  conversion_options? options,
247
264
  visitor? visitor
248
265
  ) -> String
266
+ def native_convert_with_tables: (
267
+ String html,
268
+ conversion_options? options,
269
+ metadata_config? metadata_config
270
+ ) -> table_extraction_result
249
271
 
250
272
  # Visitor interface for customizing conversion behavior
251
273
  type visitor = Object
@@ -422,6 +444,25 @@ module HtmlToMarkdown
422
444
  ?metadata_config metadata_config
423
445
  ) -> [String, extended_metadata]
424
446
 
447
+ # Convert HTML and extract tables as structured data
448
+ #
449
+ # Args:
450
+ # html: HTML string to convert
451
+ # options: Optional conversion configuration
452
+ # metadata_config: Optional metadata extraction configuration
453
+ #
454
+ # Returns:
455
+ # table_extraction_result: Hash containing content, metadata, and tables array
456
+ #
457
+ # Example:
458
+ # result = HtmlToMarkdown.convert_with_tables(html)
459
+ # puts result[:tables].length
460
+ def self.convert_with_tables: (
461
+ String html,
462
+ ?conversion_options options,
463
+ ?metadata_config metadata_config
464
+ ) -> table_extraction_result
465
+
425
466
  # Instance method versions (created by module_function)
426
467
  def convert: (String html, ?conversion_options options, ?visitor visitor) -> String
427
468
  def options: (?conversion_options options_hash) -> Options
@@ -449,4 +490,9 @@ module HtmlToMarkdown
449
490
  Options options_handle,
450
491
  ?metadata_config metadata_config
451
492
  ) -> [String, extended_metadata]
493
+ def convert_with_tables: (
494
+ String html,
495
+ ?conversion_options options,
496
+ ?metadata_config metadata_config
497
+ ) -> table_extraction_result
452
498
  end
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe HtmlToMarkdown do
6
+ describe '.convert_with_tables' do
7
+ it 'returns a hash with content, metadata, and tables keys' do
8
+ html = '<table><tr><td>Cell</td></tr></table>'
9
+ result = described_class.convert_with_tables(html)
10
+
11
+ expect(result).to be_a(Hash)
12
+ expect(result).to include(:content, :metadata, :tables)
13
+ end
14
+
15
+ context 'with a basic table with header' do
16
+ let(:html) do
17
+ <<~HTML
18
+ <table>
19
+ <thead>
20
+ <tr><th>Name</th><th>Age</th></tr>
21
+ </thead>
22
+ <tbody>
23
+ <tr><td>Alice</td><td>30</td></tr>
24
+ </tbody>
25
+ </table>
26
+ HTML
27
+ end
28
+
29
+ it 'extracts exactly one table' do
30
+ result = described_class.convert_with_tables(html)
31
+
32
+ expect(result[:tables].length).to eq(1)
33
+ end
34
+
35
+ it 'extracts cells as rows of columns' do
36
+ result = described_class.convert_with_tables(html)
37
+ table = result[:tables][0]
38
+
39
+ expect(table[:cells]).to be_an(Array)
40
+ expect(table[:cells].length).to eq(2)
41
+ expect(table[:cells][0]).to eq(%w[Name Age])
42
+ expect(table[:cells][1]).to eq(%w[Alice 30])
43
+ end
44
+
45
+ it 'provides markdown representation' do
46
+ result = described_class.convert_with_tables(html)
47
+ table = result[:tables][0]
48
+
49
+ expect(table[:markdown]).to be_a(String)
50
+ expect(table[:markdown]).to include('Name')
51
+ expect(table[:markdown]).to include('Alice')
52
+ end
53
+
54
+ it 'marks header rows correctly' do
55
+ result = described_class.convert_with_tables(html)
56
+ table = result[:tables][0]
57
+
58
+ expect(table[:is_header_row]).to be_an(Array)
59
+ expect(table[:is_header_row].length).to eq(2)
60
+ expect(table[:is_header_row][0]).to be true
61
+ expect(table[:is_header_row][1]).to be false
62
+ end
63
+
64
+ it 'includes converted markdown content' do
65
+ result = described_class.convert_with_tables(html)
66
+
67
+ expect(result[:content]).to be_a(String)
68
+ expect(result[:content]).not_to be_empty
69
+ end
70
+ end
71
+
72
+ context 'with empty HTML' do
73
+ it 'returns empty tables array' do
74
+ result = described_class.convert_with_tables('')
75
+
76
+ expect(result[:tables]).to eq([])
77
+ expect(result[:content]).to be_a(String)
78
+ end
79
+ end
80
+
81
+ context 'with HTML containing no tables' do
82
+ it 'returns empty tables array' do
83
+ html = '<p>No tables here</p>'
84
+ result = described_class.convert_with_tables(html)
85
+
86
+ expect(result[:tables]).to eq([])
87
+ expect(result[:content]).to include('No tables here')
88
+ end
89
+ end
90
+
91
+ context 'with multiple tables' do
92
+ let(:html) do
93
+ <<~HTML
94
+ <table>
95
+ <tr><th>A</th></tr>
96
+ <tr><td>1</td></tr>
97
+ </table>
98
+ <p>Some text between tables</p>
99
+ <table>
100
+ <tr><th>B</th><th>C</th></tr>
101
+ <tr><td>2</td><td>3</td></tr>
102
+ <tr><td>4</td><td>5</td></tr>
103
+ </table>
104
+ HTML
105
+ end
106
+
107
+ it 'extracts all tables' do
108
+ result = described_class.convert_with_tables(html)
109
+
110
+ expect(result[:tables].length).to eq(2)
111
+ end
112
+
113
+ it 'preserves table order' do
114
+ result = described_class.convert_with_tables(html)
115
+
116
+ first_table = result[:tables][0]
117
+ second_table = result[:tables][1]
118
+
119
+ expect(first_table[:cells][0]).to eq(['A'])
120
+ expect(second_table[:cells][0]).to eq(%w[B C])
121
+ end
122
+
123
+ it 'extracts correct row counts per table' do
124
+ result = described_class.convert_with_tables(html)
125
+
126
+ expect(result[:tables][0][:cells].length).to eq(2)
127
+ expect(result[:tables][1][:cells].length).to eq(3)
128
+ end
129
+ end
130
+
131
+ context 'with special characters in cells' do
132
+ let(:html) do
133
+ <<~HTML
134
+ <table>
135
+ <tr><th>Key</th><th>Value</th></tr>
136
+ <tr><td>Brackets &lt;&gt;</td><td>Ampersand &amp;</td></tr>
137
+ <tr><td>Quotes "double"</td><td>Quotes 'single'</td></tr>
138
+ <tr><td>Unicode: cafe\u0301</td><td>Emoji: test</td></tr>
139
+ </table>
140
+ HTML
141
+ end
142
+
143
+ it 'handles HTML entities in cells' do
144
+ result = described_class.convert_with_tables(html)
145
+ table = result[:tables][0]
146
+
147
+ expect(table[:cells][1][0]).to include('<>')
148
+ expect(table[:cells][1][1]).to include('&')
149
+ end
150
+
151
+ it 'handles quotes in cells' do
152
+ result = described_class.convert_with_tables(html)
153
+ table = result[:tables][0]
154
+
155
+ expect(table[:cells][2][0]).to include('"double"')
156
+ expect(table[:cells][2][1]).to include("'single'")
157
+ end
158
+
159
+ it 'handles unicode in cells' do
160
+ result = described_class.convert_with_tables(html)
161
+ table = result[:tables][0]
162
+
163
+ expect(table[:cells][3][0]).to be_a(String)
164
+ end
165
+ end
166
+
167
+ context 'with conversion options' do
168
+ it 'accepts options hash' do
169
+ html = '<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>'
170
+ result = described_class.convert_with_tables(html, { heading_style: :atx })
171
+
172
+ expect(result).to be_a(Hash)
173
+ expect(result[:tables].length).to eq(1)
174
+ end
175
+
176
+ it 'accepts nil options' do
177
+ html = '<table><tr><td>Data</td></tr></table>'
178
+ result = described_class.convert_with_tables(html, nil, nil)
179
+
180
+ expect(result).to be_a(Hash)
181
+ expect(result[:tables].length).to eq(1)
182
+ end
183
+ end
184
+
185
+ context 'with metadata config' do
186
+ it 'includes metadata when configured' do
187
+ html = '<html><head><title>Test</title></head><body><table><tr><td>Data</td></tr></table></body></html>'
188
+ result = described_class.convert_with_tables(html, nil, { extract_headers: true })
189
+
190
+ expect(result[:metadata]).to be_a(Hash).or(be_nil)
191
+ end
192
+ end
193
+ end
194
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.27.3
4
+ version: 2.28.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
@@ -61,6 +61,7 @@ files:
61
61
  - ext/html-to-markdown-rb/native/src/conversion/inline_images.rs
62
62
  - ext/html-to-markdown-rb/native/src/conversion/metadata.rs
63
63
  - ext/html-to-markdown-rb/native/src/conversion/mod.rs
64
+ - ext/html-to-markdown-rb/native/src/conversion/tables.rs
64
65
  - ext/html-to-markdown-rb/native/src/lib.rs
65
66
  - ext/html-to-markdown-rb/native/src/options.rs
66
67
  - ext/html-to-markdown-rb/native/src/profiling.rs
@@ -9737,6 +9738,7 @@ files:
9737
9738
  - sig/open3.rbs
9738
9739
  - spec/cli_proxy_spec.rb
9739
9740
  - spec/convert_spec.rb
9741
+ - spec/convert_with_tables_spec.rb
9740
9742
  - spec/metadata_extraction_spec.rb
9741
9743
  - spec/spec_helper.rb
9742
9744
  - spec/visitor_issue_187_spec.rb