html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -7,6 +7,8 @@
7
7
  //! - Spacing management for various contexts
8
8
  //! - Visitor callbacks for custom blockquote processing
9
9
 
10
+ #[cfg(feature = "visitor")]
11
+ use crate::converter::utility::content::collect_tag_attributes;
10
12
  use crate::options::ConversionOptions;
11
13
  #[allow(unused_imports)]
12
14
  use std::collections::BTreeMap;
@@ -88,11 +90,7 @@ pub(crate) fn handle(
88
90
 
89
91
  if let Some(node) = node_handle.get(parser) {
90
92
  if let tl::Node::Tag(tag) = node {
91
- let attributes: BTreeMap<String, String> = tag
92
- .attributes()
93
- .iter()
94
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
95
- .collect();
93
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
96
94
 
97
95
  let node_id = node_handle.get_inner();
98
96
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -6,6 +6,7 @@
6
6
  //! - List continuations: Uses list indentation
7
7
  //! - Block context: Adds surrounding newlines for proper block separation
8
8
 
9
+ use crate::converter::main_helpers::trim_trailing_whitespace;
9
10
  use crate::options::ConversionOptions;
10
11
  use tl::{NodeHandle, Parser};
11
12
 
@@ -131,13 +132,6 @@ pub(crate) fn handle(
131
132
  }
132
133
  }
133
134
 
134
- /// Helper function to trim trailing whitespace
135
- fn trim_trailing_whitespace(output: &mut String) {
136
- while output.ends_with(' ') || output.ends_with('\t') {
137
- output.pop();
138
- }
139
- }
140
-
141
135
  /// Helper function to add list continuation indentation
142
136
  fn add_list_continuation_indent(
143
137
  output: &mut String,
@@ -6,6 +6,8 @@
6
6
  //! - Metadata collection (headers, IDs)
7
7
  //! - Visitor callbacks for custom heading processing
8
8
 
9
+ #[cfg(feature = "visitor")]
10
+ use crate::converter::utility::content::collect_tag_attributes;
9
11
  use crate::options::{ConversionOptions, HeadingStyle};
10
12
  use std::borrow::Cow;
11
13
  #[allow(unused_imports)]
@@ -124,6 +126,21 @@ pub(crate) fn handle(
124
126
  }
125
127
  }
126
128
  }
129
+
130
+ // Notify the structure collector if present.
131
+ if let Some(ref sc) = ctx.structure_collector {
132
+ if let Some(node) = node_handle.get(parser) {
133
+ if let tl::Node::Tag(tag) = node {
134
+ let id = tag
135
+ .attributes()
136
+ .get("id")
137
+ .flatten()
138
+ .map(|v| v.as_utf8_str().to_string());
139
+ sc.borrow_mut()
140
+ .push_heading(level as u8, normalized.as_ref(), id.as_deref());
141
+ }
142
+ }
143
+ }
127
144
  }
128
145
  }
129
146
 
@@ -292,11 +309,7 @@ fn visitor_heading_output(
292
309
  .flatten()
293
310
  .map(|v| v.as_utf8_str().to_string());
294
311
 
295
- let attributes: BTreeMap<String, String> = tag
296
- .attributes()
297
- .iter()
298
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
299
- .collect();
312
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
300
313
 
301
314
  let node_id = node_handle.get_inner();
302
315
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -96,6 +96,16 @@ pub(crate) fn handle(
96
96
  if has_content && !ctx.convert_as_inline && !ctx.in_table_cell {
97
97
  output.push_str("\n\n");
98
98
  }
99
+
100
+ // Notify the structure collector if present and we produced non-empty top-level paragraph content.
101
+ if has_content && !ctx.in_table_cell && !ctx.in_list_item && !ctx.convert_as_inline {
102
+ if let Some(ref sc) = ctx.structure_collector {
103
+ let text = output[content_start_pos..].trim().to_string();
104
+ if !text.is_empty() {
105
+ sc.borrow_mut().push_paragraph(&text);
106
+ }
107
+ }
108
+ }
99
109
  }
100
110
 
101
111
  /// Add continuation indentation for list items.
@@ -7,6 +7,8 @@
7
7
  //! - Inline code formatting with backtick management
8
8
  //! - Visitor callbacks for custom code processing
9
9
 
10
+ #[cfg(feature = "visitor")]
11
+ use crate::converter::utility::content::collect_tag_attributes;
10
12
  use crate::options::{CodeBlockStyle, ConversionOptions, WhitespaceMode};
11
13
  #[allow(unused_imports)]
12
14
  use std::collections::BTreeMap;
@@ -93,11 +95,7 @@ pub(crate) fn handle_pre(
93
95
 
94
96
  if let Some(node) = node_handle.get(parser) {
95
97
  if let tl::Node::Tag(tag) = node {
96
- let attributes: BTreeMap<String, String> = tag
97
- .attributes()
98
- .iter()
99
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
100
- .collect();
98
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
101
99
 
102
100
  let node_id = node_handle.get_inner();
103
101
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -9,6 +9,8 @@ use super::cell::{collect_table_cells, get_colspan};
9
9
  use super::cells::{append_layout_row, convert_table_row};
10
10
  use super::scanner::scan_table;
11
11
  use super::utils::{is_tag_name, normalized_tag_name};
12
+ #[cfg(feature = "visitor")]
13
+ use crate::converter::utility::content::collect_tag_attributes;
12
14
 
13
15
  /// Maximum allowed table columns to prevent unbounded memory usage.
14
16
  const MAX_TABLE_COLS: usize = 1000;
@@ -106,11 +108,7 @@ pub fn handle_table(
106
108
  use crate::visitor::{NodeContext, NodeType, VisitResult};
107
109
  use std::collections::BTreeMap;
108
110
 
109
- let attributes: BTreeMap<String, String> = tag
110
- .attributes()
111
- .iter()
112
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
113
- .collect();
111
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
114
112
 
115
113
  let node_id = node_handle.get_inner();
116
114
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -160,7 +158,7 @@ pub fn handle_table(
160
158
  .get("border")
161
159
  .is_some_and(|v| v.as_ref().is_some_and(|b| b.as_utf8_str() == "0"));
162
160
  let looks_like_layout =
163
- table_scan.has_nested_table || distinct_counts.len() > 1 || (table_scan.has_span && has_border_zero);
161
+ table_scan.nested_table_count > 1 || distinct_counts.len() > 1 || (table_scan.has_span && has_border_zero);
164
162
  let link_count = table_scan.link_count;
165
163
  let is_blank_table = !table_scan.has_text;
166
164
 
@@ -343,11 +341,7 @@ pub fn handle_table(
343
341
  use crate::visitor::{NodeContext, NodeType, VisitResult};
344
342
  use std::collections::BTreeMap;
345
343
 
346
- let attributes: BTreeMap<String, String> = tag
347
- .attributes()
348
- .iter()
349
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
350
- .collect();
344
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
351
345
 
352
346
  let node_id = node_handle.get_inner();
353
347
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -400,3 +394,14 @@ pub fn handle_table(
400
394
  }
401
395
  }
402
396
  }
397
+
398
+ #[cfg(test)]
399
+ mod tests {
400
+ #[test]
401
+ fn single_nested_table_stays_as_table() {
402
+ let html = r"<table><tr><td>Label</td><td><table><tr><td>A</td><td>B</td></tr></table></td></tr></table>";
403
+ let result = crate::convert(html, None).unwrap();
404
+ let content = result.content.unwrap_or_default();
405
+ assert!(content.contains('|'), "should produce pipe table, not list");
406
+ }
407
+ }
@@ -179,3 +179,23 @@ pub fn convert_table_cell(
179
179
  output.push_str(" |");
180
180
  }
181
181
  }
182
+
183
+ #[cfg(test)]
184
+ mod tests {
185
+ #[test]
186
+ fn rich_formatting_preserved_in_cells() {
187
+ let html = "<table><tr><th>H</th></tr><tr><td><strong>Bold</strong> and <em>italic</em></td></tr></table>";
188
+ let result = crate::convert(html, None).unwrap();
189
+ let content = result.content.unwrap_or_default();
190
+ assert!(
191
+ content.contains("**Bold**") || content.contains("__Bold__"),
192
+ "bold should be preserved: {}",
193
+ content
194
+ );
195
+ assert!(
196
+ content.contains("*italic*") || content.contains("_italic_"),
197
+ "italic should be preserved: {}",
198
+ content
199
+ );
200
+ }
201
+ }
@@ -5,6 +5,9 @@
5
5
  //! - Cell layout handling with colspan/rowspan support
6
6
  //! - Layout table row conversion to list items
7
7
 
8
+ #[cfg(feature = "visitor")]
9
+ use crate::converter::utility::content::collect_tag_attributes;
10
+ use crate::converter::utility::content::normalized_tag_name;
8
11
  use std::borrow::Cow;
9
12
 
10
13
  use super::cell::{collect_table_cells, convert_table_cell, get_colspan_rowspan};
@@ -84,18 +87,6 @@ pub fn append_layout_row(
84
87
  }
85
88
  }
86
89
 
87
- /// Normalize HTML tag names to lowercase.
88
- ///
89
- /// Converts tag names to a consistent lowercase form for comparison.
90
- fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
91
- let lowercased = raw.to_lowercase();
92
- if lowercased.as_str() == raw.as_ref() {
93
- raw
94
- } else {
95
- Cow::Owned(lowercased)
96
- }
97
- }
98
-
99
90
  /// Convert a table row (tr) to Markdown format.
100
91
  ///
101
92
  /// Processes all cells in a row, handling colspan and rowspan for proper
@@ -167,11 +158,7 @@ pub fn convert_table_row(
167
158
  use std::collections::BTreeMap;
168
159
 
169
160
  if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
170
- let attributes: BTreeMap<String, String> = tag
171
- .attributes()
172
- .iter()
173
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
174
- .collect();
161
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
175
162
 
176
163
  let node_ctx = NodeContext {
177
164
  node_type: NodeType::TableRow,
@@ -96,6 +96,13 @@ pub(crate) fn handle_table_with_context(
96
96
  let mut table_output = String::new();
97
97
  builder::handle_table(node_handle, parser, &mut table_output, options, ctx, dom_ctx, depth);
98
98
 
99
+ // Feed the table into the structure collector when document structure extraction is enabled.
100
+ if let Some(ref sc) = ctx.structure_collector {
101
+ if let Some(grid) = collect_table_grid(node_handle, parser, options, ctx, dom_ctx) {
102
+ sc.borrow_mut().push_table(grid);
103
+ }
104
+ }
105
+
99
106
  if ctx.in_list_item {
100
107
  let has_caption = table_output.starts_with('*');
101
108
 
@@ -124,3 +131,136 @@ pub(crate) fn handle_table_with_context(
124
131
  output.push('\n');
125
132
  }
126
133
  }
134
+
135
+ /// Collect a [`crate::types::TableGrid`] from the DOM for the structure collector.
136
+ ///
137
+ /// Walks the table's rows and cells, extracting text content and span attributes
138
+ /// to build a structured grid representation.
139
+ fn collect_table_grid(
140
+ node_handle: &tl::NodeHandle,
141
+ parser: &tl::Parser,
142
+ options: &crate::options::ConversionOptions,
143
+ ctx: &super::super::Context,
144
+ dom_ctx: &super::super::DomContext,
145
+ ) -> Option<crate::types::TableGrid> {
146
+ use utils::{is_tag_name, normalized_tag_name};
147
+
148
+ let tl::Node::Tag(tag) = node_handle.get(parser)? else {
149
+ return None;
150
+ };
151
+
152
+ let mut grid_cells = Vec::new();
153
+ let mut row_index: u32 = 0;
154
+ let mut max_cols: u32 = 0;
155
+ let mut cell_handles = Vec::new();
156
+
157
+ let children = tag.children();
158
+ for child_handle in children.top().iter() {
159
+ if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
160
+ let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
161
+ match tag_name.as_ref() {
162
+ "thead" | "tbody" | "tfoot" => {
163
+ let is_header_section = tag_name.as_ref() == "thead";
164
+ for row_handle in child_tag.children().top().iter() {
165
+ if is_tag_name(row_handle, parser, dom_ctx, "tr") {
166
+ collect_grid_row(
167
+ row_handle,
168
+ parser,
169
+ options,
170
+ ctx,
171
+ dom_ctx,
172
+ &mut cell_handles,
173
+ &mut grid_cells,
174
+ &mut row_index,
175
+ &mut max_cols,
176
+ is_header_section,
177
+ );
178
+ }
179
+ }
180
+ }
181
+ "tr" | "row" => {
182
+ let is_first = row_index == 0;
183
+ collect_grid_row(
184
+ child_handle,
185
+ parser,
186
+ options,
187
+ ctx,
188
+ dom_ctx,
189
+ &mut cell_handles,
190
+ &mut grid_cells,
191
+ &mut row_index,
192
+ &mut max_cols,
193
+ is_first,
194
+ );
195
+ }
196
+ _ => {}
197
+ }
198
+ }
199
+ }
200
+
201
+ if row_index == 0 {
202
+ return None;
203
+ }
204
+
205
+ Some(crate::types::TableGrid {
206
+ rows: row_index,
207
+ cols: max_cols,
208
+ cells: grid_cells,
209
+ })
210
+ }
211
+
212
+ /// Process a single table row for grid collection.
213
+ #[allow(clippy::too_many_arguments)]
214
+ fn collect_grid_row(
215
+ row_handle: &tl::NodeHandle,
216
+ parser: &tl::Parser,
217
+ options: &crate::options::ConversionOptions,
218
+ ctx: &super::super::Context,
219
+ dom_ctx: &super::super::DomContext,
220
+ cell_handles: &mut Vec<tl::NodeHandle>,
221
+ grid_cells: &mut Vec<crate::types::GridCell>,
222
+ row_index: &mut u32,
223
+ max_cols: &mut u32,
224
+ is_header_section: bool,
225
+ ) {
226
+ use cell::{collect_table_cells, get_colspan_rowspan};
227
+
228
+ collect_table_cells(row_handle, parser, dom_ctx, cell_handles);
229
+
230
+ let mut col_index: u32 = 0;
231
+ for cell_handle in cell_handles.iter() {
232
+ let is_header = is_header_section
233
+ || dom_ctx
234
+ .tag_name_for(*cell_handle, parser)
235
+ .is_some_and(|name| name.as_ref() == "th");
236
+
237
+ let mut text = String::new();
238
+ let cell_ctx = super::super::Context {
239
+ in_table_cell: true,
240
+ ..ctx.clone()
241
+ };
242
+ if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
243
+ for child_handle in cell_tag.children().top().iter() {
244
+ super::super::walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
245
+ }
246
+ }
247
+ let content = crate::text::normalize_whitespace_cow(&text).trim().to_string();
248
+
249
+ let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
250
+
251
+ grid_cells.push(crate::types::GridCell {
252
+ content,
253
+ row: *row_index,
254
+ col: col_index,
255
+ row_span: rowspan as u32,
256
+ col_span: colspan as u32,
257
+ is_header,
258
+ });
259
+
260
+ col_index += colspan as u32;
261
+ }
262
+ if col_index > *max_cols {
263
+ *max_cols = col_index;
264
+ }
265
+ *row_index += 1;
266
+ }
@@ -3,11 +3,9 @@
3
3
  //! Provides the TableScan struct and scanning functions for analyzing table structure
4
4
  //! to determine if it should be rendered as a Markdown table or converted to list format.
5
5
 
6
+ use crate::converter::utility::content::normalized_tag_name;
6
7
  use std::borrow::Cow;
7
8
 
8
- /// Maximum allowed table columns to prevent unbounded memory usage.
9
- const MAX_TABLE_COLS: usize = 1000;
10
-
11
9
  /// Scan results for a table element.
12
10
  ///
13
11
  /// Contains metadata about table structure to determine optimal rendering:
@@ -25,8 +23,8 @@ pub struct TableScan {
25
23
  pub has_header: bool,
26
24
  /// Whether the table has a caption element
27
25
  pub has_caption: bool,
28
- /// Whether the table contains nested tables
29
- pub has_nested_table: bool,
26
+ /// Number of nested tables found inside this table
27
+ pub nested_table_count: usize,
30
28
  /// Count of anchor elements in the table
31
29
  pub link_count: usize,
32
30
  /// Whether the table contains text content (not empty)
@@ -111,7 +109,7 @@ fn scan_table_node(
111
109
  }
112
110
  }
113
111
  }
114
- "table" if !is_root => scan.has_nested_table = true,
112
+ "table" if !is_root => scan.nested_table_count += 1,
115
113
  "tr" | "row" => {
116
114
  let mut cell_count = 0;
117
115
  for child in tag.children().top().iter() {
@@ -146,15 +144,3 @@ fn scan_table_node(
146
144
  }
147
145
  }
148
146
  }
149
-
150
- /// Normalize HTML tag names to lowercase.
151
- ///
152
- /// Converts tag names to a consistent lowercase form for comparison.
153
- fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
154
- let lowercased = raw.to_lowercase();
155
- if lowercased.as_str() == raw.as_ref() {
156
- raw
157
- } else {
158
- Cow::Owned(lowercased)
159
- }
160
- }
@@ -2,24 +2,8 @@
2
2
  //!
3
3
  //! Provides helper functions for tag name normalization and comparison.
4
4
 
5
- use std::borrow::Cow;
6
-
7
- /// Normalize HTML tag names to lowercase.
8
- ///
9
- /// Converts tag names to a consistent lowercase form for comparison.
10
- pub(super) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
11
- let lowercased = raw.to_lowercase();
12
- if lowercased.as_str() == raw.as_ref() {
13
- raw
14
- } else {
15
- Cow::Owned(lowercased)
16
- }
17
- }
18
-
19
- /// Check tag name equality with case-insensitive comparison.
20
- pub(super) fn tag_name_eq(name: Cow<'_, str>, needle: &str) -> bool {
21
- name.eq_ignore_ascii_case(needle)
22
- }
5
+ pub(super) use crate::converter::main_helpers::tag_name_eq;
6
+ pub(super) use crate::converter::utility::content::normalized_tag_name;
23
7
 
24
8
  /// Check if a node has a specific tag name.
25
9
  ///
@@ -12,6 +12,8 @@ use std::rc::Rc;
12
12
  #[cfg(feature = "inline-images")]
13
13
  use crate::inline_images::InlineImageCollector;
14
14
 
15
+ use crate::types::structure_collector::StructureCollectorHandle;
16
+
15
17
  /// Handle type for inline image collector when feature is enabled.
16
18
  #[cfg(feature = "inline-images")]
17
19
  pub type InlineCollectorHandle = Rc<RefCell<InlineImageCollector>>;
@@ -99,6 +101,10 @@ pub struct Context {
99
101
  #[cfg(feature = "visitor")]
100
102
  /// Stores the first visitor error encountered during traversal.
101
103
  pub(crate) visitor_error: Rc<RefCell<Option<String>>>,
104
+ /// Optional structure collector for building a [`crate::types::DocumentStructure`].
105
+ ///
106
+ /// Populated when `options.include_document_structure == true`.
107
+ pub(crate) structure_collector: Option<StructureCollectorHandle>,
102
108
  }
103
109
 
104
110
  impl Context {
@@ -115,6 +121,7 @@ impl Context {
115
121
  #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
116
122
  #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
117
123
  #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
124
+ structure_collector: Option<StructureCollectorHandle>,
118
125
  ) -> Self {
119
126
  #[cfg(feature = "metadata")]
120
127
  let (
@@ -178,6 +185,7 @@ impl Context {
178
185
  visitor: visitor.clone(),
179
186
  #[cfg(feature = "visitor")]
180
187
  visitor_error: Rc::new(RefCell::new(None)),
188
+ structure_collector,
181
189
  }
182
190
  }
183
191
  }
@@ -11,8 +11,6 @@ use crate::converter::main_helpers::is_inline_element;
11
11
  use crate::converter::utility::content::{is_block_level_name, normalized_tag_name};
12
12
  use crate::text;
13
13
 
14
- const TEXT_CACHE_CAPACITY: usize = 4096;
15
-
16
14
  /// Cached information about an HTML tag element.
17
15
  ///
18
16
  /// This struct stores pre-computed information about tag elements to avoid
@@ -236,11 +234,8 @@ impl DomContext {
236
234
  .or_else(|| siblings.iter().position(|handle| handle.get_inner() == id))?;
237
235
 
238
236
  for sibling in siblings.iter().skip(position + 1) {
239
- if let Some(info) = self.tag_info(sibling.get_inner(), parser) {
237
+ if self.tag_info(sibling.get_inner(), parser).is_some() {
240
238
  let sibling_id = sibling.get_inner();
241
- if info.name == "script" || info.name == "style" {
242
- return Some(sibling_id);
243
- }
244
239
  return Some(sibling_id);
245
240
  }
246
241
  if let Some(tl::Node::Raw(raw)) = sibling.get(parser) {
@@ -27,7 +27,7 @@ use std::borrow::Cow;
27
27
  /// - **Inline mode**: Children are processed inline without block spacing
28
28
  /// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
29
29
  /// - **Empty content**: Skipped entirely
30
- pub fn handle_form(
30
+ pub(crate) fn handle_form(
31
31
  _tag_name: &str,
32
32
  node_handle: &tl::NodeHandle,
33
33
  parser: &tl::Parser,
@@ -82,7 +82,7 @@ pub fn handle_form(
82
82
  /// - **Inline mode**: Children are processed inline without block spacing
83
83
  /// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
84
84
  /// - **Empty content**: Skipped entirely
85
- pub fn handle_fieldset(
85
+ pub(crate) fn handle_fieldset(
86
86
  _tag_name: &str,
87
87
  node_handle: &tl::NodeHandle,
88
88
  parser: &tl::Parser,
@@ -137,7 +137,7 @@ pub fn handle_fieldset(
137
137
  /// - **Block mode**: Content is wrapped in strong markers (e.g., `**text**`)
138
138
  /// - **Inline mode**: Content is rendered without emphasis
139
139
  /// - Uses the configured strong/emphasis symbol from ConversionOptions
140
- pub fn handle_legend(
140
+ pub(crate) fn handle_legend(
141
141
  _tag_name: &str,
142
142
  node_handle: &tl::NodeHandle,
143
143
  parser: &tl::Parser,
@@ -198,7 +198,7 @@ pub fn handle_legend(
198
198
  /// - Content is collected from children
199
199
  /// - Non-empty content is output followed by blank lines (in block mode)
200
200
  /// - Blank lines are suppressed in inline mode
201
- pub fn handle_label(
201
+ pub(crate) fn handle_label(
202
202
  _tag_name: &str,
203
203
  node_handle: &tl::NodeHandle,
204
204
  parser: &tl::Parser,
@@ -231,7 +231,7 @@ pub fn handle_label(
231
231
  ///
232
232
  /// An input element represents a form control for user input. Since input
233
233
  /// elements typically have no text content, this handler produces no output.
234
- pub fn handle_input(
234
+ pub(crate) fn handle_input(
235
235
  _tag_name: &str,
236
236
  _node_handle: &tl::NodeHandle,
237
237
  _parser: &tl::Parser,
@@ -253,7 +253,7 @@ pub fn handle_input(
253
253
  ///
254
254
  /// - Content is collected from children
255
255
  /// - Blank lines are added after content in block mode only
256
- pub fn handle_textarea(
256
+ pub(crate) fn handle_textarea(
257
257
  _tag_name: &str,
258
258
  node_handle: &tl::NodeHandle,
259
259
  parser: &tl::Parser,
@@ -287,7 +287,7 @@ pub fn handle_textarea(
287
287
  ///
288
288
  /// - Content (options) is collected from children
289
289
  /// - A single newline is added after the select in block mode
290
- pub fn handle_select(
290
+ pub(crate) fn handle_select(
291
291
  _tag_name: &str,
292
292
  node_handle: &tl::NodeHandle,
293
293
  parser: &tl::Parser,
@@ -322,7 +322,7 @@ pub fn handle_select(
322
322
  /// - Content is collected from children
323
323
  /// - If the option has the `selected` attribute, it's prefixed with `* ` in block mode
324
324
  /// - A newline is added after each option in block mode
325
- pub fn handle_option(
325
+ pub(crate) fn handle_option(
326
326
  _tag_name: &str,
327
327
  node_handle: &tl::NodeHandle,
328
328
  parser: &tl::Parser,
@@ -365,7 +365,7 @@ pub fn handle_option(
365
365
  ///
366
366
  /// - The `label` attribute is output as strong text (if present)
367
367
  /// - Options within the group are rendered normally
368
- pub fn handle_optgroup(
368
+ pub(crate) fn handle_optgroup(
369
369
  _tag_name: &str,
370
370
  node_handle: &tl::NodeHandle,
371
371
  parser: &tl::Parser,
@@ -410,7 +410,7 @@ pub fn handle_optgroup(
410
410
  ///
411
411
  /// - Content is collected from children
412
412
  /// - Blank lines are added after content in block mode only
413
- pub fn handle_button(
413
+ pub(crate) fn handle_button(
414
414
  _tag_name: &str,
415
415
  node_handle: &tl::NodeHandle,
416
416
  parser: &tl::Parser,
@@ -444,7 +444,7 @@ pub fn handle_button(
444
444
  ///
445
445
  /// - Content is collected from children (usually empty)
446
446
  /// - Blank lines are added after content in block mode only
447
- pub fn handle_progress(
447
+ pub(crate) fn handle_progress(
448
448
  _tag_name: &str,
449
449
  node_handle: &tl::NodeHandle,
450
450
  parser: &tl::Parser,
@@ -478,7 +478,7 @@ pub fn handle_progress(
478
478
  ///
479
479
  /// - Content is collected from children (usually empty)
480
480
  /// - Blank lines are added after content in block mode only
481
- pub fn handle_meter(
481
+ pub(crate) fn handle_meter(
482
482
  _tag_name: &str,
483
483
  node_handle: &tl::NodeHandle,
484
484
  parser: &tl::Parser,
@@ -512,7 +512,7 @@ pub fn handle_meter(
512
512
  ///
513
513
  /// - Content is collected from children
514
514
  /// - Blank lines are added after content in block mode only
515
- pub fn handle_output(
515
+ pub(crate) fn handle_output(
516
516
  _tag_name: &str,
517
517
  node_handle: &tl::NodeHandle,
518
518
  parser: &tl::Parser,
@@ -546,7 +546,7 @@ pub fn handle_output(
546
546
  ///
547
547
  /// - Content (options) is collected from children
548
548
  /// - A single newline is added after the datalist in block mode
549
- pub fn handle_datalist(
549
+ pub(crate) fn handle_datalist(
550
550
  _tag_name: &str,
551
551
  node_handle: &tl::NodeHandle,
552
552
  parser: &tl::Parser,