html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.2.4"
6
+ version = "3.4.0-rc.13"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -18,7 +18,7 @@ clap = { version = "4.6", features = ["derive"] }
18
18
  clap_complete = "4.6"
19
19
  clap_mangen = "0.3"
20
20
  encoding_rs = "0.8"
21
- ext-php-rs = "0.15.10"
21
+ ext-php-rs = "0.15.12"
22
22
  html5ever = "0.39.0"
23
23
  once_cell = "1.21"
24
24
  pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.2.4"
3
+ version = "3.4.0-rc.13"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -7,6 +7,6 @@ fn main() {
7
7
  let opts: ConversionOptions = serde_json::from_str(json).unwrap();
8
8
  println!("code_block_style: {:?}", opts.code_block_style);
9
9
 
10
- let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
10
+ let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts), None).unwrap();
11
11
  println!("result: {:?}", result.content);
12
12
  }
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -3,7 +3,7 @@ fn convert(
3
3
  html: &str,
4
4
  opts: Option<html_to_markdown_rs::ConversionOptions>,
5
5
  ) -> html_to_markdown_rs::error::Result<String> {
6
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
6
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
7
7
  }
8
8
 
9
9
  use html_to_markdown_rs::ConversionOptions;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  fn main() {
@@ -30,14 +30,18 @@ use crate::{HtmlMetadata, MetadataConfig};
30
30
  /// use html_to_markdown_rs::{convert, ConversionOptions};
31
31
  ///
32
32
  /// let html = "<h1>Hello World</h1>";
33
- /// let result = convert(html, None).unwrap();
33
+ /// let result = convert(html, None, None).unwrap();
34
34
  /// assert!(result.content.as_deref().unwrap_or("").contains("Hello World"));
35
35
  /// ```
36
36
  ///
37
37
  /// # Errors
38
38
  ///
39
39
  /// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
40
- pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<ConversionResult> {
40
+ pub fn convert(
41
+ html: &str,
42
+ options: Option<ConversionOptions>,
43
+ #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
44
+ ) -> Result<ConversionResult> {
41
45
  use std::cell::RefCell;
42
46
  use std::rc::Rc;
43
47
 
@@ -96,6 +100,11 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
96
100
  None
97
101
  };
98
102
 
103
+ // When the visitor feature is not enabled, there is no visitor parameter.
104
+ // convert_html_impl expects `Option<()>` in the non-visitor slot.
105
+ #[cfg(not(feature = "visitor"))]
106
+ let visitor: Option<()> = None;
107
+
99
108
  // Run the conversion pipeline.
100
109
  // Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
101
110
  // to return the finished DocumentStructure. We must not hold a second Rc reference.
@@ -107,7 +116,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
107
116
  &options,
108
117
  image_collector.as_ref().map(Rc::clone),
109
118
  metadata_collector.as_ref().map(Rc::clone),
110
- None,
119
+ visitor,
111
120
  structure_collector,
112
121
  )?
113
122
  }
@@ -118,7 +127,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
118
127
  &options,
119
128
  None,
120
129
  metadata_collector.as_ref().map(Rc::clone),
121
- None,
130
+ visitor,
122
131
  structure_collector,
123
132
  )?
124
133
  }
@@ -129,7 +138,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
129
138
  &options,
130
139
  image_collector.as_ref().map(Rc::clone),
131
140
  None,
132
- None,
141
+ visitor,
133
142
  structure_collector,
134
143
  )?
135
144
  }
@@ -140,7 +149,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
140
149
  &options,
141
150
  None,
142
151
  None,
143
- None,
152
+ visitor,
144
153
  structure_collector,
145
154
  )?
146
155
  }
@@ -201,25 +210,6 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
201
210
  })
202
211
  }
203
212
 
204
- /// Internal: convert with visitor support. Used by FFI crate.
205
- /// Will be removed when convert() accepts visitor parameter directly.
206
- #[cfg(feature = "visitor")]
207
- #[doc(hidden)]
208
- pub fn convert_with_visitor(
209
- html: &str,
210
- options: Option<ConversionOptions>,
211
- visitor: Option<crate::visitor::VisitorHandle>,
212
- ) -> Result<String> {
213
- let options = options.unwrap_or_default();
214
- let normalized_html = normalize_input(html)?;
215
- let markdown = crate::converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
216
- if options.wrap {
217
- Ok(crate::wrapper::wrap_markdown(&markdown, &options))
218
- } else {
219
- Ok(markdown)
220
- }
221
- }
222
-
223
213
  /// Validate and normalize HTML input for conversion.
224
214
  fn normalize_input(html: &str) -> Result<Cow<'_, str>> {
225
215
  let decoded = decode_utf16_if_needed(html);
@@ -22,7 +22,7 @@ type DomContext = crate::converter::DomContext;
22
22
  ///
23
23
  /// Processes blockquote content, applies `> ` prefix to each line,
24
24
  /// handles optional `cite` attribution, and manages spacing.
25
- pub(crate) fn handle(
25
+ pub fn handle(
26
26
  node_handle: &NodeHandle,
27
27
  parser: &Parser,
28
28
  output: &mut String,
@@ -28,7 +28,7 @@ type DomContext = crate::converter::DomContext;
28
28
  /// * `ctx` - Current conversion context
29
29
  /// * `depth` - Current recursion depth
30
30
  /// * `dom_ctx` - DOM context for tracking relationships
31
- pub(crate) fn handle_structural_container(
31
+ pub fn handle_structural_container(
32
32
  node_handle: &NodeHandle,
33
33
  parser: &Parser,
34
34
  output: &mut String,
@@ -64,7 +64,7 @@ pub(crate) fn handle_structural_container(
64
64
  /// * `ctx` - Current conversion context
65
65
  /// * `depth` - Current recursion depth
66
66
  /// * `dom_ctx` - DOM context for tracking relationships
67
- pub(crate) fn handle_passthrough(
67
+ pub fn handle_passthrough(
68
68
  node_handle: &NodeHandle,
69
69
  parser: &Parser,
70
70
  output: &mut String,
@@ -101,7 +101,7 @@ pub(crate) fn handle_passthrough(
101
101
  /// * `_depth` - Current recursion depth (unused)
102
102
  /// * `_dom_ctx` - DOM context (unused)
103
103
  #[inline]
104
- pub(crate) fn handle_noop(
104
+ pub fn handle_noop(
105
105
  _node_handle: &NodeHandle,
106
106
  _parser: &Parser,
107
107
  _output: &mut String,
@@ -25,7 +25,7 @@ type DomContext = crate::converter::DomContext;
25
25
  /// # Note
26
26
  /// This function references `walk_node` and helper functions from converter.rs
27
27
  /// which must be accessible (pub(crate)) for this module to work correctly.
28
- pub(crate) fn handle(
28
+ pub fn handle(
29
29
  node_handle: &NodeHandle,
30
30
  parser: &Parser,
31
31
  output: &mut String,
@@ -27,7 +27,7 @@ type DomContext = crate::converter::DomContext;
27
27
  /// # Note
28
28
  /// This function references `walk_node` from converter.rs which must be
29
29
  /// accessible (pub(crate)) for this module to work correctly.
30
- pub(crate) fn handle(
30
+ pub fn handle(
31
31
  tag_name: &str,
32
32
  node_handle: &NodeHandle,
33
33
  parser: &Parser,
@@ -149,7 +149,7 @@ pub(crate) fn handle(
149
149
  }
150
150
 
151
151
  /// Determine if a heading element should allow inline images.
152
- pub(crate) fn heading_allows_inline_images(
152
+ pub fn heading_allows_inline_images(
153
153
  tag_name: &str,
154
154
  keep_inline_images_in: &std::rc::Rc<std::collections::HashSet<String>>,
155
155
  ) -> bool {
@@ -189,7 +189,7 @@ fn normalize_heading_text(text: &str) -> Cow<'_, str> {
189
189
  }
190
190
 
191
191
  /// Format heading output with appropriate markdown syntax.
192
- pub(crate) fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
192
+ pub fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
193
193
  if text.is_empty() {
194
194
  return;
195
195
  }
@@ -374,7 +374,7 @@ fn visitor_heading_output(
374
374
  /// - Multiple headings are found
375
375
  /// - Non-whitespace non-heading content exists
376
376
  /// - Non-text comments exist
377
- pub(crate) fn find_single_heading_child(node_handle: NodeHandle, parser: &Parser) -> Option<(usize, NodeHandle)> {
377
+ pub fn find_single_heading_child(node_handle: NodeHandle, parser: &Parser) -> Option<(usize, NodeHandle)> {
378
378
  let node = node_handle.get(parser)?;
379
379
 
380
380
  let tl::Node::Tag(tag) = node else {
@@ -397,13 +397,12 @@ pub(crate) fn find_single_heading_child(node_handle: NodeHandle, parser: &Parser
397
397
  }
398
398
  tl::Node::Tag(child_tag) => {
399
399
  let name = crate::converter::utility::content::normalized_tag_name(child_tag.name().as_utf8_str());
400
- if let Some(level) = heading_level_from_name(name.as_ref()) {
400
+ {
401
+ let level = heading_level_from_name(name.as_ref())?;
401
402
  if heading_data.is_some() {
402
403
  return None;
403
404
  }
404
405
  heading_data = Some((level, *child_handle));
405
- } else {
406
- return None;
407
406
  }
408
407
  }
409
408
  tl::Node::Comment(_) => return None,
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
15
15
  ///
16
16
  /// Converts to Markdown horizontal rule (---) with appropriate blank line
17
17
  /// spacing based on context and previous siblings.
18
- pub(crate) fn handle(
18
+ pub fn handle(
19
19
  node_handle: &NodeHandle,
20
20
  parser: &Parser,
21
21
  output: &mut String,
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
15
15
  ///
16
16
  /// Converts to appropriate Markdown line break syntax based on the configured
17
17
  /// newline style and current context (e.g., in headings).
18
- pub(crate) fn handle(
18
+ pub fn handle(
19
19
  _node_handle: &NodeHandle,
20
20
  _parser: &Parser,
21
21
  output: &mut String,
@@ -1,22 +1,3 @@
1
- //! Block element handlers for HTML to Markdown conversion.
2
- //!
3
- //! This module provides specialized handlers for block-level HTML elements:
4
- //! - Headings (h1-h6)
5
- //! - Paragraphs (p)
6
- //! - Blockquotes (blockquote)
7
- //! - Preformatted code (pre)
8
- //! - Tables (table, thead, tbody, tfoot, tr, th, td)
9
- //!
10
- //! These handlers are designed to be extracted from the main `converter.rs`
11
- //! file and integrated once the converter module is refactored.
12
- //!
13
- //! **Note on Current Integration:**
14
- //! This module cannot currently be fully integrated into converter.rs due to
15
- //! Rust's module system rules (cannot have both converter.rs and converter/mod.rs).
16
- //! Once converter.rs is refactored to use converter/main.rs or similar pattern,
17
- //! these handlers should be exposed through converter/mod.rs and used in the
18
- //! main walk_node function via the dispatch_block_handler function below.
19
-
20
1
  pub mod blockquote;
21
2
  pub mod container;
22
3
  pub mod div;
@@ -27,92 +8,3 @@ pub mod paragraph;
27
8
  pub mod preformatted;
28
9
  pub mod table;
29
10
  pub mod unknown;
30
-
31
- // Re-export types from parent module for submodule access
32
- pub use super::{Context, DomContext};
33
-
34
- // Re-export for internal use by dispatcher (crate-private)
35
-
36
- /// Dispatches block element handling to the appropriate handler.
37
- ///
38
- /// This function is designed to be called from the main walk_node function
39
- /// in converter.rs once the module is refactored. It returns `true` if the
40
- /// element was handled, `false` otherwise.
41
- ///
42
- /// # Usage in converter.rs
43
- /// ```text
44
- /// if crate::converter::block::dispatch_block_handler(
45
- /// &tag_name,
46
- /// node_handle,
47
- /// parser,
48
- /// output,
49
- /// options,
50
- /// ctx,
51
- /// depth,
52
- /// dom_ctx,
53
- /// ) {
54
- /// return; // Element was handled
55
- /// }
56
- /// ```
57
- pub fn dispatch_block_handler(
58
- tag_name: &str,
59
- node_handle: &tl::NodeHandle,
60
- parser: &tl::Parser,
61
- output: &mut String,
62
- options: &crate::options::ConversionOptions,
63
- ctx: &super::Context,
64
- depth: usize,
65
- dom_ctx: &super::DomContext,
66
- ) -> bool {
67
- match tag_name {
68
- "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
69
- heading::handle(tag_name, node_handle, parser, output, options, ctx, depth, dom_ctx);
70
- true
71
- }
72
- "p" => {
73
- paragraph::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
74
- true
75
- }
76
- "blockquote" => {
77
- blockquote::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
78
- true
79
- }
80
- "pre" => {
81
- preformatted::handle_pre(node_handle, parser, output, options, ctx, depth, dom_ctx);
82
- true
83
- }
84
- "br" => {
85
- line_break::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
86
- true
87
- }
88
- "hr" => {
89
- horizontal_rule::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
90
- true
91
- }
92
- "div" => {
93
- div::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
94
- true
95
- }
96
- "table" => {
97
- table::handle_table(node_handle, parser, output, options, ctx, dom_ctx, depth);
98
- true
99
- }
100
- "caption" => {
101
- table::handle_caption(node_handle, parser, output, options, ctx, depth, dom_ctx);
102
- true
103
- }
104
- "body" | "html" => {
105
- container::handle_structural_container(node_handle, parser, output, options, ctx, depth, dom_ctx);
106
- true
107
- }
108
- "time" | "data" => {
109
- container::handle_passthrough(node_handle, parser, output, options, ctx, depth, dom_ctx);
110
- true
111
- }
112
- "wbr" | "source" | "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {
113
- container::handle_noop(node_handle, parser, output, options, ctx, depth, dom_ctx);
114
- true
115
- }
116
- _ => false,
117
- }
118
- }
@@ -18,7 +18,7 @@ type DomContext = crate::converter::DomContext;
18
18
  ///
19
19
  /// Processes children with proper context, manages spacing,
20
20
  /// and handles special cases for table cells and list items.
21
- pub(crate) fn handle(
21
+ pub fn handle(
22
22
  node_handle: &NodeHandle,
23
23
  parser: &Parser,
24
24
  output: &mut String,
@@ -19,7 +19,7 @@ type Context = crate::converter::Context;
19
19
  type DomContext = crate::converter::DomContext;
20
20
 
21
21
  /// Handle preformatted code blocks (pre element).
22
- pub(crate) fn handle_pre(
22
+ pub fn handle_pre(
23
23
  node_handle: &NodeHandle,
24
24
  parser: &Parser,
25
25
  output: &mut String,
@@ -400,7 +400,7 @@ mod tests {
400
400
  #[test]
401
401
  fn single_nested_table_stays_as_table() {
402
402
  let html = r"<table><tr><td>Label</td><td><table><tr><td>A</td><td>B</td></tr></table></td></tr></table>";
403
- let result = crate::convert(html, None).unwrap();
403
+ let result = crate::convert(html, None, None).unwrap();
404
404
  let content = result.content.unwrap_or_default();
405
405
  assert!(content.contains('|'), "should produce pipe table, not list");
406
406
  }
@@ -185,7 +185,7 @@ mod tests {
185
185
  #[test]
186
186
  fn rich_formatting_preserved_in_cells() {
187
187
  let html = "<table><tr><th>H</th></tr><tr><td><strong>Bold</strong> and <em>italic</em></td></tr></table>";
188
- let result = crate::convert(html, None).unwrap();
188
+ let result = crate::convert(html, None, None).unwrap();
189
189
  let content = result.content.unwrap_or_default();
190
190
  assert!(
191
191
  content.contains("**Bold**") || content.contains("__Bold__"),
@@ -17,7 +17,7 @@ use crate::options::ListIndentType;
17
17
  ///
18
18
  /// # Returns
19
19
  /// Indented table content
20
- pub(crate) fn indent_table_for_list(
20
+ pub fn indent_table_for_list(
21
21
  table_content: &str,
22
22
  list_depth: usize,
23
23
  options: &crate::options::ConversionOptions,
@@ -20,11 +20,9 @@ pub mod scanner;
20
20
  pub(super) mod utils;
21
21
 
22
22
  // Re-export types from parent module for submodule access
23
- pub use super::super::{Context, DomContext};
24
23
 
25
24
  // Re-export for use in converter.rs
26
- pub(crate) use builder::handle_table;
27
- pub(crate) use caption::handle_caption;
25
+ pub use caption::handle_caption;
28
26
 
29
27
  /// Dispatches table element handling to the main convert_table function.
30
28
  ///
@@ -84,7 +82,7 @@ pub fn dispatch_table_handler(
84
82
  /// * `ctx` - Conversion context (includes list state)
85
83
  /// * `dom_ctx` - DOM context for tree structure info
86
84
  /// * `depth` - Current nesting depth
87
- pub(crate) fn handle_table_with_context(
85
+ pub fn handle_table_with_context(
88
86
  node_handle: &tl::NodeHandle,
89
87
  parser: &tl::Parser,
90
88
  output: &mut String,
@@ -32,7 +32,7 @@ type DomContext = crate::converter::DomContext;
32
32
  /// # Code Block Detection
33
33
  /// Code blocks (identified by markdown formatting) are always preserved,
34
34
  /// even if they appear "empty" according to trim().
35
- pub(crate) fn handle(
35
+ pub fn handle(
36
36
  node_handle: &NodeHandle,
37
37
  parser: &Parser,
38
38
  output: &mut String,
@@ -80,6 +80,8 @@ pub struct Context {
80
80
  pub(crate) preserve_tags: Rc<HashSet<String>>,
81
81
  /// Tag names that allow inline images inside headings.
82
82
  pub(crate) keep_inline_images_in: Rc<HashSet<String>>,
83
+ /// Node IDs matching `exclude_selectors` — these nodes and all descendants are dropped.
84
+ pub(crate) excluded_node_ids: Rc<HashSet<u32>>,
83
85
  #[cfg(feature = "inline-images")]
84
86
  /// Shared collector for inline images when enabled.
85
87
  pub(crate) inline_collector: Option<InlineCollectorHandle>,
@@ -111,6 +113,13 @@ pub struct Context {
111
113
  }
112
114
 
113
115
  impl Context {
116
+ /// Set the pre-computed set of node IDs that match `exclude_selectors`.
117
+ ///
118
+ /// Called in `convert_html_impl` after DOM parsing, before the walk starts.
119
+ pub(crate) fn set_excluded_node_ids(&mut self, ids: HashSet<u32>) {
120
+ self.excluded_node_ids = Rc::new(ids);
121
+ }
122
+
114
123
  /// Create a new conversion context from options and optional collectors.
115
124
  #[allow(clippy::too_many_arguments)]
116
125
  #[cfg_attr(
@@ -171,6 +180,7 @@ impl Context {
171
180
  strip_tags: Rc::new(options.strip_tags.iter().cloned().collect()),
172
181
  preserve_tags: Rc::new(options.preserve_tags.iter().cloned().collect()),
173
182
  keep_inline_images_in: Rc::new(options.keep_inline_images_in.iter().cloned().collect()),
183
+ excluded_node_ids: Rc::new(HashSet::new()),
174
184
  #[cfg(feature = "inline-images")]
175
185
  inline_collector,
176
186
  #[cfg(feature = "metadata")]
@@ -15,7 +15,7 @@ use crate::text;
15
15
  ///
16
16
  /// This struct stores pre-computed information about tag elements to avoid
17
17
  /// repeated parsing during tree traversal.
18
- pub(crate) struct TagInfo {
18
+ pub struct TagInfo {
19
19
  /// The normalized (lowercase) tag name.
20
20
  pub(crate) name: String,
21
21
  /// Whether this element behaves like an inline element (including script/style).