html-to-markdown 2.30.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +6 -19
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +13 -701
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +6 -3
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +126 -52
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -68
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -323
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -42
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -11,6 +11,9 @@ use crate::converter::dom_context::DomContext;
11
11
  use crate::converter::main::walk_node;
12
12
  use crate::options::ConversionOptions;
13
13
 
14
+ #[cfg(feature = "visitor")]
15
+ #[cfg(feature = "visitor")]
16
+ use crate::converter::utility::content::collect_tag_attributes;
14
17
  #[cfg(feature = "visitor")]
15
18
  use std::collections::BTreeMap;
16
19
 
@@ -86,11 +89,7 @@ pub fn handle_blockquote(
86
89
  if let Some(ref visitor) = ctx.visitor {
87
90
  use crate::visitor::{NodeContext, NodeType, VisitResult};
88
91
 
89
- let attributes: BTreeMap<String, String> = tag
90
- .attributes()
91
- .iter()
92
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
93
- .collect();
92
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
94
93
 
95
94
  let node_id = node_handle.get_inner();
96
95
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -13,6 +13,9 @@ use crate::converter::main::walk_node;
13
13
  use crate::converter::text::dedent_code_block;
14
14
  use crate::options::ConversionOptions;
15
15
 
16
+ #[cfg(feature = "visitor")]
17
+ #[cfg(feature = "visitor")]
18
+ use crate::converter::utility::content::collect_tag_attributes;
16
19
  #[cfg(feature = "visitor")]
17
20
  use std::collections::BTreeMap;
18
21
 
@@ -75,11 +78,7 @@ pub fn handle_code(
75
78
  let code_output = if let Some(ref visitor_handle) = ctx.visitor {
76
79
  use crate::visitor::{NodeContext, NodeType, VisitResult};
77
80
 
78
- let attributes: BTreeMap<String, String> = tag
79
- .attributes()
80
- .iter()
81
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
82
- .collect();
81
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
83
82
 
84
83
  let node_id = node_handle.get_inner();
85
84
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -255,11 +254,7 @@ pub fn handle_pre(
255
254
  let code_block_output = if let Some(ref visitor_handle) = ctx.visitor {
256
255
  use crate::visitor::{NodeContext, NodeType, VisitResult};
257
256
 
258
- let attributes: BTreeMap<String, String> = tag
259
- .attributes()
260
- .iter()
261
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
262
- .collect();
257
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
263
258
 
264
259
  let node_id = node_handle.get_inner();
265
260
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -11,6 +11,8 @@ use std::collections::BTreeMap;
11
11
 
12
12
  use crate::converter::Context;
13
13
  use crate::converter::dom_context::DomContext;
14
+ #[cfg(feature = "visitor")]
15
+ use crate::converter::utility::content::collect_tag_attributes;
14
16
  use crate::options::ConversionOptions;
15
17
 
16
18
  #[cfg(feature = "visitor")]
@@ -100,11 +102,7 @@ pub fn handle_graphic(
100
102
  let graphic_output = if let Some(ref visitor_handle) = ctx.visitor {
101
103
  use crate::visitor::{NodeContext, NodeType, VisitResult};
102
104
 
103
- let attributes: BTreeMap<String, String> = tag
104
- .attributes()
105
- .iter()
106
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
107
- .collect();
105
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
108
106
 
109
107
  let node_id = node_handle.get_inner();
110
108
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -11,6 +11,8 @@ use std::collections::BTreeMap;
11
11
 
12
12
  use crate::converter::Context;
13
13
  use crate::converter::dom_context::DomContext;
14
+ #[cfg(feature = "visitor")]
15
+ use crate::converter::utility::content::collect_tag_attributes;
14
16
  use crate::converter::utility::preprocessing::sanitize_markdown_url;
15
17
  use crate::options::ConversionOptions;
16
18
 
@@ -123,11 +125,7 @@ pub fn handle_img(
123
125
  let image_output = if let Some(ref visitor_handle) = ctx.visitor {
124
126
  use crate::visitor::{NodeContext, NodeType, VisitResult};
125
127
 
126
- let attributes: BTreeMap<String, String> = tag
127
- .attributes()
128
- .iter()
129
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
130
- .collect();
128
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
131
129
 
132
130
  let node_id = node_handle.get_inner();
133
131
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -15,6 +15,8 @@ use crate::converter::block::heading::{find_single_heading_child, heading_allows
15
15
  use crate::converter::dom_context::DomContext;
16
16
  use crate::converter::inline::link::append_markdown_link;
17
17
  use crate::converter::main::walk_node;
18
+ #[cfg(feature = "visitor")]
19
+ use crate::converter::utility::content::collect_tag_attributes;
18
20
  use crate::converter::utility::content::{
19
21
  collect_link_label_text, escape_link_label, get_text_content, normalize_link_label, normalized_tag_name,
20
22
  };
@@ -194,11 +196,7 @@ pub fn handle_link(
194
196
  let link_output = if let Some(ref visitor_handle) = ctx.visitor {
195
197
  use crate::visitor::{NodeContext, NodeType, VisitResult};
196
198
 
197
- let attributes: BTreeMap<String, String> = tag
198
- .attributes()
199
- .iter()
200
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
201
- .collect();
199
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
202
200
 
203
201
  let node_id = node_handle.get_inner();
204
202
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -10,6 +10,8 @@
10
10
  //! - Visitor callbacks for custom code processing
11
11
  //! - Whitespace normalization for kbd/samp elements
12
12
 
13
+ #[cfg(feature = "visitor")]
14
+ use crate::converter::utility::content::collect_tag_attributes;
13
15
  use crate::options::ConversionOptions;
14
16
  use crate::text;
15
17
  #[allow(unused_imports)]
@@ -120,11 +122,7 @@ fn handle_code(
120
122
  let code_output = if let Some(ref visitor_handle) = ctx.visitor {
121
123
  use crate::visitor::{NodeContext, NodeType, VisitResult};
122
124
 
123
- let attributes: BTreeMap<String, String> = tag
124
- .attributes()
125
- .iter()
126
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
127
- .collect();
125
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
128
126
 
129
127
  let node_id = node_handle.get_inner();
130
128
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -8,6 +8,8 @@
8
8
  //! - Visitor callbacks for custom emphasis processing
9
9
  //! - Bootstrap caret detection (.caret class)
10
10
 
11
+ #[cfg(feature = "visitor")]
12
+ use crate::converter::utility::content::collect_tag_attributes;
11
13
  use crate::options::{ConversionOptions, OutputFormat};
12
14
  #[allow(unused_imports)]
13
15
  use std::collections::BTreeMap;
@@ -106,11 +108,7 @@ fn handle_strong(
106
108
  use crate::visitor::{NodeContext, NodeType, VisitResult};
107
109
 
108
110
  let text_content = get_text_content(node_handle, parser, dom_ctx);
109
- let attributes: BTreeMap<String, String> = tag
110
- .attributes()
111
- .iter()
112
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
113
- .collect();
111
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
114
112
 
115
113
  let node_id = node_handle.get_inner();
116
114
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -246,11 +244,7 @@ fn handle_emphasis(
246
244
  use crate::visitor::{NodeContext, NodeType, VisitResult};
247
245
 
248
246
  let text_content = get_text_content(node_handle, parser, dom_ctx);
249
- let attributes: BTreeMap<String, String> = tag
250
- .attributes()
251
- .iter()
252
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
253
- .collect();
247
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
254
248
 
255
249
  let node_id = node_handle.get_inner();
256
250
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -9,7 +9,9 @@
9
9
  //! - Metadata collection for links (links, URLs, titles, rel attributes)
10
10
  //! - Block-level content within links (via inline context)
11
11
 
12
- use crate::converter::utility::content::{is_block_level_element, normalized_tag_name};
12
+ #[cfg(feature = "visitor")]
13
+ use crate::converter::utility::content::collect_tag_attributes;
14
+ use crate::converter::utility::content::{collect_link_label_text, escape_link_label, normalize_link_label};
13
15
  use crate::converter::utility::preprocessing::sanitize_markdown_url;
14
16
  use crate::options::ConversionOptions;
15
17
  use std::collections::BTreeMap;
@@ -230,11 +232,7 @@ pub(crate) fn handle(
230
232
  let link_output = if let Some(ref visitor_handle) = ctx.visitor {
231
233
  use crate::visitor::{NodeContext, NodeType, VisitResult};
232
234
 
233
- let attributes: BTreeMap<String, String> = tag
234
- .attributes()
235
- .iter()
236
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
237
- .collect();
235
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
238
236
 
239
237
  let node_id = node_handle.get_inner();
240
238
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -341,58 +339,6 @@ pub(crate) fn handle(
341
339
  }
342
340
  }
343
341
 
344
- /// Escape special Markdown characters in link labels.
345
- ///
346
- /// Escapes unmatched closing brackets `]` to prevent accidental link termination.
347
- /// Tracks bracket nesting to avoid escaping matched closing brackets.
348
- ///
349
- /// # Examples
350
- /// ```text
351
- /// Input: "Click [here] for more"
352
- /// Output: "Click [here\\] for more" (closing bracket is escaped because it's unmatched)
353
- ///
354
- /// Input: "Normal text"
355
- /// Output: "Normal text" (no escaping needed)
356
- /// ```
357
- fn escape_link_label(text: &str) -> String {
358
- if text.is_empty() {
359
- return String::new();
360
- }
361
-
362
- let mut result = String::with_capacity(text.len());
363
- let mut backslash_count = 0usize;
364
- let mut bracket_depth = 0usize;
365
-
366
- for ch in text.chars() {
367
- if ch == '\\' {
368
- result.push('\\');
369
- backslash_count += 1;
370
- continue;
371
- }
372
-
373
- let is_escaped = backslash_count % 2 == 1;
374
- backslash_count = 0;
375
-
376
- match ch {
377
- '[' if !is_escaped => {
378
- bracket_depth = bracket_depth.saturating_add(1);
379
- result.push('[');
380
- }
381
- ']' if !is_escaped => {
382
- if bracket_depth == 0 {
383
- result.push('\\');
384
- } else {
385
- bracket_depth -= 1;
386
- }
387
- result.push(']');
388
- }
389
- _ => result.push(ch),
390
- }
391
- }
392
-
393
- result
394
- }
395
-
396
342
  /// Format and append a Markdown link to the output string.
397
343
  ///
398
344
  /// Generates the link syntax: `[label](href "title")`
@@ -462,115 +408,3 @@ pub(crate) fn append_markdown_link(
462
408
 
463
409
  output.push(')');
464
410
  }
465
-
466
- /// Collect text content from direct inline children of a link element.
467
- ///
468
- /// Performs a shallow scan to find text content, distinguishing between:
469
- /// - Inline text (normal flow, accumulated)
470
- /// - Block-level elements (stop at them, mark `saw_block`)
471
- /// - Comments (stop processing)
472
- ///
473
- /// Returns:
474
- /// - `(text, block_nodes, saw_block)` where:
475
- /// - `text` is concatenated inline text
476
- /// - `block_nodes` is list of block-level children found
477
- /// - `saw_block` indicates if any block elements were encountered
478
- ///
479
- /// # Algorithm
480
- /// Uses a stack-based approach to traverse the DOM tree, accumulating text
481
- /// from inline elements while identifying block-level boundaries.
482
- fn collect_link_label_text(
483
- children: &[NodeHandle],
484
- parser: &Parser,
485
- dom_ctx: &DomContext,
486
- ) -> (String, Vec<NodeHandle>, bool) {
487
- let mut text = String::new();
488
- let mut saw_block = false;
489
- let mut block_nodes = Vec::new();
490
- let mut stack: Vec<_> = children.iter().rev().copied().collect();
491
-
492
- while let Some(handle) = stack.pop() {
493
- if let Some(node) = handle.get(parser) {
494
- match node {
495
- tl::Node::Raw(bytes) => {
496
- let raw = bytes.as_utf8_str();
497
- let decoded = crate::text::decode_html_entities_cow(raw.as_ref());
498
- text.push_str(decoded.as_ref());
499
- }
500
- tl::Node::Tag(tag) => {
501
- let is_block = dom_ctx.tag_info(handle.get_inner(), parser).map_or_else(
502
- || {
503
- let tag_name = normalized_tag_name(tag.name().as_utf8_str());
504
- is_block_level_element(tag_name.as_ref())
505
- },
506
- |info| info.is_block,
507
- );
508
- if is_block {
509
- saw_block = true;
510
- block_nodes.push(handle);
511
- continue;
512
- }
513
-
514
- if let Some(children) = dom_ctx.children_of(handle.get_inner()) {
515
- for child in children.iter().rev() {
516
- stack.push(*child);
517
- }
518
- } else {
519
- let tag_children = tag.children();
520
- let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
521
- child_nodes.reverse();
522
- stack.extend(child_nodes);
523
- }
524
- }
525
- _ => {}
526
- }
527
- }
528
- }
529
-
530
- (text, block_nodes, saw_block)
531
- }
532
-
533
- /// Normalize link label text.
534
- ///
535
- /// Collapses line breaks and normalizes whitespace:
536
- /// - Replaces `\n` and `\r` with spaces
537
- /// - Collapses multiple consecutive spaces to single space
538
- /// - Trims leading/trailing whitespace
539
- ///
540
- /// This is required by the Markdown spec for link labels to function properly.
541
- ///
542
- /// # Examples
543
- /// ```text
544
- /// Input: "Line 1\nLine 2"
545
- /// Output: "Line 1 Line 2"
546
- ///
547
- /// Input: "Text with spaces"
548
- /// Output: "Text with spaces"
549
- /// ```
550
- #[allow(clippy::trivially_copy_pass_by_ref)]
551
- fn normalize_link_label(label: &str) -> String {
552
- let mut needs_collapse = false;
553
- for ch in label.chars() {
554
- if ch == '\n' || ch == '\r' {
555
- needs_collapse = true;
556
- break;
557
- }
558
- }
559
-
560
- let collapsed = if needs_collapse {
561
- let mut collapsed = String::with_capacity(label.len());
562
- for ch in label.chars() {
563
- if ch == '\n' || ch == '\r' {
564
- collapsed.push(' ');
565
- } else {
566
- collapsed.push(ch);
567
- }
568
- }
569
- std::borrow::Cow::Owned(collapsed)
570
- } else {
571
- std::borrow::Cow::Borrowed(label)
572
- };
573
-
574
- let normalized = crate::text::normalize_whitespace_cow(collapsed.as_ref());
575
- normalized.as_ref().trim().to_string()
576
- }
@@ -5,7 +5,11 @@
5
5
  //! - Strikethrough (del, s tags) with ~~ syntax
6
6
  //! - Inserted/underlined text (ins, u tags) with == syntax
7
7
 
8
+ #[cfg(feature = "visitor")]
9
+ use crate::converter::utility::content::collect_tag_attributes;
8
10
  use crate::options::{ConversionOptions, OutputFormat};
11
+ #[cfg(feature = "visitor")]
12
+ use std::collections::BTreeMap;
9
13
  use tl::{NodeHandle, Parser};
10
14
 
11
15
  type Context = crate::converter::Context;
@@ -135,14 +139,8 @@ pub fn handle_strikethrough(
135
139
  let strikethrough_output = if let Some(ref visitor_handle) = ctx.visitor {
136
140
  use crate::converter::get_text_content;
137
141
  use crate::visitor::{NodeContext, NodeType, VisitResult};
138
- use std::collections::BTreeMap;
139
-
140
142
  let text_content = get_text_content(node_handle, parser, dom_ctx);
141
- let attributes: BTreeMap<String, String> = tag
142
- .attributes()
143
- .iter()
144
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
145
- .collect();
143
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
146
144
 
147
145
  let node_id = node_handle.get_inner();
148
146
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -262,14 +260,9 @@ pub fn handle_inserted(
262
260
  let underline_output = if let Some(ref visitor_handle) = ctx.visitor {
263
261
  use crate::converter::get_text_content;
264
262
  use crate::visitor::{NodeContext, NodeType, VisitResult};
265
- use std::collections::BTreeMap;
266
263
 
267
264
  let text_content = get_text_content(node_handle, parser, dom_ctx);
268
- let attributes: BTreeMap<String, String> = tag
269
- .attributes()
270
- .iter()
271
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
272
- .collect();
265
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
273
266
 
274
267
  let node_id = node_handle.get_inner();
275
268
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -377,14 +370,9 @@ pub fn handle_underline(
377
370
  if let Some(ref visitor_handle) = ctx.visitor {
378
371
  use crate::converter::get_text_content;
379
372
  use crate::visitor::{NodeContext, NodeType, VisitResult};
380
- use std::collections::BTreeMap;
381
373
 
382
374
  let text_content = get_text_content(node_handle, parser, dom_ctx);
383
- let attributes: BTreeMap<String, String> = tag
384
- .attributes()
385
- .iter()
386
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
387
- .collect();
375
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
388
376
 
389
377
  let node_id = node_handle.get_inner();
390
378
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -8,6 +8,8 @@
8
8
 
9
9
  use crate::converter::main_helpers::tag_name_eq;
10
10
  use crate::converter::main_helpers::trim_trailing_whitespace;
11
+ #[cfg(feature = "visitor")]
12
+ use crate::converter::utility::content::collect_tag_attributes;
11
13
  use crate::converter::utility::content::normalized_tag_name;
12
14
  use crate::converter::walk_node;
13
15
  use crate::options::ConversionOptions;
@@ -216,11 +218,7 @@ pub(crate) fn handle_li(
216
218
  use crate::visitor::{NodeContext, NodeType, VisitResult};
217
219
  use std::collections::BTreeMap;
218
220
 
219
- let attributes: BTreeMap<String, String> = tag
220
- .attributes()
221
- .iter()
222
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
223
- .collect();
221
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
224
222
 
225
223
  let parent_tag = dom_ctx
226
224
  .parent_of(node_handle.get_inner())
@@ -10,6 +10,8 @@ use super::utils::{
10
10
  add_list_leading_separator, add_nested_list_trailing_separator, calculate_list_nesting_depth, is_loose_list,
11
11
  process_list_children,
12
12
  };
13
+ #[cfg(feature = "visitor")]
14
+ use crate::converter::utility::content::collect_tag_attributes;
13
15
  use crate::options::ConversionOptions;
14
16
  #[allow(unused_imports)]
15
17
  use std::collections::BTreeMap;
@@ -60,11 +62,7 @@ pub(crate) fn handle_ol(
60
62
  if let Some(ref visitor_handle) = ctx.visitor {
61
63
  use crate::visitor::{NodeContext, NodeType, VisitResult};
62
64
 
63
- let attributes: BTreeMap<String, String> = tag
64
- .attributes()
65
- .iter()
66
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
67
- .collect();
65
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
68
66
 
69
67
  let parent_tag = dom_ctx
70
68
  .parent_of(node_handle.get_inner())
@@ -129,11 +127,7 @@ pub(crate) fn handle_ol(
129
127
  if let Some(ref visitor_handle) = ctx.visitor {
130
128
  use crate::visitor::{NodeContext, NodeType, VisitResult};
131
129
 
132
- let attributes: BTreeMap<String, String> = tag
133
- .attributes()
134
- .iter()
135
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
136
- .collect();
130
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
137
131
 
138
132
  let parent_tag = dom_ctx
139
133
  .parent_of(node_handle.get_inner())
@@ -10,7 +10,11 @@ use super::utils::{
10
10
  add_list_leading_separator, add_nested_list_trailing_separator, calculate_list_nesting_depth, is_loose_list,
11
11
  process_list_children,
12
12
  };
13
+ #[cfg(feature = "visitor")]
14
+ use crate::converter::utility::content::collect_tag_attributes;
13
15
  use crate::options::ConversionOptions;
16
+ #[cfg(feature = "visitor")]
17
+ use std::collections::BTreeMap;
14
18
  use tl;
15
19
 
16
20
  // Type aliases for Context and DomContext to avoid circular imports
@@ -51,13 +55,8 @@ pub(crate) fn handle_ul(
51
55
  #[cfg(feature = "visitor")]
52
56
  if let Some(ref visitor_handle) = ctx.visitor {
53
57
  use crate::visitor::{NodeContext, NodeType, VisitResult};
54
- use std::collections::BTreeMap;
55
58
 
56
- let attributes: BTreeMap<String, String> = tag
57
- .attributes()
58
- .iter()
59
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
60
- .collect();
59
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
61
60
 
62
61
  let parent_tag = dom_ctx
63
62
  .parent_of(node_handle.get_inner())
@@ -121,13 +120,8 @@ pub(crate) fn handle_ul(
121
120
  #[cfg(feature = "visitor")]
122
121
  if let Some(ref visitor_handle) = ctx.visitor {
123
122
  use crate::visitor::{NodeContext, NodeType, VisitResult};
124
- use std::collections::BTreeMap;
125
123
 
126
- let attributes: BTreeMap<String, String> = tag
127
- .attributes()
128
- .iter()
129
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
130
- .collect();
124
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
131
125
 
132
126
  let parent_tag = dom_ctx
133
127
  .parent_of(node_handle.get_inner())
@@ -3,6 +3,7 @@
3
3
  //! Contains helper functions for loose list detection, indentation calculation,
4
4
  //! list spacing, and list child processing.
5
5
 
6
+ use crate::converter::main_helpers::{tag_name_eq, trim_trailing_whitespace};
6
7
  use crate::options::{ConversionOptions, ListIndentType};
7
8
  use tl;
8
9
 
@@ -11,18 +12,6 @@ use tl;
11
12
  type Context = crate::converter::Context;
12
13
  type DomContext = crate::converter::DomContext;
13
14
 
14
- /// Remove trailing spaces and tabs from output string.
15
- fn trim_trailing_whitespace(output: &mut String) {
16
- while output.ends_with(' ') || output.ends_with('\t') {
17
- output.pop();
18
- }
19
- }
20
-
21
- /// Check if tag names are equal (case-insensitive).
22
- fn tag_name_eq<'a>(a: impl AsRef<str>, b: &str) -> bool {
23
- a.as_ref().eq_ignore_ascii_case(b)
24
- }
25
-
26
15
  /// Calculate indentation level for list item continuations.
27
16
  ///
28
17
  /// Returns the number of 4-space indent groups needed for list continuations.