html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -15,14 +15,14 @@ use std::collections::BTreeMap;
15
15
 
16
16
  use crate::converter::dom_context::DomContext;
17
17
  use crate::converter::main_helpers::{
18
- extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
19
- repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
18
+ extract_head_metadata, format_metadata_frontmatter, has_custom_element_tags, repair_with_html5ever,
19
+ trim_line_end_whitespace, trim_trailing_whitespace,
20
20
  };
21
21
  use crate::converter::plain_text::extract_plain_text;
22
22
  use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
23
23
  use crate::converter::utility::caching::build_dom_context;
24
24
  use crate::converter::utility::content::normalized_tag_name;
25
- use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
25
+ use crate::converter::utility::preprocessing::{preprocess_html, strip_hidden_elements, strip_script_and_style_tags};
26
26
  use crate::converter::utility::serialization::serialize_tag_to_html;
27
27
  use crate::options::OutputFormat;
28
28
 
@@ -31,12 +31,13 @@ use crate::error::Result;
31
31
  use crate::options::ConversionOptions;
32
32
 
33
33
  use crate::converter::context::{Context, InlineCollectorHandle};
34
+ use crate::types::structure_collector::StructureCollectorHandle;
34
35
 
35
36
  /// Converts HTML to Markdown using the provided conversion options.
36
37
  ///
37
38
  /// This is the main entry point for HTML to Markdown conversion.
38
39
  pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
39
- convert_html_impl(html, options, None, None, None)
40
+ convert_html_impl(html, options, None, None, None, None).map(|(md, _)| md)
40
41
  }
41
42
 
42
43
  /// Converts HTML to Markdown with a custom visitor for callbacks during traversal.
@@ -49,26 +50,13 @@ pub fn convert_html_with_visitor(
49
50
  options: &ConversionOptions,
50
51
  visitor: Option<crate::visitor::VisitorHandle>,
51
52
  ) -> Result<String> {
52
- convert_html_impl(html, options, None, None, visitor)
53
- }
54
-
55
- /// Converts HTML to Markdown with an async visitor for callbacks during traversal.
56
- ///
57
- /// Async variant with async visitor callbacks for Promise-based bindings.
58
- #[cfg(feature = "async-visitor")]
59
- #[allow(clippy::future_not_send)]
60
- pub async fn convert_html_with_visitor_async(
61
- html: &str,
62
- options: &ConversionOptions,
63
- visitor: Option<crate::visitor_helpers::AsyncVisitorHandle>,
64
- ) -> Result<String> {
65
- convert_html_impl_async(html, options, None, None, visitor).await
53
+ convert_html_impl(html, options, None, None, visitor, None).map(|(md, _)| md)
66
54
  }
67
55
 
68
56
  /// Internal implementation of HTML to Markdown conversion.
69
57
  ///
70
- /// This function handles the actual conversion logic with optional inline image collection,
71
- /// metadata extraction, and visitor callbacks depending on enabled features.
58
+ /// Returns `(markdown, Option<DocumentStructure>)`. The structure is populated when
59
+ /// `options.include_document_structure == true` and a `structure_collector` handle is provided.
72
60
  #[cfg_attr(
73
61
  any(not(feature = "inline-images"), not(feature = "metadata"), not(feature = "visitor")),
74
62
  allow(unused_variables)
@@ -82,10 +70,13 @@ pub(crate) fn convert_html_impl(
82
70
  #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
83
71
  #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
84
72
  #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
85
- ) -> Result<String> {
73
+ structure_collector: Option<StructureCollectorHandle>,
74
+ ) -> Result<(String, Option<crate::types::DocumentStructure>)> {
86
75
  // Strip script and style tags completely to prevent parser confusion from HTML-like content
87
76
  // inside script/style elements. This preserves JSON-LD for metadata extraction.
88
77
  let stripped = strip_script_and_style_tags(html);
78
+ // Strip elements with the `hidden` attribute before parsing.
79
+ let stripped = strip_hidden_elements(&stripped);
89
80
  let mut preprocessed = preprocess_html(&stripped).into_owned();
90
81
  let mut preprocessed_len = preprocessed.len();
91
82
 
@@ -113,11 +104,6 @@ pub(crate) fn convert_html_impl(
113
104
  let mut parser = dom.parser();
114
105
  let mut output = String::with_capacity(preprocessed_len.saturating_add(preprocessed_len / 4));
115
106
 
116
- // Check and handle hOCR documents
117
- if handle_hocr_document(&dom, parser, options, &mut output) {
118
- return Ok(output);
119
- }
120
-
121
107
  let mut dom_ctx = build_dom_context(&dom, parser, preprocessed_len);
122
108
 
123
109
  // Check for inline-block misnesting and repair if needed
@@ -211,13 +197,37 @@ pub(crate) fn convert_html_impl(
211
197
  }
212
198
 
213
199
  #[cfg(all(feature = "metadata", feature = "visitor"))]
214
- let ctx = Context::new(options, inline_collector, metadata_collector, visitor);
200
+ let ctx = Context::new(
201
+ options,
202
+ inline_collector,
203
+ metadata_collector,
204
+ visitor,
205
+ structure_collector.as_ref().map(std::rc::Rc::clone),
206
+ );
215
207
  #[cfg(all(feature = "metadata", not(feature = "visitor")))]
216
- let ctx = Context::new(options, inline_collector, metadata_collector, _visitor);
208
+ let ctx = Context::new(
209
+ options,
210
+ inline_collector,
211
+ metadata_collector,
212
+ _visitor,
213
+ structure_collector.as_ref().map(std::rc::Rc::clone),
214
+ );
217
215
  #[cfg(all(not(feature = "metadata"), feature = "visitor"))]
218
- let ctx = Context::new(options, inline_collector, _metadata_collector, visitor);
216
+ let ctx = Context::new(
217
+ options,
218
+ inline_collector,
219
+ _metadata_collector,
220
+ visitor,
221
+ structure_collector.as_ref().map(std::rc::Rc::clone),
222
+ );
219
223
  #[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
220
- let ctx = Context::new(options, inline_collector, _metadata_collector, _visitor);
224
+ let ctx = Context::new(
225
+ options,
226
+ inline_collector,
227
+ _metadata_collector,
228
+ _visitor,
229
+ structure_collector.as_ref().map(std::rc::Rc::clone),
230
+ );
221
231
 
222
232
  for child_handle in dom.children() {
223
233
  walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
@@ -228,20 +238,32 @@ pub(crate) fn convert_html_impl(
228
238
  return Err(crate::error::ConversionError::Visitor(err.clone()));
229
239
  }
230
240
 
241
+ // Drop ctx before unwrapping the structure collector Rc — ctx holds a cloned Rc
242
+ // reference to the same collector, and Rc::try_unwrap requires exactly one reference.
243
+ drop(ctx);
244
+
231
245
  // If plain text was requested, discard the markdown output and return plain text.
232
246
  // The full pipeline was still run above so that metadata + visitor callbacks fire.
233
247
  if is_plain_text {
234
248
  let plain = extract_plain_text(&dom, parser, options);
235
- return Ok(plain);
249
+ let document =
250
+ structure_collector.and_then(|sc| std::rc::Rc::try_unwrap(sc).ok().map(|cell| cell.into_inner().finish()));
251
+ return Ok((plain, document));
236
252
  }
237
253
 
238
254
  trim_line_end_whitespace(&mut output);
239
255
  let trimmed = output.trim_end_matches('\n');
240
- if trimmed.is_empty() {
241
- Ok(String::new())
256
+ let markdown = if trimmed.is_empty() {
257
+ String::new()
242
258
  } else {
243
- Ok(format!("{trimmed}\n"))
244
- }
259
+ format!("{trimmed}\n")
260
+ };
261
+
262
+ // Finish the structure collector if present.
263
+ let document =
264
+ structure_collector.and_then(|sc| std::rc::Rc::try_unwrap(sc).ok().map(|cell| cell.into_inner().finish()));
265
+
266
+ Ok((markdown, document))
245
267
  }
246
268
  // has_more_than_one_char moved to main_helpers
247
269
  // is_inline_element available from utility::content
@@ -473,6 +495,34 @@ pub(crate) fn walk_node(
473
495
  );
474
496
  }
475
497
 
498
+ // Quote element routed to semantic dispatcher
499
+ "q" => {
500
+ crate::converter::semantic::dispatch_semantic_handler(
501
+ &tag_name,
502
+ node_handle,
503
+ parser,
504
+ output,
505
+ options,
506
+ ctx,
507
+ depth,
508
+ dom_ctx,
509
+ );
510
+ }
511
+
512
+ // Figure elements routed to semantic dispatcher
513
+ "figure" | "figcaption" => {
514
+ crate::converter::semantic::dispatch_semantic_handler(
515
+ &tag_name,
516
+ node_handle,
517
+ parser,
518
+ output,
519
+ options,
520
+ ctx,
521
+ depth,
522
+ dom_ctx,
523
+ );
524
+ }
525
+
476
526
  // Semantic interactive elements routed to semantic dispatcher
477
527
  "details" | "summary" | "dialog" | "menu" => {
478
528
  crate::converter::semantic::dispatch_semantic_handler(
@@ -569,24 +619,3 @@ pub(crate) fn walk_node(
569
619
  tl::Node::Comment(_) => {}
570
620
  }
571
621
  }
572
- /// Async equivalent of `convert_html_impl` for Promise-based visitor callbacks.
573
- #[cfg(feature = "async-visitor")]
574
- #[allow(clippy::future_not_send)]
575
- pub(crate) async fn convert_html_impl_async(
576
- html: &str,
577
- options: &ConversionOptions,
578
- _inline_collector: Option<InlineCollectorHandle>,
579
- #[cfg(feature = "metadata")] _metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
580
- #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
581
- visitor: Option<crate::visitor_helpers::AsyncVisitorHandle>,
582
- ) -> Result<String> {
583
- if visitor.is_some() {
584
- return Err(crate::error::ConversionError::ParseError(
585
- "Async visitor not yet implemented. Use AsyncToSyncVisitorBridge.".to_string(),
586
- ));
587
- }
588
- #[cfg(feature = "visitor")]
589
- return convert_html_impl(html, options, _inline_collector, _metadata_collector, None);
590
- #[cfg(not(feature = "visitor"))]
591
- return convert_html_impl(html, options, _inline_collector, _metadata_collector, ());
592
- }
@@ -145,8 +145,8 @@ pub fn extract_head_metadata(
145
145
  if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
146
146
  // Look for meta tags
147
147
  if child_tag.name().as_utf8_str().eq_ignore_ascii_case("meta")
148
- && !options.strip_tags.contains(&"meta".to_string())
149
- && !options.preserve_tags.contains(&"meta".to_string())
148
+ && !options.strip_tags.iter().any(|t| t == "meta")
149
+ && !options.preserve_tags.iter().any(|t| t == "meta")
150
150
  {
151
151
  if let (Some(name), Some(content)) = (
152
152
  child_tag.attributes().get("name").flatten(),
@@ -168,8 +168,8 @@ pub fn extract_head_metadata(
168
168
  }
169
169
  // Look for title tag
170
170
  if child_tag.name().as_utf8_str().eq_ignore_ascii_case("title")
171
- && !options.strip_tags.contains(&"title".to_string())
172
- && !options.preserve_tags.contains(&"title".to_string())
171
+ && !options.strip_tags.iter().any(|t| t == "title")
172
+ && !options.preserve_tags.iter().any(|t| t == "title")
173
173
  {
174
174
  // Extract text content from title tag
175
175
  let mut title_content = String::new();
@@ -284,66 +284,3 @@ pub fn is_inline_element(tag_name: &str) -> bool {
284
284
  | "meter"
285
285
  )
286
286
  }
287
-
288
- /// Handle hOCR document conversion, returning true if handled, false if not hOCR.
289
- pub fn handle_hocr_document(
290
- dom: &tl::VDom<'_>,
291
- parser: &tl::Parser<'_>,
292
- options: &ConversionOptions,
293
- output: &mut String,
294
- ) -> bool {
295
- use crate::converter::utility::attributes::{is_hocr_document, may_be_hocr};
296
- use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
297
-
298
- let preprocessed = dom.outer_html();
299
- if !may_be_hocr(preprocessed.as_ref()) {
300
- return false;
301
- }
302
-
303
- let mut is_hocr = false;
304
- for child_handle in dom.children() {
305
- if is_hocr_document(*child_handle, parser) {
306
- is_hocr = true;
307
- break;
308
- }
309
- }
310
-
311
- if !is_hocr {
312
- return false;
313
- }
314
-
315
- let (elements, metadata) = extract_hocr_document(dom);
316
-
317
- if options.extract_metadata && !options.convert_as_inline {
318
- let mut metadata_map = BTreeMap::new();
319
- if let Some(system) = metadata.ocr_system {
320
- metadata_map.insert("ocr-system".to_string(), system);
321
- }
322
- if !metadata.ocr_capabilities.is_empty() {
323
- metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
324
- }
325
- if let Some(pages) = metadata.ocr_number_of_pages {
326
- metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
327
- }
328
- if !metadata.ocr_langs.is_empty() {
329
- metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
330
- }
331
- if !metadata.ocr_scripts.is_empty() {
332
- metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
333
- }
334
-
335
- if !metadata_map.is_empty() {
336
- output.push_str(&format_metadata_frontmatter(&metadata_map));
337
- }
338
- }
339
-
340
- let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
341
-
342
- if !markdown.trim().is_empty() {
343
- markdown.truncate(markdown.trim_end().len());
344
- output.push_str(&markdown);
345
- output.push('\n');
346
- }
347
-
348
- true
349
- }
@@ -12,6 +12,7 @@ use tl::{HTMLTag, NodeHandle, Parser};
12
12
 
13
13
  use crate::converter::Context;
14
14
  use crate::converter::dom_context::DomContext;
15
+ use crate::converter::main_helpers::tag_name_eq;
15
16
  use crate::options::ConversionOptions;
16
17
 
17
18
  /// Extract src attribute from media element (audio, video, iframe).
@@ -46,11 +47,6 @@ pub(crate) fn is_source_element(tag: &HTMLTag) -> bool {
46
47
  tag_name_eq(tag.name().as_utf8_str(), "source")
47
48
  }
48
49
 
49
- /// Compare tag name with needle (case-insensitive).
50
- fn tag_name_eq<'a>(name: impl AsRef<str>, needle: &str) -> bool {
51
- name.as_ref().eq_ignore_ascii_case(needle)
52
- }
53
-
54
50
  /// Determine if media should output source link in markdown.
55
51
  ///
56
52
  /// Returns true if src is non-empty.
@@ -1,41 +1,4 @@
1
1
  //! Graphic element handling (custom graphic elements with alternative source attributes).
2
-
3
- use std::borrow::Cow;
4
- use tl::HTMLTag;
5
-
6
- /// Handle custom graphic elements with multiple source attribute options.
7
- ///
8
- /// The graphic element is a custom XML element that supports multiple source attributes:
9
- /// - `url` (primary)
10
- /// - `href` (secondary)
11
- /// - `xlink:href` (SVG standard)
12
- /// - `src` (fallback)
13
- ///
14
- /// This is commonly used in publishing formats like EPUB.
15
- pub(crate) fn extract_graphic_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
16
- tag.attributes()
17
- .get("url")
18
- .flatten()
19
- .or_else(|| tag.attributes().get("href").flatten())
20
- .or_else(|| tag.attributes().get("xlink:href").flatten())
21
- .or_else(|| tag.attributes().get("src").flatten())
22
- .map_or_else(|| Cow::Borrowed(""), |v| v.as_utf8_str())
23
- }
24
-
25
- /// Extract alt text from graphic element with fallback to filename.
26
- pub(crate) fn extract_graphic_alt<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
27
- tag.attributes()
28
- .get("alt")
29
- .flatten()
30
- .map(|v| v.as_utf8_str())
31
- .or_else(|| tag.attributes().get("filename").flatten().map(|v| v.as_utf8_str()))
32
- .unwrap_or_else(|| Cow::Borrowed(""))
33
- }
34
-
35
- /// Get source attributes to skip during metadata collection.
36
- ///
37
- /// These attributes are handled specially and should not be included
38
- /// in the generic attributes map.
39
- pub(crate) fn should_skip_graphic_attr(key_str: &str) -> bool {
40
- matches!(key_str, "url" | "href" | "xlink:href" | "src")
41
- }
2
+ //!
3
+ //! The `<graphic>` element is a custom XML element used in publishing formats like EPUB.
4
+ //! Conversion logic lives in `crate::converter::handlers::graphic`.
@@ -171,14 +171,6 @@ pub(crate) fn handle_inline_data_image(
171
171
  collector.push_image(index, image);
172
172
  }
173
173
 
174
- /// Check if heading tag allows inline images based on configuration.
175
- pub(crate) fn heading_allows_inline_images(
176
- tag_name: &str,
177
- keep_inline_images_in: &std::collections::HashSet<String>,
178
- ) -> bool {
179
- keep_inline_images_in.contains(tag_name)
180
- }
181
-
182
174
  /// Extract non-empty trimmed string or return None.
183
175
  #[cfg(feature = "inline-images")]
184
176
  fn non_empty_trimmed(value: &str) -> Option<String> {
@@ -1,5 +1,7 @@
1
1
  //! SVG and MathML element handling with serialization and base64 encoding.
2
2
 
3
+ use crate::converter::main_helpers::tag_name_eq;
4
+ use crate::converter::utility::content::normalized_tag_name;
3
5
  #[allow(unused_imports)]
4
6
  use std::collections::BTreeMap;
5
7
  use tl::{NodeHandle, Parser};
@@ -93,7 +95,7 @@ pub(crate) fn handle_inline_svg(
93
95
  #[allow(clippy::trivially_copy_pass_by_ref)]
94
96
  pub(crate) fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
95
97
  if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
96
- let tag_name = normalized_tag_name(tag.name().as_utf8_str().as_ref());
98
+ let tag_name = normalized_tag_name(tag.name().as_utf8_str());
97
99
  let mut html = String::with_capacity(256);
98
100
  html.push('<');
99
101
  html.push_str(&tag_name);
@@ -142,11 +144,6 @@ pub(crate) fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> Strin
142
144
  }
143
145
  }
144
146
 
145
- /// Normalize tag name to lowercase.
146
- fn normalized_tag_name(name: &str) -> String {
147
- name.to_ascii_lowercase()
148
- }
149
-
150
147
  /// Extract non-empty trimmed string or return None.
151
148
  #[cfg(feature = "inline-images")]
152
149
  fn non_empty_trimmed(value: &str) -> Option<String> {
@@ -158,12 +155,6 @@ fn non_empty_trimmed(value: &str) -> Option<String> {
158
155
  }
159
156
  }
160
157
 
161
- /// Encode SVG to base64 data URI.
162
- pub(crate) fn encode_svg_to_data_uri(svg_html: &str) -> String {
163
- use base64::{Engine as _, engine::general_purpose::STANDARD};
164
- STANDARD.encode(svg_html.as_bytes())
165
- }
166
-
167
158
  /// Handle SVG element conversion to Markdown.
168
159
  ///
169
160
  /// Extracts title from child elements, handles inline image collection,
@@ -179,7 +170,6 @@ pub(crate) fn handle_svg(
179
170
  _depth: usize,
180
171
  dom_ctx: &super::DomContext,
181
172
  ) {
182
- use crate::converter::main_helpers::tag_name_eq;
183
173
  use crate::converter::utility::content::get_text_content;
184
174
 
185
175
  let mut title = String::from("SVG Image");
@@ -100,7 +100,7 @@ fn handle_head(
100
100
  let json = child_tag.inner_text(parser);
101
101
  let json = json.trim();
102
102
  if !json.is_empty() {
103
- let json = decode_html_entities(json).clone();
103
+ let json = decode_html_entities(json);
104
104
  if !json.is_empty() {
105
105
  collector.borrow_mut().add_json_ld(json);
106
106
  }
@@ -68,7 +68,6 @@
68
68
  //! - Inline image extraction (`inline-images` feature)
69
69
  //! - Metadata collection (`metadata` feature)
70
70
  //! - Custom visitor callbacks (`visitor` feature)
71
- //! - Async visitor support (`async-visitor` feature)
72
71
  //!
73
72
  //! # Example Integration
74
73
  //!
@@ -120,17 +119,10 @@ pub use self::main::convert_html;
120
119
  #[cfg(feature = "visitor")]
121
120
  pub use self::main::convert_html_with_visitor;
122
121
 
123
- #[cfg(feature = "async-visitor")]
124
- pub use self::main::convert_html_with_visitor_async;
125
-
126
122
  // Import the tree walker and utility functions from main and main_helpers
127
123
  pub(crate) use self::main::{convert_html_impl, walk_node};
128
124
  pub(crate) use self::main_helpers::trim_trailing_whitespace;
129
125
 
130
- #[cfg(feature = "async-visitor")]
131
- #[allow(unused_imports)]
132
- pub(crate) use self::main::convert_html_impl_async;
133
-
134
126
  // Re-export helper functions from utility modules (migrated from converter_legacy)
135
127
  pub(crate) use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
136
128
  #[allow(unused_imports)]
@@ -292,22 +292,47 @@ fn ensure_newline(buf: &mut String) {
292
292
  }
293
293
  }
294
294
 
295
- /// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
296
- fn post_process(buf: &mut String) {
297
- // Collapse runs of 3+ newlines to exactly 2
298
- while buf.contains("\n\n\n") {
299
- *buf = buf.replace("\n\n\n", "\n\n");
295
+ /// Collapse runs of 3 or more consecutive newlines to exactly 2 in a single pass.
296
+ fn collapse_triple_newlines(buf: &mut String) {
297
+ let bytes = buf.as_bytes();
298
+ let mut result = String::with_capacity(buf.len());
299
+ let mut newline_count = 0usize;
300
+ for &b in bytes {
301
+ if b == b'\n' {
302
+ newline_count += 1;
303
+ if newline_count <= 2 {
304
+ result.push('\n');
305
+ }
306
+ } else {
307
+ newline_count = 0;
308
+ result.push(b as char);
309
+ }
300
310
  }
311
+ *buf = result;
312
+ }
301
313
 
302
- // Trim trailing whitespace from each line collect owned strings to avoid borrow conflict
303
- let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
304
- buf.clear();
305
- for (i, line) in lines.iter().enumerate() {
306
- buf.push_str(line);
307
- if i < lines.len() - 1 {
308
- buf.push('\n');
314
+ /// Trim trailing whitespace from every line in a buffer without allocating per-line strings.
315
+ ///
316
+ /// Uses a single allocation of the same capacity, writing each line's trimmed content
317
+ /// and inserting newline separators directly.
318
+ fn trim_line_ends(buf: &mut String) {
319
+ let mut result = String::with_capacity(buf.len());
320
+ for line in buf.lines() {
321
+ if !result.is_empty() {
322
+ result.push('\n');
309
323
  }
324
+ result.push_str(line.trim_end());
310
325
  }
326
+ *buf = result;
327
+ }
328
+
329
+ /// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
330
+ fn post_process(buf: &mut String) {
331
+ // Collapse runs of 3+ newlines to exactly 2
332
+ collapse_triple_newlines(buf);
333
+
334
+ // Trim trailing whitespace from each line in-place
335
+ trim_line_ends(buf);
311
336
 
312
337
  // Trim to single trailing newline
313
338
  let keep = buf.trim_end_matches('\n').len();
@@ -15,6 +15,8 @@
15
15
  // Note: Context and DomContext are defined in converter.rs
16
16
  // walk_node is also defined there and must be called via the parent module
17
17
 
18
+ use crate::converter::utility::content::chomp_inline;
19
+
18
20
  /// Handles the `<dfn>` element.
19
21
  ///
20
22
  /// A dfn element marks a term that is being defined. The content represents
@@ -220,14 +222,9 @@ pub fn handle_q(
220
222
 
221
223
  let trimmed = content.trim();
222
224
  if !trimmed.is_empty() {
223
- if ctx.convert_as_inline {
224
- output.push_str(trimmed);
225
- } else {
226
- output.push('"');
227
- let escaped = trimmed.replace('\\', r"\\").replace('"', r#"\""#);
228
- output.push_str(&escaped);
229
- output.push('"');
230
- }
225
+ output.push('"');
226
+ output.push_str(trimmed);
227
+ output.push('"');
231
228
  }
232
229
  }
233
230
  }
@@ -256,28 +253,6 @@ pub fn handle(
256
253
  }
257
254
  }
258
255
 
259
- /// Extracts prefix, suffix, and trimmed content from inline element text.
260
- ///
261
- /// This helper function splits leading and trailing whitespace from content,
262
- /// allowing inline elements to preserve surrounding whitespace context.
263
- ///
264
- /// # Returns
265
- ///
266
- /// A tuple of `(prefix, suffix, trimmed_content)` where:
267
- /// - `prefix`: Leading whitespace (spaces, tabs, newlines)
268
- /// - `suffix`: Trailing whitespace (spaces, tabs, newlines)
269
- /// - `trimmed_content`: The content without leading/trailing whitespace
270
- fn chomp_inline(content: &str) -> (&str, &str, &str) {
271
- let trimmed = content.trim();
272
- let prefix_len = content.len() - content.trim_start().len();
273
- let suffix_len = content.len() - content.trim_end().len();
274
-
275
- let prefix = &content[..prefix_len];
276
- let suffix = &content[content.len() - suffix_len..];
277
-
278
- (prefix, suffix, trimmed)
279
- }
280
-
281
256
  /// Appends inline suffix to the output.
282
257
  ///
283
258
  /// This is a placeholder for integrating with other inline formatting systems
@@ -164,3 +164,32 @@ pub fn handle(
164
164
  _ => {}
165
165
  }
166
166
  }
167
+
168
+ #[cfg(test)]
169
+ mod tests {
170
+ #[test]
171
+ fn figure_caption_separated_from_image() {
172
+ let html = r#"<figure><img src="photo.jpg" alt="Photo"><figcaption>A nice photo</figcaption></figure>"#;
173
+ let result = crate::convert(html, None).unwrap();
174
+ let content = result.content.unwrap_or_default();
175
+ assert!(
176
+ content.contains("![Photo](photo.jpg)"),
177
+ "image should be present: {}",
178
+ content
179
+ );
180
+ assert!(
181
+ content.contains("A nice photo"),
182
+ "caption should be present: {}",
183
+ content
184
+ );
185
+ // Image and caption should not be on the same line
186
+ let lines: Vec<&str> = content.lines().filter(|l| !l.trim().is_empty()).collect();
187
+ let img_line = lines.iter().position(|l| l.contains("![")).unwrap_or(999);
188
+ let cap_line = lines.iter().position(|l| l.contains("A nice photo")).unwrap_or(999);
189
+ assert!(
190
+ cap_line > img_line,
191
+ "caption should be on a separate line after image, lines: {:?}",
192
+ lines
193
+ );
194
+ }
195
+ }