html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn test_br_inside_bold_tags() {
@@ -1,13 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! `CommonMark` Specification Compliance Tests
4
- //!
5
- //! This test suite verifies that our HTML-to-Markdown converter produces
6
- //! CommonMark-compliant output by testing against the official `CommonMark` spec.
7
- //!
8
- //! The test cases are derived from <https://spec.commonmark.org>/
9
-
10
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ use html_to_markdown_rs::ConversionOptions;
11
4
  use serde::Deserialize;
12
5
 
13
6
  #[derive(Debug, Deserialize)]
@@ -289,3 +282,10 @@ fn test_commonmark_compliance() {
289
282
  fn normalize_markdown(md: &str) -> String {
290
283
  md.trim_end().to_string()
291
284
  }
285
+
286
+ fn convert(
287
+ html: &str,
288
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
289
+ ) -> html_to_markdown_rs::error::Result<String> {
290
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
291
+ }
@@ -1,6 +1,12 @@
1
- //! Tests for Djot output format support.
1
+ #![allow(missing_docs)]
2
+ fn convert(
3
+ html: &str,
4
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
5
+ ) -> html_to_markdown_rs::error::Result<String> {
6
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ }
2
8
 
3
- use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
9
+ use html_to_markdown_rs::{ConversionOptions, OutputFormat};
4
10
 
5
11
  fn djot_options() -> ConversionOptions {
6
12
  ConversionOptions {
@@ -1,11 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! Integration tests for HTML to Markdown conversion.
4
- //!
5
- //! These tests verify end-to-end conversion of various HTML elements
6
- //! to ensure correct Markdown output.
7
-
8
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ use html_to_markdown_rs::ConversionOptions;
9
4
 
10
5
  #[test]
11
6
  fn test_basic_paragraph() {
@@ -580,3 +575,25 @@ fn test_nested_bold_issue_111() {
580
575
  let result = convert(html, None).unwrap();
581
576
  assert_eq!(result, "**bolder**\n");
582
577
  }
578
+
579
+ #[test]
580
+ fn hidden_elements_stripped() {
581
+ let html = "<p>visible</p><div hidden>secret</div><p>also visible</p>";
582
+ let result = convert(html, None).unwrap();
583
+ assert!(!result.contains("secret"));
584
+ assert!(result.contains("visible"));
585
+ }
586
+
587
+ #[test]
588
+ fn q_element_produces_quotes() {
589
+ let html = "<p>He said <q>hello</q> to me</p>";
590
+ let result = convert(html, None).unwrap();
591
+ assert!(result.contains(r#""hello""#), "q element should add quotes: {result}");
592
+ }
593
+
594
+ fn convert(
595
+ html: &str,
596
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
597
+ ) -> html_to_markdown_rs::error::Result<String> {
598
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
599
+ }
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,11 +1,18 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
13
  use html_to_markdown_rs::{
7
14
  CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, PreprocessingOptions,
8
- PreprocessingPreset, WhitespaceMode, convert,
15
+ PreprocessingPreset, WhitespaceMode,
9
16
  };
10
17
 
11
18
  fn fixture_path(name: &str) -> PathBuf {
@@ -27,7 +34,6 @@ fn issue_127_options() -> ConversionOptions {
27
34
  code_block_style: CodeBlockStyle::Backticks,
28
35
  strip_newlines: true,
29
36
  extract_metadata: false,
30
- hocr_spatial_tables: true,
31
37
  preprocessing: PreprocessingOptions {
32
38
  enabled: true,
33
39
  preset: PreprocessingPreset::Minimal,
@@ -1,6 +1,11 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  #[test]
6
11
  fn images_with_dimensions_render_as_markdown_links() {
@@ -1,7 +1,14 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
3
11
  use html_to_markdown_rs::options::WhitespaceMode;
4
- use html_to_markdown_rs::{ConversionOptions, convert};
5
12
 
6
13
  #[test]
7
14
  fn link_flattens_block_children_issue_131() {
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn long_multibyte_link_label_does_not_panic() {
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,12 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! Issue #145 Regression Tests
4
- //!
5
- //! Tests for ensuring that `strip_newlines=True` doesn't cause excessive whitespace
6
- //! around block elements. The root cause was that newlines were converted to spaces
7
- //! BEFORE whitespace-only node detection, causing the detection to fail.
8
-
9
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ use html_to_markdown_rs::ConversionOptions;
10
4
 
11
5
  #[test]
12
6
  fn test_strip_newlines_preserves_block_spacing() {
@@ -130,3 +124,10 @@ fn test_strip_newlines_handles_nested_blocks() {
130
124
  "excessive blank lines in nested blocks: {max_consecutive_blank} consecutive blanks in:\n{result}"
131
125
  );
132
126
  }
127
+
128
+ fn convert(
129
+ html: &str,
130
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
131
+ ) -> html_to_markdown_rs::error::Result<String> {
132
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
133
+ }
@@ -1,12 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! Issue #146 Regression Tests
4
- //!
5
- //! Tests for ensuring that `strip_tags` and `preserve_tags` properly prevent
6
- //! `<meta>` and `<title>` tags from appearing in YAML frontmatter when metadata
7
- //! extraction is enabled.
8
-
9
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ use html_to_markdown_rs::ConversionOptions;
10
4
 
11
5
  #[test]
12
6
  fn test_strip_tags_prevents_metadata_extraction() {
@@ -141,3 +135,10 @@ fn test_preserve_tags_prevents_metadata_extraction() {
141
135
  "meta-author should NOT be in YAML frontmatter when preserve_tags=['meta']: {result}"
142
136
  );
143
137
  }
138
+
139
+ fn convert(
140
+ html: &str,
141
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
142
+ ) -> html_to_markdown_rs::error::Result<String> {
143
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
144
+ }
@@ -4,7 +4,12 @@
4
4
 
5
5
  #[test]
6
6
  fn test_strong_blockquote_strong_newlines() {
7
- use html_to_markdown_rs::convert;
7
+ fn convert(
8
+ html: &str,
9
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
10
+ ) -> html_to_markdown_rs::error::Result<String> {
11
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
12
+ }
8
13
 
9
14
  // Test case from issue #176: strong + blockquote + strong
10
15
  let html = r"<strong>2. Point two</strong><blockquote>Option Explicit
@@ -30,7 +35,12 @@ End Function</blockquote><strong>3. Point three</strong>";
30
35
 
31
36
  #[test]
32
37
  fn test_paragraph_blockquote_paragraph_newlines() {
33
- use html_to_markdown_rs::convert;
38
+ fn convert(
39
+ html: &str,
40
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
41
+ ) -> html_to_markdown_rs::error::Result<String> {
42
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
43
+ }
34
44
 
35
45
  // Control test: p + blockquote + p should work correctly
36
46
  let html = r"<p>First paragraph</p><blockquote>A quote</blockquote><p>Second paragraph</p>";
@@ -1,9 +1,16 @@
1
1
  //! Regression coverage for issue #190.
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{CodeBlockStyle, ConversionOptions, convert};
13
+ use html_to_markdown_rs::{CodeBlockStyle, ConversionOptions};
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [
@@ -1,6 +1,11 @@
1
1
  //! Regression coverage for issue #199.
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  #[test]
6
11
  fn test_link_label_is_not_truncated() {
@@ -1,6 +1,11 @@
1
1
  //! Regression coverage for issues #200 and #214.
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  #[test]
6
11
  fn test_definition_list_spacing_consistency() {
@@ -1,6 +1,11 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::convert;
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
4
9
 
5
10
  /// Regression test for <https://github.com/kreuzberg-dev/html-to-markdown/issues/212>
6
11
  ///
@@ -7,7 +7,12 @@
7
7
  //! fresh String buffer while inheriting a parent context with `block_content_start`
8
8
  //! set by a paragraph handler, the index points into the wrong buffer.
9
9
 
10
- use html_to_markdown_rs::convert;
10
+ fn convert(
11
+ html: &str,
12
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
13
+ ) -> html_to_markdown_rs::error::Result<String> {
14
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
15
+ }
11
16
 
12
17
  /// Minimal reproducer: a <details> containing a <p> with <strong> inside.
13
18
  /// The <details> handler collects into a fresh buffer, the <p> sets
@@ -1,7 +1,5 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::metadata::MetadataConfig;
4
-
5
3
  #[test]
6
4
  fn extracts_json_ld_from_head_script() {
7
5
  let html = r#"
@@ -16,8 +14,8 @@ fn extracts_json_ld_from_head_script() {
16
14
  </html>
17
15
  "#;
18
16
 
19
- let (_markdown, metadata) = html_to_markdown_rs::convert_with_metadata(html, None, MetadataConfig::default(), None)
20
- .expect("convert_with_metadata failed");
17
+ let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
18
+ let metadata = result.metadata;
21
19
 
22
20
  assert_eq!(metadata.structured_data.len(), 1);
23
21
  assert!(metadata.structured_data[0].raw_json.contains(r#""@type": "Article""#));
@@ -37,8 +35,8 @@ fn extracts_json_ld_from_body_script_and_keeps_content() {
37
35
  </html>
38
36
  "#;
39
37
 
40
- let (_markdown, metadata) = html_to_markdown_rs::convert_with_metadata(html, None, MetadataConfig::default(), None)
41
- .expect("convert_with_metadata failed");
38
+ let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
39
+ let metadata = result.metadata;
42
40
 
43
41
  assert_eq!(metadata.structured_data.len(), 1);
44
42
  assert!(!metadata.structured_data[0].raw_json.trim().is_empty());
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn test_basic_unordered_list() {
@@ -1,6 +1,12 @@
1
- //! Tests for plain text output format support.
1
+ #![allow(missing_docs)]
2
+ fn convert(
3
+ html: &str,
4
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
5
+ ) -> html_to_markdown_rs::error::Result<String> {
6
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ }
2
8
 
3
- use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
9
+ use html_to_markdown_rs::{ConversionOptions, OutputFormat};
4
10
 
5
11
  fn plain_options() -> ConversionOptions {
6
12
  ConversionOptions {
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn footer_without_navigation_hint_is_preserved() {
@@ -1,16 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! Tests for the `skip_images` functionality.
4
- //!
5
- //! This test suite verifies that the `skip_images` option correctly omits all `<img>` tags
6
- //! from the markdown output when enabled, while preserving all other content.
7
- //!
8
- //! The `skip_images` option is useful for:
9
- //! - Text-only extraction from HTML documents
10
- //! - Filtering out visual content for accessibility or reduced bandwidth
11
- //! - Converting image-heavy documents to plain text markdown
12
-
13
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ use html_to_markdown_rs::ConversionOptions;
14
4
 
15
5
  #[test]
16
6
  fn test_skip_images_enabled() {
@@ -523,3 +513,10 @@ fn test_skip_images_preserves_links_and_formatting() {
523
513
  // Should not contain image
524
514
  assert!(!result.contains("![Ignored]"), "Should not contain image");
525
515
  }
516
+
517
+ fn convert(
518
+ html: &str,
519
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
520
+ ) -> html_to_markdown_rs::error::Result<String> {
521
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
522
+ }
@@ -1,6 +1,13 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- use html_to_markdown_rs::{ConversionOptions, convert};
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
10
+ use html_to_markdown_rs::ConversionOptions;
4
11
 
5
12
  #[test]
6
13
  fn test_basic_table() {
@@ -711,7 +718,10 @@ fn test_table_colspan_no_header_issue_233() {
711
718
  <td>Cell 2</td>
712
719
  </tr>
713
720
  </table>"#;
714
- let result = html_to_markdown_rs::convert(html, None).unwrap();
721
+ let result = html_to_markdown_rs::convert(html, None)
722
+ .unwrap()
723
+ .content
724
+ .unwrap_or_default();
715
725
  assert!(result.contains("| Cell spanning 2 columns | |"));
716
726
  assert!(result.contains("| Cell 1 | Cell 2 |"));
717
727
  }
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,9 +1,16 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
+ fn convert(
4
+ html: &str,
5
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
6
+ ) -> html_to_markdown_rs::error::Result<String> {
7
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
8
+ }
9
+
3
10
  use std::fs;
4
11
  use std::path::PathBuf;
5
12
 
6
- use html_to_markdown_rs::{ConversionOptions, convert};
13
+ use html_to_markdown_rs::ConversionOptions;
7
14
 
8
15
  fn fixture_path(name: &str) -> PathBuf {
9
16
  [env!("CARGO_MANIFEST_DIR"), "../../test_documents/html/issues", name]
@@ -1,12 +1,6 @@
1
1
  #![allow(missing_docs)]
2
2
 
3
- //! Tests for script and style tag stripping before parsing.
4
- //!
5
- //! This test suite verifies that script and style tags are completely removed
6
- //! from HTML before parsing, preventing the tl parser from misinterpreting
7
- //! HTML-like content inside scripts as actual tags.
8
-
9
- use html_to_markdown_rs::{ConversionOptions, MetadataConfig, convert, convert_with_metadata};
3
+ use html_to_markdown_rs::ConversionOptions;
10
4
 
11
5
  #[test]
12
6
  fn test_strip_simple_script_tag() {
@@ -117,13 +111,9 @@ fn test_preserve_json_ld_script() {
117
111
  </body>
118
112
  </html>"#;
119
113
 
120
- let options = ConversionOptions {
121
- extract_metadata: true,
122
- ..Default::default()
123
- };
124
-
125
- let (markdown, metadata) =
126
- convert_with_metadata(html, Some(options), MetadataConfig::default(), None).expect("Failed to convert");
114
+ let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
115
+ let metadata = result.metadata;
116
+ let markdown = result.content.unwrap_or_default();
127
117
 
128
118
  println!("Markdown:\n{markdown}");
129
119
  println!("Metadata: {:?}", metadata.document.title);
@@ -174,13 +164,9 @@ fn test_multiple_script_tags() {
174
164
  </body>
175
165
  </html>"#;
176
166
 
177
- let options = ConversionOptions {
178
- extract_metadata: true,
179
- ..Default::default()
180
- };
181
-
182
- let (markdown, metadata) =
183
- convert_with_metadata(html, Some(options), MetadataConfig::default(), None).expect("Failed to convert");
167
+ let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
168
+ let metadata = result.metadata;
169
+ let markdown = result.content.unwrap_or_default();
184
170
 
185
171
  println!("Markdown:\n{markdown}");
186
172
 
@@ -235,13 +221,9 @@ fn test_reuters_like_structure() {
235
221
  </body>
236
222
  </html>"#;
237
223
 
238
- let options = ConversionOptions {
239
- extract_metadata: true,
240
- ..Default::default()
241
- };
242
-
243
- let (markdown, metadata) =
244
- convert_with_metadata(html, Some(options), MetadataConfig::default(), None).expect("Failed to convert");
224
+ let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
225
+ let metadata = result.metadata;
226
+ let markdown = result.content.unwrap_or_default();
245
227
 
246
228
  println!("Markdown output:\n{markdown}");
247
229
  println!("Metadata title: {:?}", metadata.document.title);
@@ -405,3 +387,10 @@ fn test_inline_script_attributes_not_affected() {
405
387
  "Should remove script tag content"
406
388
  );
407
389
  }
390
+
391
+ fn convert(
392
+ html: &str,
393
+ opts: Option<html_to_markdown_rs::ConversionOptions>,
394
+ ) -> html_to_markdown_rs::error::Result<String> {
395
+ html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
396
+ }