html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  #[test]
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  /// Regression test for <https://github.com/kreuzberg-dev/html-to-markdown/issues/212>
@@ -11,7 +11,7 @@ fn convert(
11
11
  html: &str,
12
12
  opts: Option<html_to_markdown_rs::ConversionOptions>,
13
13
  ) -> html_to_markdown_rs::error::Result<String> {
14
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
14
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
15
15
  }
16
16
 
17
17
  /// Minimal reproducer: a <details> containing a <p> with <strong> inside.
@@ -14,7 +14,7 @@ fn extracts_json_ld_from_head_script() {
14
14
  </html>
15
15
  "#;
16
16
 
17
- let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
17
+ let result = html_to_markdown_rs::convert(html, None, None).expect("convert failed");
18
18
  let metadata = result.metadata;
19
19
 
20
20
  assert_eq!(metadata.structured_data.len(), 1);
@@ -35,7 +35,7 @@ fn extracts_json_ld_from_body_script_and_keeps_content() {
35
35
  </html>
36
36
  "#;
37
37
 
38
- let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
38
+ let result = html_to_markdown_rs::convert(html, None, None).expect("convert failed");
39
39
  let metadata = result.metadata;
40
40
 
41
41
  assert_eq!(metadata.structured_data.len(), 1);
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -3,7 +3,7 @@ fn convert(
3
3
  html: &str,
4
4
  opts: Option<html_to_markdown_rs::ConversionOptions>,
5
5
  ) -> html_to_markdown_rs::error::Result<String> {
6
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
6
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
7
7
  }
8
8
 
9
9
  use html_to_markdown_rs::{ConversionOptions, OutputFormat};
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -3,7 +3,7 @@
3
3
  use html_to_markdown_rs::{ConversionOptions, LinkStyle};
4
4
 
5
5
  fn convert(html: &str, options: Option<ConversionOptions>) -> String {
6
- html_to_markdown_rs::convert(html, options)
6
+ html_to_markdown_rs::convert(html, options, None)
7
7
  .unwrap()
8
8
  .content
9
9
  .unwrap_or_default()
@@ -0,0 +1,137 @@
1
+ #![allow(missing_docs)]
2
+
3
+ fn convert(html: &str) -> String {
4
+ html_to_markdown_rs::convert(html, None, None)
5
+ .map(|r| r.content.unwrap_or_default())
6
+ .expect("conversion should succeed")
7
+ }
8
+
9
+ // --- header ---
10
+
11
+ #[test]
12
+ fn test_h1_inside_header() {
13
+ let html = "<header><h1>Title in header not exported???</h1></header>";
14
+ let result = convert(html);
15
+ assert_eq!(result, "# Title in header not exported???\n");
16
+ }
17
+
18
+ #[test]
19
+ fn test_paragraph_inside_header() {
20
+ let html = "<header><p>Intro text</p></header>";
21
+ let result = convert(html);
22
+ assert_eq!(result, "Intro text\n");
23
+ }
24
+
25
+ #[test]
26
+ fn test_header_with_nested_elements() {
27
+ let html = "<header><h1>Title</h1><p>Subtitle</p></header>";
28
+ let result = convert(html);
29
+ assert!(result.contains("# Title"), "Should contain h1: {result}");
30
+ assert!(result.contains("Subtitle"), "Should contain paragraph: {result}");
31
+ }
32
+
33
+ // --- footer ---
34
+
35
+ #[test]
36
+ fn test_paragraph_inside_footer() {
37
+ let html = "<footer><p>Footer content</p></footer>";
38
+ let result = convert(html);
39
+ assert_eq!(result, "Footer content\n");
40
+ }
41
+
42
+ // --- main ---
43
+
44
+ #[test]
45
+ fn test_h2_inside_main() {
46
+ let html = "<main><h2>Main heading</h2></main>";
47
+ let result = convert(html);
48
+ assert_eq!(result, "## Main heading\n");
49
+ }
50
+
51
+ // --- article ---
52
+
53
+ #[test]
54
+ fn test_article_with_header_and_section() {
55
+ let html = "<article><header><h1>Title</h1></header><section><p>Content here</p></section></article>";
56
+ let result = convert(html);
57
+ assert!(result.contains("# Title"), "Should contain heading: {result}");
58
+ assert!(result.contains("Content here"), "Should contain content: {result}");
59
+ }
60
+
61
+ // --- section ---
62
+
63
+ #[test]
64
+ fn test_heading_inside_section() {
65
+ let html = "<section><h2>Section Heading</h2><p>Section body</p></section>";
66
+ let result = convert(html);
67
+ assert!(result.contains("## Section Heading"), "Should contain h2: {result}");
68
+ assert!(result.contains("Section body"), "Should contain body: {result}");
69
+ }
70
+
71
+ // --- nav ---
72
+
73
+ #[test]
74
+ fn test_nav_dropped_by_default() {
75
+ // nav is dropped by default when remove_navigation is true (the default)
76
+ let html = r#"<nav><a href="/home">Home</a><a href="/about">About</a></nav>"#;
77
+ let result = convert(html);
78
+ assert!(result.is_empty(), "nav should be dropped by default: '{result}'");
79
+ }
80
+
81
+ #[test]
82
+ fn test_nav_preserved_when_remove_navigation_disabled() {
83
+ use html_to_markdown_rs::{ConversionOptions, PreprocessingOptions};
84
+ let opts = ConversionOptions {
85
+ preprocessing: PreprocessingOptions {
86
+ remove_navigation: false,
87
+ ..Default::default()
88
+ },
89
+ ..Default::default()
90
+ };
91
+ let html = r#"<nav><a href="/home">Home</a></nav>"#;
92
+ let result = html_to_markdown_rs::convert(html, Some(opts), None)
93
+ .map(|r| r.content.unwrap_or_default())
94
+ .expect("conversion should succeed");
95
+ assert!(
96
+ result.contains("Home"),
97
+ "nav should pass through when remove_navigation=false: '{result}'"
98
+ );
99
+ }
100
+
101
+ // --- aside ---
102
+
103
+ #[test]
104
+ fn test_paragraph_inside_aside() {
105
+ let html = "<aside><p>Side note</p></aside>";
106
+ let result = convert(html);
107
+ assert_eq!(result, "Side note\n");
108
+ }
109
+
110
+ // --- navigation-hinted header should still be dropped ---
111
+
112
+ #[test]
113
+ fn test_site_chrome_header_dropped() {
114
+ // A <header> with class="site-header" is site chrome and should be removed
115
+ let html = r#"<header class="site-header"><a href="/">Logo</a></header><p>Content</p>"#;
116
+ let result = convert(html);
117
+ assert!(
118
+ !result.contains("Logo"),
119
+ "site-chrome header should be dropped: '{result}'"
120
+ );
121
+ assert!(
122
+ result.contains("Content"),
123
+ "body content should be preserved: '{result}'"
124
+ );
125
+ }
126
+
127
+ #[test]
128
+ fn test_header_with_role_navigation_dropped() {
129
+ // A <header role="navigation"> is nav chrome and should be removed
130
+ let html = r#"<header role="navigation"><a href="/">Home</a></header><p>Body</p>"#;
131
+ let result = convert(html);
132
+ assert!(
133
+ !result.contains("Home"),
134
+ "navigation header should be dropped: '{result}'"
135
+ );
136
+ assert!(result.contains("Body"), "body content should be preserved: '{result}'");
137
+ }
@@ -518,5 +518,5 @@ fn convert(
518
518
  html: &str,
519
519
  opts: Option<html_to_markdown_rs::ConversionOptions>,
520
520
  ) -> html_to_markdown_rs::error::Result<String> {
521
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
521
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
522
522
  }
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -718,7 +718,7 @@ fn test_table_colspan_no_header_issue_233() {
718
718
  <td>Cell 2</td>
719
719
  </tr>
720
720
  </table>"#;
721
- let result = html_to_markdown_rs::convert(html, None)
721
+ let result = html_to_markdown_rs::convert(html, None, None)
722
722
  .unwrap()
723
723
  .content
724
724
  .unwrap_or_default();
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -7,7 +7,7 @@
7
7
 
8
8
  #![cfg(feature = "visitor")]
9
9
 
10
- use html_to_markdown_rs::convert_with_visitor;
10
+ use html_to_markdown_rs::convert;
11
11
  use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
12
12
  use std::cell::RefCell;
13
13
  use std::rc::Rc;
@@ -147,7 +147,10 @@ fn test_issue_187_content_filter() {
147
147
  "#;
148
148
 
149
149
  let visitor = Rc::new(RefCell::new(ContentFilter::default()));
150
- let result = convert_with_visitor(html, None, Some(visitor.clone())).unwrap();
150
+ let result = convert(html, None, Some(visitor.clone()))
151
+ .unwrap()
152
+ .content
153
+ .unwrap_or_default();
151
154
 
152
155
  println!("Converted Markdown:\n{result}");
153
156
  println!("\nSkipped Elements:");
@@ -9,7 +9,7 @@
9
9
  use std::cell::RefCell;
10
10
  use std::rc::Rc;
11
11
 
12
- use html_to_markdown_rs::convert_with_visitor;
12
+ use html_to_markdown_rs::convert;
13
13
  use html_to_markdown_rs::visitor::HtmlVisitor;
14
14
 
15
15
  /// Empty visitor — does nothing, just uses default implementations.
@@ -26,7 +26,7 @@ fn make_visitor() -> Rc<RefCell<dyn HtmlVisitor>> {
26
26
  fn test_cyrillic_with_tabs_between_divs_and_visitor() {
27
27
  // Exact reproduction from the issue
28
28
  let html = "<div><span>А</span></div>\t\t\t<div><span>По";
29
- let result = convert_with_visitor(html, None, Some(make_visitor()));
29
+ let result = convert(html, None, Some(make_visitor()));
30
30
  assert!(result.is_ok(), "Should not panic: {result:?}");
31
31
  }
32
32
 
@@ -40,7 +40,7 @@ fn test_multibyte_utf8_with_tabs_and_visitor() {
40
40
  ];
41
41
 
42
42
  for html in &cases {
43
- let result = convert_with_visitor(html, None, Some(make_visitor()));
43
+ let result = convert(html, None, Some(make_visitor()));
44
44
  assert!(result.is_ok(), "Should not panic for: {html}\nError: {result:?}");
45
45
  }
46
46
  }
@@ -50,7 +50,7 @@ fn test_cyrillic_with_varying_tab_counts_and_visitor() {
50
50
  for n in 1..=5 {
51
51
  let tabs = "\t".repeat(n);
52
52
  let html = format!("<div><span>А</span></div>{tabs}<div><span>По");
53
- let result = convert_with_visitor(&html, None, Some(make_visitor()));
53
+ let result = convert(&html, None, Some(make_visitor()));
54
54
  assert!(result.is_ok(), "Should not panic with {n} tabs: {result:?}");
55
55
  }
56
56
  }
@@ -0,0 +1,77 @@
1
+ #![allow(missing_docs)]
2
+
3
+ //! Regression test for issue #277: silent truncation on large HTML inputs.
4
+ //!
5
+ //! The bug was caused by `repair_with_html5ever` re-introducing `<script>` elements
6
+ //! that had already been stripped, and `preprocess_html` failing to find the closing
7
+ //! tag when script content contained unbalanced literal `<script>` strings.
8
+
9
+ fn convert(html: &str) -> String {
10
+ html_to_markdown_rs::convert(html, None, None)
11
+ .expect("conversion should not fail")
12
+ .content
13
+ .unwrap_or_default()
14
+ }
15
+
16
+ /// When custom elements trigger html5ever repair, scripts must be re-stripped.
17
+ /// Without the fix, content after a script with unbalanced `<script>` literals
18
+ /// would be silently truncated.
19
+ #[test]
20
+ fn test_no_truncation_after_repair_with_scripts() {
21
+ // Custom element triggers repair_with_html5ever
22
+ // Script content has an unbalanced literal `<script>` that confuses depth tracking
23
+ let html = r"<html>
24
+ <head>
25
+ <script>
26
+ var example = '<script>';
27
+ console.log(example);
28
+ </script>
29
+ </head>
30
+ <body>
31
+ <custom-widget>widget</custom-widget>
32
+ <p>Content before</p>
33
+ <p>Content after scripts that must not be truncated</p>
34
+ <p>Final paragraph</p>
35
+ </body>
36
+ </html>";
37
+
38
+ let result = convert(html);
39
+ assert!(
40
+ result.contains("Content before"),
41
+ "Should contain content before script region"
42
+ );
43
+ assert!(
44
+ result.contains("Content after scripts that must not be truncated"),
45
+ "Content after scripts should NOT be silently truncated. Got:\n{result}"
46
+ );
47
+ assert!(
48
+ result.contains("Final paragraph"),
49
+ "Final content should be present. Got:\n{result}"
50
+ );
51
+ }
52
+
53
+ /// Ensure `preprocess_html` doesn't truncate the rest of the document when
54
+ /// `find_closing_tag` returns None (unmatched script opening).
55
+ #[test]
56
+ fn test_preprocess_unmatched_script_preserves_remaining_content() {
57
+ // Even without custom elements, preprocess_html's unwrap_or fallback
58
+ // should not consume the entire rest of the document.
59
+ let html = r"<html><body>
60
+ <p>Before</p>
61
+ <script>var x = '<script>'; var y = '<script>';</script>
62
+ <p>After first script</p>
63
+ <script>var z = 1;</script>
64
+ <p>After second script</p>
65
+ </body></html>";
66
+
67
+ let result = convert(html);
68
+ assert!(result.contains("Before"), "Content before scripts should be present");
69
+ assert!(
70
+ result.contains("After first script"),
71
+ "Content after first script should be present. Got:\n{result}"
72
+ );
73
+ assert!(
74
+ result.contains("After second script"),
75
+ "Content after second script should be present. Got:\n{result}"
76
+ );
77
+ }
@@ -0,0 +1,82 @@
1
+ #![allow(missing_docs)]
2
+
3
+ //! Tests for the `max_depth` recursion-safety option.
4
+
5
+ use html_to_markdown_rs::ConversionOptions;
6
+
7
+ fn convert_with_options(html: &str, options: ConversionOptions) -> String {
8
+ html_to_markdown_rs::convert(html, Some(options), None)
9
+ .expect("conversion should not fail")
10
+ .content
11
+ .unwrap_or_default()
12
+ }
13
+
14
+ /// With the default `max_depth: None`, deeply nested content should be fully converted.
15
+ #[test]
16
+ fn test_max_depth_none_converts_deeply_nested() {
17
+ // Build 100 levels of nesting around a leaf text node.
18
+ let mut html = String::from("<p>deep</p>");
19
+ for _ in 0..100 {
20
+ html = format!("<div>{html}</div>");
21
+ }
22
+
23
+ let options = ConversionOptions {
24
+ extract_metadata: false,
25
+ max_depth: None,
26
+ ..Default::default()
27
+ };
28
+
29
+ let result = convert_with_options(&html, options);
30
+ assert!(
31
+ result.contains("deep"),
32
+ "Deeply nested text should be present when max_depth is None. Got:\n{result}"
33
+ );
34
+ }
35
+
36
+ /// With `max_depth: Some(2)`, block elements at depth 2 are not visited, so
37
+ /// their text content is excluded from the output.
38
+ #[test]
39
+ fn test_max_depth_truncates_at_limit() {
40
+ // Depth counting (each handler passes depth+1 to its children):
41
+ // depth 0: outer <div> — visited
42
+ // depth 1: <p> — visited, paragraph handler passes depth+1 to children
43
+ // depth 2: "shallow" — visited (2 < 3), appears in output
44
+ // depth 1: inner <div> — visited, div handler passes depth+1 to children
45
+ // depth 2: <p> — visited, paragraph handler passes depth+1 to children
46
+ // depth 3: "deep" — skipped (3 >= 3), absent from output
47
+ let html = "<div><p>shallow</p><div><p>deep</p></div></div>";
48
+
49
+ let options = ConversionOptions {
50
+ extract_metadata: false,
51
+ max_depth: Some(3),
52
+ ..Default::default()
53
+ };
54
+
55
+ let result = convert_with_options(html, options);
56
+ assert!(
57
+ result.contains("shallow"),
58
+ "Content at depth < max_depth should be present. Got:\n{result}"
59
+ );
60
+ assert!(
61
+ !result.contains("deep"),
62
+ "Content at depth >= max_depth should be absent. Got:\n{result}"
63
+ );
64
+ }
65
+
66
+ /// With `max_depth: Some(0)`, no nodes are processed and the output is empty or whitespace only.
67
+ #[test]
68
+ fn test_max_depth_zero_produces_empty() {
69
+ let html = "<p>hello</p>";
70
+
71
+ let options = ConversionOptions {
72
+ extract_metadata: false,
73
+ max_depth: Some(0),
74
+ ..Default::default()
75
+ };
76
+
77
+ let result = convert_with_options(html, options);
78
+ assert!(
79
+ result.trim().is_empty(),
80
+ "max_depth: Some(0) should produce no output. Got:\n{result}"
81
+ );
82
+ }
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -111,7 +111,7 @@ fn test_preserve_json_ld_script() {
111
111
  </body>
112
112
  </html>"#;
113
113
 
114
- let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
114
+ let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
115
115
  let metadata = result.metadata;
116
116
  let markdown = result.content.unwrap_or_default();
117
117
 
@@ -164,7 +164,7 @@ fn test_multiple_script_tags() {
164
164
  </body>
165
165
  </html>"#;
166
166
 
167
- let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
167
+ let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
168
168
  let metadata = result.metadata;
169
169
  let markdown = result.content.unwrap_or_default();
170
170
 
@@ -221,7 +221,7 @@ fn test_reuters_like_structure() {
221
221
  </body>
222
222
  </html>"#;
223
223
 
224
- let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
224
+ let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
225
225
  let metadata = result.metadata;
226
226
  let markdown = result.content.unwrap_or_default();
227
227
 
@@ -392,5 +392,5 @@ fn convert(
392
392
  html: &str,
393
393
  opts: Option<html_to_markdown_rs::ConversionOptions>,
394
394
  ) -> html_to_markdown_rs::error::Result<String> {
395
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
395
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
396
396
  }
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -1,7 +1,7 @@
1
1
  #![allow(missing_docs)]
2
2
  #![cfg(feature = "visitor")]
3
3
 
4
- use html_to_markdown_rs::convert_with_visitor;
4
+ use html_to_markdown_rs::convert;
5
5
  use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
6
6
  use std::cell::RefCell;
7
7
  use std::rc::Rc;
@@ -33,7 +33,7 @@ fn test_code_block_visitor() {
33
33
  inline_codes: vec![],
34
34
  }));
35
35
 
36
- let result = convert_with_visitor(html, None, Some(visitor.clone()));
36
+ let result = convert(html, None, Some(visitor.clone()));
37
37
  assert!(result.is_ok());
38
38
 
39
39
  let visitor_ref = visitor.borrow();
@@ -49,7 +49,7 @@ fn test_inline_code_visitor() {
49
49
  inline_codes: vec![],
50
50
  }));
51
51
 
52
- let result = convert_with_visitor(html, None, Some(visitor.clone()));
52
+ let result = convert(html, None, Some(visitor.clone()));
53
53
  assert!(result.is_ok());
54
54
 
55
55
  let visitor_ref = visitor.borrow();
@@ -71,9 +71,9 @@ fn test_code_block_skip() {
71
71
  let html = "<pre><code>skipped code</code></pre>";
72
72
  let visitor = Rc::new(RefCell::new(SkipCodeVisitor));
73
73
 
74
- let result = convert_with_visitor(html, None, Some(visitor));
74
+ let result = convert(html, None, Some(visitor));
75
75
  assert!(result.is_ok());
76
- let markdown = result.unwrap();
76
+ let markdown = result.unwrap().content.unwrap_or_default();
77
77
  assert!(!markdown.contains("skipped code"));
78
78
  }
79
79
 
@@ -97,7 +97,7 @@ fn test_code_block_language_detection() {
97
97
  inline_codes: vec![],
98
98
  }));
99
99
 
100
- let result = convert_with_visitor(html, None, Some(visitor.clone()));
100
+ let result = convert(html, None, Some(visitor.clone()));
101
101
  assert!(result.is_ok(), "Failed to convert: {html}");
102
102
 
103
103
  let visitor_ref = visitor.borrow();