html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
@@ -7,7 +7,7 @@ use crate::converter::DomContext;
7
7
 
8
8
  /// Get the tag name of the next sibling element.
9
9
  #[allow(clippy::trivially_copy_pass_by_ref)]
10
- pub(crate) fn get_next_sibling_tag<'a>(
10
+ pub fn get_next_sibling_tag<'a>(
11
11
  node_handle: &tl::NodeHandle,
12
12
  parser: &'a tl::Parser,
13
13
  dom_ctx: &'a DomContext,
@@ -17,7 +17,7 @@ pub(crate) fn get_next_sibling_tag<'a>(
17
17
 
18
18
  /// Get the tag name of the previous sibling element.
19
19
  #[allow(clippy::trivially_copy_pass_by_ref)]
20
- pub(crate) fn get_previous_sibling_tag<'a>(
20
+ pub fn get_previous_sibling_tag<'a>(
21
21
  node_handle: &tl::NodeHandle,
22
22
  parser: &tl::Parser,
23
23
  dom_ctx: &'a DomContext,
@@ -53,17 +53,13 @@ pub(crate) fn get_previous_sibling_tag<'a>(
53
53
 
54
54
  /// Check if the previous sibling is an inline tag.
55
55
  #[allow(clippy::trivially_copy_pass_by_ref)]
56
- pub(crate) fn previous_sibling_is_inline_tag(
57
- node_handle: &tl::NodeHandle,
58
- parser: &tl::Parser,
59
- dom_ctx: &DomContext,
60
- ) -> bool {
56
+ pub fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
61
57
  dom_ctx.previous_inline_like(*node_handle, parser)
62
58
  }
63
59
 
64
60
  /// Check if the next sibling is whitespace-only text.
65
61
  #[allow(clippy::trivially_copy_pass_by_ref)]
66
- pub(crate) fn next_sibling_is_whitespace_text(
62
+ pub fn next_sibling_is_whitespace_text(
67
63
  node_handle: &tl::NodeHandle,
68
64
  parser: &tl::Parser,
69
65
  dom_ctx: &DomContext,
@@ -73,11 +69,7 @@ pub(crate) fn next_sibling_is_whitespace_text(
73
69
 
74
70
  /// Check if the next sibling is an inline tag.
75
71
  #[allow(clippy::trivially_copy_pass_by_ref)]
76
- pub(crate) fn next_sibling_is_inline_tag(
77
- node_handle: &tl::NodeHandle,
78
- parser: &tl::Parser,
79
- dom_ctx: &DomContext,
80
- ) -> bool {
72
+ pub fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
81
73
  dom_ctx.next_inline_like(*node_handle, parser)
82
74
  }
83
75
 
@@ -85,7 +77,7 @@ pub(crate) fn next_sibling_is_inline_tag(
85
77
  ///
86
78
  /// Avoids adding spaces before siblings that are already whitespace.
87
79
  #[allow(clippy::trivially_copy_pass_by_ref)]
88
- pub(crate) fn append_inline_suffix(
80
+ pub fn append_inline_suffix(
89
81
  output: &mut String,
90
82
  suffix: &str,
91
83
  has_core_content: bool,
@@ -163,6 +163,12 @@ pub struct InlineImage {
163
163
  pub attributes: BTreeMap<String, String>,
164
164
  }
165
165
 
166
+ impl std::fmt::Display for InlineImage {
167
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168
+ write!(f, "{self:?}")
169
+ }
170
+ }
171
+
166
172
  /// Human-friendly warning emitted during inline image extraction.
167
173
  #[derive(Debug, Clone)]
168
174
  pub struct InlineImageWarning {
@@ -47,28 +47,29 @@
47
47
  // Module Declarations
48
48
  // ============================================================================
49
49
 
50
- pub mod converter;
51
50
  pub mod error;
52
- #[cfg(feature = "inline-images")]
53
- mod inline_images;
54
51
  #[cfg(feature = "metadata")]
55
52
  pub mod metadata;
56
53
  pub mod options;
57
- pub mod safety;
58
- pub mod text;
59
54
  pub mod types;
60
55
  #[cfg(feature = "visitor")]
61
56
  pub mod visitor;
62
- #[cfg(feature = "visitor")]
63
- pub mod visitor_helpers;
64
- pub mod wrapper;
65
57
 
66
58
  // Internal modules (not part of public API)
67
59
  mod convert_api;
60
+ #[allow(dead_code)]
61
+ pub(crate) mod converter;
68
62
  mod exports;
69
- pub mod prelude;
63
+ #[cfg(feature = "inline-images")]
64
+ mod inline_images;
65
+ pub(crate) mod prelude;
70
66
  mod rcdom;
67
+ pub(crate) mod text;
71
68
  mod validation;
69
+ #[cfg(feature = "visitor")]
70
+ #[allow(clippy::ref_option)]
71
+ pub(crate) mod visitor_helpers;
72
+ pub(crate) mod wrapper;
72
73
 
73
74
  // ============================================================================
74
75
  // Public Re-exports (from exports module)
@@ -79,6 +80,8 @@ pub use types::{
79
80
  AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
80
81
  TableData, TableGrid, TextAnnotation, WarningKind,
81
82
  };
83
+ #[cfg(feature = "visitor")]
84
+ pub use visitor::{NodeContext, NodeType, VisitResult};
82
85
 
83
86
  // ============================================================================
84
87
  // Main Public API Functions
@@ -95,10 +98,6 @@ pub use convert_api::metadata_config_from_json;
95
98
  #[cfg(feature = "inline-images")]
96
99
  pub use convert_api::inline_image_config_from_json;
97
100
 
98
- #[cfg(feature = "visitor")]
99
- #[doc(hidden)]
100
- pub use convert_api::convert_with_visitor;
101
-
102
101
  // Tests
103
102
  // ============================================================================
104
103
 
@@ -109,27 +108,27 @@ mod basic_tests {
109
108
  #[test]
110
109
  fn test_binary_input_rejected() {
111
110
  let html = format!("abc{}def", "\0".repeat(20));
112
- let result = convert(&html, None);
111
+ let result = convert(&html, None, None);
113
112
  assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
114
113
  }
115
114
 
116
115
  #[test]
117
116
  fn test_binary_magic_rejected() {
118
117
  let html = "%PDF-1.7";
119
- let result = convert(html, None);
118
+ let result = convert(html, None, None);
120
119
  assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
121
120
  }
122
121
 
123
122
  #[test]
124
123
  fn test_utf16_hint_recovered() {
125
124
  let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
126
- let result = convert(&html, None);
125
+ let result = convert(&html, None, None);
127
126
  assert!(result.is_ok(), "UTF-16 input should be recovered instead of rejected");
128
127
  }
129
128
 
130
129
  #[test]
131
130
  fn test_plain_text_allowed() {
132
- let result = convert("Just text", None).unwrap();
131
+ let result = convert("Just text", None, None).unwrap();
133
132
  let content = result.content.unwrap_or_default();
134
133
  assert!(content.contains("Just text"));
135
134
  }
@@ -141,7 +140,7 @@ mod basic_tests {
141
140
  escape_underscores: true,
142
141
  ..ConversionOptions::default()
143
142
  };
144
- let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
143
+ let result = convert("Text *asterisks* _underscores_", Some(options), None).unwrap();
145
144
  let content = result.content.unwrap_or_default();
146
145
  assert!(content.contains(r"\*asterisks\*"));
147
146
  assert!(content.contains(r"\_underscores\_"));
@@ -105,6 +105,21 @@ pub struct ConversionOptions {
105
105
  pub capture_svg: bool,
106
106
  /// Infer image dimensions from data.
107
107
  pub infer_dimensions: bool,
108
+ /// Maximum DOM traversal depth. `None` means unlimited.
109
+ /// When set, subtrees beyond this depth are silently truncated.
110
+ pub max_depth: Option<usize>,
111
+ /// CSS selectors for elements to exclude entirely (element + all content).
112
+ ///
113
+ /// Unlike `strip_tags` (which removes the tag wrapper but keeps children),
114
+ /// excluded elements and all their descendants are dropped from the output.
115
+ /// Supports any CSS selector that `tl` supports: tag names, `.class`,
116
+ /// `#id`, `[attribute]`, etc.
117
+ ///
118
+ /// Invalid selectors are silently skipped at conversion time.
119
+ ///
120
+ /// Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
121
+ #[serde(default)]
122
+ pub exclude_selectors: Vec<String>,
108
123
  }
109
124
 
110
125
  impl Default for ConversionOptions {
@@ -148,6 +163,8 @@ impl Default for ConversionOptions {
148
163
  max_image_size: 5_242_880,
149
164
  capture_svg: false,
150
165
  infer_dimensions: true,
166
+ max_depth: None,
167
+ exclude_selectors: Vec::new(),
151
168
  }
152
169
  }
153
170
  }
@@ -255,6 +272,14 @@ impl ConversionOptionsBuilder {
255
272
  builder_setter!(max_image_size, u64);
256
273
  builder_setter!(capture_svg, bool);
257
274
  builder_setter!(infer_dimensions, bool);
275
+ builder_setter!(max_depth, Option<usize>);
276
+
277
+ /// Set the list of CSS selectors for elements to exclude entirely from output.
278
+ #[must_use]
279
+ pub fn exclude_selectors(mut self, selectors: Vec<String>) -> Self {
280
+ self.0.exclude_selectors = selectors;
281
+ self
282
+ }
258
283
 
259
284
  // Preprocessing
260
285
  /// Set the pre-processing options applied to the HTML before conversion.
@@ -368,6 +393,10 @@ pub struct ConversionOptionsUpdate {
368
393
  pub capture_svg: Option<bool>,
369
394
  /// Optional override for [`ConversionOptions::infer_dimensions`].
370
395
  pub infer_dimensions: Option<bool>,
396
+ /// Optional override for [`ConversionOptions::max_depth`].
397
+ pub max_depth: Option<Option<usize>>,
398
+ /// Optional override for [`ConversionOptions::exclude_selectors`].
399
+ pub exclude_selectors: Option<Vec<String>>,
371
400
  }
372
401
 
373
402
  impl ConversionOptions {
@@ -417,6 +446,8 @@ impl ConversionOptions {
417
446
  apply!(max_image_size);
418
447
  apply!(capture_svg);
419
448
  apply!(infer_dimensions);
449
+ apply!(max_depth);
450
+ apply!(exclude_selectors);
420
451
  if let Some(preprocessing) = update.preprocessing {
421
452
  self.preprocessing.apply_update(preprocessing);
422
453
  }
@@ -1,12 +1 @@
1
- //! Prelude module for convenient imports.
2
- //!
3
- //! Re-exports the most commonly used types and functions from the crate.
4
- //! Users can import everything they need with:
5
- //! ```
6
- //! use html_to_markdown_rs::prelude::*;
7
- //! ```
8
-
9
- pub use crate::convert;
10
- pub use crate::error::{ConversionError, Result};
11
- pub use crate::options::{ConversionOptions, HeadingStyle};
12
- pub use crate::types::ConversionResult;
1
+ //! Prelude module for convenient internal imports.
@@ -314,36 +314,6 @@ const fn is_unicode_space(ch: char) -> bool {
314
314
  )
315
315
  }
316
316
 
317
- /// Underline text with a character.
318
- #[must_use]
319
- pub fn underline(text: &str, pad_char: char) -> String {
320
- let text = text.trim_end();
321
- if text.is_empty() {
322
- return String::new();
323
- }
324
- format!("{}\n{}\n\n", text, pad_char.to_string().repeat(text.len()))
325
- }
326
-
327
- /// Indent text with a string prefix.
328
- #[must_use]
329
- pub fn indent(text: &str, level: usize, indent_str: &str) -> String {
330
- if text.is_empty() {
331
- return String::new();
332
- }
333
-
334
- let prefix = indent_str.repeat(level);
335
- text.lines()
336
- .map(|line| {
337
- if line.is_empty() {
338
- String::new()
339
- } else {
340
- format!("{prefix}{line}")
341
- }
342
- })
343
- .collect::<Vec<_>>()
344
- .join("\n")
345
- }
346
-
347
317
  #[cfg(test)]
348
318
  mod tests {
349
319
  use super::*;
@@ -385,18 +355,4 @@ mod tests {
385
355
  assert_eq!(chomp("text "), ("", " ", "text"));
386
356
  assert_eq!(chomp(""), ("", "", ""));
387
357
  }
388
-
389
- #[test]
390
- fn test_underline() {
391
- assert_eq!(underline("Title", '='), "Title\n=====\n\n");
392
- assert_eq!(underline("Subtitle", '-'), "Subtitle\n--------\n\n");
393
- assert_eq!(underline("", '='), "");
394
- }
395
-
396
- #[test]
397
- fn test_indent() {
398
- assert_eq!(indent("line1\nline2", 1, "\t"), "\tline1\n\tline2");
399
- assert_eq!(indent("text", 2, " "), " text");
400
- assert_eq!(indent("", 1, "\t"), "");
401
- }
402
358
  }
@@ -25,4 +25,6 @@ pub enum WarningKind {
25
25
  MalformedHtml,
26
26
  /// Sanitization was applied to remove potentially unsafe content.
27
27
  SanitizationApplied,
28
+ /// DOM traversal was truncated because max_depth was exceeded.
29
+ DepthLimitExceeded,
28
30
  }
@@ -12,6 +12,7 @@ use std::collections::BTreeMap;
12
12
  /// This enum categorizes all HTML elements that the converter recognizes,
13
13
  /// providing a coarse-grained classification for visitor dispatch.
14
14
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15
+ #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
15
16
  pub enum NodeType {
16
17
  /// Text node (most frequent - 100+ per document)
17
18
  Text,
@@ -207,6 +208,7 @@ pub enum NodeType {
207
208
  /// Provides comprehensive metadata about the current node being visited,
208
209
  /// including its type, attributes, position in the DOM tree, and parent context.
209
210
  #[derive(Debug, Clone)]
211
+ #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
210
212
  pub struct NodeContext {
211
213
  /// Coarse-grained node type classification
212
214
  pub node_type: NodeType,
@@ -235,8 +237,10 @@ pub struct NodeContext {
235
237
  /// Allows visitors to control the conversion flow by either proceeding
236
238
  /// with default behavior, providing custom output, skipping elements,
237
239
  /// preserving HTML, or signaling errors.
238
- #[derive(Debug, Clone)]
240
+ #[derive(Debug, Clone, Default)]
241
+ #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
239
242
  pub enum VisitResult {
243
+ #[default]
240
244
  /// Continue with default conversion behavior
241
245
  Continue,
242
246
 
@@ -296,7 +296,10 @@ macro_rules! try_visitor {
296
296
  return Ok(String::new());
297
297
  }
298
298
  $crate::visitor_helpers::VisitorDispatch::PreserveHtml => {
299
- // TODO: Implement HTML preservation logic
299
+ // Falls through to default conversion — full HTML preservation requires
300
+ // the node handle and parser context which aren't available in this macro.
301
+ // Callers that need PreserveHtml support should match on the dispatch
302
+ // result directly and call serialize_tag_to_html.
300
303
  }
301
304
  }
302
305
  }};
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -293,5 +293,5 @@ fn convert(
293
293
  html: &str,
294
294
  opts: Option<html_to_markdown_rs::ConversionOptions>,
295
295
  ) -> html_to_markdown_rs::error::Result<String> {
296
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
296
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
297
297
  }
@@ -3,7 +3,7 @@ fn convert(
3
3
  html: &str,
4
4
  opts: Option<html_to_markdown_rs::ConversionOptions>,
5
5
  ) -> html_to_markdown_rs::error::Result<String> {
6
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
6
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
7
7
  }
8
8
 
9
9
  use html_to_markdown_rs::{ConversionOptions, OutputFormat};
@@ -0,0 +1,136 @@
1
+ #![allow(missing_docs)]
2
+
3
+ use html_to_markdown_rs::ConversionOptions;
4
+
5
+ fn convert(html: &str, opts: Option<ConversionOptions>) -> html_to_markdown_rs::error::Result<String> {
6
+ #[cfg(feature = "visitor")]
7
+ let result = html_to_markdown_rs::convert(html, opts, None);
8
+ #[cfg(not(feature = "visitor"))]
9
+ let result = html_to_markdown_rs::convert(html, opts);
10
+ result.map(|r| r.content.unwrap_or_default())
11
+ }
12
+
13
+ #[test]
14
+ fn test_exclude_selectors_drops_matching_elements() {
15
+ let html = r#"<body>
16
+ <div class="cookie-banner">Accept cookies</div>
17
+ <article><p>Main content here.</p></article>
18
+ <div id="ad-container">Buy stuff</div>
19
+ </body>"#;
20
+
21
+ let options = ConversionOptions {
22
+ exclude_selectors: vec![".cookie-banner".to_string(), "#ad-container".to_string()],
23
+ ..Default::default()
24
+ };
25
+
26
+ let result = convert(html, Some(options)).unwrap();
27
+
28
+ assert!(result.contains("Main content"), "Should keep main content");
29
+ assert!(!result.contains("cookie"), "Should drop .cookie-banner element");
30
+ assert!(!result.contains("Buy stuff"), "Should drop #ad-container element");
31
+ }
32
+
33
+ #[test]
34
+ fn test_exclude_selectors_drops_nested_content() {
35
+ let html = r#"<body>
36
+ <aside class="sidebar">
37
+ <h2>Related articles</h2>
38
+ <p>Some sidebar content</p>
39
+ </aside>
40
+ <main><p>Primary content.</p></main>
41
+ </body>"#;
42
+
43
+ let options = ConversionOptions {
44
+ exclude_selectors: vec![".sidebar".to_string()],
45
+ ..Default::default()
46
+ };
47
+
48
+ let result = convert(html, Some(options)).unwrap();
49
+
50
+ assert!(result.contains("Primary content"), "Should keep main content");
51
+ assert!(
52
+ !result.contains("Related articles"),
53
+ "Should drop heading inside excluded element"
54
+ );
55
+ assert!(
56
+ !result.contains("sidebar content"),
57
+ "Should drop paragraph inside excluded element"
58
+ );
59
+ }
60
+
61
+ #[test]
62
+ fn test_exclude_selectors_empty_list_is_noop() {
63
+ let html = r"<body><p>Hello world</p></body>";
64
+
65
+ let options = ConversionOptions {
66
+ exclude_selectors: vec![],
67
+ ..Default::default()
68
+ };
69
+
70
+ let result = convert(html, Some(options)).unwrap();
71
+ assert!(
72
+ result.contains("Hello world"),
73
+ "Empty exclude_selectors should not affect output"
74
+ );
75
+ }
76
+
77
+ #[test]
78
+ fn test_exclude_selectors_invalid_selector_is_skipped() {
79
+ let html = r"<body><p>Visible text</p></body>";
80
+
81
+ // An empty string or garbled selector should not panic or error — just be ignored.
82
+ let options = ConversionOptions {
83
+ exclude_selectors: vec![String::new(), "p".to_string()],
84
+ ..Default::default()
85
+ };
86
+
87
+ // Should not return an error; whether the paragraph is excluded depends on the
88
+ // selector, but it must not panic.
89
+ let _ = convert(html, Some(options));
90
+ }
91
+
92
+ #[test]
93
+ fn test_exclude_selectors_attribute_selector() {
94
+ let html = r#"<body>
95
+ <div role="complementary">Sidebar</div>
96
+ <p>Main text</p>
97
+ </body>"#;
98
+
99
+ let options = ConversionOptions {
100
+ exclude_selectors: vec!["[role='complementary']".to_string()],
101
+ ..Default::default()
102
+ };
103
+
104
+ let result = convert(html, Some(options)).unwrap();
105
+
106
+ assert!(result.contains("Main text"), "Should keep non-excluded content");
107
+ assert!(
108
+ !result.contains("Sidebar"),
109
+ "Should drop element matching attribute selector"
110
+ );
111
+ }
112
+
113
+ #[test]
114
+ fn test_exclude_selectors_plain_text_output() {
115
+ let html = r#"<body>
116
+ <div class="nav">Navigation links</div>
117
+ <p>Article body text.</p>
118
+ </body>"#;
119
+
120
+ let options = ConversionOptions {
121
+ exclude_selectors: vec![".nav".to_string()],
122
+ output_format: html_to_markdown_rs::OutputFormat::Plain,
123
+ ..Default::default()
124
+ };
125
+
126
+ let result = convert(html, Some(options)).unwrap();
127
+
128
+ assert!(
129
+ result.contains("Article body text"),
130
+ "Should keep body text in plain output"
131
+ );
132
+ assert!(
133
+ !result.contains("Navigation links"),
134
+ "Should drop excluded element in plain output"
135
+ );
136
+ }
@@ -619,5 +619,5 @@ fn convert(
619
619
  html: &str,
620
620
  opts: Option<html_to_markdown_rs::ConversionOptions>,
621
621
  ) -> html_to_markdown_rs::error::Result<String> {
622
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
622
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
623
623
  }
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  #[test]
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use html_to_markdown_rs::ConversionOptions;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -129,5 +129,5 @@ fn convert(
129
129
  html: &str,
130
130
  opts: Option<html_to_markdown_rs::ConversionOptions>,
131
131
  ) -> html_to_markdown_rs::error::Result<String> {
132
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
132
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
133
133
  }
@@ -140,5 +140,5 @@ fn convert(
140
140
  html: &str,
141
141
  opts: Option<html_to_markdown_rs::ConversionOptions>,
142
142
  ) -> html_to_markdown_rs::error::Result<String> {
143
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
143
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
144
144
  }
@@ -8,7 +8,7 @@ fn test_strong_blockquote_strong_newlines() {
8
8
  html: &str,
9
9
  opts: Option<html_to_markdown_rs::ConversionOptions>,
10
10
  ) -> html_to_markdown_rs::error::Result<String> {
11
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
11
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
12
12
  }
13
13
 
14
14
  // Test case from issue #176: strong + blockquote + strong
@@ -39,7 +39,7 @@ fn test_paragraph_blockquote_paragraph_newlines() {
39
39
  html: &str,
40
40
  opts: Option<html_to_markdown_rs::ConversionOptions>,
41
41
  ) -> html_to_markdown_rs::error::Result<String> {
42
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
42
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
43
43
  }
44
44
 
45
45
  // Control test: p + blockquote + p should work correctly
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  use std::fs;
@@ -4,7 +4,7 @@ fn convert(
4
4
  html: &str,
5
5
  opts: Option<html_to_markdown_rs::ConversionOptions>,
6
6
  ) -> html_to_markdown_rs::error::Result<String> {
7
- html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
7
+ html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
8
8
  }
9
9
 
10
10
  #[test]