html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
@@ -27,7 +27,7 @@ use std::borrow::Cow;
27
27
  /// - **Inline mode**: Children are processed inline without block spacing
28
28
  /// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
29
29
  /// - **Empty content**: Skipped entirely
30
- pub(crate) fn handle_form(
30
+ pub fn handle_form(
31
31
  _tag_name: &str,
32
32
  node_handle: &tl::NodeHandle,
33
33
  parser: &tl::Parser,
@@ -82,7 +82,7 @@ pub(crate) fn handle_form(
82
82
  /// - **Inline mode**: Children are processed inline without block spacing
83
83
  /// - **Block mode**: Content is collected, trimmed, and wrapped with blank lines
84
84
  /// - **Empty content**: Skipped entirely
85
- pub(crate) fn handle_fieldset(
85
+ pub fn handle_fieldset(
86
86
  _tag_name: &str,
87
87
  node_handle: &tl::NodeHandle,
88
88
  parser: &tl::Parser,
@@ -137,7 +137,7 @@ pub(crate) fn handle_fieldset(
137
137
  /// - **Block mode**: Content is wrapped in strong markers (e.g., `**text**`)
138
138
  /// - **Inline mode**: Content is rendered without emphasis
139
139
  /// - Uses the configured strong/emphasis symbol from ConversionOptions
140
- pub(crate) fn handle_legend(
140
+ pub fn handle_legend(
141
141
  _tag_name: &str,
142
142
  node_handle: &tl::NodeHandle,
143
143
  parser: &tl::Parser,
@@ -198,7 +198,7 @@ pub(crate) fn handle_legend(
198
198
  /// - Content is collected from children
199
199
  /// - Non-empty content is output followed by blank lines (in block mode)
200
200
  /// - Blank lines are suppressed in inline mode
201
- pub(crate) fn handle_label(
201
+ pub fn handle_label(
202
202
  _tag_name: &str,
203
203
  node_handle: &tl::NodeHandle,
204
204
  parser: &tl::Parser,
@@ -231,7 +231,7 @@ pub(crate) fn handle_label(
231
231
  ///
232
232
  /// An input element represents a form control for user input. Since input
233
233
  /// elements typically have no text content, this handler produces no output.
234
- pub(crate) fn handle_input(
234
+ pub fn handle_input(
235
235
  _tag_name: &str,
236
236
  _node_handle: &tl::NodeHandle,
237
237
  _parser: &tl::Parser,
@@ -253,7 +253,7 @@ pub(crate) fn handle_input(
253
253
  ///
254
254
  /// - Content is collected from children
255
255
  /// - Blank lines are added after content in block mode only
256
- pub(crate) fn handle_textarea(
256
+ pub fn handle_textarea(
257
257
  _tag_name: &str,
258
258
  node_handle: &tl::NodeHandle,
259
259
  parser: &tl::Parser,
@@ -287,7 +287,7 @@ pub(crate) fn handle_textarea(
287
287
  ///
288
288
  /// - Content (options) is collected from children
289
289
  /// - A single newline is added after the select in block mode
290
- pub(crate) fn handle_select(
290
+ pub fn handle_select(
291
291
  _tag_name: &str,
292
292
  node_handle: &tl::NodeHandle,
293
293
  parser: &tl::Parser,
@@ -322,7 +322,7 @@ pub(crate) fn handle_select(
322
322
  /// - Content is collected from children
323
323
  /// - If the option has the `selected` attribute, it's prefixed with `* ` in block mode
324
324
  /// - A newline is added after each option in block mode
325
- pub(crate) fn handle_option(
325
+ pub fn handle_option(
326
326
  _tag_name: &str,
327
327
  node_handle: &tl::NodeHandle,
328
328
  parser: &tl::Parser,
@@ -365,7 +365,7 @@ pub(crate) fn handle_option(
365
365
  ///
366
366
  /// - The `label` attribute is output as strong text (if present)
367
367
  /// - Options within the group are rendered normally
368
- pub(crate) fn handle_optgroup(
368
+ pub fn handle_optgroup(
369
369
  _tag_name: &str,
370
370
  node_handle: &tl::NodeHandle,
371
371
  parser: &tl::Parser,
@@ -410,7 +410,7 @@ pub(crate) fn handle_optgroup(
410
410
  ///
411
411
  /// - Content is collected from children
412
412
  /// - Blank lines are added after content in block mode only
413
- pub(crate) fn handle_button(
413
+ pub fn handle_button(
414
414
  _tag_name: &str,
415
415
  node_handle: &tl::NodeHandle,
416
416
  parser: &tl::Parser,
@@ -444,7 +444,7 @@ pub(crate) fn handle_button(
444
444
  ///
445
445
  /// - Content is collected from children (usually empty)
446
446
  /// - Blank lines are added after content in block mode only
447
- pub(crate) fn handle_progress(
447
+ pub fn handle_progress(
448
448
  _tag_name: &str,
449
449
  node_handle: &tl::NodeHandle,
450
450
  parser: &tl::Parser,
@@ -478,7 +478,7 @@ pub(crate) fn handle_progress(
478
478
  ///
479
479
  /// - Content is collected from children (usually empty)
480
480
  /// - Blank lines are added after content in block mode only
481
- pub(crate) fn handle_meter(
481
+ pub fn handle_meter(
482
482
  _tag_name: &str,
483
483
  node_handle: &tl::NodeHandle,
484
484
  parser: &tl::Parser,
@@ -512,7 +512,7 @@ pub(crate) fn handle_meter(
512
512
  ///
513
513
  /// - Content is collected from children
514
514
  /// - Blank lines are added after content in block mode only
515
- pub(crate) fn handle_output(
515
+ pub fn handle_output(
516
516
  _tag_name: &str,
517
517
  node_handle: &tl::NodeHandle,
518
518
  parser: &tl::Parser,
@@ -546,7 +546,7 @@ pub(crate) fn handle_output(
546
546
  ///
547
547
  /// - Content (options) is collected from children
548
548
  /// - A single newline is added after the datalist in block mode
549
- pub(crate) fn handle_datalist(
549
+ pub fn handle_datalist(
550
550
  _tag_name: &str,
551
551
  node_handle: &tl::NodeHandle,
552
552
  parser: &tl::Parser,
@@ -25,7 +25,7 @@
25
25
  pub mod elements;
26
26
 
27
27
  // Re-export types from parent module for submodule access
28
- pub(crate) use super::walk_node;
28
+ pub use super::walk_node;
29
29
  pub use super::{Context, DomContext};
30
30
 
31
31
  // Re-export handler function for direct use
@@ -6,9 +6,6 @@
6
6
  mod djot;
7
7
  mod markdown;
8
8
 
9
- pub use djot::DjotRenderer;
10
- pub use markdown::MarkdownRenderer;
11
-
12
9
  /// Trait for format-specific rendering of inline elements.
13
10
  ///
14
11
  /// Implementations provide the syntax for emphasis, strong, strikethrough, etc.
@@ -35,7 +35,7 @@ type DomContext = crate::converter::DomContext;
35
35
  /// # Note
36
36
  /// This function references helper functions and `walk_node` from converter.rs
37
37
  /// which must be accessible (pub(crate)) for this module to work correctly.
38
- pub(crate) fn handle(
38
+ pub fn handle(
39
39
  tag_name: &str,
40
40
  node_handle: &NodeHandle,
41
41
  parser: &Parser,
@@ -32,7 +32,7 @@ type DomContext = crate::converter::DomContext;
32
32
  /// # Note
33
33
  /// This function references helper functions and `walk_node` from converter.rs
34
34
  /// which must be accessible (pub(crate)) for this module to work correctly.
35
- pub(crate) fn handle(
35
+ pub fn handle(
36
36
  tag_name: &str,
37
37
  node_handle: &NodeHandle,
38
38
  parser: &Parser,
@@ -44,7 +44,7 @@ type DomContext = crate::converter::DomContext;
44
44
  /// # Note
45
45
  /// This function references helper functions from converter.rs
46
46
  /// which must be accessible (pub(crate)) for this module to work correctly.
47
- pub(crate) fn handle(
47
+ pub fn handle(
48
48
  node_handle: &NodeHandle,
49
49
  parser: &Parser,
50
50
  output: &mut String,
@@ -360,7 +360,7 @@ pub(crate) fn handle(
360
360
  /// * `title` - Optional link title attribute
361
361
  /// * `raw_text` - Original unprocessed text (for default_title option)
362
362
  /// * `options` - Conversion options
363
- pub(crate) fn append_markdown_link(
363
+ pub fn append_markdown_link(
364
364
  output: &mut String,
365
365
  label: &str,
366
366
  href: &str,
@@ -31,7 +31,6 @@ pub mod ruby;
31
31
  pub mod semantic;
32
32
 
33
33
  // Re-export types from parent module for submodule access
34
- pub use super::{Context, DomContext};
35
34
 
36
35
  // Re-export handler functions for internal use by dispatcher (crate-private)
37
36
  // pub(crate) use ruby::handle as handle_ruby;
@@ -45,7 +45,7 @@ type DomContext = crate::converter::DomContext;
45
45
  /// # Note
46
46
  /// This function references `walk_node` and `normalized_tag_name` from converter.rs,
47
47
  /// which must be accessible (pub(crate)) for this module to work correctly.
48
- pub(crate) fn handle(
48
+ pub fn handle(
49
49
  tag_name: &str,
50
50
  node_handle: &NodeHandle,
51
51
  parser: &Parser,
@@ -37,7 +37,7 @@ type DomContext = crate::converter::DomContext;
37
37
  /// # Note
38
38
  /// This function references helper functions and `walk_node` from converter.rs
39
39
  /// which must be accessible (pub(crate)) for this module to work correctly.
40
- pub(crate) fn handle(
40
+ pub fn handle(
41
41
  tag_name: &str,
42
42
  node_handle: &NodeHandle,
43
43
  parser: &Parser,
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
15
15
  /// Handle definition list element (<dl>).
16
16
  ///
17
17
  /// Groups dt/dd pairs and formats them with proper Markdown separation.
18
- pub(crate) fn handle_dl(
18
+ pub fn handle_dl(
19
19
  node_handle: &tl::NodeHandle,
20
20
  parser: &tl::Parser,
21
21
  output: &mut String,
@@ -61,7 +61,7 @@ pub(crate) fn handle_dl(
61
61
  /// Handle definition term element (<dt>).
62
62
  ///
63
63
  /// Outputs the term text followed by a newline.
64
- pub(crate) fn handle_dt(
64
+ pub fn handle_dt(
65
65
  node_handle: &tl::NodeHandle,
66
66
  parser: &tl::Parser,
67
67
  output: &mut String,
@@ -96,7 +96,7 @@ pub(crate) fn handle_dt(
96
96
  /// Handle definition description element (<dd>).
97
97
  ///
98
98
  /// Outputs the description as a plain block.
99
- pub(crate) fn handle_dd(
99
+ pub fn handle_dd(
100
100
  node_handle: &tl::NodeHandle,
101
101
  parser: &tl::Parser,
102
102
  output: &mut String,
@@ -24,7 +24,7 @@ type DomContext = crate::converter::DomContext;
24
24
  /// Processes list item content with support for task lists (checkboxes),
25
25
  /// proper indentation, and block-level element detection.
26
26
  #[allow(clippy::too_many_arguments)]
27
- pub(crate) fn handle_li(
27
+ pub fn handle_li(
28
28
  node_handle: &tl::NodeHandle,
29
29
  tag: &tl::HTMLTag,
30
30
  parser: &tl::Parser,
@@ -14,7 +14,6 @@ pub mod unordered;
14
14
  pub mod utils;
15
15
 
16
16
  // Re-export types from parent module for submodule access
17
- pub use super::{Context, DomContext};
18
17
 
19
18
  // Re-export utility function needed by table builder
20
19
 
@@ -26,7 +26,7 @@ type DomContext = crate::converter::DomContext;
26
26
  /// Extracts the `start` attribute to set initial counter value,
27
27
  /// detects loose/tight list format, and processes list items.
28
28
  #[allow(clippy::too_many_arguments)]
29
- pub(crate) fn handle_ol(
29
+ pub fn handle_ol(
30
30
  node_handle: &tl::NodeHandle,
31
31
  parser: &tl::Parser,
32
32
  output: &mut String,
@@ -198,4 +198,4 @@ pub(crate) fn handle_ol(
198
198
  }
199
199
 
200
200
  /// Public alias for `handle_ol` to match the expected module interface.
201
- pub(crate) use handle_ol as handle;
201
+ pub use handle_ol as handle;
@@ -26,7 +26,7 @@ type DomContext = crate::converter::DomContext;
26
26
  /// Detects loose/tight list format, handles nested bullets,
27
27
  /// and processes list items with proper indentation.
28
28
  #[allow(clippy::too_many_arguments)]
29
- pub(crate) fn handle_ul(
29
+ pub fn handle_ul(
30
30
  node_handle: &tl::NodeHandle,
31
31
  parser: &tl::Parser,
32
32
  output: &mut String,
@@ -192,4 +192,4 @@ pub(crate) fn handle_ul(
192
192
  }
193
193
 
194
194
  /// Public alias for `handle_ul` to match the expected module interface.
195
- pub(crate) use handle_ul as handle;
195
+ pub use handle_ul as handle;
@@ -11,7 +11,7 @@
11
11
  )]
12
12
 
13
13
  use std::borrow::Cow;
14
- use std::collections::BTreeMap;
14
+ use std::collections::{BTreeMap, HashSet};
15
15
 
16
16
  use crate::converter::dom_context::DomContext;
17
17
  use crate::converter::main_helpers::{
@@ -33,26 +33,6 @@ use crate::options::ConversionOptions;
33
33
  use crate::converter::context::{Context, InlineCollectorHandle};
34
34
  use crate::types::structure_collector::StructureCollectorHandle;
35
35
 
36
- /// Converts HTML to Markdown using the provided conversion options.
37
- ///
38
- /// This is the main entry point for HTML to Markdown conversion.
39
- pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
40
- convert_html_impl(html, options, None, None, None, None).map(|(md, _)| md)
41
- }
42
-
43
- /// Converts HTML to Markdown with a custom visitor for callbacks during traversal.
44
- ///
45
- /// This variant allows passing a visitor that will receive callbacks for each node
46
- /// during the tree walk, enabling custom processing or analysis.
47
- #[cfg(feature = "visitor")]
48
- pub fn convert_html_with_visitor(
49
- html: &str,
50
- options: &ConversionOptions,
51
- visitor: Option<crate::visitor::VisitorHandle>,
52
- ) -> Result<String> {
53
- convert_html_impl(html, options, None, None, visitor, None).map(|(md, _)| md)
54
- }
55
-
56
36
  /// Internal implementation of HTML to Markdown conversion.
57
37
  ///
58
38
  /// Returns `(markdown, Option<DocumentStructure>)`. The structure is populated when
@@ -62,7 +42,7 @@ pub fn convert_html_with_visitor(
62
42
  allow(unused_variables)
63
43
  )]
64
44
  #[allow(clippy::too_many_lines)]
65
- pub(crate) fn convert_html_impl(
45
+ pub fn convert_html_impl(
66
46
  html: &str,
67
47
  options: &ConversionOptions,
68
48
  inline_collector: Option<InlineCollectorHandle>,
@@ -82,7 +62,9 @@ pub(crate) fn convert_html_impl(
82
62
 
83
63
  if has_custom_element_tags(&preprocessed) {
84
64
  if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
85
- let repaired = preprocess_html(&repaired_html).into_owned();
65
+ let stripped = strip_script_and_style_tags(&repaired_html);
66
+ let stripped = strip_hidden_elements(&stripped);
67
+ let repaired = preprocess_html(&stripped).into_owned();
86
68
  preprocessed = repaired;
87
69
  preprocessed_len = preprocessed.len();
88
70
  }
@@ -93,7 +75,9 @@ pub(crate) fn convert_html_impl(
93
75
  break dom;
94
76
  }
95
77
  if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
96
- preprocessed = preprocess_html(&repaired_html).into_owned();
78
+ let stripped = strip_script_and_style_tags(&repaired_html);
79
+ let stripped = strip_hidden_elements(&stripped);
80
+ preprocessed = preprocess_html(&stripped).into_owned();
97
81
  preprocessed_len = preprocessed.len();
98
82
  continue;
99
83
  }
@@ -111,7 +95,9 @@ pub(crate) fn convert_html_impl(
111
95
  if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
112
96
  // Drop dom to release borrow on preprocessed
113
97
  drop(dom);
114
- preprocessed = preprocess_html(&repaired_html).into_owned();
98
+ let stripped = strip_script_and_style_tags(&repaired_html);
99
+ let stripped = strip_hidden_elements(&stripped);
100
+ preprocessed = preprocess_html(&stripped).into_owned();
115
101
  preprocessed_len = preprocessed.len();
116
102
  // Re-parse with repaired HTML
117
103
  dom = tl::parse(&preprocessed, parser_options)
@@ -205,7 +191,7 @@ pub(crate) fn convert_html_impl(
205
191
  };
206
192
 
207
193
  #[cfg(all(feature = "metadata", feature = "visitor"))]
208
- let ctx = Context::new(
194
+ let mut ctx = Context::new(
209
195
  options,
210
196
  inline_collector,
211
197
  metadata_collector,
@@ -214,7 +200,7 @@ pub(crate) fn convert_html_impl(
214
200
  reference_collector.as_ref().map(std::rc::Rc::clone),
215
201
  );
216
202
  #[cfg(all(feature = "metadata", not(feature = "visitor")))]
217
- let ctx = Context::new(
203
+ let mut ctx = Context::new(
218
204
  options,
219
205
  inline_collector,
220
206
  metadata_collector,
@@ -223,7 +209,7 @@ pub(crate) fn convert_html_impl(
223
209
  reference_collector.as_ref().map(std::rc::Rc::clone),
224
210
  );
225
211
  #[cfg(all(not(feature = "metadata"), feature = "visitor"))]
226
- let ctx = Context::new(
212
+ let mut ctx = Context::new(
227
213
  options,
228
214
  inline_collector,
229
215
  _metadata_collector,
@@ -232,7 +218,7 @@ pub(crate) fn convert_html_impl(
232
218
  reference_collector.as_ref().map(std::rc::Rc::clone),
233
219
  );
234
220
  #[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
235
- let ctx = Context::new(
221
+ let mut ctx = Context::new(
236
222
  options,
237
223
  inline_collector,
238
224
  _metadata_collector,
@@ -241,6 +227,20 @@ pub(crate) fn convert_html_impl(
241
227
  reference_collector.as_ref().map(std::rc::Rc::clone),
242
228
  );
243
229
 
230
+ // Pre-compute node IDs matching exclude_selectors so walk_node can skip them in O(1).
231
+ // Invalid or unsupported selectors are silently skipped.
232
+ if !options.exclude_selectors.is_empty() {
233
+ let mut excluded: HashSet<u32> = HashSet::new();
234
+ for selector in &options.exclude_selectors {
235
+ if let Some(iter) = dom.query_selector(selector) {
236
+ for handle in iter {
237
+ excluded.insert(handle.get_inner());
238
+ }
239
+ }
240
+ }
241
+ ctx.set_excluded_node_ids(excluded);
242
+ }
243
+
244
244
  for child_handle in dom.children() {
245
245
  walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
246
246
  }
@@ -297,7 +297,7 @@ pub(crate) fn convert_html_impl(
297
297
  #[allow(clippy::only_used_in_recursion)]
298
298
  #[allow(clippy::trivially_copy_pass_by_ref)]
299
299
  #[allow(clippy::cast_possible_truncation)]
300
- pub(crate) fn walk_node(
300
+ pub fn walk_node(
301
301
  node_handle: &tl::NodeHandle,
302
302
  parser: &tl::Parser,
303
303
  output: &mut String,
@@ -308,6 +308,12 @@ pub(crate) fn walk_node(
308
308
  ) {
309
309
  let Some(node) = node_handle.get(parser) else { return };
310
310
 
311
+ if let Some(max) = options.max_depth {
312
+ if depth >= max {
313
+ return;
314
+ }
315
+ }
316
+
311
317
  match node {
312
318
  tl::Node::Raw(bytes) => {
313
319
  let raw = bytes.as_utf8_str();
@@ -353,7 +359,13 @@ pub(crate) fn walk_node(
353
359
  }
354
360
  }
355
361
 
356
- if should_drop_for_preprocessing(node_handle, tag_name.as_ref(), tag, parser, dom_ctx, options) {
362
+ if should_drop_for_preprocessing(tag_name.as_ref(), tag, options) {
363
+ trim_trailing_whitespace(output);
364
+ return;
365
+ }
366
+
367
+ // Drop elements matching exclude_selectors, including all their descendants.
368
+ if !ctx.excluded_node_ids.is_empty() && ctx.excluded_node_ids.contains(&node_handle.get_inner()) {
357
369
  trim_trailing_whitespace(output);
358
370
  return;
359
371
  }
@@ -520,6 +532,20 @@ pub(crate) fn walk_node(
520
532
  );
521
533
  }
522
534
 
535
+ // Sectioning elements routed to semantic dispatcher
536
+ "article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
537
+ crate::converter::semantic::dispatch_semantic_handler(
538
+ &tag_name,
539
+ node_handle,
540
+ parser,
541
+ output,
542
+ options,
543
+ ctx,
544
+ depth,
545
+ dom_ctx,
546
+ );
547
+ }
548
+
523
549
  // Quote element routed to semantic dispatcher
524
550
  "q" => {
525
551
  crate::converter::semantic::dispatch_semantic_handler(
@@ -16,7 +16,7 @@ use crate::converter::main_helpers::tag_name_eq;
16
16
  use crate::options::ConversionOptions;
17
17
 
18
18
  /// Extract src attribute from media element (audio, video, iframe).
19
- pub(crate) fn extract_media_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
19
+ pub fn extract_media_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
20
20
  tag.attributes()
21
21
  .get("src")
22
22
  .flatten()
@@ -28,7 +28,7 @@ pub(crate) fn extract_media_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
28
28
  ///
29
29
  /// Used by audio and video elements to extract src from child <source> elements
30
30
  /// when the parent doesn't have a src attribute.
31
- pub(crate) fn find_source_src<'a, T>(children: T, parser: &'a Parser) -> Option<Cow<'a, str>>
31
+ pub fn find_source_src<'a, T>(children: T, parser: &'a Parser) -> Option<Cow<'a, str>>
32
32
  where
33
33
  T: IntoIterator<Item = &'a NodeHandle>,
34
34
  {
@@ -43,14 +43,14 @@ where
43
43
  }
44
44
 
45
45
  /// Check if tag is a source element.
46
- pub(crate) fn is_source_element(tag: &HTMLTag) -> bool {
46
+ pub fn is_source_element(tag: &HTMLTag) -> bool {
47
47
  tag_name_eq(tag.name().as_utf8_str(), "source")
48
48
  }
49
49
 
50
50
  /// Determine if media should output source link in markdown.
51
51
  ///
52
52
  /// Returns true if src is non-empty.
53
- pub(crate) fn should_output_media_link(src: &str) -> bool {
53
+ pub fn should_output_media_link(src: &str) -> bool {
54
54
  !src.is_empty()
55
55
  }
56
56
 
@@ -58,7 +58,7 @@ pub(crate) fn should_output_media_link(src: &str) -> bool {
58
58
  ///
59
59
  /// Extracts src from audio tag or nested source elements, outputs as a link,
60
60
  /// and processes fallback content (e.g., browser compatibility text).
61
- pub(crate) fn handle_audio(
61
+ pub fn handle_audio(
62
62
  _node_handle: &NodeHandle,
63
63
  tag: &HTMLTag,
64
64
  parser: &Parser,
@@ -121,7 +121,7 @@ pub(crate) fn handle_audio(
121
121
  ///
122
122
  /// Extracts src from video tag or nested source elements, outputs as a link,
123
123
  /// and processes fallback content (e.g., browser compatibility text).
124
- pub(crate) fn handle_video(
124
+ pub fn handle_video(
125
125
  _node_handle: &NodeHandle,
126
126
  tag: &HTMLTag,
127
127
  parser: &Parser,
@@ -183,7 +183,7 @@ pub(crate) fn handle_video(
183
183
  /// Handle picture element conversion to Markdown.
184
184
  ///
185
185
  /// Finds and processes the first child img element, skipping source elements.
186
- pub(crate) fn handle_picture(
186
+ pub fn handle_picture(
187
187
  _node_handle: &NodeHandle,
188
188
  tag: &HTMLTag,
189
189
  parser: &Parser,
@@ -209,7 +209,7 @@ pub(crate) fn handle_picture(
209
209
  ///
210
210
  /// Extracts src attribute from iframe and outputs as a markdown link.
211
211
  /// iframes cannot be embedded in markdown, so we just provide a link to the source.
212
- pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
212
+ pub fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
213
213
  let src = tag
214
214
  .attributes()
215
215
  .get("src")
@@ -19,7 +19,7 @@ type InlineCollectorHandle = std::rc::Rc<std::cell::RefCell<InlineImageCollector
19
19
  #[cfg(feature = "inline-images")]
20
20
  #[allow(clippy::items_after_statements)]
21
21
  #[allow(clippy::manual_let_else)]
22
- pub(crate) fn handle_inline_data_image(
22
+ pub fn handle_inline_data_image(
23
23
  collector_ref: &InlineCollectorHandle,
24
24
  src: &str,
25
25
  alt: &str,
@@ -15,7 +15,7 @@ pub mod svg;
15
15
  pub use super::{Context, DomContext};
16
16
 
17
17
  #[cfg(feature = "inline-images")]
18
- pub(crate) use image::handle_inline_data_image;
18
+ pub use image::handle_inline_data_image;
19
19
 
20
20
  /// Dispatches media element handling to the appropriate handler.
21
21
  ///
@@ -23,7 +23,7 @@ type InlineCollectorHandle = std::rc::Rc<std::cell::RefCell<InlineImageCollector
23
23
  #[allow(clippy::trivially_copy_pass_by_ref)]
24
24
  #[allow(clippy::needless_pass_by_value)]
25
25
  #[allow(clippy::option_if_let_else)]
26
- pub(crate) fn handle_inline_svg(
26
+ pub fn handle_inline_svg(
27
27
  collector_ref: &InlineCollectorHandle,
28
28
  node_handle: &NodeHandle,
29
29
  parser: &Parser,
@@ -93,7 +93,7 @@ pub(crate) fn handle_inline_svg(
93
93
 
94
94
  /// Serialize an element to HTML string (for SVG and Math elements).
95
95
  #[allow(clippy::trivially_copy_pass_by_ref)]
96
- pub(crate) fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
96
+ pub fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
97
97
  if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
98
98
  let tag_name = normalized_tag_name(tag.name().as_utf8_str());
99
99
  let mut html = String::with_capacity(256);
@@ -132,7 +132,7 @@ pub(crate) fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> St
132
132
 
133
133
  /// Serialize a node to HTML string.
134
134
  #[allow(clippy::trivially_copy_pass_by_ref)]
135
- pub(crate) fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> String {
135
+ pub fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> String {
136
136
  if let Some(node) = node_handle.get(parser) {
137
137
  match node {
138
138
  tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
@@ -160,7 +160,7 @@ fn non_empty_trimmed(value: &str) -> Option<String> {
160
160
  /// Extracts title from child elements, handles inline image collection,
161
161
  /// and outputs either the title text (in inline mode) or a base64-encoded image.
162
162
  #[allow(clippy::too_many_arguments)]
163
- pub(crate) fn handle_svg(
163
+ pub fn handle_svg(
164
164
  node_handle: &NodeHandle,
165
165
  tag: &tl::HTMLTag,
166
166
  parser: &Parser,
@@ -230,7 +230,7 @@ pub(crate) fn handle_svg(
230
230
  ///
231
231
  /// Serializes MathML to HTML comment and outputs text content with escaping.
232
232
  #[allow(clippy::too_many_arguments)]
233
- pub(crate) fn handle_math(
233
+ pub fn handle_math(
234
234
  node_handle: &NodeHandle,
235
235
  tag: &tl::HTMLTag,
236
236
  parser: &Parser,
@@ -115,47 +115,36 @@ pub mod visitor_hooks;
115
115
  // Import and re-export public types and functions from the main module
116
116
  pub use self::context::Context;
117
117
  pub use self::dom_context::DomContext;
118
- pub use self::main::convert_html;
119
-
120
- #[cfg(feature = "visitor")]
121
- pub use self::main::convert_html_with_visitor;
122
118
 
123
119
  // Import the tree walker and utility functions from main and main_helpers
124
- pub(crate) use self::main::{convert_html_impl, walk_node};
125
- pub(crate) use self::main_helpers::trim_trailing_whitespace;
120
+ pub use self::main::{convert_html_impl, walk_node};
121
+ pub use self::main_helpers::trim_trailing_whitespace;
126
122
 
127
123
  // Re-export helper functions from utility modules (migrated from converter_legacy)
128
- pub(crate) use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
124
+ pub use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
129
125
  #[allow(unused_imports)]
130
- pub(crate) use crate::converter::utility::serialization::{serialize_node, serialize_node_to_html};
126
+ pub use crate::converter::utility::serialization::{serialize_node, serialize_node_to_html};
131
127
 
132
128
  // Helper functions migrated to utility modules
133
- pub(crate) use crate::converter::utility::siblings::append_inline_suffix;
129
+ pub use crate::converter::utility::siblings::append_inline_suffix;
134
130
 
135
131
  // Caching functions migrated to utility/caching
136
132
 
137
133
  // Content functions migrated to utility/content
138
134
 
139
135
  // Heading functions migrated to block/heading
140
- pub(crate) use crate::converter::block::heading::find_single_heading_child;
136
+ pub use crate::converter::block::heading::find_single_heading_child;
141
137
 
142
138
  // Link functions migrated to inline/link
143
139
 
144
140
  // Re-export dispatch functions for routing elements to handlers
145
- pub use block::dispatch_block_handler;
146
- pub use form::dispatch_form_handler;
147
- pub use inline::dispatch_inline_handler;
148
- pub use list::dispatch_list_handler;
149
- pub use semantic::dispatch_semantic_handler;
150
141
  // Media module doesn't have a dispatcher - it exports utility functions
151
142
 
152
143
  // Re-export utility submodules for public access to their types
153
144
  // NOTE: utility::preprocessing is deliberately not re-exported to avoid naming conflict
154
145
  // with preprocessing_helpers module. Users should access utility::preprocessing directly.
155
- pub use utility::{attributes, caching, content, serialization, siblings};
156
146
 
157
147
  // Re-export format renderer types
158
- pub use format::{DjotRenderer, FormatRenderer, MarkdownRenderer};
159
148
 
160
149
  // Block and inline handlers are internal - only dispatchers are exposed
161
150
  // Individual handlers are pub(crate) and not meant to be part of the public API