html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. checksums.yaml +4 -4
  2. data/Steepfile +6 -0
  3. data/ext/html_to_markdown_rb/Cargo.toml +2 -2
  4. data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
  5. data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
  6. data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
  7. data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
  8. data/lib/bin/html-to-markdown +0 -0
  9. data/lib/html_to_markdown/version.rb +1 -1
  10. data/lib/html_to_markdown.rb +5 -3
  11. data/sig/types.rbs +769 -0
  12. data/vendor/Cargo.toml +2 -2
  13. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  14. data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
  15. data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
  16. data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
  17. data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
  18. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
  19. data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
  20. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
  21. data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
  22. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
  23. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
  24. data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
  25. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
  27. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
  28. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
  29. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
  31. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
  32. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
  33. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
  38. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  43. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
  44. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
  45. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
  46. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
  47. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
  48. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
  53. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
  54. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
  56. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
  57. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
  58. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
  59. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
  60. data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
  61. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
  62. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
  63. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  65. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
  66. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
  67. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
  68. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
  69. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
  70. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
  71. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
  72. data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
  73. data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
  74. data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
  75. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
  76. data/vendor/html-to-markdown-rs/src/text.rs +0 -44
  77. data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
  78. data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
  79. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
  80. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
  81. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
  82. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
  83. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
  84. data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
  85. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
  86. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
  87. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
  88. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
  89. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
  90. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
  91. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
  92. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
  93. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
  94. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
  95. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
  96. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
  97. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
  98. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
  99. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
  100. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
  101. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
  102. data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
  103. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
  104. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
  105. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
  106. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
  107. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
  108. data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
  109. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
  110. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
  111. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
  112. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
  113. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
  114. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
  115. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
  116. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
  117. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
  118. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
  119. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
  120. metadata +21 -43
  121. data/.bundle/config +0 -2
  122. data/.gitignore +0 -3
  123. data/.rubocop.yml +0 -59
  124. data/Gemfile +0 -18
  125. data/Gemfile.lock +0 -173
  126. data/README.md +0 -331
  127. data/Rakefile +0 -26
  128. data/exe/html-to-markdown +0 -6
  129. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
  130. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
  131. data/html-to-markdown-rb.gemspec +0 -99
  132. data/lib/html_to_markdown_rs.rb +0 -3
  133. data/sig/html_to_markdown.rbs +0 -149
  134. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
  135. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
  136. data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
@@ -4,8 +4,10 @@
4
4
  //! visible text content with structural whitespace, bypassing the full
5
5
  //! Markdown/Djot conversion pipeline.
6
6
 
7
+ use std::collections::HashSet;
7
8
  use std::fmt::Write;
8
9
 
10
+ use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
9
11
  use crate::options::ConversionOptions;
10
12
  use crate::text;
11
13
 
@@ -61,12 +63,36 @@ const BLOCK_TAGS: &[&str] = &[
61
63
  /// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
62
64
  /// - Tables: cells separated by tab, rows by newline
63
65
  /// - Inline elements are recursed without markers
66
+ /// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
64
67
  pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
65
68
  let mut buf = String::with_capacity(1024);
66
69
  let mut list_ctx = ListContext::None;
67
70
 
71
+ // Pre-compute excluded node IDs from exclude_selectors.
72
+ let excluded_node_ids: HashSet<u32> = if options.exclude_selectors.is_empty() {
73
+ HashSet::new()
74
+ } else {
75
+ let mut ids = HashSet::new();
76
+ for selector in &options.exclude_selectors {
77
+ if let Some(iter) = dom.query_selector(selector) {
78
+ for handle in iter {
79
+ ids.insert(handle.get_inner());
80
+ }
81
+ }
82
+ }
83
+ ids
84
+ };
85
+
68
86
  for child_handle in dom.children() {
69
- walk_plain(child_handle, parser, &mut buf, options, false, &mut list_ctx);
87
+ walk_plain(
88
+ child_handle,
89
+ parser,
90
+ &mut buf,
91
+ options,
92
+ false,
93
+ &mut list_ctx,
94
+ &excluded_node_ids,
95
+ );
70
96
  }
71
97
 
72
98
  post_process(&mut buf);
@@ -81,6 +107,7 @@ fn walk_plain(
81
107
  options: &ConversionOptions,
82
108
  in_pre: bool,
83
109
  list_ctx: &mut ListContext,
110
+ excluded_node_ids: &HashSet<u32>,
84
111
  ) {
85
112
  let Some(node) = node_handle.get(parser) else {
86
113
  return;
@@ -104,6 +131,11 @@ fn walk_plain(
104
131
  }
105
132
  }
106
133
  tl::Node::Tag(tag) => {
134
+ // Drop elements matching exclude_selectors, including all their descendants.
135
+ if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
136
+ return;
137
+ }
138
+
107
139
  let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
108
140
  let tag_str = tag_name.as_str();
109
141
 
@@ -112,6 +144,12 @@ fn walk_plain(
112
144
  return;
113
145
  }
114
146
 
147
+ // Apply preprocessing: drop nav/footer/aside/noise elements
148
+ // (shared logic with the markdown path).
149
+ if should_drop_for_preprocessing(tag_str, tag, options) {
150
+ return;
151
+ }
152
+
115
153
  match tag_str {
116
154
  "br" => {
117
155
  buf.push('\n');
@@ -121,7 +159,7 @@ fn walk_plain(
121
159
  }
122
160
  "pre" => {
123
161
  ensure_blank_line(buf);
124
- walk_children(tag, parser, buf, options, true, list_ctx);
162
+ walk_children(tag, parser, buf, options, true, list_ctx, excluded_node_ids);
125
163
  ensure_blank_line(buf);
126
164
  }
127
165
  "img" => {
@@ -136,13 +174,13 @@ fn walk_plain(
136
174
  }
137
175
  "table" => {
138
176
  ensure_blank_line(buf);
139
- walk_table(tag, parser, buf, options);
177
+ walk_table(tag, parser, buf, options, excluded_node_ids);
140
178
  ensure_blank_line(buf);
141
179
  }
142
180
  "ul" => {
143
181
  ensure_newline(buf);
144
182
  let mut child_ctx = ListContext::Unordered;
145
- walk_children(tag, parser, buf, options, false, &mut child_ctx);
183
+ walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
146
184
  ensure_newline(buf);
147
185
  }
148
186
  "ol" => {
@@ -154,7 +192,7 @@ fn walk_plain(
154
192
  .unwrap_or(1);
155
193
  ensure_newline(buf);
156
194
  let mut child_ctx = ListContext::Ordered { next_index: start };
157
- walk_children(tag, parser, buf, options, false, &mut child_ctx);
195
+ walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
158
196
  ensure_newline(buf);
159
197
  }
160
198
  "li" => {
@@ -172,17 +210,17 @@ fn walk_plain(
172
210
  buf.push_str("- ");
173
211
  }
174
212
  }
175
- walk_children(tag, parser, buf, options, false, list_ctx);
213
+ walk_children(tag, parser, buf, options, false, list_ctx, excluded_node_ids);
176
214
  ensure_newline(buf);
177
215
  }
178
216
  _ if BLOCK_TAGS.contains(&tag_str) => {
179
217
  ensure_blank_line(buf);
180
- walk_children(tag, parser, buf, options, in_pre, list_ctx);
218
+ walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
181
219
  ensure_blank_line(buf);
182
220
  }
183
221
  _ => {
184
222
  // Inline elements and structural containers (html, body, etc.)
185
- walk_children(tag, parser, buf, options, in_pre, list_ctx);
223
+ walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
186
224
  }
187
225
  }
188
226
  }
@@ -198,16 +236,23 @@ fn walk_children(
198
236
  options: &ConversionOptions,
199
237
  in_pre: bool,
200
238
  list_ctx: &mut ListContext,
239
+ excluded_node_ids: &HashSet<u32>,
201
240
  ) {
202
241
  let children = tag.children();
203
242
  let top = children.top();
204
243
  for child in top.iter() {
205
- walk_plain(child, parser, buf, options, in_pre, list_ctx);
244
+ walk_plain(child, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
206
245
  }
207
246
  }
208
247
 
209
248
  /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
210
- fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
249
+ fn walk_table(
250
+ table_tag: &tl::HTMLTag,
251
+ parser: &tl::Parser,
252
+ buf: &mut String,
253
+ options: &ConversionOptions,
254
+ excluded_node_ids: &HashSet<u32>,
255
+ ) {
211
256
  // Collect all <tr> node handles by recursing into the table
212
257
  let mut row_handles = Vec::new();
213
258
  collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
@@ -240,7 +285,15 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, op
240
285
  let mut cell_buf = String::new();
241
286
  if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
242
287
  let mut cell_list_ctx = ListContext::None;
243
- walk_children(cell_tag, parser, &mut cell_buf, options, false, &mut cell_list_ctx);
288
+ walk_children(
289
+ cell_tag,
290
+ parser,
291
+ &mut cell_buf,
292
+ options,
293
+ false,
294
+ &mut cell_list_ctx,
295
+ excluded_node_ids,
296
+ );
244
297
  }
245
298
  buf.push_str(cell_buf.trim());
246
299
  }
@@ -5,12 +5,12 @@
5
5
 
6
6
  use crate::converter::dom_context::DomContext;
7
7
  use crate::converter::main_helpers::is_inline_element;
8
- use crate::converter::utility::attributes::element_has_navigation_hint;
8
+ use crate::converter::utility::attributes::{attribute_matches_any, element_has_navigation_hint};
9
9
  use crate::converter::utility::content::normalized_tag_name;
10
10
  use crate::options::ConversionOptions;
11
11
 
12
12
  /// Check if an inline ancestor element is allowed to contain block-level elements.
13
- pub(crate) fn inline_ancestor_allows_block(tag_name: &str) -> bool {
13
+ pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
14
14
  matches!(tag_name, "a" | "ins" | "del")
15
15
  }
16
16
 
@@ -18,7 +18,7 @@ pub(crate) fn inline_ancestor_allows_block(tag_name: &str) -> bool {
18
18
  ///
19
19
  /// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
20
20
  /// whitespace preservation rules and should not be repaired.
21
- pub(crate) fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
21
+ pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
22
22
  for handle in dom_ctx.node_map.iter().flatten() {
23
23
  if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
24
24
  let is_block = dom_ctx
@@ -68,43 +68,101 @@ pub(crate) fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser
68
68
  }
69
69
 
70
70
  /// Determine if a node should be dropped during preprocessing.
71
- pub(crate) fn should_drop_for_preprocessing(
72
- node_handle: &tl::NodeHandle,
73
- tag_name: &str,
74
- tag: &tl::HTMLTag,
75
- parser: &tl::Parser,
76
- dom_ctx: &DomContext,
77
- options: &ConversionOptions,
78
- ) -> bool {
79
- // If preprocessing is globally disabled, don't drop any nodes
71
+ ///
72
+ /// Behavior depends on the [`PreprocessingPreset`]:
73
+ ///
74
+ /// - **Minimal**: Only scripts/styles are stripped (handled elsewhere). This function
75
+ /// drops nothing — all structural elements are preserved.
76
+ /// - **Standard** (default): Drops `<nav>` unconditionally. Drops `<header>`, `<footer>`,
77
+ /// and `<aside>` only when they have navigation hints (class/role/aria attributes
78
+ /// indicating site chrome). Drops `<form>` when `remove_forms` is enabled.
79
+ /// - **Aggressive**: All of Standard, plus: drops `<footer>`, `<aside>`, `<noscript>`
80
+ /// unconditionally. Drops ANY element with navigation hints in class/id/role
81
+ /// (e.g. `<div class="sidebar">`). Drops elements with noise-related classes/roles.
82
+ pub fn should_drop_for_preprocessing(tag_name: &str, tag: &tl::HTMLTag, options: &ConversionOptions) -> bool {
83
+ use crate::options::PreprocessingPreset;
84
+
80
85
  if !options.preprocessing.enabled {
81
86
  return false;
82
87
  }
83
88
 
89
+ let preset = options.preprocessing.preset;
90
+
91
+ // Minimal preset: drop nothing here (scripts/styles handled in earlier pipeline stage).
92
+ if preset == PreprocessingPreset::Minimal {
93
+ return false;
94
+ }
95
+
96
+ // Form removal — applies to both Standard and Aggressive when enabled.
97
+ if options.preprocessing.remove_forms && tag_name == "form" {
98
+ return true;
99
+ }
100
+
101
+ let is_aggressive = preset == PreprocessingPreset::Aggressive;
102
+
103
+ // Aggressive: drop <noscript> — its content is fallback for no-JS browsers.
104
+ if is_aggressive && tag_name == "noscript" {
105
+ return true;
106
+ }
107
+
108
+ // Navigation removal — only when the flag is enabled.
84
109
  if !options.preprocessing.remove_navigation {
85
110
  return false;
86
111
  }
87
112
 
88
113
  let has_nav_hint = element_has_navigation_hint(tag);
89
114
 
115
+ // <nav> is always navigation — drop in both Standard and Aggressive.
90
116
  if tag_name == "nav" {
91
117
  return true;
92
118
  }
93
119
 
94
120
  if tag_name == "header" {
95
- use crate::converter::utility::attributes::has_semantic_content_ancestor;
96
- let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
97
- if !inside_semantic_content {
98
- return true;
99
- }
100
- if has_nav_hint {
101
- return true;
102
- }
103
- } else if tag_name == "footer" || tag_name == "aside" {
104
- if has_nav_hint {
121
+ // Drop <header> only with navigation hints (e.g. class="site-header",
122
+ // role="navigation"). A plain <header> often wraps article titles like
123
+ // <header><h1>Title</h1></header> — dropping it loses content.
124
+ return has_nav_hint;
125
+ }
126
+
127
+ if tag_name == "footer" || tag_name == "aside" {
128
+ // Standard: drop only with navigation hints.
129
+ // Aggressive: drop unconditionally.
130
+ return is_aggressive || has_nav_hint;
131
+ }
132
+
133
+ // Aggressive: drop ANY element that has navigation hints in class/id/role.
134
+ // This catches <div class="sidebar">, <div class="menu">, <section class="navigation">,
135
+ // and similar non-semantic navigation containers.
136
+ if is_aggressive && has_nav_hint {
137
+ return true;
138
+ }
139
+
140
+ // Aggressive: drop elements with noise-related roles.
141
+ if is_aggressive {
142
+ if element_has_noise_hint(tag) {
105
143
  return true;
106
144
  }
107
145
  }
108
146
 
109
147
  false
110
148
  }
149
+
150
+ /// Check if an element has noise-related hints (ads, cookie banners, social sharing).
151
+ fn element_has_noise_hint(tag: &tl::HTMLTag) -> bool {
152
+ const NOISE_KEYWORDS: &[&str] = &[
153
+ "cookie",
154
+ "consent",
155
+ "gdpr",
156
+ "banner",
157
+ "advertisement",
158
+ "ad-container",
159
+ "advert",
160
+ "social-share",
161
+ "share-buttons",
162
+ "popup",
163
+ "modal-overlay",
164
+ "newsletter-signup",
165
+ ];
166
+
167
+ attribute_matches_any(tag, "class", NOISE_KEYWORDS) || attribute_matches_any(tag, "id", NOISE_KEYWORDS)
168
+ }
@@ -170,7 +170,7 @@ mod tests {
170
170
  #[test]
171
171
  fn figure_caption_separated_from_image() {
172
172
  let html = r#"<figure><img src="photo.jpg" alt="Photo"><figcaption>A nice photo</figcaption></figure>"#;
173
- let result = crate::convert(html, None).unwrap();
173
+ let result = crate::convert(html, None, None).unwrap();
174
174
  let content = result.content.unwrap_or_default();
175
175
  assert!(
176
176
  content.contains("![Photo](photo.jpg)"),
@@ -30,7 +30,7 @@ pub mod sectioning;
30
30
  pub mod summary;
31
31
 
32
32
  // Re-export types from parent module for submodule access
33
- pub(crate) use super::walk_node;
33
+ pub use super::walk_node;
34
34
  pub use super::{Context, DomContext};
35
35
 
36
36
  // Re-export handler functions for direct use
@@ -3,10 +3,6 @@
3
3
  //! This module provides utilities for normalizing, escaping, and processing text content
4
4
  //! extracted from HTML documents during the conversion to Markdown format.
5
5
 
6
- mod escaping;
7
- mod normalization;
8
6
  mod processing;
9
7
 
10
- pub use escaping::{escape_link_label, escape_malformed_angle_brackets};
11
- pub use normalization::{normalize_heading_text, trim_line_end_whitespace, truncate_at_char_boundary};
12
8
  pub use processing::dedent_code_block;
@@ -6,7 +6,7 @@ use crate::converter::DomContext;
6
6
  use crate::converter::utility::content::normalized_tag_name;
7
7
 
8
8
  /// Check if a tag has main content semantics based on role or class.
9
- pub(crate) fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
9
+ pub fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
10
10
  if let Some(Some(role)) = tag.attributes().get("role") {
11
11
  let lowered = role.as_utf8_str().to_ascii_lowercase();
12
12
  if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
@@ -38,7 +38,7 @@ pub(crate) fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
38
38
  }
39
39
 
40
40
  /// Check if an element has navigation-related hints in its attributes.
41
- pub(crate) fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
41
+ pub fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
42
42
  if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
43
43
  return true;
44
44
  }
@@ -88,7 +88,7 @@ pub(crate) fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
88
88
  }
89
89
 
90
90
  /// Check if an attribute value matches any of the given keywords (space or custom-separator aware).
91
- pub(crate) fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
91
+ pub fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
92
92
  let Some(attr_value) = tag.attributes().get(attr) else {
93
93
  return false;
94
94
  };
@@ -113,7 +113,7 @@ pub(crate) fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&
113
113
 
114
114
  /// Check if an attribute contains any of the given keywords (substring match).
115
115
  #[allow(clippy::trivially_copy_pass_by_ref)]
116
- pub(crate) fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
116
+ pub fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
117
117
  let Some(attr_value) = tag.attributes().get(attr) else {
118
118
  return false;
119
119
  };
@@ -126,11 +126,7 @@ pub(crate) fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[
126
126
 
127
127
  /// Check if a node has a semantic content ancestor (main, article, section).
128
128
  #[allow(clippy::trivially_copy_pass_by_ref)]
129
- pub(crate) fn has_semantic_content_ancestor(
130
- node_handle: &tl::NodeHandle,
131
- parser: &tl::Parser,
132
- dom_ctx: &DomContext,
133
- ) -> bool {
129
+ pub fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
134
130
  let mut current_id = node_handle.get_inner();
135
131
  while let Some(parent_id) = dom_ctx.parent_of(current_id) {
136
132
  if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
@@ -10,7 +10,7 @@ use std::num::NonZeroUsize;
10
10
  ///
11
11
  /// Pre-computes parent-child relationships, sibling indices, and caches
12
12
  /// tag information for efficient DOM navigation during conversion.
13
- pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len: usize) -> DomContext {
13
+ pub fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len: usize) -> DomContext {
14
14
  let cache_capacity = text_cache_capacity_for_input(input_len);
15
15
  let mut ctx = DomContext {
16
16
  parent_map: Vec::new(),
@@ -40,7 +40,7 @@ pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len:
40
40
  ///
41
41
  /// Returns a cache capacity between 32 and TEXT_CACHE_CAPACITY,
42
42
  /// scaled proportionally to input size (1KB = 1 slot).
43
- pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
43
+ pub fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
44
44
  const TEXT_CACHE_CAPACITY: usize = 256;
45
45
  // `clamp(32, TEXT_CACHE_CAPACITY)` guarantees `target >= 32 > 0`, so `new` always returns Some.
46
46
  let target = (input_len / 1024).clamp(32, TEXT_CACHE_CAPACITY);
@@ -50,7 +50,7 @@ pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
50
50
  /// Recursively record node hierarchy into DOM context.
51
51
  ///
52
52
  /// Builds the complete parent-child relationship map for efficient tree traversal.
53
- pub(crate) fn record_node_hierarchy(
53
+ pub fn record_node_hierarchy(
54
54
  node_handle: tl::NodeHandle,
55
55
  parent: Option<u32>,
56
56
  parser: &tl::Parser,
@@ -9,14 +9,14 @@ use std::borrow::Cow;
9
9
  use std::collections::BTreeMap;
10
10
 
11
11
  // Forward declare DomContext from parent module to avoid circular imports
12
- pub(crate) use crate::converter::DomContext;
12
+ pub use crate::converter::DomContext;
13
13
 
14
14
  /// Collect all attributes from an HTML tag as a `BTreeMap<String, String>`.
15
15
  ///
16
16
  /// Boolean attributes (those with `None` as the value) are skipped; only
17
17
  /// attributes that carry an explicit value are included.
18
18
  #[cfg(feature = "visitor")]
19
- pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
19
+ pub fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
20
20
  tag.attributes()
21
21
  .iter()
22
22
  .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
@@ -28,7 +28,7 @@ pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, Stri
28
28
  /// Similar to `text::chomp` but handles line breaks from `<br>` tags specially.
29
29
  /// Line breaks are extracted as suffix to be placed outside formatting.
30
30
  /// Returns (prefix, suffix, `trimmed_text`).
31
- pub(crate) fn chomp_inline(text: &str) -> (&str, &str, &str) {
31
+ pub fn chomp_inline(text: &str) -> (&str, &str, &str) {
32
32
  if text.is_empty() {
33
33
  return ("", "", "");
34
34
  }
@@ -59,13 +59,13 @@ pub(crate) fn chomp_inline(text: &str) -> (&str, &str, &str) {
59
59
 
60
60
  /// Get the text content of a node and its children.
61
61
  #[allow(clippy::trivially_copy_pass_by_ref)]
62
- pub(crate) fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> String {
62
+ pub fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> String {
63
63
  dom_ctx.text_content(*node_handle, parser)
64
64
  }
65
65
 
66
66
  /// Collect inline text for link labels, skipping block-level descendants.
67
67
  #[allow(clippy::match_wildcard_for_single_variants)]
68
- pub(crate) fn collect_link_label_text(
68
+ pub fn collect_link_label_text(
69
69
  children: &[tl::NodeHandle],
70
70
  parser: &tl::Parser,
71
71
  dom_ctx: &DomContext,
@@ -118,7 +118,7 @@ pub(crate) fn collect_link_label_text(
118
118
 
119
119
  /// Normalize a link label by collapsing newlines and normalizing whitespace.
120
120
  #[allow(clippy::trivially_copy_pass_by_ref)]
121
- pub(crate) fn normalize_link_label(label: &str) -> String {
121
+ pub fn normalize_link_label(label: &str) -> String {
122
122
  let mut needs_collapse = false;
123
123
  for ch in label.chars() {
124
124
  if ch == '\n' || ch == '\r' {
@@ -146,7 +146,7 @@ pub(crate) fn normalize_link_label(label: &str) -> String {
146
146
  }
147
147
 
148
148
  /// Normalize a tag name to lowercase, preserving borrowed input when possible.
149
- pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
149
+ pub fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
150
150
  if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
151
151
  let mut owned = raw.into_owned();
152
152
  owned.make_ascii_lowercase();
@@ -157,7 +157,7 @@ pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
157
157
  }
158
158
 
159
159
  /// Check if an element is block-level (not inline).
160
- pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
160
+ pub fn is_block_level_element(tag_name: &str) -> bool {
161
161
  is_block_level_name(tag_name, crate::converter::main_helpers::is_inline_element(tag_name))
162
162
  }
163
163
 
@@ -191,7 +191,7 @@ pub fn floor_char_boundary(s: &str, index: usize) -> usize {
191
191
  /// Input: "[outer [inner]]"
192
192
  /// Output: "[outer [inner]]"
193
193
  /// ```
194
- pub(crate) fn escape_link_label(text: &str) -> String {
194
+ pub fn escape_link_label(text: &str) -> String {
195
195
  if text.is_empty() {
196
196
  return String::new();
197
197
  }
@@ -231,7 +231,7 @@ pub(crate) fn escape_link_label(text: &str) -> String {
231
231
  }
232
232
 
233
233
  /// Helper for block-level element detection.
234
- pub(crate) fn is_block_level_name(tag_name: &str, is_inline: bool) -> bool {
234
+ pub fn is_block_level_name(tag_name: &str, is_inline: bool) -> bool {
235
235
  !is_inline
236
236
  && matches!(
237
237
  tag_name,
@@ -7,7 +7,7 @@ use std::borrow::Cow;
7
7
  use std::str;
8
8
 
9
9
  /// Strip script and style tags and their content from HTML.
10
- pub(crate) fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
10
+ pub fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
11
11
  let bytes = input.as_bytes();
12
12
  let len = bytes.len();
13
13
 
@@ -163,7 +163,7 @@ pub(crate) fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
163
163
  /// Returns the position AFTER the closing tag (including the '>').
164
164
  /// This is highly optimized for performance and uses a fast-path scan.
165
165
  #[inline]
166
- pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
166
+ pub fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
167
167
  let len = bytes.len();
168
168
  let tag_len = tag.len();
169
169
 
@@ -212,7 +212,7 @@ pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) ->
212
212
 
213
213
  /// Compare bytes ignoring ASCII case.
214
214
  #[inline]
215
- pub(crate) fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
215
+ pub fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
216
216
  if a.len() != b.len() {
217
217
  return false;
218
218
  }
@@ -220,7 +220,7 @@ pub(crate) fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
220
220
  }
221
221
 
222
222
  /// Preprocess HTML to normalize tags and fix common issues.
223
- pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
223
+ pub fn preprocess_html(input: &str) -> Cow<'_, str> {
224
224
  const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
225
225
  const TAGS: [&[u8]; 2] = [b"script", b"style"];
226
226
  const SVG: &[u8] = b"svg";
@@ -289,7 +289,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
289
289
  if tag == b"script" && is_json_ld_script_open_tag(&input[idx..open_end]) {
290
290
  continue;
291
291
  }
292
- let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
292
+ let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(open_end);
293
293
  let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
294
294
  out.push_str(&input[last..idx]);
295
295
  out.push_str(&input[idx..open_end]);
@@ -379,7 +379,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
379
379
  }
380
380
 
381
381
  /// Check if a script tag is a JSON-LD script.
382
- pub(crate) fn is_json_ld_script_open_tag(tag: &str) -> bool {
382
+ pub fn is_json_ld_script_open_tag(tag: &str) -> bool {
383
383
  let bytes = tag.as_bytes();
384
384
  let mut idx = 0;
385
385
  while idx + 4 <= bytes.len() {
@@ -443,7 +443,7 @@ pub(crate) fn is_json_ld_script_open_tag(tag: &str) -> bool {
443
443
 
444
444
  /// Case-insensitive byte comparison for ASCII.
445
445
  #[inline]
446
- pub(crate) fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
446
+ pub fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
447
447
  if haystack.len() < needle.len() {
448
448
  return false;
449
449
  }
@@ -454,7 +454,7 @@ pub(crate) fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool
454
454
  }
455
455
 
456
456
  /// Check if bytes match a tag start pattern.
457
- pub(crate) fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
457
+ pub fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
458
458
  if start >= bytes.len() {
459
459
  return false;
460
460
  }
@@ -477,7 +477,7 @@ pub(crate) fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> b
477
477
  }
478
478
 
479
479
  /// Find the end of an HTML tag (the position of '>').
480
- pub(crate) fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
480
+ pub fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
481
481
  let len = bytes.len();
482
482
  let mut in_quote: Option<u8> = None;
483
483
 
@@ -502,7 +502,7 @@ pub(crate) fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
502
502
  }
503
503
 
504
504
  /// Find the closing tag for a given tag name.
505
- pub(crate) fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
505
+ pub fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
506
506
  let len = bytes.len();
507
507
  let mut depth = 1usize;
508
508
 
@@ -533,7 +533,7 @@ pub(crate) fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Opti
533
533
  }
534
534
 
535
535
  /// Check if bytes match an end tag pattern.
536
- pub(crate) fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
536
+ pub fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
537
537
  if start >= bytes.len() || bytes[start] != b'/' {
538
538
  return false;
539
539
  }
@@ -553,7 +553,7 @@ pub(crate) fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> b
553
553
  ///
554
554
  /// # Returns
555
555
  /// * `Cow<str>` - Either the borrowed original URL or an owned sanitized version
556
- pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
556
+ pub fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
557
557
  // Pattern: ...[text](actual_url) or similar markdown-like syntax
558
558
  // This handles malformed HTML where markdown syntax wasn't properly converted
559
559
  // and prevents downstream URL parsing errors (e.g., bracketed "IPv6" hosts).
@@ -585,7 +585,7 @@ pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
585
585
  /// Scans for opening tags containing the `hidden` attribute, finds their
586
586
  /// matching closing tag, and removes the entire element (tag + content).
587
587
  /// Self-closing tags with `hidden` are also removed.
588
- pub(crate) fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
588
+ pub fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
589
589
  let bytes = input.as_bytes();
590
590
  let len = bytes.len();
591
591
 
@@ -8,7 +8,7 @@ use crate::converter::utility::content::normalized_tag_name;
8
8
  /// Serialize an element to HTML string (for SVG and Math elements).
9
9
  #[allow(clippy::trivially_copy_pass_by_ref)]
10
10
  #[allow(dead_code)] // used with visitor feature
11
- pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
11
+ pub fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
12
12
  if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
13
13
  let tag_name = normalized_tag_name(tag.name().as_utf8_str());
14
14
  let mut html = String::with_capacity(256);
@@ -48,7 +48,7 @@ pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parse
48
48
  /// Serialize a node to HTML string.
49
49
  #[allow(clippy::trivially_copy_pass_by_ref)]
50
50
  #[allow(dead_code)] // used with visitor feature
51
- pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
51
+ pub fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
52
52
  if let Some(node) = node_handle.get(parser) {
53
53
  match node {
54
54
  tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
@@ -61,7 +61,7 @@ pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser)
61
61
  }
62
62
 
63
63
  /// Serialize a tag to HTML, wrapping serialize_node_to_html.
64
- pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
64
+ pub fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
65
65
  let mut html = String::new();
66
66
  serialize_node_to_html(handle, parser, &mut html);
67
67
  html
@@ -70,7 +70,7 @@ pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser
70
70
  /// Recursively serialize a node to HTML.
71
71
  #[allow(clippy::trivially_copy_pass_by_ref)]
72
72
  #[allow(dead_code)] // used with visitor feature
73
- pub(crate) fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
73
+ pub fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
74
74
  match handle.get(parser) {
75
75
  Some(tl::Node::Tag(tag)) => {
76
76
  let tag_name = normalized_tag_name(tag.name().as_utf8_str());