html-to-markdown 2.29.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +18 -41
  3. data/README.md +37 -50
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
  6. data/ext/html-to-markdown-rb/native/README.md +4 -13
  7. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
  8. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
  9. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
  10. data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
  11. data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
  12. data/lib/html_to_markdown/version.rb +1 -1
  13. data/lib/html_to_markdown.rb +13 -194
  14. data/sig/html_to_markdown.rbs +12 -373
  15. data/vendor/Cargo.toml +7 -4
  16. data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
  17. data/vendor/html-to-markdown-rs/README.md +127 -51
  18. data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
  19. data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
  20. data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
  21. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
  22. data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
  23. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
  24. data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
  25. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
  26. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
  27. data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
  28. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
  29. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
  30. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
  31. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
  32. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
  33. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
  34. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
  35. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
  36. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
  37. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
  38. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
  39. data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
  40. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
  41. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
  42. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
  43. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
  44. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
  45. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
  46. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
  47. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
  48. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
  49. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
  50. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
  51. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
  52. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
  53. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
  54. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
  55. data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
  56. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
  57. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
  58. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
  59. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
  60. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
  61. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
  62. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
  63. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
  64. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
  65. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
  66. data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
  67. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
  68. data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
  69. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
  70. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
  71. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
  72. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
  73. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
  74. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
  75. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
  76. data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
  77. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  78. data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
  79. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
  80. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
  81. data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
  82. data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
  83. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
  84. data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
  85. data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
  86. data/vendor/html-to-markdown-rs/src/text.rs +25 -14
  87. data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
  88. data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
  89. data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
  90. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
  91. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
  92. data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
  93. data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
  94. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
  95. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
  96. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
  97. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
  98. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
  99. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
  100. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
  101. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
  102. data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
  103. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
  104. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
  105. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
  106. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
  107. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
  108. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
  109. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
  110. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
  111. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
  112. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
  113. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
  114. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
  115. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
  116. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
  117. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
  118. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
  119. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
  120. data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
  121. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
  122. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
  123. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
  124. data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
  125. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
  126. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
  127. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
  128. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
  129. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
  130. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
  131. metadata +9 -37
  132. data/bin/benchmark.rb +0 -232
  133. data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
  134. data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
  135. data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
  136. data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
  137. data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
  138. data/spec/convert_spec.rb +0 -77
  139. data/spec/convert_with_tables_spec.rb +0 -194
  140. data/spec/metadata_extraction_spec.rb +0 -437
  141. data/spec/visitor_issue_187_spec.rb +0 -605
  142. data/spec/visitor_spec.rb +0 -1149
  143. data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
  144. data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
  145. data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
  146. data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
  147. data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
  148. data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
  149. data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
  150. data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
  151. data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
  152. data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
  153. data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
  154. data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
  155. data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
  156. data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
  157. data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
  158. data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
  159. data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
  160. data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
  161. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
  162. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
  163. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
  164. data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
  165. data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
  166. data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
@@ -19,42 +19,7 @@ use std::borrow::Cow;
19
19
  /// "Text \\[escaped\\]" → "Text \\[escaped\\]"
20
20
  /// ```
21
21
  pub fn escape_link_label(text: &str) -> String {
22
- if text.is_empty() {
23
- return String::new();
24
- }
25
-
26
- let mut result = String::with_capacity(text.len());
27
- let mut backslash_count = 0usize;
28
- let mut bracket_depth = 0usize;
29
-
30
- for ch in text.chars() {
31
- if ch == '\\' {
32
- result.push('\\');
33
- backslash_count += 1;
34
- continue;
35
- }
36
-
37
- let is_escaped = backslash_count % 2 == 1;
38
- backslash_count = 0;
39
-
40
- match ch {
41
- '[' if !is_escaped => {
42
- bracket_depth = bracket_depth.saturating_add(1);
43
- result.push('[');
44
- }
45
- ']' if !is_escaped => {
46
- if bracket_depth == 0 {
47
- result.push('\\');
48
- } else {
49
- bracket_depth -= 1;
50
- }
51
- result.push(']');
52
- }
53
- _ => result.push(ch),
54
- }
55
- }
56
-
57
- result
22
+ crate::converter::utility::content::escape_link_label(text)
58
23
  }
59
24
 
60
25
  /// Escape malformed angle brackets in markdown output.
@@ -8,7 +8,5 @@ mod normalization;
8
8
  mod processing;
9
9
 
10
10
  pub use escaping::{escape_link_label, escape_malformed_angle_brackets};
11
- pub use normalization::{
12
- chomp_inline, normalize_heading_text, trim_line_end_whitespace, trim_trailing_whitespace, truncate_at_char_boundary,
13
- };
11
+ pub use normalization::{normalize_heading_text, trim_line_end_whitespace, truncate_at_char_boundary};
14
12
  pub use processing::dedent_code_block;
@@ -5,59 +5,6 @@
5
5
 
6
6
  use std::borrow::Cow;
7
7
 
8
- /// Chomp whitespace from inline element content, preserving line breaks.
9
- ///
10
- /// Returns (prefix, suffix, trimmed_text) where:
11
- /// - prefix: leading whitespace (space or tab)
12
- /// - suffix: trailing whitespace (including soft breaks like " \n" or "\\\n")
13
- /// - trimmed_text: the trimmed content
14
- ///
15
- /// # Examples
16
- ///
17
- /// ```text
18
- /// " text \n" → (" ", " \n", "text")
19
- /// " text " → (" ", " ", "text")
20
- /// "text" → ("", "", "text")
21
- /// ```
22
- pub fn chomp_inline(text: &str) -> (&str, &str, &str) {
23
- if text.is_empty() {
24
- return ("", "", "");
25
- }
26
-
27
- let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
28
-
29
- let has_trailing_linebreak = text.ends_with(" \n") || text.ends_with("\\\n");
30
-
31
- let suffix = if has_trailing_linebreak {
32
- if text.ends_with(" \n") { " \n" } else { "\\\n" }
33
- } else if text.ends_with(&[' ', '\t'][..]) {
34
- " "
35
- } else {
36
- ""
37
- };
38
-
39
- let trimmed = if has_trailing_linebreak {
40
- text.strip_suffix(" \n").map_or_else(
41
- || text.strip_suffix("\\\n").map_or_else(|| text.trim(), |s| s.trim()),
42
- |s| s.trim(),
43
- )
44
- } else {
45
- text.trim()
46
- };
47
-
48
- (prefix, suffix, trimmed)
49
- }
50
-
51
- /// Remove trailing spaces and tabs from output string.
52
- ///
53
- /// This is used before adding block separators or newlines to ensure
54
- /// clean Markdown output without spurious whitespace.
55
- pub fn trim_trailing_whitespace(output: &mut String) {
56
- while output.ends_with(' ') || output.ends_with('\t') {
57
- output.pop();
58
- }
59
- }
60
-
61
8
  /// Remove trailing spaces/tabs from every line while preserving newlines.
62
9
  pub fn trim_line_end_whitespace(output: &mut String) {
63
10
  if output.is_empty() {
@@ -50,7 +50,7 @@ pub fn process_text_node(
50
50
  let had_newlines = text_ref.contains('\n');
51
51
  let has_double_newline = text_ref.contains("\n\n") || text_ref.contains("\r\n\r\n");
52
52
 
53
- if options.strip_newlines {
53
+ if options.strip_newlines && (text.contains('\r') || text.contains('\n')) {
54
54
  text = Cow::Owned(text.replace(['\r', '\n'], " "));
55
55
  }
56
56
 
@@ -153,44 +153,3 @@ pub(crate) fn has_semantic_content_ancestor(
153
153
  }
154
154
  false
155
155
  }
156
-
157
- /// Check if a document might be an hOCR document (has relevant attributes).
158
- pub(crate) fn may_be_hocr(input: &str) -> bool {
159
- const HOCR_MARKERS: [&[u8]; 3] = [b"class=\"ocr", b"class='ocr", b"ocr_page"];
160
- HOCR_MARKERS
161
- .iter()
162
- .any(|marker| input.as_bytes().windows(marker.len()).any(|w| w == *marker))
163
- }
164
-
165
- /// Check if a node is an hOCR document by examining its root tag.
166
- pub(crate) fn is_hocr_document(node_handle: tl::NodeHandle, parser: &tl::Parser) -> bool {
167
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
168
- let tag_name = tag.name().as_utf8_str();
169
- if tag_name != "html" {
170
- return false;
171
- }
172
-
173
- // Check for hOCR class on root or first child
174
- if let Some(Some(class_bytes)) = tag.attributes().get("class") {
175
- let class = class_bytes.as_utf8_str();
176
- if class.contains("ocr_document") || class.contains("ocr_page") {
177
- return true;
178
- }
179
- }
180
-
181
- // Check children
182
- let children = tag.children();
183
- for child_handle in children.top().iter() {
184
- if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
185
- if let Some(Some(class_bytes)) = child_tag.attributes().get("class") {
186
- let class = class_bytes.as_utf8_str();
187
- if class.contains("ocr_document") || class.contains("ocr_page") {
188
- return true;
189
- }
190
- }
191
- }
192
- }
193
- }
194
-
195
- false
196
- }
@@ -42,8 +42,9 @@ pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len:
42
42
  /// scaled proportionally to input size (1KB = 1 slot).
43
43
  pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
44
44
  const TEXT_CACHE_CAPACITY: usize = 256;
45
+ // `clamp(32, TEXT_CACHE_CAPACITY)` guarantees `target >= 32 > 0`, so `new` always returns Some.
45
46
  let target = (input_len / 1024).clamp(32, TEXT_CACHE_CAPACITY);
46
- NonZeroUsize::new(target).unwrap_or_else(|| NonZeroUsize::new(32).unwrap())
47
+ NonZeroUsize::new(target).unwrap_or(NonZeroUsize::MIN)
47
48
  }
48
49
 
49
50
  /// Recursively record node hierarchy into DOM context.
@@ -5,10 +5,24 @@
5
5
 
6
6
  use crate::text;
7
7
  use std::borrow::Cow;
8
+ #[cfg(feature = "visitor")]
9
+ use std::collections::BTreeMap;
8
10
 
9
11
  // Forward declare DomContext from parent module to avoid circular imports
10
12
  pub(crate) use crate::converter::DomContext;
11
13
 
14
+ /// Collect all attributes from an HTML tag as a `BTreeMap<String, String>`.
15
+ ///
16
+ /// Boolean attributes (those with `None` as the value) are skipped; only
17
+ /// attributes that carry an explicit value are included.
18
+ #[cfg(feature = "visitor")]
19
+ pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
20
+ tag.attributes()
21
+ .iter()
22
+ .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
23
+ .collect()
24
+ }
25
+
12
26
  /// Chomp whitespace from inline element content, preserving line breaks.
13
27
  ///
14
28
  /// Similar to `text::chomp` but handles line breaks from `<br>` tags specially.
@@ -131,31 +145,6 @@ pub(crate) fn normalize_link_label(label: &str) -> String {
131
145
  normalized.as_ref().trim().to_string()
132
146
  }
133
147
 
134
- /// Check if an inline element is considered empty (no meaningful content).
135
- pub(crate) fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
136
- const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
137
- "abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
138
- ];
139
-
140
- let tag_name: Option<Cow<'_, str>> = dom_ctx
141
- .tag_info(node_handle.get_inner(), parser)
142
- .map(|info| Cow::Borrowed(info.name.as_str()))
143
- .or_else(|| {
144
- if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
145
- Some(normalized_tag_name(tag.name().as_utf8_str()))
146
- } else {
147
- None
148
- }
149
- });
150
-
151
- if let Some(tag_name) = tag_name {
152
- if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
153
- return get_text_content(node_handle, parser, dom_ctx).trim().is_empty();
154
- }
155
- }
156
- false
157
- }
158
-
159
148
  /// Normalize a tag name to lowercase, preserving borrowed input when possible.
160
149
  pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
161
150
  if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
@@ -167,81 +156,9 @@ pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
167
156
  }
168
157
  }
169
158
 
170
- /// Check if an element is inline (not block-level).
171
- fn is_inline_element(tag_name: &str) -> bool {
172
- matches!(
173
- tag_name,
174
- "a" | "abbr"
175
- | "b"
176
- | "bdi"
177
- | "bdo"
178
- | "br"
179
- | "cite"
180
- | "code"
181
- | "data"
182
- | "dfn"
183
- | "em"
184
- | "i"
185
- | "kbd"
186
- | "mark"
187
- | "q"
188
- | "rp"
189
- | "rt"
190
- | "ruby"
191
- | "s"
192
- | "samp"
193
- | "small"
194
- | "span"
195
- | "strong"
196
- | "sub"
197
- | "sup"
198
- | "time"
199
- | "u"
200
- | "var"
201
- | "wbr"
202
- | "del"
203
- | "ins"
204
- | "img"
205
- | "map"
206
- | "area"
207
- | "audio"
208
- | "video"
209
- | "picture"
210
- | "source"
211
- | "track"
212
- | "embed"
213
- | "object"
214
- | "param"
215
- | "input"
216
- | "label"
217
- | "button"
218
- | "select"
219
- | "textarea"
220
- | "output"
221
- | "progress"
222
- | "meter"
223
- )
224
- }
225
-
226
159
  /// Check if an element is block-level (not inline).
227
160
  pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
228
- is_block_level_name(tag_name, is_inline_element(tag_name))
229
- }
230
-
231
- /// Truncate a string to a maximum length at a valid UTF-8 character boundary.
232
- ///
233
- /// Ensures the string is not longer than `max_len` bytes, truncating at the last
234
- /// valid character boundary if necessary to preserve valid UTF-8.
235
- pub(crate) fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
236
- if value.len() <= max_len {
237
- return;
238
- }
239
-
240
- let mut new_len = max_len.min(value.len());
241
- while new_len > 0 && !value.is_char_boundary(new_len) {
242
- new_len -= 1;
243
- }
244
- value.truncate(new_len);
161
+ is_block_level_name(tag_name, crate::converter::main_helpers::is_inline_element(tag_name))
245
162
  }
246
163
 
247
164
  /// Returns the largest valid char boundary index at or before `index`.
@@ -176,10 +176,13 @@ pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) ->
176
176
  const MAX_SCAN: usize = 100_000_000; // 100MB limit per tag - prevents pathological cases
177
177
 
178
178
  while idx < len && (idx - start) < MAX_SCAN {
179
- // Optimization: skip forward to next '<' quickly
179
+ // Optimization: skip forward to next '<' quickly using memchr
180
180
  if bytes[idx] != b'<' {
181
- idx += 1;
182
- continue;
181
+ if let Some(pos) = memchr::memchr(b'<', &bytes[idx..]) {
182
+ idx += pos;
183
+ } else {
184
+ break;
185
+ }
183
186
  }
184
187
 
185
188
  // Check for </ pattern
@@ -291,7 +294,11 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
291
294
  out.push_str(&input[last..idx]);
292
295
  out.push_str(&input[idx..open_end]);
293
296
  out.push_str("</");
294
- out.push_str(str::from_utf8(tag).unwrap());
297
+ // `TAGS` contains only ASCII byte literals (`b"script"`, `b"style"`),
298
+ // which are always valid UTF-8; `from_utf8` cannot fail here.
299
+ if let Ok(tag_str) = str::from_utf8(tag) {
300
+ out.push_str(tag_str);
301
+ }
295
302
  out.push('>');
296
303
 
297
304
  last = remove_end;
@@ -573,6 +580,108 @@ pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
573
580
  Cow::Owned(url[paren_start..paren_end].to_string())
574
581
  }
575
582
 
583
+ /// Strip elements with the `hidden` attribute from HTML.
584
+ ///
585
+ /// Scans for opening tags containing the `hidden` attribute, finds their
586
+ /// matching closing tag, and removes the entire element (tag + content).
587
+ /// Self-closing tags with `hidden` are also removed.
588
+ pub(crate) fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
589
+ let bytes = input.as_bytes();
590
+ let len = bytes.len();
591
+
592
+ if len == 0 || !bytes.contains(&b'<') {
593
+ return Cow::Borrowed(input);
594
+ }
595
+
596
+ let mut idx = 0;
597
+ let mut last = 0;
598
+ let mut output: Option<String> = None;
599
+
600
+ while idx < len {
601
+ if bytes[idx] == b'<' && idx + 1 < len && bytes[idx + 1] != b'/' && bytes[idx + 1] != b'!' {
602
+ // Find the end of this opening tag
603
+ if let Some(tag_end) = find_tag_end(bytes, idx + 1) {
604
+ let tag_slice = &input[idx..tag_end];
605
+ if tag_has_hidden_attribute(tag_slice) {
606
+ // Extract the tag name
607
+ let name_start = idx + 1;
608
+ let mut name_end = name_start;
609
+ while name_end < len
610
+ && !bytes[name_end].is_ascii_whitespace()
611
+ && bytes[name_end] != b'>'
612
+ && bytes[name_end] != b'/'
613
+ {
614
+ name_end += 1;
615
+ }
616
+ let tag_name = &bytes[name_start..name_end];
617
+
618
+ // Check if it's a self-closing tag (e.g., <br hidden> or <br hidden/>)
619
+ let is_self_closing = tag_slice.ends_with("/>")
620
+ || tag_name.eq_ignore_ascii_case(b"br")
621
+ || tag_name.eq_ignore_ascii_case(b"hr")
622
+ || tag_name.eq_ignore_ascii_case(b"img")
623
+ || tag_name.eq_ignore_ascii_case(b"input");
624
+
625
+ let remove_end = if is_self_closing {
626
+ tag_end
627
+ } else {
628
+ // Find the closing tag
629
+ find_closing_tag_bytes(bytes, tag_end, tag_name).unwrap_or(tag_end)
630
+ };
631
+
632
+ let out = output.get_or_insert_with(|| String::with_capacity(len));
633
+ out.push_str(&input[last..idx]);
634
+ last = remove_end;
635
+ idx = remove_end;
636
+ continue;
637
+ }
638
+ }
639
+ }
640
+ idx += 1;
641
+ }
642
+
643
+ if let Some(mut out) = output {
644
+ if last < len {
645
+ out.push_str(&input[last..]);
646
+ }
647
+ Cow::Owned(out)
648
+ } else {
649
+ Cow::Borrowed(input)
650
+ }
651
+ }
652
+
653
+ /// Check if an opening tag string contains the `hidden` attribute.
654
+ ///
655
+ /// Handles: `hidden`, `hidden=""`, `hidden="hidden"`, `hidden="true"`.
656
+ /// Does NOT match attributes like `data-hidden` or `aria-hidden`.
657
+ fn tag_has_hidden_attribute(tag: &str) -> bool {
658
+ let bytes = tag.as_bytes();
659
+ let len = bytes.len();
660
+ let needle = b"hidden";
661
+ let nlen = needle.len();
662
+
663
+ let mut i = 0;
664
+ // Skip past the tag name
665
+ while i < len && bytes[i] != b' ' && bytes[i] != b'\t' && bytes[i] != b'\n' && bytes[i] != b'>' {
666
+ i += 1;
667
+ }
668
+
669
+ while i + nlen <= len {
670
+ if bytes[i..i + nlen].eq_ignore_ascii_case(needle) {
671
+ // Check that the character before is whitespace (attribute boundary)
672
+ let before_ok = i == 0 || bytes[i - 1].is_ascii_whitespace();
673
+ // Check that the character after is whitespace, '>', '=', or '/'
674
+ let after = bytes.get(i + nlen).copied();
675
+ let after_ok = matches!(after, None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'=' | b'/'));
676
+ if before_ok && after_ok {
677
+ return true;
678
+ }
679
+ }
680
+ i += 1;
681
+ }
682
+ false
683
+ }
684
+
576
685
  #[cfg(test)]
577
686
  mod tests {
578
687
  use super::sanitize_markdown_url;
@@ -7,6 +7,7 @@ use crate::converter::utility::content::normalized_tag_name;
7
7
 
8
8
  /// Serialize an element to HTML string (for SVG and Math elements).
9
9
  #[allow(clippy::trivially_copy_pass_by_ref)]
10
+ #[allow(dead_code)] // used with visitor feature
10
11
  pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
11
12
  if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
12
13
  let tag_name = normalized_tag_name(tag.name().as_utf8_str());
@@ -46,6 +47,7 @@ pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parse
46
47
 
47
48
  /// Serialize a node to HTML string.
48
49
  #[allow(clippy::trivially_copy_pass_by_ref)]
50
+ #[allow(dead_code)] // used with visitor feature
49
51
  pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
50
52
  if let Some(node) = node_handle.get(parser) {
51
53
  match node {
@@ -67,6 +69,7 @@ pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser
67
69
 
68
70
  /// Recursively serialize a node to HTML.
69
71
  #[allow(clippy::trivially_copy_pass_by_ref)]
72
+ #[allow(dead_code)] // used with visitor feature
70
73
  pub(crate) fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
71
74
  match handle.get(parser) {
72
75
  Some(tl::Node::Tag(tag)) => {
@@ -6,6 +6,8 @@
6
6
 
7
7
  use std::collections::BTreeMap;
8
8
 
9
+ #[cfg(feature = "visitor")]
10
+ use crate::converter::utility::content::collect_tag_attributes;
9
11
  use crate::converter::utility::content::is_block_level_element;
10
12
  use crate::visitor::{NodeContext, NodeType, VisitResult};
11
13
 
@@ -48,11 +50,7 @@ pub fn handle_visitor_element_start(
48
50
  depth: usize,
49
51
  dom_ctx: &crate::converter::DomContext,
50
52
  ) -> VisitAction {
51
- let attributes: BTreeMap<String, String> = tag
52
- .attributes()
53
- .iter()
54
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
55
- .collect();
53
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
56
54
 
57
55
  let node_id = node_handle.get_inner();
58
56
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -131,11 +129,7 @@ pub fn handle_visitor_element_end(
131
129
  return;
132
130
  }
133
131
 
134
- let attributes: BTreeMap<String, String> = tag
135
- .attributes()
136
- .iter()
137
- .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
138
- .collect();
132
+ let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
139
133
 
140
134
  let node_id = node_handle.get_inner();
141
135
  let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
@@ -13,7 +13,7 @@ pub use crate::inline_images::{
13
13
 
14
14
  #[cfg(feature = "metadata")]
15
15
  pub use crate::metadata::{
16
- DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
16
+ DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, HeaderMetadata, HtmlMetadata, ImageMetadata, ImageType,
17
17
  LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
18
18
  };
19
19
 
@@ -21,6 +21,3 @@ pub use crate::options::{
21
21
  CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
22
22
  NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
23
23
  };
24
-
25
- #[cfg(feature = "async-visitor")]
26
- pub use crate::visitor_helpers::AsyncVisitorHandle;
@@ -172,7 +172,7 @@ pub struct InlineImageWarning {
172
172
  pub message: String,
173
173
  }
174
174
 
175
- /// Output of `convert_with_inline_images`.
175
+ /// Output containing extracted inline images from `convert()` when `extract_images` is enabled.
176
176
  #[derive(Debug, Clone)]
177
177
  pub struct HtmlExtraction {
178
178
  /// Converted markdown output.