html-to-markdown 3.4.0.pre.rc.45 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. checksums.yaml +4 -4
  2. data/ext/html_to_markdown_rb/Cargo.toml +1 -1
  3. data/lib/bin/html-to-markdown +0 -0
  4. data/lib/html_to_markdown/native.rb +3 -1
  5. data/lib/html_to_markdown/version.rb +2 -2
  6. data/vendor/Cargo.toml +1 -1
  7. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  8. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +47 -1
  9. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +87 -12
  10. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +87 -5
  11. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +4 -4
  12. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +19 -1
  13. data/vendor/html-to-markdown-rs/src/converter/main.rs +9 -1
  14. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +158 -40
  15. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +64 -31
  16. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +4 -10
  17. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +137 -1
  18. data/vendor/html-to-markdown-rs/tests/integration_test.rs +13 -5
  19. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +4 -2
  20. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +53 -0
  21. data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +74 -0
  22. data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +154 -0
  23. data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +93 -0
  24. data/vendor/html-to-markdown-rs/tests/tables_test.rs +39 -23
  25. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +74 -47
  26. metadata +4 -1
@@ -11,6 +11,13 @@ use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
11
11
  use crate::options::ConversionOptions;
12
12
  use crate::text;
13
13
 
14
+ #[cfg(feature = "visitor")]
15
+ use crate::converter::utility::content::{collect_tag_attributes, is_block_level_element};
16
+ #[cfg(feature = "visitor")]
17
+ use crate::visitor::{NodeContext, NodeType, VisitResult, VisitorHandle};
18
+ #[cfg(feature = "visitor")]
19
+ use std::collections::BTreeMap;
20
+
14
21
  /// Tracks list context for proper marker emission on `<li>` elements.
15
22
  #[derive(Clone, Debug)]
16
23
  enum ListContext {
@@ -53,6 +60,30 @@ const BLOCK_TAGS: &[&str] = &[
53
60
  "search",
54
61
  ];
55
62
 
63
+ /// Shared walker state threaded through all recursive calls.
64
+ ///
65
+ /// Holds the options, visitor (feature-gated), and current DOM depth.
66
+ /// Using a struct avoids feature-gated function parameters at call sites.
67
+ struct WalkState<'a> {
68
+ options: &'a ConversionOptions,
69
+ excluded_node_ids: &'a HashSet<u32>,
70
+ depth: usize,
71
+ #[cfg(feature = "visitor")]
72
+ visitor: Option<&'a VisitorHandle>,
73
+ }
74
+
75
+ impl WalkState<'_> {
76
+ fn descend(&self) -> Self {
77
+ WalkState {
78
+ options: self.options,
79
+ excluded_node_ids: self.excluded_node_ids,
80
+ depth: self.depth + 1,
81
+ #[cfg(feature = "visitor")]
82
+ visitor: self.visitor,
83
+ }
84
+ }
85
+ }
86
+
56
87
  /// Extract plain text from a parsed DOM tree.
57
88
  ///
58
89
  /// Walks the tree collecting visible text with structural whitespace:
@@ -64,6 +95,8 @@ const BLOCK_TAGS: &[&str] = &[
64
95
  /// - Tables: cells separated by tab, rows by newline
65
96
  /// - Inline elements are recursed without markers
66
97
  /// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
98
+ /// - When a visitor is configured, `visit_element_start`, `visit_element_end`, and
99
+ /// `visit_text` callbacks are fired and their results are honoured.
67
100
  pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
68
101
  let mut buf = String::with_capacity(1024);
69
102
  let mut list_ctx = ListContext::None;
@@ -83,16 +116,16 @@ pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &Convers
83
116
  ids
84
117
  };
85
118
 
119
+ let state = WalkState {
120
+ options,
121
+ excluded_node_ids: &excluded_node_ids,
122
+ depth: 0,
123
+ #[cfg(feature = "visitor")]
124
+ visitor: options.visitor.as_ref(),
125
+ };
126
+
86
127
  for child_handle in dom.children() {
87
- walk_plain(
88
- child_handle,
89
- parser,
90
- &mut buf,
91
- options,
92
- false,
93
- &mut list_ctx,
94
- &excluded_node_ids,
95
- );
128
+ walk_plain(child_handle, parser, &mut buf, false, &mut list_ctx, &state);
96
129
  }
97
130
 
98
131
  post_process(&mut buf);
@@ -104,10 +137,9 @@ fn walk_plain(
104
137
  node_handle: &tl::NodeHandle,
105
138
  parser: &tl::Parser,
106
139
  buf: &mut String,
107
- options: &ConversionOptions,
108
140
  in_pre: bool,
109
141
  list_ctx: &mut ListContext,
110
- excluded_node_ids: &HashSet<u32>,
142
+ state: &WalkState<'_>,
111
143
  ) {
112
144
  let Some(node) = node_handle.get(parser) else {
113
145
  return;
@@ -117,6 +149,30 @@ fn walk_plain(
117
149
  tl::Node::Raw(bytes) => {
118
150
  let raw = bytes.as_utf8_str();
119
151
  let decoded = text::decode_html_entities_cow(raw.as_ref());
152
+
153
+ #[cfg(feature = "visitor")]
154
+ if let Some(visitor_handle) = state.visitor {
155
+ let text_str: &str = &decoded;
156
+ let node_ctx = NodeContext {
157
+ node_type: NodeType::Text,
158
+ tag_name: String::new(),
159
+ attributes: BTreeMap::new(),
160
+ depth: state.depth,
161
+ index_in_parent: 0,
162
+ parent_tag: None,
163
+ is_inline: true,
164
+ };
165
+ let result = visitor_handle.borrow_mut().visit_text(&node_ctx, text_str);
166
+ match result {
167
+ VisitResult::Skip => return,
168
+ VisitResult::Custom(custom) => {
169
+ buf.push_str(&custom);
170
+ return;
171
+ }
172
+ _ => {}
173
+ }
174
+ }
175
+
120
176
  if in_pre {
121
177
  buf.push_str(&decoded);
122
178
  } else {
@@ -132,7 +188,7 @@ fn walk_plain(
132
188
  }
133
189
  tl::Node::Tag(tag) => {
134
190
  // Drop elements matching exclude_selectors, including all their descendants.
135
- if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
191
+ if !state.excluded_node_ids.is_empty() && state.excluded_node_ids.contains(&node_handle.get_inner()) {
136
192
  return;
137
193
  }
138
194
 
@@ -146,10 +202,55 @@ fn walk_plain(
146
202
 
147
203
  // Apply preprocessing: drop nav/footer/aside/noise elements
148
204
  // (shared logic with the markdown path).
149
- if should_drop_for_preprocessing(tag_str, tag, options) {
205
+ if should_drop_for_preprocessing(tag_str, tag, state.options) {
150
206
  return;
151
207
  }
152
208
 
209
+ // --- visitor: element start ---
210
+ #[cfg(feature = "visitor")]
211
+ if let Some(visitor_handle) = state.visitor {
212
+ let attributes = collect_tag_attributes(tag);
213
+ let node_ctx = NodeContext {
214
+ node_type: NodeType::Element,
215
+ tag_name: tag_str.to_string(),
216
+ attributes,
217
+ depth: state.depth,
218
+ index_in_parent: 0,
219
+ parent_tag: None,
220
+ is_inline: !is_block_level_element(tag_str),
221
+ };
222
+ let result = visitor_handle.borrow_mut().visit_element_start(&node_ctx);
223
+ match result {
224
+ VisitResult::Skip => return,
225
+ VisitResult::Custom(custom) => {
226
+ buf.push_str(&custom);
227
+ // Still call visit_element_end with the custom content as context.
228
+ let end_result = visitor_handle.borrow_mut().visit_element_end(&node_ctx, &custom);
229
+ match end_result {
230
+ VisitResult::Custom(replacement) => {
231
+ let trim_len = buf.len() - custom.len();
232
+ buf.truncate(trim_len);
233
+ buf.push_str(&replacement);
234
+ }
235
+ VisitResult::Skip => {
236
+ let trim_len = buf.len() - custom.len();
237
+ buf.truncate(trim_len);
238
+ }
239
+ _ => {}
240
+ }
241
+ return;
242
+ }
243
+ _ => {}
244
+ }
245
+ }
246
+
247
+ // Record the buf position before this element's content so visit_element_end
248
+ // can truncate back to it for Custom/Skip results.
249
+ #[cfg(feature = "visitor")]
250
+ let element_output_start = buf.len();
251
+
252
+ let child_state = state.descend();
253
+
153
254
  match tag_str {
154
255
  "br" => {
155
256
  buf.push('\n');
@@ -159,11 +260,11 @@ fn walk_plain(
159
260
  }
160
261
  "pre" => {
161
262
  ensure_blank_line(buf);
162
- walk_children(tag, parser, buf, options, true, list_ctx, excluded_node_ids);
263
+ walk_children(tag, parser, buf, true, list_ctx, &child_state);
163
264
  ensure_blank_line(buf);
164
265
  }
165
266
  "img" => {
166
- if !options.skip_images {
267
+ if !state.options.skip_images {
167
268
  if let Some(Some(alt)) = tag.attributes().get("alt") {
168
269
  let alt_text = alt.as_utf8_str();
169
270
  if !alt_text.is_empty() {
@@ -174,13 +275,13 @@ fn walk_plain(
174
275
  }
175
276
  "table" => {
176
277
  ensure_blank_line(buf);
177
- walk_table(tag, parser, buf, options, excluded_node_ids);
278
+ walk_table(tag, parser, buf, &child_state);
178
279
  ensure_blank_line(buf);
179
280
  }
180
281
  "ul" => {
181
282
  ensure_newline(buf);
182
283
  let mut child_ctx = ListContext::Unordered;
183
- walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
284
+ walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
184
285
  ensure_newline(buf);
185
286
  }
186
287
  "ol" => {
@@ -192,7 +293,7 @@ fn walk_plain(
192
293
  .unwrap_or(1);
193
294
  ensure_newline(buf);
194
295
  let mut child_ctx = ListContext::Ordered { next_index: start };
195
- walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
296
+ walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
196
297
  ensure_newline(buf);
197
298
  }
198
299
  "li" => {
@@ -210,17 +311,48 @@ fn walk_plain(
210
311
  buf.push_str("- ");
211
312
  }
212
313
  }
213
- walk_children(tag, parser, buf, options, false, list_ctx, excluded_node_ids);
314
+ walk_children(tag, parser, buf, false, list_ctx, &child_state);
214
315
  ensure_newline(buf);
215
316
  }
216
317
  _ if BLOCK_TAGS.contains(&tag_str) => {
217
318
  ensure_blank_line(buf);
218
- walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
319
+ walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
219
320
  ensure_blank_line(buf);
220
321
  }
221
322
  _ => {
222
323
  // Inline elements and structural containers (html, body, etc.)
223
- walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
324
+ walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
325
+ }
326
+ }
327
+
328
+ // --- visitor: element end ---
329
+ #[cfg(feature = "visitor")]
330
+ if let Some(visitor_handle) = state.visitor {
331
+ let attributes = collect_tag_attributes(tag);
332
+ let node_ctx = NodeContext {
333
+ node_type: NodeType::Element,
334
+ tag_name: tag_str.to_string(),
335
+ attributes,
336
+ depth: state.depth,
337
+ index_in_parent: 0,
338
+ parent_tag: None,
339
+ is_inline: !is_block_level_element(tag_str),
340
+ };
341
+ // Clamp safe_start in case children truncated the buffer.
342
+ let safe_start = element_output_start.min(buf.len());
343
+ let element_content = &buf[safe_start..];
344
+ let result = visitor_handle
345
+ .borrow_mut()
346
+ .visit_element_end(&node_ctx, element_content);
347
+ match result {
348
+ VisitResult::Custom(custom) => {
349
+ buf.truncate(safe_start);
350
+ buf.push_str(&custom);
351
+ }
352
+ VisitResult::Skip => {
353
+ buf.truncate(safe_start);
354
+ }
355
+ _ => {}
224
356
  }
225
357
  }
226
358
  }
@@ -233,26 +365,19 @@ fn walk_children(
233
365
  tag: &tl::HTMLTag,
234
366
  parser: &tl::Parser,
235
367
  buf: &mut String,
236
- options: &ConversionOptions,
237
368
  in_pre: bool,
238
369
  list_ctx: &mut ListContext,
239
- excluded_node_ids: &HashSet<u32>,
370
+ state: &WalkState<'_>,
240
371
  ) {
241
372
  let children = tag.children();
242
373
  let top = children.top();
243
374
  for child in top.iter() {
244
- walk_plain(child, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
375
+ walk_plain(child, parser, buf, in_pre, list_ctx, state);
245
376
  }
246
377
  }
247
378
 
248
379
  /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
249
- fn walk_table(
250
- table_tag: &tl::HTMLTag,
251
- parser: &tl::Parser,
252
- buf: &mut String,
253
- options: &ConversionOptions,
254
- excluded_node_ids: &HashSet<u32>,
255
- ) {
380
+ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, state: &WalkState<'_>) {
256
381
  // Collect all <tr> node handles by recursing into the table
257
382
  let mut row_handles = Vec::new();
258
383
  collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
@@ -278,6 +403,7 @@ fn walk_table(
278
403
  }
279
404
  }
280
405
 
406
+ let cell_state = state.descend();
281
407
  for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
282
408
  if cell_idx > 0 {
283
409
  buf.push('\t');
@@ -285,15 +411,7 @@ fn walk_table(
285
411
  let mut cell_buf = String::new();
286
412
  if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
287
413
  let mut cell_list_ctx = ListContext::None;
288
- walk_children(
289
- cell_tag,
290
- parser,
291
- &mut cell_buf,
292
- options,
293
- false,
294
- &mut cell_list_ctx,
295
- excluded_node_ids,
296
- );
414
+ walk_children(cell_tag, parser, &mut cell_buf, false, &mut cell_list_ctx, &cell_state);
297
415
  }
298
416
  buf.push_str(cell_buf.trim());
299
417
  }
@@ -18,48 +18,61 @@ pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
18
18
  ///
19
19
  /// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
20
20
  /// whitespace preservation rules and should not be repaired.
21
+ ///
22
+ /// Also detects table structural elements (`td`, `tr`, `th`) nested under `<p>` —
23
+ /// a structural impossibility in valid HTML that signals the `tl` parser absorbed
24
+ /// a table into a paragraph because of an unclosed `<p>` (common in Word/Outlook
25
+ /// HTML such as `<p class='MsoNormal'>` cells). Issue #336.
21
26
  pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
22
27
  for handle in dom_ctx.node_map.iter().flatten() {
23
28
  if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
24
- let is_block = dom_ctx
25
- .tag_info(handle.get_inner(), parser)
26
- .map(|info| info.is_block)
27
- .unwrap_or(false);
28
- if is_block {
29
- // Check if this block element or any ancestor is pre/code
30
- let mut check_parent = Some(handle.get_inner());
31
- let mut inside_preformatted = false;
32
- while let Some(node_id) = check_parent {
33
- if let Some(info) = dom_ctx.tag_info(node_id, parser) {
34
- if matches!(info.name.as_str(), "pre" | "code") {
35
- inside_preformatted = true;
36
- break;
37
- }
29
+ let node_id = handle.get_inner();
30
+ let Some(info) = dom_ctx.tag_info(node_id, parser) else {
31
+ continue;
32
+ };
33
+
34
+ // Table elements under <p>: tl misparsed an unclosed <p> in <td>.
35
+ if matches!(info.name.as_str(), "td" | "tr" | "th") && has_p_ancestor(dom_ctx, parser, node_id) {
36
+ return true;
37
+ }
38
+
39
+ if !info.is_block {
40
+ continue;
41
+ }
42
+
43
+ // Check if this block element or any ancestor is pre/code
44
+ let mut check_parent = Some(node_id);
45
+ let mut inside_preformatted = false;
46
+ while let Some(check_id) = check_parent {
47
+ if let Some(info) = dom_ctx.tag_info(check_id, parser) {
48
+ if matches!(info.name.as_str(), "pre" | "code") {
49
+ inside_preformatted = true;
50
+ break;
38
51
  }
39
- check_parent = dom_ctx.parent_of(node_id);
40
52
  }
53
+ check_parent = dom_ctx.parent_of(check_id);
54
+ }
41
55
 
42
- // Skip misnesting check for elements inside pre/code blocks
43
- if inside_preformatted {
44
- continue;
45
- }
56
+ // Skip misnesting check for elements inside pre/code blocks
57
+ if inside_preformatted {
58
+ continue;
59
+ }
46
60
 
47
- let mut current = dom_ctx.parent_of(handle.get_inner());
48
- while let Some(parent_id) = current {
49
- if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
50
- if is_inline_element(&parent_info.name) && !inline_ancestor_allows_block(&parent_info.name) {
61
+ let mut current = dom_ctx.parent_of(node_id);
62
+ while let Some(parent_id) = current {
63
+ if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
64
+ if is_inline_element(&parent_info.name) && !inline_ancestor_allows_block(&parent_info.name) {
65
+ return true;
66
+ }
67
+ } else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
68
+ if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
69
+ let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
70
+ if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
51
71
  return true;
52
72
  }
53
- } else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
54
- if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
55
- let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
56
- if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
57
- return true;
58
- }
59
- }
60
73
  }
61
- current = dom_ctx.parent_of(parent_id);
62
74
  }
75
+ current = dom_ctx.parent_of(parent_id);
63
76
  }
64
77
  }
65
78
  }
@@ -67,6 +80,26 @@ pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bo
67
80
  false
68
81
  }
69
82
 
83
+ /// Walk ancestors of `node_id` looking for a `<p>` element.
84
+ ///
85
+ /// Stops ascending once it leaves the table hierarchy (`table`/`body`/`html`)
86
+ /// to avoid false positives where a `<p>` legitimately wraps a `<table>`.
87
+ fn has_p_ancestor(dom_ctx: &DomContext, parser: &tl::Parser, node_id: u32) -> bool {
88
+ let mut current = dom_ctx.parent_of(node_id);
89
+ while let Some(parent_id) = current {
90
+ if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
91
+ if parent_info.name == "p" {
92
+ return true;
93
+ }
94
+ if matches!(parent_info.name.as_str(), "table" | "body" | "html") {
95
+ return false;
96
+ }
97
+ }
98
+ current = dom_ctx.parent_of(parent_id);
99
+ }
100
+ false
101
+ }
102
+
70
103
  /// Determine if a node should be dropped during preprocessing.
71
104
  ///
72
105
  /// Behavior depends on the [`PreprocessingPreset`]:
@@ -114,25 +114,19 @@ pub fn process_text_node(
114
114
  let processed_text = if ctx.in_code || ctx.in_ruby {
115
115
  text.into_owned()
116
116
  } else if ctx.in_table_cell {
117
+ // Always escape * and _ in table cells to prevent unintended emphasis.
117
118
  let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
118
119
  let normalized_text = text::normalize_whitespace_cow(text.as_ref());
119
120
  let escaped_result = text::escape(
120
121
  normalized_text.as_ref(),
121
122
  options.escape_misc,
122
- options.escape_asterisks,
123
- options.escape_underscores,
123
+ true,
124
+ true,
124
125
  options.escape_ascii,
125
126
  );
126
127
  escaped_result.into_owned()
127
128
  } else {
128
- text::escape(
129
- text.as_ref(),
130
- options.escape_misc,
131
- options.escape_asterisks,
132
- options.escape_underscores,
133
- options.escape_ascii,
134
- )
135
- .into_owned()
129
+ text::escape(text.as_ref(), options.escape_misc, true, true, options.escape_ascii).into_owned()
136
130
  };
137
131
  if options.escape_misc {
138
132
  escaped
@@ -323,6 +323,96 @@ pub fn normalize_bogus_comment_endings(input: &str) -> Cow<'_, str> {
323
323
  }
324
324
  }
325
325
 
326
+ /// Normalize closing tags whose `>` appears on a subsequent line.
327
+ ///
328
+ /// Some HTML formatters (JSX-style) write closing tags as:
329
+ ///
330
+ /// ```html
331
+ /// </a
332
+ /// >
333
+ /// ```
334
+ ///
335
+ /// The `tl` parser does not handle end-tags with a newline before the closing
336
+ /// `>`, leaving the element unclosed so all subsequent siblings become children
337
+ /// of the open element. This pass collapses such patterns to a single-line
338
+ /// closing tag (`</a>`) before the document reaches `tl`.
339
+ ///
340
+ /// Only the whitespace between the tag name and the closing `>` is normalised;
341
+ /// the rest of the document is untouched.
342
+ pub fn normalize_split_closing_tags(input: &str) -> Cow<'_, str> {
343
+ let bytes = input.as_bytes();
344
+ let len = bytes.len();
345
+
346
+ // Fast path: need both '</' and '\n' to have any candidates.
347
+ if len < 4 || !bytes.contains(&b'\n') {
348
+ return Cow::Borrowed(input);
349
+ }
350
+
351
+ let mut idx = 0;
352
+ let mut last = 0;
353
+ let mut output: Option<String> = None;
354
+
355
+ while idx + 2 < len {
356
+ // Look for `</`
357
+ if bytes[idx] != b'<' || bytes[idx + 1] != b'/' {
358
+ idx += 1;
359
+ continue;
360
+ }
361
+
362
+ // Scan tag name: ASCII letters, digits, hyphens (HTML5 allows hyphens in custom elements)
363
+ let name_start = idx + 2;
364
+ let mut name_end = name_start;
365
+ while name_end < len && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-') {
366
+ name_end += 1;
367
+ }
368
+
369
+ if name_end == name_start {
370
+ // No tag name — not a closing tag we care about.
371
+ idx += 1;
372
+ continue;
373
+ }
374
+
375
+ // After the tag name, skip any whitespace. If there is a newline in
376
+ // that whitespace before the `>`, we need to rewrite.
377
+ let ws_start = name_end;
378
+ let mut ws_end = ws_start;
379
+ let mut has_newline = false;
380
+ while ws_end < len && bytes[ws_end].is_ascii_whitespace() {
381
+ if bytes[ws_end] == b'\n' || bytes[ws_end] == b'\r' {
382
+ has_newline = true;
383
+ }
384
+ ws_end += 1;
385
+ }
386
+
387
+ if !has_newline || ws_end >= len || bytes[ws_end] != b'>' {
388
+ // Either no whitespace newline, or the `>` is not the next char.
389
+ idx += 1;
390
+ continue;
391
+ }
392
+
393
+ // We have `</tagname [whitespace-with-newline]>` — rewrite to `</tagname>`.
394
+ let tag_name = &input[name_start..name_end];
395
+ let out = output.get_or_insert_with(|| String::with_capacity(len));
396
+ out.push_str(&input[last..idx]);
397
+ out.push_str("</");
398
+ out.push_str(tag_name);
399
+ out.push('>');
400
+
401
+ idx = ws_end + 1; // advance past the `>`
402
+ last = idx;
403
+ }
404
+
405
+ match output {
406
+ Some(mut out) => {
407
+ if last < len {
408
+ out.push_str(&input[last..]);
409
+ }
410
+ Cow::Owned(out)
411
+ }
412
+ None => Cow::Borrowed(input),
413
+ }
414
+ }
415
+
326
416
  /// Preprocess HTML to normalize tags and fix common issues.
327
417
  pub fn preprocess_html(input: &str) -> Cow<'_, str> {
328
418
  const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
@@ -788,7 +878,7 @@ fn tag_has_hidden_attribute(tag: &str) -> bool {
788
878
 
789
879
  #[cfg(test)]
790
880
  mod tests {
791
- use super::{normalize_bogus_comment_endings, sanitize_markdown_url};
881
+ use super::{normalize_bogus_comment_endings, normalize_split_closing_tags, sanitize_markdown_url};
792
882
 
793
883
  // ── normalize_bogus_comment_endings ───────────────────────────────────────
794
884
 
@@ -841,6 +931,52 @@ mod tests {
841
931
  assert_eq!(result.as_ref(), "");
842
932
  }
843
933
 
934
+ // ── normalize_split_closing_tags ──────────────────────────────────────────
935
+
936
+ #[test]
937
+ fn normalize_split_closing_tags_collapses_newline_before_close_bracket() {
938
+ let input = "<a href=\"#x\">text</a\n>";
939
+ let result = normalize_split_closing_tags(input);
940
+ assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
941
+ }
942
+
943
+ #[test]
944
+ fn normalize_split_closing_tags_collapses_indented_newline_before_close_bracket() {
945
+ let input = "<a href=\"#x\">text</a\n >";
946
+ let result = normalize_split_closing_tags(input);
947
+ assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
948
+ }
949
+
950
+ #[test]
951
+ fn normalize_split_closing_tags_leaves_well_formed_closing_tags_unchanged() {
952
+ let input = "<a href=\"#x\">text</a>";
953
+ let result = normalize_split_closing_tags(input);
954
+ assert_eq!(result.as_ref(), input);
955
+ }
956
+
957
+ #[test]
958
+ fn normalize_split_closing_tags_handles_multiple_split_closing_tags() {
959
+ let input = "<li><a href=\"#a\">A</a\n >\n<a href=\"#b\">B</a\n>";
960
+ let result = normalize_split_closing_tags(input);
961
+ assert_eq!(result.as_ref(), "<li><a href=\"#a\">A</a>\n<a href=\"#b\">B</a>");
962
+ }
963
+
964
+ #[test]
965
+ fn normalize_split_closing_tags_does_not_collapse_inline_whitespace() {
966
+ // Only newlines trigger the normalisation; spaces alone must not.
967
+ let input = "<a href=\"#x\">text</a >";
968
+ let result = normalize_split_closing_tags(input);
969
+ // A space before > is actually valid HTML and tl handles it fine.
970
+ // We must not touch it to avoid over-normalising.
971
+ assert_eq!(result.as_ref(), input);
972
+ }
973
+
974
+ #[test]
975
+ fn normalize_split_closing_tags_empty_input() {
976
+ let result = normalize_split_closing_tags("");
977
+ assert_eq!(result.as_ref(), "");
978
+ }
979
+
844
980
  // ── sanitize_markdown_url ─────────────────────────────────────────────────
845
981
 
846
982
  #[test]
@@ -198,9 +198,13 @@ fn test_strikethrough() {
198
198
  fn test_simple_table() {
199
199
  let html = "<table><tr><th>Header</th></tr><tr><td>Cell</td></tr></table>";
200
200
  let result = convert(html, None).unwrap();
201
- assert!(result.contains("| Header |"));
202
- assert!(result.contains("| --- |"));
203
- assert!(result.contains("| Cell |"));
201
+ assert!(result.contains("| Header |"), "header row missing: {result}");
202
+ // Separator uses at least as many dashes as the widest cell ("Header" = 6).
203
+ assert!(
204
+ result.lines().any(|l| l.starts_with("| ----")),
205
+ "separator row missing: {result}"
206
+ );
207
+ assert!(result.contains("| Cell"), "cell row missing: {result}");
204
208
  }
205
209
 
206
210
  #[test]
@@ -221,7 +225,10 @@ fn test_table_rowspan() {
221
225
  ..Default::default()
222
226
  };
223
227
  let result = convert(html, Some(options)).unwrap();
224
- let expected = "\n\n| Header 1 | Header 2 |\n| --- | --- |\n| Spanning cell | First row content<br>Second line |\n| | Next row<br>More content |\n";
228
+ // Columns are padded to the widest cell per column (rowspan accounted):
229
+ // col 0: max("Header 1"=8, "Spanning cell"=13, ""=0) = 13
230
+ // col 1: max("Header 2"=8, "First row content<br>Second line"=32, "Next row<br>More content"=24) = 32
231
+ let expected = "| Header 1 | Header 2 |\n| ------------- | -------------------------------- |\n| Spanning cell | First row content<br>Second line |\n| | Next row<br>More content |\n";
225
232
  assert_eq!(result, expected);
226
233
  }
227
234
 
@@ -534,7 +541,8 @@ fn test_ordered_list_with_heading_and_table() {
534
541
  ";
535
542
 
536
543
  let result = convert(html, None).unwrap();
537
- let expected = "1. ### h3\n2. *table*\n\n | blah |\n | --- |\n";
544
+ // Separator dashes match the column width ("blah" = 4 chars → 4 dashes).
545
+ let expected = "1. ### h3\n2. *table*\n\n | blah |\n | ---- |\n";
538
546
  assert_eq!(result, expected);
539
547
  }
540
548