RubyGems - html-to-markdown - Versions diffs - 3.4.0.pre.rc.45 → 3.4.0 - Mend

html-to-markdown 3.4.0.pre.rc.45 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/vendor/html-to-markdown-rs/src/converter/plain_text.rs CHANGED Viewed

@@ -11,6 +11,13 @@ use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
 use crate::options::ConversionOptions;
 use crate::text;
+#[cfg(feature = "visitor")]
+use crate::converter::utility::content::{collect_tag_attributes, is_block_level_element};
+#[cfg(feature = "visitor")]
+use crate::visitor::{NodeContext, NodeType, VisitResult, VisitorHandle};
+#[cfg(feature = "visitor")]
+use std::collections::BTreeMap;
 /// Tracks list context for proper marker emission on `<li>` elements.
 #[derive(Clone, Debug)]
 enum ListContext {
@@ -53,6 +60,30 @@ const BLOCK_TAGS: &[&str] = &[
     "search",
 ];
+/// Shared walker state threaded through all recursive calls.
+///
+/// Holds the options, visitor (feature-gated), and current DOM depth.
+/// Using a struct avoids feature-gated function parameters at call sites.
+struct WalkState<'a> {
+    options: &'a ConversionOptions,
+    excluded_node_ids: &'a HashSet<u32>,
+    depth: usize,
+    #[cfg(feature = "visitor")]
+    visitor: Option<&'a VisitorHandle>,
+}
+impl WalkState<'_> {
+    fn descend(&self) -> Self {
+        WalkState {
+            options: self.options,
+            excluded_node_ids: self.excluded_node_ids,
+            depth: self.depth + 1,
+            #[cfg(feature = "visitor")]
+            visitor: self.visitor,
+        }
+    }
+}
 /// Extract plain text from a parsed DOM tree.
 ///
 /// Walks the tree collecting visible text with structural whitespace:
@@ -64,6 +95,8 @@ const BLOCK_TAGS: &[&str] = &[
 /// - Tables: cells separated by tab, rows by newline
 /// - Inline elements are recursed without markers
 /// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
+/// - When a visitor is configured, `visit_element_start`, `visit_element_end`, and
+///   `visit_text` callbacks are fired and their results are honoured.
 pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
     let mut buf = String::with_capacity(1024);
     let mut list_ctx = ListContext::None;
@@ -83,16 +116,16 @@ pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &Convers
         ids
     };
+    let state = WalkState {
+        options,
+        excluded_node_ids: &excluded_node_ids,
+        depth: 0,
+        #[cfg(feature = "visitor")]
+        visitor: options.visitor.as_ref(),
+    };
     for child_handle in dom.children() {
-        walk_plain(
-            child_handle,
-            parser,
-            &mut buf,
-            options,
-            false,
-            &mut list_ctx,
-            &excluded_node_ids,
-        );
+        walk_plain(child_handle, parser, &mut buf, false, &mut list_ctx, &state);
     }
     post_process(&mut buf);
@@ -104,10 +137,9 @@ fn walk_plain(
     node_handle: &tl::NodeHandle,
     parser: &tl::Parser,
     buf: &mut String,
-    options: &ConversionOptions,
     in_pre: bool,
     list_ctx: &mut ListContext,
-    excluded_node_ids: &HashSet<u32>,
+    state: &WalkState<'_>,
 ) {
     let Some(node) = node_handle.get(parser) else {
         return;
@@ -117,6 +149,30 @@ fn walk_plain(
         tl::Node::Raw(bytes) => {
             let raw = bytes.as_utf8_str();
             let decoded = text::decode_html_entities_cow(raw.as_ref());
+            #[cfg(feature = "visitor")]
+            if let Some(visitor_handle) = state.visitor {
+                let text_str: &str = &decoded;
+                let node_ctx = NodeContext {
+                    node_type: NodeType::Text,
+                    tag_name: String::new(),
+                    attributes: BTreeMap::new(),
+                    depth: state.depth,
+                    index_in_parent: 0,
+                    parent_tag: None,
+                    is_inline: true,
+                };
+                let result = visitor_handle.borrow_mut().visit_text(&node_ctx, text_str);
+                match result {
+                    VisitResult::Skip => return,
+                    VisitResult::Custom(custom) => {
+                        buf.push_str(&custom);
+                        return;
+                    }
+                    _ => {}
+                }
+            }
             if in_pre {
                 buf.push_str(&decoded);
             } else {
@@ -132,7 +188,7 @@ fn walk_plain(
         }
         tl::Node::Tag(tag) => {
             // Drop elements matching exclude_selectors, including all their descendants.
-            if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
+            if !state.excluded_node_ids.is_empty() && state.excluded_node_ids.contains(&node_handle.get_inner()) {
                 return;
             }
@@ -146,10 +202,55 @@ fn walk_plain(
             // Apply preprocessing: drop nav/footer/aside/noise elements
             // (shared logic with the markdown path).
-            if should_drop_for_preprocessing(tag_str, tag, options) {
+            if should_drop_for_preprocessing(tag_str, tag, state.options) {
                 return;
             }
+            // --- visitor: element start ---
+            #[cfg(feature = "visitor")]
+            if let Some(visitor_handle) = state.visitor {
+                let attributes = collect_tag_attributes(tag);
+                let node_ctx = NodeContext {
+                    node_type: NodeType::Element,
+                    tag_name: tag_str.to_string(),
+                    attributes,
+                    depth: state.depth,
+                    index_in_parent: 0,
+                    parent_tag: None,
+                    is_inline: !is_block_level_element(tag_str),
+                };
+                let result = visitor_handle.borrow_mut().visit_element_start(&node_ctx);
+                match result {
+                    VisitResult::Skip => return,
+                    VisitResult::Custom(custom) => {
+                        buf.push_str(&custom);
+                        // Still call visit_element_end with the custom content as context.
+                        let end_result = visitor_handle.borrow_mut().visit_element_end(&node_ctx, &custom);
+                        match end_result {
+                            VisitResult::Custom(replacement) => {
+                                let trim_len = buf.len() - custom.len();
+                                buf.truncate(trim_len);
+                                buf.push_str(&replacement);
+                            }
+                            VisitResult::Skip => {
+                                let trim_len = buf.len() - custom.len();
+                                buf.truncate(trim_len);
+                            }
+                            _ => {}
+                        }
+                        return;
+                    }
+                    _ => {}
+                }
+            }
+            // Record the buf position before this element's content so visit_element_end
+            // can truncate back to it for Custom/Skip results.
+            #[cfg(feature = "visitor")]
+            let element_output_start = buf.len();
+            let child_state = state.descend();
             match tag_str {
                 "br" => {
                     buf.push('\n');
@@ -159,11 +260,11 @@ fn walk_plain(
                 }
                 "pre" => {
                     ensure_blank_line(buf);
-                    walk_children(tag, parser, buf, options, true, list_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, true, list_ctx, &child_state);
                     ensure_blank_line(buf);
                 }
                 "img" => {
-                    if !options.skip_images {
+                    if !state.options.skip_images {
                         if let Some(Some(alt)) = tag.attributes().get("alt") {
                             let alt_text = alt.as_utf8_str();
                             if !alt_text.is_empty() {
@@ -174,13 +275,13 @@ fn walk_plain(
                 }
                 "table" => {
                     ensure_blank_line(buf);
-                    walk_table(tag, parser, buf, options, excluded_node_ids);
+                    walk_table(tag, parser, buf, &child_state);
                     ensure_blank_line(buf);
                 }
                 "ul" => {
                     ensure_newline(buf);
                     let mut child_ctx = ListContext::Unordered;
-                    walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
                     ensure_newline(buf);
                 }
                 "ol" => {
@@ -192,7 +293,7 @@ fn walk_plain(
                         .unwrap_or(1);
                     ensure_newline(buf);
                     let mut child_ctx = ListContext::Ordered { next_index: start };
-                    walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
                     ensure_newline(buf);
                 }
                 "li" => {
@@ -210,17 +311,48 @@ fn walk_plain(
                             buf.push_str("- ");
                         }
                     }
-                    walk_children(tag, parser, buf, options, false, list_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, false, list_ctx, &child_state);
                     ensure_newline(buf);
                 }
                 _ if BLOCK_TAGS.contains(&tag_str) => {
                     ensure_blank_line(buf);
-                    walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
                     ensure_blank_line(buf);
                 }
                 _ => {
                     // Inline elements and structural containers (html, body, etc.)
-                    walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
+                    walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
+                }
+            }
+            // --- visitor: element end ---
+            #[cfg(feature = "visitor")]
+            if let Some(visitor_handle) = state.visitor {
+                let attributes = collect_tag_attributes(tag);
+                let node_ctx = NodeContext {
+                    node_type: NodeType::Element,
+                    tag_name: tag_str.to_string(),
+                    attributes,
+                    depth: state.depth,
+                    index_in_parent: 0,
+                    parent_tag: None,
+                    is_inline: !is_block_level_element(tag_str),
+                };
+                // Clamp safe_start in case children truncated the buffer.
+                let safe_start = element_output_start.min(buf.len());
+                let element_content = &buf[safe_start..];
+                let result = visitor_handle
+                    .borrow_mut()
+                    .visit_element_end(&node_ctx, element_content);
+                match result {
+                    VisitResult::Custom(custom) => {
+                        buf.truncate(safe_start);
+                        buf.push_str(&custom);
+                    }
+                    VisitResult::Skip => {
+                        buf.truncate(safe_start);
+                    }
+                    _ => {}
                 }
             }
         }
@@ -233,26 +365,19 @@ fn walk_children(
     tag: &tl::HTMLTag,
     parser: &tl::Parser,
     buf: &mut String,
-    options: &ConversionOptions,
     in_pre: bool,
     list_ctx: &mut ListContext,
-    excluded_node_ids: &HashSet<u32>,
+    state: &WalkState<'_>,
 ) {
     let children = tag.children();
     let top = children.top();
     for child in top.iter() {
-        walk_plain(child, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
+        walk_plain(child, parser, buf, in_pre, list_ctx, state);
     }
 }
 /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
-fn walk_table(
-    table_tag: &tl::HTMLTag,
-    parser: &tl::Parser,
-    buf: &mut String,
-    options: &ConversionOptions,
-    excluded_node_ids: &HashSet<u32>,
-) {
+fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, state: &WalkState<'_>) {
     // Collect all <tr> node handles by recursing into the table
     let mut row_handles = Vec::new();
     collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
@@ -278,6 +403,7 @@ fn walk_table(
             }
         }
+        let cell_state = state.descend();
         for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
             if cell_idx > 0 {
                 buf.push('\t');
@@ -285,15 +411,7 @@ fn walk_table(
             let mut cell_buf = String::new();
             if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
                 let mut cell_list_ctx = ListContext::None;
-                walk_children(
-                    cell_tag,
-                    parser,
-                    &mut cell_buf,
-                    options,
-                    false,
-                    &mut cell_list_ctx,
-                    excluded_node_ids,
-                );
+                walk_children(cell_tag, parser, &mut cell_buf, false, &mut cell_list_ctx, &cell_state);
             }
             buf.push_str(cell_buf.trim());
         }

data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs CHANGED Viewed

@@ -18,48 +18,61 @@ pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
 ///
 /// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
 /// whitespace preservation rules and should not be repaired.
+///
+/// Also detects table structural elements (`td`, `tr`, `th`) nested under `<p>` —
+/// a structural impossibility in valid HTML that signals the `tl` parser absorbed
+/// a table into a paragraph because of an unclosed `<p>` (common in Word/Outlook
+/// HTML such as `<p class='MsoNormal'>` cells). Issue #336.
 pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
     for handle in dom_ctx.node_map.iter().flatten() {
         if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
-            let is_block = dom_ctx
-                .tag_info(handle.get_inner(), parser)
-                .map(|info| info.is_block)
-                .unwrap_or(false);
-            if is_block {
-                // Check if this block element or any ancestor is pre/code
-                let mut check_parent = Some(handle.get_inner());
-                let mut inside_preformatted = false;
-                while let Some(node_id) = check_parent {
-                    if let Some(info) = dom_ctx.tag_info(node_id, parser) {
-                        if matches!(info.name.as_str(), "pre" | "code") {
-                            inside_preformatted = true;
-                            break;
-                        }
+            let node_id = handle.get_inner();
+            let Some(info) = dom_ctx.tag_info(node_id, parser) else {
+                continue;
+            };
+            // Table elements under <p>: tl misparsed an unclosed <p> in <td>.
+            if matches!(info.name.as_str(), "td" | "tr" | "th") && has_p_ancestor(dom_ctx, parser, node_id) {
+                return true;
+            }
+            if !info.is_block {
+                continue;
+            }
+            // Check if this block element or any ancestor is pre/code
+            let mut check_parent = Some(node_id);
+            let mut inside_preformatted = false;
+            while let Some(check_id) = check_parent {
+                if let Some(info) = dom_ctx.tag_info(check_id, parser) {
+                    if matches!(info.name.as_str(), "pre" | "code") {
+                        inside_preformatted = true;
+                        break;
                     }
-                    check_parent = dom_ctx.parent_of(node_id);
                 }
+                check_parent = dom_ctx.parent_of(check_id);
+            }
-                // Skip misnesting check for elements inside pre/code blocks
-                if inside_preformatted {
-                    continue;
-                }
+            // Skip misnesting check for elements inside pre/code blocks
+            if inside_preformatted {
+                continue;
+            }
-                let mut current = dom_ctx.parent_of(handle.get_inner());
-                while let Some(parent_id) = current {
-                    if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
-                        if is_inline_element(&parent_info.name) && !inline_ancestor_allows_block(&parent_info.name) {
+            let mut current = dom_ctx.parent_of(node_id);
+            while let Some(parent_id) = current {
+                if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
+                    if is_inline_element(&parent_info.name) && !inline_ancestor_allows_block(&parent_info.name) {
+                        return true;
+                    }
+                } else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
+                    if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
+                        let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
+                        if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
                             return true;
                         }
-                    } else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
-                        if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
-                            let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
-                            if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
-                                return true;
-                            }
-                        }
                     }
-                    current = dom_ctx.parent_of(parent_id);
                 }
+                current = dom_ctx.parent_of(parent_id);
             }
         }
     }
@@ -67,6 +80,26 @@ pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bo
     false
 }
+/// Walk ancestors of `node_id` looking for a `<p>` element.
+///
+/// Stops ascending once it leaves the table hierarchy (`table`/`body`/`html`)
+/// to avoid false positives where a `<p>` legitimately wraps a `<table>`.
+fn has_p_ancestor(dom_ctx: &DomContext, parser: &tl::Parser, node_id: u32) -> bool {
+    let mut current = dom_ctx.parent_of(node_id);
+    while let Some(parent_id) = current {
+        if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
+            if parent_info.name == "p" {
+                return true;
+            }
+            if matches!(parent_info.name.as_str(), "table" | "body" | "html") {
+                return false;
+            }
+        }
+        current = dom_ctx.parent_of(parent_id);
+    }
+    false
+}
 /// Determine if a node should be dropped during preprocessing.
 ///
 /// Behavior depends on the [`PreprocessingPreset`]:

data/vendor/html-to-markdown-rs/src/converter/text_node.rs CHANGED Viewed

@@ -114,25 +114,19 @@ pub fn process_text_node(
     let processed_text = if ctx.in_code || ctx.in_ruby {
         text.into_owned()
     } else if ctx.in_table_cell {
+        // Always escape * and _ in table cells to prevent unintended emphasis.
         let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
             let normalized_text = text::normalize_whitespace_cow(text.as_ref());
             let escaped_result = text::escape(
                 normalized_text.as_ref(),
                 options.escape_misc,
-                options.escape_asterisks,
-                options.escape_underscores,
+                true,
+                true,
                 options.escape_ascii,
             );
             escaped_result.into_owned()
         } else {
-            text::escape(
-                text.as_ref(),
-                options.escape_misc,
-                options.escape_asterisks,
-                options.escape_underscores,
-                options.escape_ascii,
-            )
-            .into_owned()
+            text::escape(text.as_ref(), options.escape_misc, true, true, options.escape_ascii).into_owned()
         };
         if options.escape_misc {
             escaped

data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs CHANGED Viewed

@@ -323,6 +323,96 @@ pub fn normalize_bogus_comment_endings(input: &str) -> Cow<'_, str> {
     }
 }
+/// Normalize closing tags whose `>` appears on a subsequent line.
+///
+/// Some HTML formatters (JSX-style) write closing tags as:
+///
+/// ```html
+/// </a
+/// >
+/// ```
+///
+/// The `tl` parser does not handle end-tags with a newline before the closing
+/// `>`, leaving the element unclosed so all subsequent siblings become children
+/// of the open element.  This pass collapses such patterns to a single-line
+/// closing tag (`</a>`) before the document reaches `tl`.
+///
+/// Only the whitespace between the tag name and the closing `>` is normalised;
+/// the rest of the document is untouched.
+pub fn normalize_split_closing_tags(input: &str) -> Cow<'_, str> {
+    let bytes = input.as_bytes();
+    let len = bytes.len();
+    // Fast path: need both '</' and '\n' to have any candidates.
+    if len < 4 || !bytes.contains(&b'\n') {
+        return Cow::Borrowed(input);
+    }
+    let mut idx = 0;
+    let mut last = 0;
+    let mut output: Option<String> = None;
+    while idx + 2 < len {
+        // Look for `</`
+        if bytes[idx] != b'<' || bytes[idx + 1] != b'/' {
+            idx += 1;
+            continue;
+        }
+        // Scan tag name: ASCII letters, digits, hyphens (HTML5 allows hyphens in custom elements)
+        let name_start = idx + 2;
+        let mut name_end = name_start;
+        while name_end < len && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-') {
+            name_end += 1;
+        }
+        if name_end == name_start {
+            // No tag name — not a closing tag we care about.
+            idx += 1;
+            continue;
+        }
+        // After the tag name, skip any whitespace.  If there is a newline in
+        // that whitespace before the `>`, we need to rewrite.
+        let ws_start = name_end;
+        let mut ws_end = ws_start;
+        let mut has_newline = false;
+        while ws_end < len && bytes[ws_end].is_ascii_whitespace() {
+            if bytes[ws_end] == b'\n' || bytes[ws_end] == b'\r' {
+                has_newline = true;
+            }
+            ws_end += 1;
+        }
+        if !has_newline || ws_end >= len || bytes[ws_end] != b'>' {
+            // Either no whitespace newline, or the `>` is not the next char.
+            idx += 1;
+            continue;
+        }
+        // We have `</tagname [whitespace-with-newline]>` — rewrite to `</tagname>`.
+        let tag_name = &input[name_start..name_end];
+        let out = output.get_or_insert_with(|| String::with_capacity(len));
+        out.push_str(&input[last..idx]);
+        out.push_str("</");
+        out.push_str(tag_name);
+        out.push('>');
+        idx = ws_end + 1; // advance past the `>`
+        last = idx;
+    }
+    match output {
+        Some(mut out) => {
+            if last < len {
+                out.push_str(&input[last..]);
+            }
+            Cow::Owned(out)
+        }
+        None => Cow::Borrowed(input),
+    }
+}
 /// Preprocess HTML to normalize tags and fix common issues.
 pub fn preprocess_html(input: &str) -> Cow<'_, str> {
     const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
@@ -788,7 +878,7 @@ fn tag_has_hidden_attribute(tag: &str) -> bool {
 #[cfg(test)]
 mod tests {
-    use super::{normalize_bogus_comment_endings, sanitize_markdown_url};
+    use super::{normalize_bogus_comment_endings, normalize_split_closing_tags, sanitize_markdown_url};
     // ── normalize_bogus_comment_endings ───────────────────────────────────────
@@ -841,6 +931,52 @@ mod tests {
         assert_eq!(result.as_ref(), "");
     }
+    // ── normalize_split_closing_tags ──────────────────────────────────────────
+    #[test]
+    fn normalize_split_closing_tags_collapses_newline_before_close_bracket() {
+        let input = "<a href=\"#x\">text</a\n>";
+        let result = normalize_split_closing_tags(input);
+        assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
+    }
+    #[test]
+    fn normalize_split_closing_tags_collapses_indented_newline_before_close_bracket() {
+        let input = "<a href=\"#x\">text</a\n  >";
+        let result = normalize_split_closing_tags(input);
+        assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
+    }
+    #[test]
+    fn normalize_split_closing_tags_leaves_well_formed_closing_tags_unchanged() {
+        let input = "<a href=\"#x\">text</a>";
+        let result = normalize_split_closing_tags(input);
+        assert_eq!(result.as_ref(), input);
+    }
+    #[test]
+    fn normalize_split_closing_tags_handles_multiple_split_closing_tags() {
+        let input = "<li><a href=\"#a\">A</a\n  >\n<a href=\"#b\">B</a\n>";
+        let result = normalize_split_closing_tags(input);
+        assert_eq!(result.as_ref(), "<li><a href=\"#a\">A</a>\n<a href=\"#b\">B</a>");
+    }
+    #[test]
+    fn normalize_split_closing_tags_does_not_collapse_inline_whitespace() {
+        // Only newlines trigger the normalisation; spaces alone must not.
+        let input = "<a href=\"#x\">text</a >";
+        let result = normalize_split_closing_tags(input);
+        // A space before > is actually valid HTML and tl handles it fine.
+        // We must not touch it to avoid over-normalising.
+        assert_eq!(result.as_ref(), input);
+    }
+    #[test]
+    fn normalize_split_closing_tags_empty_input() {
+        let result = normalize_split_closing_tags("");
+        assert_eq!(result.as_ref(), "");
+    }
     // ── sanitize_markdown_url ─────────────────────────────────────────────────
     #[test]

data/vendor/html-to-markdown-rs/tests/integration_test.rs CHANGED Viewed

@@ -198,9 +198,13 @@ fn test_strikethrough() {
 fn test_simple_table() {
     let html = "<table><tr><th>Header</th></tr><tr><td>Cell</td></tr></table>";
     let result = convert(html, None).unwrap();
-    assert!(result.contains("| Header |"));
-    assert!(result.contains("| --- |"));
-    assert!(result.contains("| Cell |"));
+    assert!(result.contains("| Header |"), "header row missing: {result}");
+    // Separator uses at least as many dashes as the widest cell ("Header" = 6).
+    assert!(
+        result.lines().any(|l| l.starts_with("| ----")),
+        "separator row missing: {result}"
+    );
+    assert!(result.contains("| Cell"), "cell row missing: {result}");
 }
 #[test]
@@ -221,7 +225,10 @@ fn test_table_rowspan() {
         ..Default::default()
     };
     let result = convert(html, Some(options)).unwrap();
-    let expected = "\n\n| Header 1 | Header 2 |\n| --- | --- |\n| Spanning cell | First row content<br>Second line |\n|  | Next row<br>More content |\n";
+    // Columns are padded to the widest cell per column (rowspan accounted):
+    //   col 0: max("Header 1"=8, "Spanning cell"=13, ""=0) = 13
+    //   col 1: max("Header 2"=8, "First row content<br>Second line"=32, "Next row<br>More content"=24) = 32
+    let expected = "| Header 1      | Header 2                         |\n| ------------- | -------------------------------- |\n| Spanning cell | First row content<br>Second line |\n|               | Next row<br>More content         |\n";
     assert_eq!(result, expected);
 }
@@ -534,7 +541,8 @@ fn test_ordered_list_with_heading_and_table() {
 ";
     let result = convert(html, None).unwrap();
-    let expected = "1. ### h3\n2. *table*\n\n    | blah |\n    | --- |\n";
+    // Separator dashes match the column width ("blah" = 4 chars → 4 dashes).
+    let expected = "1. ### h3\n2. *table*\n\n    | blah |\n    | ---- |\n";
     assert_eq!(result, expected);
 }