RubyGems - html-to-markdown - Versions diffs - 3.2.4 → 3.4.0.pre.rc.13 - Mend

html-to-markdown 3.2.4 → 3.4.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

data/vendor/html-to-markdown-rs/src/converter/plain_text.rs CHANGED Viewed

@@ -4,8 +4,10 @@
 //! visible text content with structural whitespace, bypassing the full
 //! Markdown/Djot conversion pipeline.
+use std::collections::HashSet;
 use std::fmt::Write;
+use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
 use crate::options::ConversionOptions;
 use crate::text;
@@ -61,12 +63,36 @@ const BLOCK_TAGS: &[&str] = &[
 /// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
 /// - Tables: cells separated by tab, rows by newline
 /// - Inline elements are recursed without markers
+/// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
 pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
     let mut buf = String::with_capacity(1024);
     let mut list_ctx = ListContext::None;
+    // Pre-compute excluded node IDs from exclude_selectors.
+    let excluded_node_ids: HashSet<u32> = if options.exclude_selectors.is_empty() {
+        HashSet::new()
+    } else {
+        let mut ids = HashSet::new();
+        for selector in &options.exclude_selectors {
+            if let Some(iter) = dom.query_selector(selector) {
+                for handle in iter {
+                    ids.insert(handle.get_inner());
+                }
+            }
+        }
+        ids
+    };
     for child_handle in dom.children() {
-        walk_plain(child_handle, parser, &mut buf, options, false, &mut list_ctx);
+        walk_plain(
+            child_handle,
+            parser,
+            &mut buf,
+            options,
+            false,
+            &mut list_ctx,
+            &excluded_node_ids,
+        );
     }
     post_process(&mut buf);
@@ -81,6 +107,7 @@ fn walk_plain(
     options: &ConversionOptions,
     in_pre: bool,
     list_ctx: &mut ListContext,
+    excluded_node_ids: &HashSet<u32>,
 ) {
     let Some(node) = node_handle.get(parser) else {
         return;
@@ -104,6 +131,11 @@ fn walk_plain(
             }
         }
         tl::Node::Tag(tag) => {
+            // Drop elements matching exclude_selectors, including all their descendants.
+            if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
+                return;
+            }
             let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
             let tag_str = tag_name.as_str();
@@ -112,6 +144,12 @@ fn walk_plain(
                 return;
             }
+            // Apply preprocessing: drop nav/footer/aside/noise elements
+            // (shared logic with the markdown path).
+            if should_drop_for_preprocessing(tag_str, tag, options) {
+                return;
+            }
             match tag_str {
                 "br" => {
                     buf.push('\n');
@@ -121,7 +159,7 @@ fn walk_plain(
                 }
                 "pre" => {
                     ensure_blank_line(buf);
-                    walk_children(tag, parser, buf, options, true, list_ctx);
+                    walk_children(tag, parser, buf, options, true, list_ctx, excluded_node_ids);
                     ensure_blank_line(buf);
                 }
                 "img" => {
@@ -136,13 +174,13 @@ fn walk_plain(
                 }
                 "table" => {
                     ensure_blank_line(buf);
-                    walk_table(tag, parser, buf, options);
+                    walk_table(tag, parser, buf, options, excluded_node_ids);
                     ensure_blank_line(buf);
                 }
                 "ul" => {
                     ensure_newline(buf);
                     let mut child_ctx = ListContext::Unordered;
-                    walk_children(tag, parser, buf, options, false, &mut child_ctx);
+                    walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
                     ensure_newline(buf);
                 }
                 "ol" => {
@@ -154,7 +192,7 @@ fn walk_plain(
                         .unwrap_or(1);
                     ensure_newline(buf);
                     let mut child_ctx = ListContext::Ordered { next_index: start };
-                    walk_children(tag, parser, buf, options, false, &mut child_ctx);
+                    walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
                     ensure_newline(buf);
                 }
                 "li" => {
@@ -172,17 +210,17 @@ fn walk_plain(
                             buf.push_str("- ");
                         }
                     }
-                    walk_children(tag, parser, buf, options, false, list_ctx);
+                    walk_children(tag, parser, buf, options, false, list_ctx, excluded_node_ids);
                     ensure_newline(buf);
                 }
                 _ if BLOCK_TAGS.contains(&tag_str) => {
                     ensure_blank_line(buf);
-                    walk_children(tag, parser, buf, options, in_pre, list_ctx);
+                    walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
                     ensure_blank_line(buf);
                 }
                 _ => {
                     // Inline elements and structural containers (html, body, etc.)
-                    walk_children(tag, parser, buf, options, in_pre, list_ctx);
+                    walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
                 }
             }
         }
@@ -198,16 +236,23 @@ fn walk_children(
     options: &ConversionOptions,
     in_pre: bool,
     list_ctx: &mut ListContext,
+    excluded_node_ids: &HashSet<u32>,
 ) {
     let children = tag.children();
     let top = children.top();
     for child in top.iter() {
-        walk_plain(child, parser, buf, options, in_pre, list_ctx);
+        walk_plain(child, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
     }
 }
 /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
-fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
+fn walk_table(
+    table_tag: &tl::HTMLTag,
+    parser: &tl::Parser,
+    buf: &mut String,
+    options: &ConversionOptions,
+    excluded_node_ids: &HashSet<u32>,
+) {
     // Collect all <tr> node handles by recursing into the table
     let mut row_handles = Vec::new();
     collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
@@ -240,7 +285,15 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, op
             let mut cell_buf = String::new();
             if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
                 let mut cell_list_ctx = ListContext::None;
-                walk_children(cell_tag, parser, &mut cell_buf, options, false, &mut cell_list_ctx);
+                walk_children(
+                    cell_tag,
+                    parser,
+                    &mut cell_buf,
+                    options,
+                    false,
+                    &mut cell_list_ctx,
+                    excluded_node_ids,
+                );
             }
             buf.push_str(cell_buf.trim());
         }

data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs CHANGED Viewed

@@ -5,12 +5,12 @@
 use crate::converter::dom_context::DomContext;
 use crate::converter::main_helpers::is_inline_element;
-use crate::converter::utility::attributes::element_has_navigation_hint;
+use crate::converter::utility::attributes::{attribute_matches_any, element_has_navigation_hint};
 use crate::converter::utility::content::normalized_tag_name;
 use crate::options::ConversionOptions;
 /// Check if an inline ancestor element is allowed to contain block-level elements.
-pub(crate) fn inline_ancestor_allows_block(tag_name: &str) -> bool {
+pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
     matches!(tag_name, "a" | "ins" | "del")
 }
@@ -18,7 +18,7 @@ pub(crate) fn inline_ancestor_allows_block(tag_name: &str) -> bool {
 ///
 /// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
 /// whitespace preservation rules and should not be repaired.
-pub(crate) fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
+pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
     for handle in dom_ctx.node_map.iter().flatten() {
         if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
             let is_block = dom_ctx
@@ -68,43 +68,101 @@ pub(crate) fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser
 }
 /// Determine if a node should be dropped during preprocessing.
-pub(crate) fn should_drop_for_preprocessing(
-    node_handle: &tl::NodeHandle,
-    tag_name: &str,
-    tag: &tl::HTMLTag,
-    parser: &tl::Parser,
-    dom_ctx: &DomContext,
-    options: &ConversionOptions,
-) -> bool {
-    // If preprocessing is globally disabled, don't drop any nodes
+///
+/// Behavior depends on the [`PreprocessingPreset`]:
+///
+/// - **Minimal**: Only scripts/styles are stripped (handled elsewhere). This function
+///   drops nothing — all structural elements are preserved.
+/// - **Standard** (default): Drops `<nav>` unconditionally. Drops `<header>`, `<footer>`,
+///   and `<aside>` only when they have navigation hints (class/role/aria attributes
+///   indicating site chrome). Drops `<form>` when `remove_forms` is enabled.
+/// - **Aggressive**: All of Standard, plus: drops `<footer>`, `<aside>`, `<noscript>`
+///   unconditionally. Drops ANY element with navigation hints in class/id/role
+///   (e.g. `<div class="sidebar">`). Drops elements with noise-related classes/roles.
+pub fn should_drop_for_preprocessing(tag_name: &str, tag: &tl::HTMLTag, options: &ConversionOptions) -> bool {
+    use crate::options::PreprocessingPreset;
     if !options.preprocessing.enabled {
         return false;
     }
+    let preset = options.preprocessing.preset;
+    // Minimal preset: drop nothing here (scripts/styles handled in earlier pipeline stage).
+    if preset == PreprocessingPreset::Minimal {
+        return false;
+    }
+    // Form removal — applies to both Standard and Aggressive when enabled.
+    if options.preprocessing.remove_forms && tag_name == "form" {
+        return true;
+    }
+    let is_aggressive = preset == PreprocessingPreset::Aggressive;
+    // Aggressive: drop <noscript> — its content is fallback for no-JS browsers.
+    if is_aggressive && tag_name == "noscript" {
+        return true;
+    }
+    // Navigation removal — only when the flag is enabled.
     if !options.preprocessing.remove_navigation {
         return false;
     }
     let has_nav_hint = element_has_navigation_hint(tag);
+    // <nav> is always navigation — drop in both Standard and Aggressive.
     if tag_name == "nav" {
         return true;
     }
     if tag_name == "header" {
-        use crate::converter::utility::attributes::has_semantic_content_ancestor;
-        let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
-        if !inside_semantic_content {
-            return true;
-        }
-        if has_nav_hint {
-            return true;
-        }
-    } else if tag_name == "footer" || tag_name == "aside" {
-        if has_nav_hint {
+        // Drop <header> only with navigation hints (e.g. class="site-header",
+        // role="navigation"). A plain <header> often wraps article titles like
+        // <header><h1>Title</h1></header> — dropping it loses content.
+        return has_nav_hint;
+    }
+    if tag_name == "footer" || tag_name == "aside" {
+        // Standard: drop only with navigation hints.
+        // Aggressive: drop unconditionally.
+        return is_aggressive || has_nav_hint;
+    }
+    // Aggressive: drop ANY element that has navigation hints in class/id/role.
+    // This catches <div class="sidebar">, <div class="menu">, <section class="navigation">,
+    // and similar non-semantic navigation containers.
+    if is_aggressive && has_nav_hint {
+        return true;
+    }
+    // Aggressive: drop elements with noise-related roles.
+    if is_aggressive {
+        if element_has_noise_hint(tag) {
             return true;
         }
     }
     false
 }
+/// Check if an element has noise-related hints (ads, cookie banners, social sharing).
+fn element_has_noise_hint(tag: &tl::HTMLTag) -> bool {
+    const NOISE_KEYWORDS: &[&str] = &[
+        "cookie",
+        "consent",
+        "gdpr",
+        "banner",
+        "advertisement",
+        "ad-container",
+        "advert",
+        "social-share",
+        "share-buttons",
+        "popup",
+        "modal-overlay",
+        "newsletter-signup",
+    ];
+    attribute_matches_any(tag, "class", NOISE_KEYWORDS) || attribute_matches_any(tag, "id", NOISE_KEYWORDS)
+}

data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs CHANGED Viewed

@@ -170,7 +170,7 @@ mod tests {
     #[test]
     fn figure_caption_separated_from_image() {
         let html = r#"<figure><img src="photo.jpg" alt="Photo"><figcaption>A nice photo</figcaption></figure>"#;
-        let result = crate::convert(html, None).unwrap();
+        let result = crate::convert(html, None, None).unwrap();
         let content = result.content.unwrap_or_default();
         assert!(
             content.contains("![Photo](photo.jpg)"),

data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs CHANGED Viewed

@@ -30,7 +30,7 @@ pub mod sectioning;
 pub mod summary;
 // Re-export types from parent module for submodule access
-pub(crate) use super::walk_node;
+pub use super::walk_node;
 pub use super::{Context, DomContext};
 // Re-export handler functions for direct use

data/vendor/html-to-markdown-rs/src/converter/text/mod.rs CHANGED Viewed

@@ -3,10 +3,6 @@
 //! This module provides utilities for normalizing, escaping, and processing text content
 //! extracted from HTML documents during the conversion to Markdown format.
-mod escaping;
-mod normalization;
 mod processing;
-pub use escaping::{escape_link_label, escape_malformed_angle_brackets};
-pub use normalization::{normalize_heading_text, trim_line_end_whitespace, truncate_at_char_boundary};
 pub use processing::dedent_code_block;

data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs CHANGED Viewed

@@ -6,7 +6,7 @@ use crate::converter::DomContext;
 use crate::converter::utility::content::normalized_tag_name;
 /// Check if a tag has main content semantics based on role or class.
-pub(crate) fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
+pub fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
     if let Some(Some(role)) = tag.attributes().get("role") {
         let lowered = role.as_utf8_str().to_ascii_lowercase();
         if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
@@ -38,7 +38,7 @@ pub(crate) fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
 }
 /// Check if an element has navigation-related hints in its attributes.
-pub(crate) fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
+pub fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
     if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
         return true;
     }
@@ -88,7 +88,7 @@ pub(crate) fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
 }
 /// Check if an attribute value matches any of the given keywords (space or custom-separator aware).
-pub(crate) fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
+pub fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
     let Some(attr_value) = tag.attributes().get(attr) else {
         return false;
     };
@@ -113,7 +113,7 @@ pub(crate) fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&
 /// Check if an attribute contains any of the given keywords (substring match).
 #[allow(clippy::trivially_copy_pass_by_ref)]
-pub(crate) fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
+pub fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
     let Some(attr_value) = tag.attributes().get(attr) else {
         return false;
     };
@@ -126,11 +126,7 @@ pub(crate) fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[
 /// Check if a node has a semantic content ancestor (main, article, section).
 #[allow(clippy::trivially_copy_pass_by_ref)]
-pub(crate) fn has_semantic_content_ancestor(
-    node_handle: &tl::NodeHandle,
-    parser: &tl::Parser,
-    dom_ctx: &DomContext,
-) -> bool {
+pub fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
     let mut current_id = node_handle.get_inner();
     while let Some(parent_id) = dom_ctx.parent_of(current_id) {
         if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {

data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs CHANGED Viewed

@@ -10,7 +10,7 @@ use std::num::NonZeroUsize;
 ///
 /// Pre-computes parent-child relationships, sibling indices, and caches
 /// tag information for efficient DOM navigation during conversion.
-pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len: usize) -> DomContext {
+pub fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len: usize) -> DomContext {
     let cache_capacity = text_cache_capacity_for_input(input_len);
     let mut ctx = DomContext {
         parent_map: Vec::new(),
@@ -40,7 +40,7 @@ pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len:
 ///
 /// Returns a cache capacity between 32 and TEXT_CACHE_CAPACITY,
 /// scaled proportionally to input size (1KB = 1 slot).
-pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
+pub fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
     const TEXT_CACHE_CAPACITY: usize = 256;
     // `clamp(32, TEXT_CACHE_CAPACITY)` guarantees `target >= 32 > 0`, so `new` always returns Some.
     let target = (input_len / 1024).clamp(32, TEXT_CACHE_CAPACITY);
@@ -50,7 +50,7 @@ pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
 /// Recursively record node hierarchy into DOM context.
 ///
 /// Builds the complete parent-child relationship map for efficient tree traversal.
-pub(crate) fn record_node_hierarchy(
+pub fn record_node_hierarchy(
     node_handle: tl::NodeHandle,
     parent: Option<u32>,
     parser: &tl::Parser,

data/vendor/html-to-markdown-rs/src/converter/utility/content.rs CHANGED Viewed

@@ -9,14 +9,14 @@ use std::borrow::Cow;
 use std::collections::BTreeMap;
 // Forward declare DomContext from parent module to avoid circular imports
-pub(crate) use crate::converter::DomContext;
+pub use crate::converter::DomContext;
 /// Collect all attributes from an HTML tag as a `BTreeMap<String, String>`.
 ///
 /// Boolean attributes (those with `None` as the value) are skipped; only
 /// attributes that carry an explicit value are included.
 #[cfg(feature = "visitor")]
-pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
+pub fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
     tag.attributes()
         .iter()
         .filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
@@ -28,7 +28,7 @@ pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, Stri
 /// Similar to `text::chomp` but handles line breaks from `<br>` tags specially.
 /// Line breaks are extracted as suffix to be placed outside formatting.
 /// Returns (prefix, suffix, `trimmed_text`).
-pub(crate) fn chomp_inline(text: &str) -> (&str, &str, &str) {
+pub fn chomp_inline(text: &str) -> (&str, &str, &str) {
     if text.is_empty() {
         return ("", "", "");
     }
@@ -59,13 +59,13 @@ pub(crate) fn chomp_inline(text: &str) -> (&str, &str, &str) {
 /// Get the text content of a node and its children.
 #[allow(clippy::trivially_copy_pass_by_ref)]
-pub(crate) fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> String {
+pub fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> String {
     dom_ctx.text_content(*node_handle, parser)
 }
 /// Collect inline text for link labels, skipping block-level descendants.
 #[allow(clippy::match_wildcard_for_single_variants)]
-pub(crate) fn collect_link_label_text(
+pub fn collect_link_label_text(
     children: &[tl::NodeHandle],
     parser: &tl::Parser,
     dom_ctx: &DomContext,
@@ -118,7 +118,7 @@ pub(crate) fn collect_link_label_text(
 /// Normalize a link label by collapsing newlines and normalizing whitespace.
 #[allow(clippy::trivially_copy_pass_by_ref)]
-pub(crate) fn normalize_link_label(label: &str) -> String {
+pub fn normalize_link_label(label: &str) -> String {
     let mut needs_collapse = false;
     for ch in label.chars() {
         if ch == '\n' || ch == '\r' {
@@ -146,7 +146,7 @@ pub(crate) fn normalize_link_label(label: &str) -> String {
 }
 /// Normalize a tag name to lowercase, preserving borrowed input when possible.
-pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
+pub fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
     if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
         let mut owned = raw.into_owned();
         owned.make_ascii_lowercase();
@@ -157,7 +157,7 @@ pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
 }
 /// Check if an element is block-level (not inline).
-pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
+pub fn is_block_level_element(tag_name: &str) -> bool {
     is_block_level_name(tag_name, crate::converter::main_helpers::is_inline_element(tag_name))
 }
@@ -191,7 +191,7 @@ pub fn floor_char_boundary(s: &str, index: usize) -> usize {
 /// Input:  "[outer [inner]]"
 /// Output: "[outer [inner]]"
 /// ```
-pub(crate) fn escape_link_label(text: &str) -> String {
+pub fn escape_link_label(text: &str) -> String {
     if text.is_empty() {
         return String::new();
     }
@@ -231,7 +231,7 @@ pub(crate) fn escape_link_label(text: &str) -> String {
 }
 /// Helper for block-level element detection.
-pub(crate) fn is_block_level_name(tag_name: &str, is_inline: bool) -> bool {
+pub fn is_block_level_name(tag_name: &str, is_inline: bool) -> bool {
     !is_inline
         && matches!(
             tag_name,

data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs CHANGED Viewed

@@ -7,7 +7,7 @@ use std::borrow::Cow;
 use std::str;
 /// Strip script and style tags and their content from HTML.
-pub(crate) fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
+pub fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
     let bytes = input.as_bytes();
     let len = bytes.len();
@@ -163,7 +163,7 @@ pub(crate) fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
 /// Returns the position AFTER the closing tag (including the '>').
 /// This is highly optimized for performance and uses a fast-path scan.
 #[inline]
-pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
+pub fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
     let len = bytes.len();
     let tag_len = tag.len();
@@ -212,7 +212,7 @@ pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) ->
 /// Compare bytes ignoring ASCII case.
 #[inline]
-pub(crate) fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
+pub fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
     if a.len() != b.len() {
         return false;
     }
@@ -220,7 +220,7 @@ pub(crate) fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
 }
 /// Preprocess HTML to normalize tags and fix common issues.
-pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
+pub fn preprocess_html(input: &str) -> Cow<'_, str> {
     const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
     const TAGS: [&[u8]; 2] = [b"script", b"style"];
     const SVG: &[u8] = b"svg";
@@ -289,7 +289,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
                             if tag == b"script" && is_json_ld_script_open_tag(&input[idx..open_end]) {
                                 continue;
                             }
-                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
+                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(open_end);
                             let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
                             out.push_str(&input[last..idx]);
                             out.push_str(&input[idx..open_end]);
@@ -379,7 +379,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
 }
 /// Check if a script tag is a JSON-LD script.
-pub(crate) fn is_json_ld_script_open_tag(tag: &str) -> bool {
+pub fn is_json_ld_script_open_tag(tag: &str) -> bool {
     let bytes = tag.as_bytes();
     let mut idx = 0;
     while idx + 4 <= bytes.len() {
@@ -443,7 +443,7 @@ pub(crate) fn is_json_ld_script_open_tag(tag: &str) -> bool {
 /// Case-insensitive byte comparison for ASCII.
 #[inline]
-pub(crate) fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
+pub fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
     if haystack.len() < needle.len() {
         return false;
     }
@@ -454,7 +454,7 @@ pub(crate) fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool
 }
 /// Check if bytes match a tag start pattern.
-pub(crate) fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
+pub fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
     if start >= bytes.len() {
         return false;
     }
@@ -477,7 +477,7 @@ pub(crate) fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> b
 }
 /// Find the end of an HTML tag (the position of '>').
-pub(crate) fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
+pub fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
     let len = bytes.len();
     let mut in_quote: Option<u8> = None;
@@ -502,7 +502,7 @@ pub(crate) fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
 }
 /// Find the closing tag for a given tag name.
-pub(crate) fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
+pub fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
     let len = bytes.len();
     let mut depth = 1usize;
@@ -533,7 +533,7 @@ pub(crate) fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Opti
 }
 /// Check if bytes match an end tag pattern.
-pub(crate) fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
+pub fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
     if start >= bytes.len() || bytes[start] != b'/' {
         return false;
     }
@@ -553,7 +553,7 @@ pub(crate) fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> b
 ///
 /// # Returns
 /// * `Cow<str>` - Either the borrowed original URL or an owned sanitized version
-pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
+pub fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
     // Pattern: ...[text](actual_url) or similar markdown-like syntax
     // This handles malformed HTML where markdown syntax wasn't properly converted
     // and prevents downstream URL parsing errors (e.g., bracketed "IPv6" hosts).
@@ -585,7 +585,7 @@ pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
 /// Scans for opening tags containing the `hidden` attribute, finds their
 /// matching closing tag, and removes the entire element (tag + content).
 /// Self-closing tags with `hidden` are also removed.
-pub(crate) fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
+pub fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
     let bytes = input.as_bytes();
     let len = bytes.len();

data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs CHANGED Viewed

@@ -8,7 +8,7 @@ use crate::converter::utility::content::normalized_tag_name;
 /// Serialize an element to HTML string (for SVG and Math elements).
 #[allow(clippy::trivially_copy_pass_by_ref)]
 #[allow(dead_code)] // used with visitor feature
-pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
+pub fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
     if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
         let tag_name = normalized_tag_name(tag.name().as_utf8_str());
         let mut html = String::with_capacity(256);
@@ -48,7 +48,7 @@ pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parse
 /// Serialize a node to HTML string.
 #[allow(clippy::trivially_copy_pass_by_ref)]
 #[allow(dead_code)] // used with visitor feature
-pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
+pub fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
     if let Some(node) = node_handle.get(parser) {
         match node {
             tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
@@ -61,7 +61,7 @@ pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser)
 }
 /// Serialize a tag to HTML, wrapping serialize_node_to_html.
-pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
+pub fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
     let mut html = String::new();
     serialize_node_to_html(handle, parser, &mut html);
     html
@@ -70,7 +70,7 @@ pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser
 /// Recursively serialize a node to HTML.
 #[allow(clippy::trivially_copy_pass_by_ref)]
 #[allow(dead_code)] // used with visitor feature
-pub(crate) fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
+pub fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
     match handle.get(parser) {
         Some(tl::Node::Tag(tag)) => {
             let tag_name = normalized_tag_name(tag.name().as_utf8_str());