PyPI - html-to-markdown - Versions diffs - 2.4.2__tar.gz → 2.5.0__tar.gz - Mend

html-to-markdown 2.4.2tar.gz → 2.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of html-to-markdown might be problematic. Click here for more details.

Files changed (55) hide show

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/Cargo.lock RENAMED Viewed

@@ -157,9 +157,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 [[package]]
 name = "cc"
-version = "1.2.41"
+version = "1.2.42"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7"
+checksum = "81bbf3b3619004ad9bd139f62a9ab5cfe467f307455a0d307b0cf58bf070feaa"
 dependencies = [
  "find-msvc-tools",
  "shlex",
@@ -425,9 +425,9 @@ dependencies = [
 [[package]]
 name = "doc-comment"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+checksum = "780955b8b195a21ab8e4ac6b60dd1dbdcec1dc6c51c0617964b08c81785e12c9"
 [[package]]
 name = "dtoa"
@@ -550,11 +550,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
  "cfg-if",
- "js-sys",
  "libc",
  "r-efi",
  "wasip2",
- "wasm-bindgen",
 ]
 [[package]]
@@ -595,7 +593,7 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-cli"
-version = "2.4.2"
+version = "2.5.0"
 dependencies = [
  "assert_cmd",
  "clap",
@@ -609,7 +607,7 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-node"
-version = "2.4.2"
+version = "2.5.0"
 dependencies = [
  "html-to-markdown-rs",
  "mimalloc-rust",
@@ -620,7 +618,7 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-py"
-version = "2.4.2"
+version = "2.5.0"
 dependencies = [
  "base64",
  "html-to-markdown-rs",
@@ -630,7 +628,7 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-rs"
-version = "2.4.2"
+version = "2.5.0"
 dependencies = [
  "ammonia",
  "base64",
@@ -647,10 +645,9 @@ dependencies = [
 [[package]]
 name = "html-to-markdown-wasm"
-version = "2.4.2"
+version = "2.5.0"
 dependencies = [
  "console_error_panic_hook",
- "getrandom",
  "html-to-markdown-rs",
  "js-sys",
  "serde",

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/Cargo.toml RENAMED Viewed

@@ -3,7 +3,7 @@ resolver = "2"
 members = ["crates/html-to-markdown-py"]
 [workspace.package]
-version = "2.4.2"
+version = "2.5.0"
 edition = "2021"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
 license = "MIT"
@@ -15,7 +15,7 @@ rust-version = "1.80"
 [workspace.dependencies]
 # Core library
-html-to-markdown-rs = { version = "2.4.2", path = "crates/html-to-markdown" }
+html-to-markdown-rs = { version = "2.5.0", path = "crates/html-to-markdown" }
 # HTML parsing and sanitization
 tl = "0.7"

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: html-to-markdown
-Version: 2.4.2
+Version: 2.5.0
 Classifier: Development Status :: 5 - Production/Stable
 Classifier: Environment :: Console
 Classifier: Intended Audience :: Developers

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/README.md RENAMED Viewed

@@ -60,6 +60,41 @@ let options = ConversionOptions {
 let markdown = convert(html, Some(options))?;
 ```
+### Preserving HTML Tags
+The `preserve_tags` option allows you to keep specific HTML tags in their original form instead of converting them to Markdown. This is useful for complex elements like tables that may not convert well:
+```rust
+use html_to_markdown_rs::{convert, ConversionOptions};
+let html = r#"
+<p>Before table</p>
+<table class="data">
+    <tr><th>Name</th><th>Value</th></tr>
+    <tr><td>Item 1</td><td>100</td></tr>
+</table>
+<p>After table</p>
+"#;
+let options = ConversionOptions {
+    preserve_tags: vec!["table".to_string()],
+    ..Default::default()
+};
+let markdown = convert(html, Some(options))?;
+// Result: "Before table\n\n<table class=\"data\">...</table>\n\nAfter table\n"
+```
+You can preserve multiple tag types and combine with `strip_tags`:
+```rust
+let options = ConversionOptions {
+    preserve_tags: vec!["table".to_string(), "form".to_string()],
+    strip_tags: vec!["script".to_string(), "style".to_string()],
+    ..Default::default()
+};
+```
 ## Web Scraping with Preprocessing
 ```rust

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/converter.rs RENAMED Viewed

@@ -1159,6 +1159,75 @@ fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
     }
 }
+/// Serialize a tag and its children back to HTML.
+///
+/// This is used for the preserve_tags feature to output original HTML for specific elements.
+fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
+    let mut html = String::new();
+    serialize_node_to_html(handle, parser, &mut html);
+    html
+}
+/// Recursively serialize a node to HTML.
+fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
+    match handle.get(parser) {
+        Some(tl::Node::Tag(tag)) => {
+            let tag_name = tag.name().as_utf8_str();
+            // Opening tag
+            output.push('<');
+            output.push_str(&tag_name);
+            // Attributes
+            for (key, value) in tag.attributes().iter() {
+                output.push(' ');
+                output.push_str(&key);
+                if let Some(val) = value {
+                    output.push_str("=\"");
+                    output.push_str(&val);
+                    output.push('"');
+                }
+            }
+            output.push('>');
+            // Children
+            let children = tag.children();
+            for child_handle in children.top().iter() {
+                serialize_node_to_html(child_handle, parser, output);
+            }
+            // Closing tag (skip for self-closing tags)
+            if !matches!(
+                tag_name.as_ref(),
+                "br" | "hr"
+                    | "img"
+                    | "input"
+                    | "meta"
+                    | "link"
+                    | "area"
+                    | "base"
+                    | "col"
+                    | "embed"
+                    | "param"
+                    | "source"
+                    | "track"
+                    | "wbr"
+            ) {
+                output.push_str("</");
+                output.push_str(&tag_name);
+                output.push('>');
+            }
+        }
+        Some(tl::Node::Raw(bytes)) => {
+            if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
+                output.push_str(text);
+            }
+        }
+        _ => {}
+    }
+}
 fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
     const TAGS: [&[u8]; 2] = [b"script", b"style"];
     const SVG: &[u8] = b"svg";
@@ -1557,6 +1626,13 @@ fn walk_node(
                 return;
             }
+            // Preserve tags: output original HTML
+            if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
+                let html = serialize_tag_to_html(node_handle, parser);
+                output.push_str(&html);
+                return;
+            }
             match tag_name.as_ref() {
                 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
                     let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
@@ -4398,4 +4474,82 @@ mod tests {
             result
         );
     }
+    #[test]
+    fn test_preserve_tags_simple_table() {
+        let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
+        let mut options = ConversionOptions::default();
+        options.preserve_tags = vec!["table".to_string()];
+        let result = convert_html(html, &options).unwrap();
+        assert!(result.contains("<table>"), "Should preserve table tag");
+        assert!(result.contains("</table>"), "Should have closing table tag");
+        assert!(result.contains("<tr>"), "Should preserve tr tag");
+        assert!(result.contains("<td>"), "Should preserve td tag");
+        assert!(result.contains("Text"), "Should convert other elements");
+    }
+    #[test]
+    fn test_preserve_tags_with_attributes() {
+        let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
+        let mut options = ConversionOptions::default();
+        options.preserve_tags = vec!["table".to_string()];
+        let result = convert_html(html, &options).unwrap();
+        assert!(result.contains("<table"), "Should preserve table tag");
+        assert!(result.contains("class="), "Should preserve class attribute");
+        assert!(result.contains("id="), "Should preserve id attribute");
+        assert!(result.contains("</table>"), "Should have closing tag");
+    }
+    #[test]
+    fn test_preserve_tags_multiple_tags() {
+        let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
+        let mut options = ConversionOptions::default();
+        options.preserve_tags = vec!["table".to_string(), "form".to_string()];
+        let result = convert_html(html, &options).unwrap();
+        assert!(result.contains("<table>"), "Should preserve table");
+        assert!(result.contains("<form>"), "Should preserve form");
+        assert!(result.contains("Text"), "Should convert paragraph");
+    }
+    #[test]
+    fn test_preserve_tags_nested_content() {
+        let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
+        let mut options = ConversionOptions::default();
+        options.preserve_tags = vec!["table".to_string()];
+        let result = convert_html(html, &options).unwrap();
+        assert!(result.contains("<thead>"), "Should preserve nested thead");
+        assert!(result.contains("<tbody>"), "Should preserve nested tbody");
+        assert!(result.contains("<th>"), "Should preserve th tag");
+        assert!(result.contains("Header"), "Should preserve text content");
+    }
+    #[test]
+    fn test_preserve_tags_empty_list() {
+        let html = r#"<table><tr><td>Cell</td></tr></table>"#;
+        let options = ConversionOptions::default(); // No preserve_tags
+        let result = convert_html(html, &options).unwrap();
+        // Should convert to markdown table (or at least not preserve HTML)
+        assert!(
+            !result.contains("<table>"),
+            "Should not preserve table without preserve_tags"
+        );
+    }
+    #[test]
+    fn test_preserve_tags_vs_strip_tags() {
+        let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
+        let mut options = ConversionOptions::default();
+        options.preserve_tags = vec!["table".to_string()];
+        options.strip_tags = vec!["span".to_string()];
+        let result = convert_html(html, &options).unwrap();
+        assert!(result.contains("<table>"), "Should preserve table");
+        assert!(!result.contains("<span>"), "Should strip span tag");
+        assert!(result.contains("Text"), "Should keep span text content");
+    }
 }

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/converter.rs RENAMED Viewed

@@ -237,9 +237,22 @@ fn convert_element(
         // Words - join with space
         HocrElementType::OcrxWord => {
+            // Ensure space before this word if output doesn't end with whitespace or markdown formatting
+            if !output.is_empty()
+                && !output.ends_with(' ')
+                && !output.ends_with('\t')
+                && !output.ends_with('\n')
+                && !output.ends_with('*')  // Don't add space after italic/bold markers
+                && !output.ends_with('`')  // Don't add space after code markers
+                && !output.ends_with('_')  // Don't add space after underline markers
+                && !output.ends_with('[')
+            // Don't add space after opening bracket (link/image alt)
+            {
+                output.push(' ');
+            }
             if !element.text.is_empty() {
                 output.push_str(&element.text);
-                output.push(' ');
             }
         }

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/hocr/extractor.rs RENAMED Viewed

@@ -78,69 +78,58 @@ fn collect_hocr_elements(
     }
 }
-/// Extract hOCR metadata from HTML head
+/// Extract hOCR metadata from HTML head (or from orphaned meta tags after sanitization)
 fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
     let mut metadata = HocrMetadata::default();
     let parser = dom.parser();
-    // Recursively search for head element
-    fn find_head_and_extract<'a>(
-        node_handle: &tl::NodeHandle,
-        parser: &'a tl::Parser<'a>,
-        metadata: &mut HocrMetadata,
-    ) {
+    // Helper function to extract metadata from a single meta tag
+    fn extract_from_meta_tag(meta_tag: &tl::HTMLTag, metadata: &mut HocrMetadata) {
+        let attrs = meta_tag.attributes();
+        if let (Some(name), Some(content)) = (attrs.get("name").flatten(), attrs.get("content").flatten()) {
+            let name_str = name.as_utf8_str();
+            let content_str = content.as_utf8_str().to_string();
+            match name_str.as_ref() {
+                "ocr-system" => metadata.ocr_system = Some(content_str),
+                "ocr-capabilities" => {
+                    metadata.ocr_capabilities = content_str.split_whitespace().map(|s| s.to_string()).collect();
+                }
+                "ocr-number-of-pages" => {
+                    metadata.ocr_number_of_pages = content_str.parse().ok();
+                }
+                "ocr-langs" => {
+                    metadata.ocr_langs = content_str.split_whitespace().map(|s| s.to_string()).collect();
+                }
+                "ocr-scripts" => {
+                    metadata.ocr_scripts = content_str.split_whitespace().map(|s| s.to_string()).collect();
+                }
+                _ => {}
+            }
+        }
+    }
+    // Recursively search for meta tags (either inside head or as orphans after sanitization)
+    fn find_meta_tags<'a>(node_handle: &tl::NodeHandle, parser: &'a tl::Parser<'a>, metadata: &mut HocrMetadata) {
         if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
             let tag_name = tag.name().as_utf8_str();
-            if tag_name == "head" {
-                // Found head, extract meta tags
-                let children = tag.children();
-                for meta_handle in children.top().iter() {
-                    if let Some(tl::Node::Tag(meta_tag)) = meta_handle.get(parser) {
-                        if meta_tag.name().as_utf8_str() == "meta" {
-                            let attrs = meta_tag.attributes();
-                            if let (Some(name), Some(content)) =
-                                (attrs.get("name").flatten(), attrs.get("content").flatten())
-                            {
-                                let name_str = name.as_utf8_str();
-                                let content_str = content.as_utf8_str().to_string();
-                                match name_str.as_ref() {
-                                    "ocr-system" => metadata.ocr_system = Some(content_str),
-                                    "ocr-capabilities" => {
-                                        metadata.ocr_capabilities =
-                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
-                                    }
-                                    "ocr-number-of-pages" => {
-                                        metadata.ocr_number_of_pages = content_str.parse().ok();
-                                    }
-                                    "ocr-langs" => {
-                                        metadata.ocr_langs =
-                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
-                                    }
-                                    "ocr-scripts" => {
-                                        metadata.ocr_scripts =
-                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
-                                    }
-                                    _ => {}
-                                }
-                            }
-                        }
-                    }
-                }
-            } else {
-                // Keep searching in children
-                let children = tag.children();
-                for child_handle in children.top().iter() {
-                    find_head_and_extract(child_handle, parser, metadata);
-                }
+            // Extract from meta tags directly (handles both meta inside head and orphaned meta)
+            if tag_name == "meta" {
+                extract_from_meta_tag(tag, metadata);
+            }
+            // Recursively search children
+            let children = tag.children();
+            for child_handle in children.top().iter() {
+                find_meta_tags(child_handle, parser, metadata);
             }
         }
     }
     // Search from root
     for child_handle in dom.children().iter() {
-        find_head_and_extract(child_handle, parser, &mut metadata);
+        find_meta_tags(child_handle, parser, &mut metadata);
     }
     metadata

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/lib.rs RENAMED Viewed

@@ -49,7 +49,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String>
     let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
     let clean_html = if options.preprocessing.enabled {
-        sanitizer::sanitize(&normalized_html, &options.preprocessing)?
+        sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
     } else {
         normalized_html
     };
@@ -86,7 +86,7 @@ pub fn convert_with_inline_images(
     let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
     let clean_html = if options.preprocessing.enabled {
-        sanitizer::sanitize(&normalized_html, &options.preprocessing)?
+        sanitizer::sanitize(&normalized_html, &options.preprocessing, &options.preserve_tags)?
     } else {
         normalized_html
     };

{html_to_markdown-2.4.2 → html_to_markdown-2.5.0}/crates/html-to-markdown/src/options.rs RENAMED Viewed

@@ -200,6 +200,10 @@ pub struct ConversionOptions {
     /// List of HTML tags to strip (output only text content, no markdown conversion)
     pub strip_tags: Vec<String>,
+    /// List of HTML tags to preserve as-is in the output (keep original HTML)
+    /// Useful for complex elements like tables that don't convert well to Markdown
+    pub preserve_tags: Vec<String>,
 }
 impl Default for ConversionOptions {
@@ -235,6 +239,7 @@ impl Default for ConversionOptions {
             encoding: "utf-8".to_string(),
             debug: false,
             strip_tags: Vec::new(),
+            preserve_tags: Vec::new(),
         }
     }
 }

html-to-markdown 2.4.2__tar.gz → 2.5.0__tar.gz

Potentially problematic release.

html-to-markdown 2.4.2tar.gz → 2.5.0tar.gz