RubyGems - html-to-markdown - Versions diffs - 3.4.0.pre.rc.18 → 3.4.0.pre.rc.23 - Mend

html-to-markdown 3.4.0.pre.rc.18 → 3.4.0.pre.rc.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/ext/html_to_markdown_rb/Cargo.toml +1 -1
data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +2 -2
data/ext/html_to_markdown_rb/src/html-to-markdown.rb +1 -1
data/ext/html_to_markdown_rb/src/lib.rs +1 -1
data/lib/bin/html-to-markdown +0 -0
data/lib/html_to_markdown/version.rb +1 -1
data/lib/html_to_markdown.rb +6 -8
data/sig/types.rbs +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
data/vendor/html-to-markdown-rs/src/convert_api.rs +24 -6
data/vendor/html-to-markdown-rs/src/converter/context.rs +3 -1
data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +1 -0
data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +1 -0
data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +1 -0
data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +1 -0
data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +167 -1
data/vendor/html-to-markdown-rs/src/converter/metadata.rs +4 -1
data/vendor/html-to-markdown-rs/src/lib.rs +1 -1
data/vendor/html-to-markdown-rs/src/options/conversion.rs +1 -1
data/vendor/html-to-markdown-rs/src/types/document.rs +26 -21
data/vendor/html-to-markdown-rs/src/types/result.rs +4 -2
data/vendor/html-to-markdown-rs/src/types/tables.rs +11 -6
data/vendor/html-to-markdown-rs/src/types/warnings.rs +6 -3
data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +94 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: cc8a1d8786db49bcecc159f716e78029598e74c19bb88d099b72173cdbc63d8e
-  data.tar.gz: 2b418868a160d659ba96dacbb6ba34cd4850b45e5544c7ec063fe50936f93a04
+  metadata.gz: e6c2c4a533b89d2eb7db4322b77ae6eec7821d2f2babfbd903b5ba3a354425af
+  data.tar.gz: 997c6b7c90856c0554a3876565dd0605ace185736aa7ed614e124b8c2789947b
 SHA512:
-  metadata.gz: c692e682b321b6476f4fbc3d06093929c875e56a6cab3b5ee5b18bf84e8e0f81b9974a025c0a2efff75705c590611843e66b0b8e841efb2879eb6de1ddcf8776
-  data.tar.gz: 5c7f2ad84af0a91cff09d6930c6919d909f9deed7366c6a132f47be59bdef119cb2f3e7a757e6ba559f215b45483029f4cee28f14fd903f0edd0191193e2c625
+  metadata.gz: 196b2bced138d1ee74b1b4972f60cea01a2cf06772efbe13d44ff90222e4093766a661dcde5271dab1d79d9fb671a29365d20b189952ed76f8646f8f0ad1a01f
+  data.tar.gz: 1ebb5633c1a86cc427045fb3db1678b4b9ae720fb1d66bafdfdc8b61d013d01a9d1eccc7959d90463b212ac4ba6bcddea3bfadb5a66244c1e8b76c7e310a9181

data/ext/html_to_markdown_rb/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 [package]
 name = "html-to-markdown-rb"
-version = "3.4.0-rc.18"
+version = "3.4.0-rc.23"
 edition = "2024"
 license = "MIT"

data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 # This file is auto-generated by alef — DO NOT EDIT.
-# alef:hash:da62d212c87c1035f7f8160829c65d5c432997a1e928234741a7cea0f0529931
+# alef:hash:048c75ae74b430ffa33441a8dd7241b1bfe520e31a66eb84709b5ede993ee4c8
 # To regenerate: alef generate
 # To verify freshness: alef verify --exit-code
 # Issues & docs: https://github.com/kreuzberg-dev/alef
 # frozen_string_literal: true
 module HtmlToMarkdown
-  VERSION = "3.4.0.pre.rc.18"
+  VERSION = '3.4.0.pre.rc.23'
 end

data/ext/html_to_markdown_rb/src/html-to-markdown.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # This file is auto-generated by alef — DO NOT EDIT.
-# alef:hash:de7c621ce0da78b37e21fdb1d38bbbf5c3259509f57cb0f671732eb28b2b7e56
+# alef:hash:2ddad3a0e4196d0f7824563b5eca866af2d2a475750704444e5ecc0336f8baa6
 # To regenerate: alef generate
 # To verify freshness: alef verify --exit-code
 # Issues & docs: https://github.com/kreuzberg-dev/alef

data/ext/html_to_markdown_rb/src/lib.rs CHANGED Viewed

@@ -1,5 +1,5 @@
 // This file is auto-generated by alef. DO NOT EDIT.
-// alef:hash:05410d54dbc3bf180f287036de010a1a0a1160595b540211d172b1cdd9bb6dff
+// alef:hash:bb6faa37da8d32b19ccd60b267ae8b20d43326587e859f396a9826ab0925d398
 // Re-generate with: alef generate
 #![allow(dead_code, unused_imports, unused_variables)]
 #![allow(

data/lib/bin/html-to-markdown CHANGED Viewed

Binary file

data/lib/html_to_markdown/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HtmlToMarkdown
-  VERSION = '3.4.0.pre.rc.18'
+  VERSION = '3.4.0.pre.rc.23'
 end

data/lib/html_to_markdown.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require_relative 'html_to_markdown/version'
 require 'html_to_markdown_rb'
+require 'json'
 # High-performance HTML to Markdown conversion.
 #
@@ -29,14 +30,11 @@ module HtmlToMarkdown
   #   (and more, matching ConversionOptions fields)
   # @return [String] The converted Markdown content.
   def self.convert(html, options = {}, visitor = nil)
-    opts = if options.is_a?(HtmlToMarkdownRs::ConversionOptions)
-             options
-           elsif options.nil? || options.empty?
-             nil
-           else
-             HtmlToMarkdownRs::ConversionOptions.new(options)
-           end
-    result = HtmlToMarkdownRs.convert(html, opts, visitor)
+    # The Rust FFI expects options as a JSON string; serialise the hash here
+    # rather than constructing a ConversionOptions object, which the generated
+    # FFI layer cannot coerce back to String (see issue #334).
+    opts_json = options.nil? || options.empty? ? nil : options.to_json
+    result = HtmlToMarkdownRs.convert(html, opts_json, visitor)
     result.content || ''
   end
 end

data/sig/types.rbs CHANGED Viewed

@@ -1,5 +1,5 @@
 # This file is auto-generated by alef — DO NOT EDIT.
-# alef:hash:fa557708df795d5b42dd32042603884cf4e9e96a2609974ffb238997cf8b32b3
+# alef:hash:f0d66ccd989cb158aa2206dc4fc0596d3e4060cbb323372db1418e22598b6c21
 # To regenerate: alef generate
 # To verify freshness: alef verify --exit-code
 # Issues & docs: https://github.com/kreuzberg-dev/alef

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
 resolver = "2"
 [workspace.package]
-version = "3.4.0-rc.18"
+version = "3.4.0-rc.23"
 edition = "2024"
 rust-version = "1.85"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/vendor/html-to-markdown-rs/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "html-to-markdown-rs"
-version = "3.4.0-rc.18"
+version = "3.4.0-rc.23"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/vendor/html-to-markdown-rs/src/convert_api.rs CHANGED Viewed

@@ -6,12 +6,26 @@ use std::borrow::Cow;
 use crate::error::Result;
 use crate::options::{ConversionOptions, WhitespaceMode};
+/// The visitor parameter type accepted by [`convert`].
+///
+/// When the `visitor` feature is enabled, this is the full `VisitorHandle`
+/// (a shared reference-counted dyn `HtmlVisitor`). When the feature is off
+/// it degrades to a unit type so that callers can keep a stable 3-arity
+/// `convert(html, options, None)` call signature regardless of feature flags.
+#[cfg(feature = "visitor")]
+pub type VisitorParam = crate::visitor::VisitorHandle;
+#[cfg(not(feature = "visitor"))]
+pub type VisitorParam = ();
+#[cfg(any(feature = "serde", feature = "metadata", feature = "inline-images"))]
+use crate::ConversionError;
+#[cfg(any(feature = "serde", feature = "metadata"))]
+use crate::ConversionOptionsUpdate;
 use crate::text;
 use crate::types::ConversionResult;
 use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
-use crate::{ConversionError, ConversionOptionsUpdate};
-#[cfg(feature = "inline-images")]
+#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
 use crate::InlineImageConfig;
 #[cfg(feature = "metadata")]
 use crate::{HtmlMetadata, MetadataConfig};
@@ -40,9 +54,11 @@ use crate::{HtmlMetadata, MetadataConfig};
 pub fn convert(
     html: &str,
     options: Option<ConversionOptions>,
-    #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
+    visitor: Option<VisitorParam>,
 ) -> Result<ConversionResult> {
+    #[cfg(any(feature = "metadata", feature = "inline-images"))]
     use std::cell::RefCell;
+    #[cfg(any(feature = "metadata", feature = "inline-images"))]
     use std::rc::Rc;
     let options = options.unwrap_or_default();
@@ -100,10 +116,12 @@ pub fn convert(
             None
         };
-    // When the visitor feature is not enabled, there is no visitor parameter.
-    // convert_html_impl expects `Option<()>` in the non-visitor slot.
+    // `convert_html_impl` expects the visitor slot to be `Option<()>` when the visitor
+    // feature is off. We accept `Option<VisitorParam>` (a feature-gated alias) at the
+    // public API — when the feature is off it's `Option<()>`, so `visitor` already has
+    // the right type and we don't need to override it.
     #[cfg(not(feature = "visitor"))]
-    let visitor: Option<()> = None;
+    let _ = visitor.is_some();
     // Run the conversion pipeline.
     // Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap

data/vendor/html-to-markdown-rs/src/converter/context.rs CHANGED Viewed

@@ -6,7 +6,9 @@
 #[cfg(any(feature = "inline-images", feature = "visitor"))]
 use std::cell::RefCell;
-use std::collections::{BTreeMap, HashSet};
+#[cfg(feature = "metadata")]
+use std::collections::BTreeMap;
+use std::collections::HashSet;
 use std::rc::Rc;
 #[cfg(feature = "inline-images")]

data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs CHANGED Viewed

@@ -7,6 +7,7 @@
 //! - Visitor callback integration
 use std::borrow::Cow;
+#[cfg(any(feature = "metadata", feature = "visitor"))]
 use std::collections::BTreeMap;
 use crate::converter::Context;

data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs CHANGED Viewed

@@ -7,6 +7,7 @@
 //! - Visitor callback integration
 use std::borrow::Cow;
+#[cfg(any(feature = "metadata", feature = "inline-images", feature = "visitor"))]
 use std::collections::BTreeMap;
 use crate::converter::Context;

data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs CHANGED Viewed

@@ -8,6 +8,7 @@
 //! - Visitor callback integration
 //! - Link metadata collection
+#[cfg(any(feature = "metadata", feature = "visitor"))]
 use std::collections::BTreeMap;
 use crate::converter::Context;

data/vendor/html-to-markdown-rs/src/converter/inline/link.rs CHANGED Viewed

@@ -14,6 +14,7 @@ use crate::converter::utility::content::collect_tag_attributes;
 use crate::converter::utility::content::{collect_link_label_text, escape_link_label, normalize_link_label};
 use crate::converter::utility::preprocessing::sanitize_markdown_url;
 use crate::options::ConversionOptions;
+#[cfg(any(feature = "metadata", feature = "visitor"))]
 use std::collections::BTreeMap;
 use tl::{NodeHandle, Parser};

data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs CHANGED Viewed

@@ -97,17 +97,183 @@ pub fn has_custom_element_tags(html: &str) -> bool {
     false
 }
+/// HTML5 void elements that are self-closing by spec and must NOT be expanded.
+///
+/// These elements are always void in HTML5: they have no end tag, and `<br />` is
+/// equivalent to `<br>`.  We must leave them as-is when pre-processing XML-style
+/// self-closing syntax so that `repair_with_html5ever` can parse them correctly.
+const HTML5_VOID_ELEMENTS: &[&str] = &[
+    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr",
+];
+/// Expand XML-style self-closing tags to explicit open+close pairs.
+///
+/// HTML5 does not honour the `/>` self-close syntax for non-void elements.  When
+/// `repair_with_html5ever` re-parses content that contains custom / namespaced tags
+/// written as `<ac:parameter name="foo" />`, the HTML5 parser treats the `/>` as `>`
+/// and leaves the element open.  Subsequent siblings then nest inside it, breaking
+/// visitor pre-order/post-order start/end pairing.
+///
+/// This function scans the input byte-by-byte and rewrites any `<tag ... />` where
+/// `tag` is not a known HTML5 void element into `<tag ...></tag>`.  Known void
+/// elements are left unchanged because they must not receive an explicit close tag.
+///
+/// # Correctness guarantees
+/// - Non-ASCII bytes are never interpreted as structural characters; all multi-byte
+///   UTF-8 sequences pass through unmodified via `&input[byte_offset..]` slicing.
+/// - Attribute values containing `/>` are skipped correctly (the scanner tracks
+///   whether it is inside a quoted attribute).
+/// - `</closing>` tags are never modified.
+/// - The function is pure and returns a new `String`; if no substitution is needed
+///   the allocation is still performed (cheap given repair is already rare).
+pub fn expand_xml_self_closing_tags(input: &str) -> String {
+    let bytes = input.as_bytes();
+    let len = bytes.len();
+    let mut output = String::with_capacity(len);
+    // `copy_start` tracks the beginning of a contiguous span of unmodified input
+    // that should be copied verbatim to `output`.
+    let mut copy_start = 0usize;
+    let mut i = 0;
+    while i < len {
+        if bytes[i] != b'<' {
+            i += 1;
+            continue;
+        }
+        // We are at `<`. Flush the unmodified span up to (but not including) this `<`.
+        let tag_open = i;
+        i += 1;
+        // Skip closing tags entirely — they must not be modified.
+        if i < len && bytes[i] == b'/' {
+            // Scan to the matching `>`.
+            while i < len && bytes[i] != b'>' {
+                i += 1;
+            }
+            if i < len {
+                i += 1; // consume `>`
+            }
+            continue;
+        }
+        // Skip leading whitespace after `<` (unusual but tolerated).
+        while i < len && bytes[i].is_ascii_whitespace() {
+            i += 1;
+        }
+        // Collect the tag name (byte-aligned; tag names are always ASCII).
+        let name_start = i;
+        while i < len {
+            let ch = bytes[i];
+            if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
+                break;
+            }
+            i += 1;
+        }
+        let tag_name_bytes = &bytes[name_start..i];
+        // Empty tag name — emit verbatim and continue.
+        if tag_name_bytes.is_empty() {
+            continue;
+        }
+        // Check whether this is a known HTML5 void element (case-insensitive).
+        let tag_name_lower = tag_name_bytes.iter().map(u8::to_ascii_lowercase).collect::<Vec<_>>();
+        let is_void = HTML5_VOID_ELEMENTS
+            .iter()
+            .any(|v| v.as_bytes() == tag_name_lower.as_slice());
+        // Scan the rest of the tag to find `/>` or `>`, skipping quoted attrs.
+        let attrs_start = i;
+        let mut in_single_quote = false;
+        let mut in_double_quote = false;
+        let mut self_closing = false;
+        while i < len {
+            match bytes[i] {
+                b'"' if !in_single_quote => {
+                    in_double_quote = !in_double_quote;
+                    i += 1;
+                }
+                b'\'' if !in_double_quote => {
+                    in_single_quote = !in_single_quote;
+                    i += 1;
+                }
+                b'/' if !in_single_quote && !in_double_quote => {
+                    if i + 1 < len && bytes[i + 1] == b'>' {
+                        self_closing = true;
+                        break;
+                    }
+                    i += 1;
+                }
+                b'>' if !in_single_quote && !in_double_quote => {
+                    break;
+                }
+                _ => {
+                    i += 1;
+                }
+            }
+        }
+        if self_closing && !is_void {
+            // Flush unchanged input up to (not including) this tag.
+            output.push_str(&input[copy_start..tag_open]);
+            let tag_name_str = std::str::from_utf8(tag_name_bytes).unwrap_or("");
+            // attrs_part covers everything between the end of the tag name and `/>`,
+            // i.e. `&input[attrs_start..i]` (the `/` at `i` is the start of `/>`)
+            let attrs_part = &input[attrs_start..i];
+            // Non-void: expand `<tag attrs/>` → `<tag attrs></tag>`.
+            output.push('<');
+            output.push_str(tag_name_str);
+            output.push_str(attrs_part);
+            output.push('>');
+            output.push('<');
+            output.push('/');
+            output.push_str(tag_name_str);
+            output.push('>');
+            i += 2; // consume `/>`
+            copy_start = i;
+        } else {
+            // Not a self-closing non-void tag: advance past `/>` or `>`.
+            if i < len && bytes[i] == b'/' {
+                i += 2; // skip `/>`
+            } else if i < len && bytes[i] == b'>' {
+                i += 1;
+            }
+        }
+    }
+    // Flush the remaining unchanged tail.
+    output.push_str(&input[copy_start..]);
+    output
+}
 /// Try to repair HTML using html5ever parser.
 ///
 /// Returns Some(repaired_html) if repair was successful, None otherwise.
+///
+/// Before feeding the input to the HTML5 parser, XML-style self-closing tags on
+/// non-void elements (e.g. `<ac:parameter name="foo" />`) are expanded to explicit
+/// open+close pairs.  This preserves the intended document structure because HTML5
+/// semantics do not honour `/>` on unknown elements — without the expansion, the
+/// element would be left open and subsequent siblings would nest inside it, breaking
+/// visitor start/end event pairing (issue #331).
 pub fn repair_with_html5ever(input: &str) -> Option<String> {
     use crate::rcdom::{RcDom, SerializableHandle};
     use html5ever::serialize::{SerializeOpts, serialize};
     use html5ever::tendril::TendrilSink;
+    // Expand XML-style self-closing on non-void elements before the HTML5 parse so
+    // that `<ac:parameter ... />` is not silently left open by the HTML5 parser.
+    let expanded = expand_xml_self_closing_tags(input);
     let dom = html5ever::parse_document(RcDom::default(), Default::default())
         .from_utf8()
-        .read_from(&mut input.as_bytes())
+        .read_from(&mut expanded.as_bytes())
         .ok()?;
     let mut buf = Vec::with_capacity(input.len());

data/vendor/html-to-markdown-rs/src/converter/metadata.rs CHANGED Viewed

@@ -8,7 +8,9 @@
 use crate::converter::media::svg::serialize_element;
 use crate::options::ConversionOptions;
-use crate::text::{decode_html_entities, escape};
+#[cfg(feature = "metadata")]
+use crate::text::decode_html_entities;
+use crate::text::escape;
 use tl::{NodeHandle, Parser};
 // Type aliases for Context and DomContext to avoid circular imports
@@ -125,6 +127,7 @@ fn handle_head(
 ///
 /// Script elements are processed to extract JSON-LD structured data when
 /// the type is "application/ld+json" and metadata collection is enabled.
+#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
 fn handle_script(
     node_handle: &NodeHandle,
     parser: &Parser,

data/vendor/html-to-markdown-rs/src/lib.rs CHANGED Viewed

@@ -95,7 +95,7 @@ pub use convert_api::{conversion_options_from_json, conversion_options_update_fr
 #[cfg(feature = "metadata")]
 pub use convert_api::metadata_config_from_json;
-#[cfg(feature = "inline-images")]
+#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
 pub use convert_api::inline_image_config_from_json;
 // Tests

data/vendor/html-to-markdown-rs/src/options/conversion.rs CHANGED Viewed

@@ -118,7 +118,7 @@ pub struct ConversionOptions {
     /// Invalid selectors are silently skipped at conversion time.
     ///
     /// Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
-    #[serde(default)]
+    #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default))]
     pub exclude_selectors: Vec<String>,
 }

data/vendor/html-to-markdown-rs/src/types/document.rs CHANGED Viewed

@@ -2,6 +2,7 @@
 use std::collections::HashMap;
+#[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 use super::tables::TableGrid;
@@ -9,41 +10,44 @@ use super::tables::TableGrid;
 /// A structured document tree representing the semantic content of an HTML document.
 ///
 /// Uses a flat node array with index-based parent/child references for efficient traversal.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct DocumentStructure {
     /// All nodes in document reading order.
     pub nodes: Vec<DocumentNode>,
     /// The source format (always "html" for this crate).
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
     pub source_format: Option<String>,
 }
 /// A single node in the document tree.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct DocumentNode {
     /// Deterministic node identifier.
     pub id: String,
     /// The semantic content of this node.
     pub content: NodeContent,
     /// Index of the parent node (None for root nodes).
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
     pub parent: Option<u32>,
     /// Indices of child nodes in reading order.
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
     pub children: Vec<u32>,
     /// Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text.
-    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
     pub annotations: Vec<TextAnnotation>,
     /// Format-specific attributes (e.g. class, id, data-* attributes).
-    #[serde(skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
     pub attributes: Option<HashMap<String, String>>,
 }
 /// The semantic content type of a document node.
 ///
 /// Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "node_type", rename_all = "snake_case")]
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "serde", serde(tag = "node_type", rename_all = "snake_case"))]
 pub enum NodeContent {
     /// A heading element (h1-h6).
     Heading {
@@ -75,13 +79,13 @@ pub enum NodeContent {
     /// An image element.
     Image {
         /// Alt text or caption.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         description: Option<String>,
         /// Image source URL.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         src: Option<String>,
         /// Index into `ConversionResult.images` when image extraction is enabled.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         image_index: Option<u32>,
     },
     /// A code block or inline code.
@@ -89,7 +93,7 @@ pub enum NodeContent {
         /// The code text content.
         text: String,
         /// Programming language (from class="language-*" or similar).
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         language: Option<String>,
     },
     /// A block quote container.
@@ -118,13 +122,13 @@ pub enum NodeContent {
     /// A section grouping container (auto-generated from heading hierarchy).
     Group {
         /// Optional section label.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         label: Option<String>,
         /// The heading level that created this group.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         heading_level: Option<u8>,
         /// The heading text that created this group.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         heading_text: Option<String>,
     },
 }
@@ -132,7 +136,8 @@ pub enum NodeContent {
 /// An inline text annotation with byte-range offsets.
 ///
 /// Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct TextAnnotation {
     /// Start byte offset (inclusive) into the parent node's text.
     pub start: u32,
@@ -145,9 +150,9 @@ pub struct TextAnnotation {
 /// The type of an inline text annotation.
 ///
 /// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "annotation_type", rename_all = "snake_case")]
-#[derive(Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Default)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "serde", serde(tag = "annotation_type", rename_all = "snake_case"))]
 pub enum AnnotationKind {
     /// Bold / strong emphasis.
     #[default]
@@ -171,7 +176,7 @@ pub enum AnnotationKind {
         /// The link URL.
         url: String,
         /// Optional link title attribute.
-        #[serde(skip_serializing_if = "Option::is_none")]
+        #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
         title: Option<String>,
     },
 }

data/vendor/html-to-markdown-rs/src/types/result.rs CHANGED Viewed

@@ -1,5 +1,6 @@
 //! The primary result type for HTML conversion and extraction.
+#[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 use super::document::DocumentStructure;
@@ -20,7 +21,8 @@ use super::warnings::ProcessingWarning;
 /// assert!(result.content.is_some());
 /// assert!(result.warnings.is_empty());
 /// ```
-#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[derive(Debug, Clone, Default)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct ConversionResult {
     /// Converted text output (markdown, djot, or plain text).
     ///
@@ -44,7 +46,7 @@ pub struct ConversionResult {
     ///
     /// Populated when `extract_images` is `true` in options.
     #[cfg(feature = "inline-images")]
-    #[serde(skip)]
+    #[cfg_attr(feature = "serde", serde(skip))]
     pub images: Vec<crate::inline_images::InlineImage>,
     /// Non-fatal processing warnings.

data/vendor/html-to-markdown-rs/src/types/tables.rs CHANGED Viewed

@@ -1,9 +1,11 @@
 //! Structured table types aligned with kreuzberg's `TableGrid`.
+#[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 /// A structured table grid with cell-level data including spans.
-#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct TableGrid {
     /// Number of rows.
     pub rows: u32,
@@ -14,7 +16,8 @@ pub struct TableGrid {
 }
 /// A single cell in a table grid.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct GridCell {
     /// The text content of the cell.
     pub content: String,
@@ -23,22 +26,24 @@ pub struct GridCell {
     /// 0-indexed column position.
     pub col: u32,
     /// Number of rows this cell spans (default 1).
-    #[serde(default = "default_span")]
+    #[cfg_attr(feature = "serde", serde(default = "default_span"))]
     pub row_span: u32,
     /// Number of columns this cell spans (default 1).
-    #[serde(default = "default_span")]
+    #[cfg_attr(feature = "serde", serde(default = "default_span"))]
     pub col_span: u32,
     /// Whether this is a header cell (`<th>`).
-    #[serde(default)]
+    #[cfg_attr(feature = "serde", serde(default))]
     pub is_header: bool,
 }
+#[cfg(feature = "serde")]
 fn default_span() -> u32 {
     1
 }
 /// A top-level extracted table with both structured data and markdown representation.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct TableData {
     /// The structured table grid.
     pub grid: TableGrid,

data/vendor/html-to-markdown-rs/src/types/warnings.rs CHANGED Viewed

@@ -1,9 +1,11 @@
 //! Processing warning types for non-fatal issues during conversion.
+#[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 /// A non-fatal warning generated during HTML processing.
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub struct ProcessingWarning {
     /// Human-readable warning message.
     pub message: String,
@@ -12,8 +14,9 @@ pub struct ProcessingWarning {
 }
 /// Categories of processing warnings.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
 pub enum WarningKind {
     /// An image could not be extracted (e.g. invalid data URI, unsupported format).
     ImageExtractionFailed,

data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs CHANGED Viewed

@@ -1002,3 +1002,97 @@ fn test_element_end_replacement_with_metadata_preserves_subsequent_content() {
         "content after replaced element should not be lost"
     );
 }
+/// Regression test for issue #331: visitor receives mismatched start/end events for
+/// hyphenated tag names that contain XML-style self-closing children.
+///
+/// When `<ac:parameter ac:name="foo" />` appears inside a hyphenated custom element, the
+/// `repair_with_html5ever` fallback (triggered because the outer tag contains a hyphen) used
+/// to re-parse with HTML5 semantics.  HTML5 does NOT honour XML-style self-closing on unknown
+/// elements, so `<ac:parameter ... />` was treated as an open tag and subsequent siblings were
+/// nested inside it.  That caused `visit_element_start("ac:parameter")` for "foo" to be
+/// followed by `visit_element_start("ac:parameter")` for "quux", then both ends in reversed
+/// order — violating the expected pre-order/post-order pairing.
+#[test]
+fn test_issue_331_hyphenated_tags_xml_self_closing_visitor_events() {
+    #[derive(Debug, Default)]
+    struct EventRecorder {
+        events: Vec<String>,
+    }
+    impl HtmlVisitor for EventRecorder {
+        fn visit_element_start(&mut self, ctx: &NodeContext) -> VisitResult {
+            self.events.push(format!("start({})", ctx.tag_name));
+            VisitResult::Continue
+        }
+        fn visit_element_end(&mut self, ctx: &NodeContext, _output: &str) -> VisitResult {
+            self.events.push(format!("end({})", ctx.tag_name));
+            VisitResult::Continue
+        }
+    }
+    let html = r#"
+<structured-macro>
+  <ac:parameter ac:name="foo" />
+  <ac:parameter ac:name="quux">lalaland</ac:parameter>
+</structured-macro>
+"#;
+    let visitor = Rc::new(RefCell::new(EventRecorder::default()));
+    let result = convert(html, None, Some(visitor.clone()));
+    assert!(result.is_ok(), "conversion should succeed: {:?}", result.err());
+    let events = visitor.borrow().events.clone();
+    // Find the indices of start/end pairs for the two ac:parameter elements.
+    // With correct XML self-closing handling:
+    //   start(ac:parameter)[foo] → end(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux]
+    // With the bug (html5ever treats `/>` as open tag):
+    //   start(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux] → end(ac:parameter)[foo]
+    // Collect positions of start/end events for ac:parameter
+    let ac_param_starts: Vec<usize> = events
+        .iter()
+        .enumerate()
+        .filter(|(_, e)| e.starts_with("start(ac:parameter)"))
+        .map(|(i, _)| i)
+        .collect();
+    let ac_param_ends: Vec<usize> = events
+        .iter()
+        .enumerate()
+        .filter(|(_, e)| e.starts_with("end(ac:parameter)"))
+        .map(|(i, _)| i)
+        .collect();
+    assert_eq!(
+        ac_param_starts.len(),
+        2,
+        "expected exactly 2 ac:parameter start events, got: {events:?}"
+    );
+    assert_eq!(
+        ac_param_ends.len(),
+        2,
+        "expected exactly 2 ac:parameter end events, got: {events:?}"
+    );
+    // Each start must come before the corresponding end: start[0] < end[0] < start[1] < end[1]
+    assert!(
+        ac_param_starts[0] < ac_param_ends[0],
+        "first ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
+        ac_param_starts[0],
+        ac_param_ends[0],
+    );
+    assert!(
+        ac_param_ends[0] < ac_param_starts[1],
+        "first ac:parameter end must precede second ac:parameter start (got end@{}, start@{}); events: {events:?}",
+        ac_param_ends[0],
+        ac_param_starts[1],
+    );
+    assert!(
+        ac_param_starts[1] < ac_param_ends[1],
+        "second ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
+        ac_param_starts[1],
+        ac_param_ends[1],
+    );
+}

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html-to-markdown
 version: !ruby/object:Gem::Version
-  version: 3.4.0.pre.rc.18
+  version: 3.4.0.pre.rc.23
 platform: ruby
 authors:
 - Kreuzberg Team
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-04-29 00:00:00.000000000 Z
+date: 2026-05-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys