html-to-markdown 3.4.0.pre.rc.18 → 3.4.0.pre.rc.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc8a1d8786db49bcecc159f716e78029598e74c19bb88d099b72173cdbc63d8e
4
- data.tar.gz: 2b418868a160d659ba96dacbb6ba34cd4850b45e5544c7ec063fe50936f93a04
3
+ metadata.gz: e6c2c4a533b89d2eb7db4322b77ae6eec7821d2f2babfbd903b5ba3a354425af
4
+ data.tar.gz: 997c6b7c90856c0554a3876565dd0605ace185736aa7ed614e124b8c2789947b
5
5
  SHA512:
6
- metadata.gz: c692e682b321b6476f4fbc3d06093929c875e56a6cab3b5ee5b18bf84e8e0f81b9974a025c0a2efff75705c590611843e66b0b8e841efb2879eb6de1ddcf8776
7
- data.tar.gz: 5c7f2ad84af0a91cff09d6930c6919d909f9deed7366c6a132f47be59bdef119cb2f3e7a757e6ba559f215b45483029f4cee28f14fd903f0edd0191193e2c625
6
+ metadata.gz: 196b2bced138d1ee74b1b4972f60cea01a2cf06772efbe13d44ff90222e4093766a661dcde5271dab1d79d9fb671a29365d20b189952ed76f8646f8f0ad1a01f
7
+ data.tar.gz: 1ebb5633c1a86cc427045fb3db1678b4b9ae720fb1d66bafdfdc8b61d013d01a9d1eccc7959d90463b212ac4ba6bcddea3bfadb5a66244c1e8b76c7e310a9181
@@ -2,7 +2,7 @@
2
2
 
3
3
  [package]
4
4
  name = "html-to-markdown-rb"
5
- version = "3.4.0-rc.18"
5
+ version = "3.4.0-rc.23"
6
6
  edition = "2024"
7
7
  license = "MIT"
8
8
 
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:da62d212c87c1035f7f8160829c65d5c432997a1e928234741a7cea0f0529931
2
+ # alef:hash:048c75ae74b430ffa33441a8dd7241b1bfe520e31a66eb84709b5ede993ee4c8
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
6
6
  # frozen_string_literal: true
7
7
 
8
8
  module HtmlToMarkdown
9
- VERSION = "3.4.0.pre.rc.18"
9
+ VERSION = '3.4.0.pre.rc.23'
10
10
  end
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:de7c621ce0da78b37e21fdb1d38bbbf5c3259509f57cb0f671732eb28b2b7e56
2
+ # alef:hash:2ddad3a0e4196d0f7824563b5eca866af2d2a475750704444e5ecc0336f8baa6
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
@@ -1,5 +1,5 @@
1
1
  // This file is auto-generated by alef. DO NOT EDIT.
2
- // alef:hash:05410d54dbc3bf180f287036de010a1a0a1160595b540211d172b1cdd9bb6dff
2
+ // alef:hash:bb6faa37da8d32b19ccd60b267ae8b20d43326587e859f396a9826ab0925d398
3
3
  // Re-generate with: alef generate
4
4
  #![allow(dead_code, unused_imports, unused_variables)]
5
5
  #![allow(
Binary file
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.4.0.pre.rc.18'
4
+ VERSION = '3.4.0.pre.rc.23'
5
5
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'html_to_markdown/version'
4
4
  require 'html_to_markdown_rb'
5
+ require 'json'
5
6
 
6
7
  # High-performance HTML to Markdown conversion.
7
8
  #
@@ -29,14 +30,11 @@ module HtmlToMarkdown
29
30
  # (and more, matching ConversionOptions fields)
30
31
  # @return [String] The converted Markdown content.
31
32
  def self.convert(html, options = {}, visitor = nil)
32
- opts = if options.is_a?(HtmlToMarkdownRs::ConversionOptions)
33
- options
34
- elsif options.nil? || options.empty?
35
- nil
36
- else
37
- HtmlToMarkdownRs::ConversionOptions.new(options)
38
- end
39
- result = HtmlToMarkdownRs.convert(html, opts, visitor)
33
+ # The Rust FFI expects options as a JSON string; serialise the hash here
34
+ # rather than constructing a ConversionOptions object, which the generated
35
+ # FFI layer cannot coerce back to String (see issue #334).
36
+ opts_json = options.nil? || options.empty? ? nil : options.to_json
37
+ result = HtmlToMarkdownRs.convert(html, opts_json, visitor)
40
38
  result.content || ''
41
39
  end
42
40
  end
data/sig/types.rbs CHANGED
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:fa557708df795d5b42dd32042603884cf4e9e96a2609974ffb238997cf8b32b3
2
+ # alef:hash:f0d66ccd989cb158aa2206dc4fc0596d3e4060cbb323372db1418e22598b6c21
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.4.0-rc.18"
6
+ version = "3.4.0-rc.23"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.4.0-rc.18"
3
+ version = "3.4.0-rc.23"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -6,12 +6,26 @@ use std::borrow::Cow;
6
6
 
7
7
  use crate::error::Result;
8
8
  use crate::options::{ConversionOptions, WhitespaceMode};
9
+
10
+ /// The visitor parameter type accepted by [`convert`].
11
+ ///
12
+ /// When the `visitor` feature is enabled, this is the full `VisitorHandle`
13
+ /// (a shared reference-counted dyn `HtmlVisitor`). When the feature is off
14
+ /// it degrades to a unit type so that callers can keep a stable 3-arity
15
+ /// `convert(html, options, None)` call signature regardless of feature flags.
16
+ #[cfg(feature = "visitor")]
17
+ pub type VisitorParam = crate::visitor::VisitorHandle;
18
+ #[cfg(not(feature = "visitor"))]
19
+ pub type VisitorParam = ();
20
+ #[cfg(any(feature = "serde", feature = "metadata", feature = "inline-images"))]
21
+ use crate::ConversionError;
22
+ #[cfg(any(feature = "serde", feature = "metadata"))]
23
+ use crate::ConversionOptionsUpdate;
9
24
  use crate::text;
10
25
  use crate::types::ConversionResult;
11
26
  use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
12
- use crate::{ConversionError, ConversionOptionsUpdate};
13
27
 
14
- #[cfg(feature = "inline-images")]
28
+ #[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
15
29
  use crate::InlineImageConfig;
16
30
  #[cfg(feature = "metadata")]
17
31
  use crate::{HtmlMetadata, MetadataConfig};
@@ -40,9 +54,11 @@ use crate::{HtmlMetadata, MetadataConfig};
40
54
  pub fn convert(
41
55
  html: &str,
42
56
  options: Option<ConversionOptions>,
43
- #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
57
+ visitor: Option<VisitorParam>,
44
58
  ) -> Result<ConversionResult> {
59
+ #[cfg(any(feature = "metadata", feature = "inline-images"))]
45
60
  use std::cell::RefCell;
61
+ #[cfg(any(feature = "metadata", feature = "inline-images"))]
46
62
  use std::rc::Rc;
47
63
 
48
64
  let options = options.unwrap_or_default();
@@ -100,10 +116,12 @@ pub fn convert(
100
116
  None
101
117
  };
102
118
 
103
- // When the visitor feature is not enabled, there is no visitor parameter.
104
- // convert_html_impl expects `Option<()>` in the non-visitor slot.
119
+ // `convert_html_impl` expects the visitor slot to be `Option<()>` when the visitor
120
+ // feature is off. We accept `Option<VisitorParam>` (a feature-gated alias) at the
121
+ // public API — when the feature is off it's `Option<()>`, so `visitor` already has
122
+ // the right type and we don't need to override it.
105
123
  #[cfg(not(feature = "visitor"))]
106
- let visitor: Option<()> = None;
124
+ let _ = visitor.is_some();
107
125
 
108
126
  // Run the conversion pipeline.
109
127
  // Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
@@ -6,7 +6,9 @@
6
6
 
7
7
  #[cfg(any(feature = "inline-images", feature = "visitor"))]
8
8
  use std::cell::RefCell;
9
- use std::collections::{BTreeMap, HashSet};
9
+ #[cfg(feature = "metadata")]
10
+ use std::collections::BTreeMap;
11
+ use std::collections::HashSet;
10
12
  use std::rc::Rc;
11
13
 
12
14
  #[cfg(feature = "inline-images")]
@@ -7,6 +7,7 @@
7
7
  //! - Visitor callback integration
8
8
 
9
9
  use std::borrow::Cow;
10
+ #[cfg(any(feature = "metadata", feature = "visitor"))]
10
11
  use std::collections::BTreeMap;
11
12
 
12
13
  use crate::converter::Context;
@@ -7,6 +7,7 @@
7
7
  //! - Visitor callback integration
8
8
 
9
9
  use std::borrow::Cow;
10
+ #[cfg(any(feature = "metadata", feature = "inline-images", feature = "visitor"))]
10
11
  use std::collections::BTreeMap;
11
12
 
12
13
  use crate::converter::Context;
@@ -8,6 +8,7 @@
8
8
  //! - Visitor callback integration
9
9
  //! - Link metadata collection
10
10
 
11
+ #[cfg(any(feature = "metadata", feature = "visitor"))]
11
12
  use std::collections::BTreeMap;
12
13
 
13
14
  use crate::converter::Context;
@@ -14,6 +14,7 @@ use crate::converter::utility::content::collect_tag_attributes;
14
14
  use crate::converter::utility::content::{collect_link_label_text, escape_link_label, normalize_link_label};
15
15
  use crate::converter::utility::preprocessing::sanitize_markdown_url;
16
16
  use crate::options::ConversionOptions;
17
+ #[cfg(any(feature = "metadata", feature = "visitor"))]
17
18
  use std::collections::BTreeMap;
18
19
  use tl::{NodeHandle, Parser};
19
20
 
@@ -97,17 +97,183 @@ pub fn has_custom_element_tags(html: &str) -> bool {
97
97
  false
98
98
  }
99
99
 
100
+ /// HTML5 void elements that are self-closing by spec and must NOT be expanded.
101
+ ///
102
+ /// These elements are always void in HTML5: they have no end tag, and `<br />` is
103
+ /// equivalent to `<br>`. We must leave them as-is when pre-processing XML-style
104
+ /// self-closing syntax so that `repair_with_html5ever` can parse them correctly.
105
+ const HTML5_VOID_ELEMENTS: &[&str] = &[
106
+ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr",
107
+ ];
108
+
109
+ /// Expand XML-style self-closing tags to explicit open+close pairs.
110
+ ///
111
+ /// HTML5 does not honour the `/>` self-close syntax for non-void elements. When
112
+ /// `repair_with_html5ever` re-parses content that contains custom / namespaced tags
113
+ /// written as `<ac:parameter name="foo" />`, the HTML5 parser treats the `/>` as `>`
114
+ /// and leaves the element open. Subsequent siblings then nest inside it, breaking
115
+ /// visitor pre-order/post-order start/end pairing.
116
+ ///
117
+ /// This function scans the input byte-by-byte and rewrites any `<tag ... />` where
118
+ /// `tag` is not a known HTML5 void element into `<tag ...></tag>`. Known void
119
+ /// elements are left unchanged because they must not receive an explicit close tag.
120
+ ///
121
+ /// # Correctness guarantees
122
+ /// - Non-ASCII bytes are never interpreted as structural characters; all multi-byte
123
+ /// UTF-8 sequences pass through unmodified via `&input[byte_offset..]` slicing.
124
+ /// - Attribute values containing `/>` are skipped correctly (the scanner tracks
125
+ /// whether it is inside a quoted attribute).
126
+ /// - `</closing>` tags are never modified.
127
+ /// - The function is pure and returns a new `String`; if no substitution is needed
128
+ /// the allocation is still performed (cheap given repair is already rare).
129
+ pub fn expand_xml_self_closing_tags(input: &str) -> String {
130
+ let bytes = input.as_bytes();
131
+ let len = bytes.len();
132
+ let mut output = String::with_capacity(len);
133
+ // `copy_start` tracks the beginning of a contiguous span of unmodified input
134
+ // that should be copied verbatim to `output`.
135
+ let mut copy_start = 0usize;
136
+ let mut i = 0;
137
+
138
+ while i < len {
139
+ if bytes[i] != b'<' {
140
+ i += 1;
141
+ continue;
142
+ }
143
+
144
+ // We are at `<`. Flush the unmodified span up to (but not including) this `<`.
145
+ let tag_open = i;
146
+ i += 1;
147
+
148
+ // Skip closing tags entirely — they must not be modified.
149
+ if i < len && bytes[i] == b'/' {
150
+ // Scan to the matching `>`.
151
+ while i < len && bytes[i] != b'>' {
152
+ i += 1;
153
+ }
154
+ if i < len {
155
+ i += 1; // consume `>`
156
+ }
157
+ continue;
158
+ }
159
+
160
+ // Skip leading whitespace after `<` (unusual but tolerated).
161
+ while i < len && bytes[i].is_ascii_whitespace() {
162
+ i += 1;
163
+ }
164
+
165
+ // Collect the tag name (byte-aligned; tag names are always ASCII).
166
+ let name_start = i;
167
+ while i < len {
168
+ let ch = bytes[i];
169
+ if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
170
+ break;
171
+ }
172
+ i += 1;
173
+ }
174
+ let tag_name_bytes = &bytes[name_start..i];
175
+
176
+ // Empty tag name — emit verbatim and continue.
177
+ if tag_name_bytes.is_empty() {
178
+ continue;
179
+ }
180
+
181
+ // Check whether this is a known HTML5 void element (case-insensitive).
182
+ let tag_name_lower = tag_name_bytes.iter().map(u8::to_ascii_lowercase).collect::<Vec<_>>();
183
+ let is_void = HTML5_VOID_ELEMENTS
184
+ .iter()
185
+ .any(|v| v.as_bytes() == tag_name_lower.as_slice());
186
+
187
+ // Scan the rest of the tag to find `/>` or `>`, skipping quoted attrs.
188
+ let attrs_start = i;
189
+ let mut in_single_quote = false;
190
+ let mut in_double_quote = false;
191
+ let mut self_closing = false;
192
+
193
+ while i < len {
194
+ match bytes[i] {
195
+ b'"' if !in_single_quote => {
196
+ in_double_quote = !in_double_quote;
197
+ i += 1;
198
+ }
199
+ b'\'' if !in_double_quote => {
200
+ in_single_quote = !in_single_quote;
201
+ i += 1;
202
+ }
203
+ b'/' if !in_single_quote && !in_double_quote => {
204
+ if i + 1 < len && bytes[i + 1] == b'>' {
205
+ self_closing = true;
206
+ break;
207
+ }
208
+ i += 1;
209
+ }
210
+ b'>' if !in_single_quote && !in_double_quote => {
211
+ break;
212
+ }
213
+ _ => {
214
+ i += 1;
215
+ }
216
+ }
217
+ }
218
+
219
+ if self_closing && !is_void {
220
+ // Flush unchanged input up to (not including) this tag.
221
+ output.push_str(&input[copy_start..tag_open]);
222
+
223
+ let tag_name_str = std::str::from_utf8(tag_name_bytes).unwrap_or("");
224
+ // attrs_part covers everything between the end of the tag name and `/>`,
225
+ // i.e. `&input[attrs_start..i]` (the `/` at `i` is the start of `/>`)
226
+ let attrs_part = &input[attrs_start..i];
227
+
228
+ // Non-void: expand `<tag attrs/>` → `<tag attrs></tag>`.
229
+ output.push('<');
230
+ output.push_str(tag_name_str);
231
+ output.push_str(attrs_part);
232
+ output.push('>');
233
+ output.push('<');
234
+ output.push('/');
235
+ output.push_str(tag_name_str);
236
+ output.push('>');
237
+
238
+ i += 2; // consume `/>`
239
+ copy_start = i;
240
+ } else {
241
+ // Not a self-closing non-void tag: advance past `/>` or `>`.
242
+ if i < len && bytes[i] == b'/' {
243
+ i += 2; // skip `/>`
244
+ } else if i < len && bytes[i] == b'>' {
245
+ i += 1;
246
+ }
247
+ }
248
+ }
249
+
250
+ // Flush the remaining unchanged tail.
251
+ output.push_str(&input[copy_start..]);
252
+ output
253
+ }
254
+
100
255
  /// Try to repair HTML using html5ever parser.
101
256
  ///
102
257
  /// Returns Some(repaired_html) if repair was successful, None otherwise.
258
+ ///
259
+ /// Before feeding the input to the HTML5 parser, XML-style self-closing tags on
260
+ /// non-void elements (e.g. `<ac:parameter name="foo" />`) are expanded to explicit
261
+ /// open+close pairs. This preserves the intended document structure because HTML5
262
+ /// semantics do not honour `/>` on unknown elements — without the expansion, the
263
+ /// element would be left open and subsequent siblings would nest inside it, breaking
264
+ /// visitor start/end event pairing (issue #331).
103
265
  pub fn repair_with_html5ever(input: &str) -> Option<String> {
104
266
  use crate::rcdom::{RcDom, SerializableHandle};
105
267
  use html5ever::serialize::{SerializeOpts, serialize};
106
268
  use html5ever::tendril::TendrilSink;
107
269
 
270
+ // Expand XML-style self-closing on non-void elements before the HTML5 parse so
271
+ // that `<ac:parameter ... />` is not silently left open by the HTML5 parser.
272
+ let expanded = expand_xml_self_closing_tags(input);
273
+
108
274
  let dom = html5ever::parse_document(RcDom::default(), Default::default())
109
275
  .from_utf8()
110
- .read_from(&mut input.as_bytes())
276
+ .read_from(&mut expanded.as_bytes())
111
277
  .ok()?;
112
278
 
113
279
  let mut buf = Vec::with_capacity(input.len());
@@ -8,7 +8,9 @@
8
8
 
9
9
  use crate::converter::media::svg::serialize_element;
10
10
  use crate::options::ConversionOptions;
11
- use crate::text::{decode_html_entities, escape};
11
+ #[cfg(feature = "metadata")]
12
+ use crate::text::decode_html_entities;
13
+ use crate::text::escape;
12
14
  use tl::{NodeHandle, Parser};
13
15
 
14
16
  // Type aliases for Context and DomContext to avoid circular imports
@@ -125,6 +127,7 @@ fn handle_head(
125
127
  ///
126
128
  /// Script elements are processed to extract JSON-LD structured data when
127
129
  /// the type is "application/ld+json" and metadata collection is enabled.
130
+ #[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
128
131
  fn handle_script(
129
132
  node_handle: &NodeHandle,
130
133
  parser: &Parser,
@@ -95,7 +95,7 @@ pub use convert_api::{conversion_options_from_json, conversion_options_update_fr
95
95
  #[cfg(feature = "metadata")]
96
96
  pub use convert_api::metadata_config_from_json;
97
97
 
98
- #[cfg(feature = "inline-images")]
98
+ #[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
99
99
  pub use convert_api::inline_image_config_from_json;
100
100
 
101
101
  // Tests
@@ -118,7 +118,7 @@ pub struct ConversionOptions {
118
118
  /// Invalid selectors are silently skipped at conversion time.
119
119
  ///
120
120
  /// Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
121
- #[serde(default)]
121
+ #[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default))]
122
122
  pub exclude_selectors: Vec<String>,
123
123
  }
124
124
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  use std::collections::HashMap;
4
4
 
5
+ #[cfg(feature = "serde")]
5
6
  use serde::{Deserialize, Serialize};
6
7
 
7
8
  use super::tables::TableGrid;
@@ -9,41 +10,44 @@ use super::tables::TableGrid;
9
10
  /// A structured document tree representing the semantic content of an HTML document.
10
11
  ///
11
12
  /// Uses a flat node array with index-based parent/child references for efficient traversal.
12
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
13
+ #[derive(Debug, Clone, PartialEq, Eq)]
14
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
13
15
  pub struct DocumentStructure {
14
16
  /// All nodes in document reading order.
15
17
  pub nodes: Vec<DocumentNode>,
16
18
  /// The source format (always "html" for this crate).
17
- #[serde(skip_serializing_if = "Option::is_none")]
19
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
18
20
  pub source_format: Option<String>,
19
21
  }
20
22
 
21
23
  /// A single node in the document tree.
22
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
24
+ #[derive(Debug, Clone, PartialEq, Eq)]
25
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
23
26
  pub struct DocumentNode {
24
27
  /// Deterministic node identifier.
25
28
  pub id: String,
26
29
  /// The semantic content of this node.
27
30
  pub content: NodeContent,
28
31
  /// Index of the parent node (None for root nodes).
29
- #[serde(skip_serializing_if = "Option::is_none")]
32
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
30
33
  pub parent: Option<u32>,
31
34
  /// Indices of child nodes in reading order.
32
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
35
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
33
36
  pub children: Vec<u32>,
34
37
  /// Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text.
35
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
38
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
36
39
  pub annotations: Vec<TextAnnotation>,
37
40
  /// Format-specific attributes (e.g. class, id, data-* attributes).
38
- #[serde(skip_serializing_if = "Option::is_none")]
41
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
39
42
  pub attributes: Option<HashMap<String, String>>,
40
43
  }
41
44
 
42
45
  /// The semantic content type of a document node.
43
46
  ///
44
47
  /// Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
45
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
46
- #[serde(tag = "node_type", rename_all = "snake_case")]
48
+ #[derive(Debug, Clone, PartialEq, Eq)]
49
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
50
+ #[cfg_attr(feature = "serde", serde(tag = "node_type", rename_all = "snake_case"))]
47
51
  pub enum NodeContent {
48
52
  /// A heading element (h1-h6).
49
53
  Heading {
@@ -75,13 +79,13 @@ pub enum NodeContent {
75
79
  /// An image element.
76
80
  Image {
77
81
  /// Alt text or caption.
78
- #[serde(skip_serializing_if = "Option::is_none")]
82
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
79
83
  description: Option<String>,
80
84
  /// Image source URL.
81
- #[serde(skip_serializing_if = "Option::is_none")]
85
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
82
86
  src: Option<String>,
83
87
  /// Index into `ConversionResult.images` when image extraction is enabled.
84
- #[serde(skip_serializing_if = "Option::is_none")]
88
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
85
89
  image_index: Option<u32>,
86
90
  },
87
91
  /// A code block or inline code.
@@ -89,7 +93,7 @@ pub enum NodeContent {
89
93
  /// The code text content.
90
94
  text: String,
91
95
  /// Programming language (from class="language-*" or similar).
92
- #[serde(skip_serializing_if = "Option::is_none")]
96
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
93
97
  language: Option<String>,
94
98
  },
95
99
  /// A block quote container.
@@ -118,13 +122,13 @@ pub enum NodeContent {
118
122
  /// A section grouping container (auto-generated from heading hierarchy).
119
123
  Group {
120
124
  /// Optional section label.
121
- #[serde(skip_serializing_if = "Option::is_none")]
125
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
122
126
  label: Option<String>,
123
127
  /// The heading level that created this group.
124
- #[serde(skip_serializing_if = "Option::is_none")]
128
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
125
129
  heading_level: Option<u8>,
126
130
  /// The heading text that created this group.
127
- #[serde(skip_serializing_if = "Option::is_none")]
131
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
128
132
  heading_text: Option<String>,
129
133
  },
130
134
  }
@@ -132,7 +136,8 @@ pub enum NodeContent {
132
136
  /// An inline text annotation with byte-range offsets.
133
137
  ///
134
138
  /// Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
135
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
139
+ #[derive(Debug, Clone, PartialEq, Eq)]
140
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
136
141
  pub struct TextAnnotation {
137
142
  /// Start byte offset (inclusive) into the parent node's text.
138
143
  pub start: u32,
@@ -145,9 +150,9 @@ pub struct TextAnnotation {
145
150
  /// The type of an inline text annotation.
146
151
  ///
147
152
  /// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
148
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
149
- #[serde(tag = "annotation_type", rename_all = "snake_case")]
150
- #[derive(Default)]
153
+ #[derive(Debug, Clone, PartialEq, Eq, Default)]
154
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
155
+ #[cfg_attr(feature = "serde", serde(tag = "annotation_type", rename_all = "snake_case"))]
151
156
  pub enum AnnotationKind {
152
157
  /// Bold / strong emphasis.
153
158
  #[default]
@@ -171,7 +176,7 @@ pub enum AnnotationKind {
171
176
  /// The link URL.
172
177
  url: String,
173
178
  /// Optional link title attribute.
174
- #[serde(skip_serializing_if = "Option::is_none")]
179
+ #[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
175
180
  title: Option<String>,
176
181
  },
177
182
  }
@@ -1,5 +1,6 @@
1
1
  //! The primary result type for HTML conversion and extraction.
2
2
 
3
+ #[cfg(feature = "serde")]
3
4
  use serde::{Deserialize, Serialize};
4
5
 
5
6
  use super::document::DocumentStructure;
@@ -20,7 +21,8 @@ use super::warnings::ProcessingWarning;
20
21
  /// assert!(result.content.is_some());
21
22
  /// assert!(result.warnings.is_empty());
22
23
  /// ```
23
- #[derive(Debug, Clone, Default, Serialize, Deserialize)]
24
+ #[derive(Debug, Clone, Default)]
25
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
24
26
  pub struct ConversionResult {
25
27
  /// Converted text output (markdown, djot, or plain text).
26
28
  ///
@@ -44,7 +46,7 @@ pub struct ConversionResult {
44
46
  ///
45
47
  /// Populated when `extract_images` is `true` in options.
46
48
  #[cfg(feature = "inline-images")]
47
- #[serde(skip)]
49
+ #[cfg_attr(feature = "serde", serde(skip))]
48
50
  pub images: Vec<crate::inline_images::InlineImage>,
49
51
 
50
52
  /// Non-fatal processing warnings.
@@ -1,9 +1,11 @@
1
1
  //! Structured table types aligned with kreuzberg's `TableGrid`.
2
2
 
3
+ #[cfg(feature = "serde")]
3
4
  use serde::{Deserialize, Serialize};
4
5
 
5
6
  /// A structured table grid with cell-level data including spans.
6
- #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
7
+ #[derive(Debug, Clone, Default, PartialEq, Eq)]
8
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
7
9
  pub struct TableGrid {
8
10
  /// Number of rows.
9
11
  pub rows: u32,
@@ -14,7 +16,8 @@ pub struct TableGrid {
14
16
  }
15
17
 
16
18
  /// A single cell in a table grid.
17
- #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
19
+ #[derive(Debug, Clone, PartialEq, Eq)]
20
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
18
21
  pub struct GridCell {
19
22
  /// The text content of the cell.
20
23
  pub content: String,
@@ -23,22 +26,24 @@ pub struct GridCell {
23
26
  /// 0-indexed column position.
24
27
  pub col: u32,
25
28
  /// Number of rows this cell spans (default 1).
26
- #[serde(default = "default_span")]
29
+ #[cfg_attr(feature = "serde", serde(default = "default_span"))]
27
30
  pub row_span: u32,
28
31
  /// Number of columns this cell spans (default 1).
29
- #[serde(default = "default_span")]
32
+ #[cfg_attr(feature = "serde", serde(default = "default_span"))]
30
33
  pub col_span: u32,
31
34
  /// Whether this is a header cell (`<th>`).
32
- #[serde(default)]
35
+ #[cfg_attr(feature = "serde", serde(default))]
33
36
  pub is_header: bool,
34
37
  }
35
38
 
39
+ #[cfg(feature = "serde")]
36
40
  fn default_span() -> u32 {
37
41
  1
38
42
  }
39
43
 
40
44
  /// A top-level extracted table with both structured data and markdown representation.
41
- #[derive(Debug, Clone, Serialize, Deserialize)]
45
+ #[derive(Debug, Clone)]
46
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
42
47
  pub struct TableData {
43
48
  /// The structured table grid.
44
49
  pub grid: TableGrid,
@@ -1,9 +1,11 @@
1
1
  //! Processing warning types for non-fatal issues during conversion.
2
2
 
3
+ #[cfg(feature = "serde")]
3
4
  use serde::{Deserialize, Serialize};
4
5
 
5
6
  /// A non-fatal warning generated during HTML processing.
6
- #[derive(Debug, Clone, Serialize, Deserialize)]
7
+ #[derive(Debug, Clone)]
8
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
7
9
  pub struct ProcessingWarning {
8
10
  /// Human-readable warning message.
9
11
  pub message: String,
@@ -12,8 +14,9 @@ pub struct ProcessingWarning {
12
14
  }
13
15
 
14
16
  /// Categories of processing warnings.
15
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
16
- #[serde(rename_all = "snake_case")]
17
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
18
+ #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
19
+ #[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
17
20
  pub enum WarningKind {
18
21
  /// An image could not be extracted (e.g. invalid data URI, unsupported format).
19
22
  ImageExtractionFailed,
@@ -1002,3 +1002,97 @@ fn test_element_end_replacement_with_metadata_preserves_subsequent_content() {
1002
1002
  "content after replaced element should not be lost"
1003
1003
  );
1004
1004
  }
1005
+
1006
+ /// Regression test for issue #331: visitor receives mismatched start/end events for
1007
+ /// hyphenated tag names that contain XML-style self-closing children.
1008
+ ///
1009
+ /// When `<ac:parameter ac:name="foo" />` appears inside a hyphenated custom element, the
1010
+ /// `repair_with_html5ever` fallback (triggered because the outer tag contains a hyphen) used
1011
+ /// to re-parse with HTML5 semantics. HTML5 does NOT honour XML-style self-closing on unknown
1012
+ /// elements, so `<ac:parameter ... />` was treated as an open tag and subsequent siblings were
1013
+ /// nested inside it. That caused `visit_element_start("ac:parameter")` for "foo" to be
1014
+ /// followed by `visit_element_start("ac:parameter")` for "quux", then both ends in reversed
1015
+ /// order — violating the expected pre-order/post-order pairing.
1016
+ #[test]
1017
+ fn test_issue_331_hyphenated_tags_xml_self_closing_visitor_events() {
1018
+ #[derive(Debug, Default)]
1019
+ struct EventRecorder {
1020
+ events: Vec<String>,
1021
+ }
1022
+
1023
+ impl HtmlVisitor for EventRecorder {
1024
+ fn visit_element_start(&mut self, ctx: &NodeContext) -> VisitResult {
1025
+ self.events.push(format!("start({})", ctx.tag_name));
1026
+ VisitResult::Continue
1027
+ }
1028
+
1029
+ fn visit_element_end(&mut self, ctx: &NodeContext, _output: &str) -> VisitResult {
1030
+ self.events.push(format!("end({})", ctx.tag_name));
1031
+ VisitResult::Continue
1032
+ }
1033
+ }
1034
+
1035
+ let html = r#"
1036
+ <structured-macro>
1037
+ <ac:parameter ac:name="foo" />
1038
+ <ac:parameter ac:name="quux">lalaland</ac:parameter>
1039
+ </structured-macro>
1040
+ "#;
1041
+
1042
+ let visitor = Rc::new(RefCell::new(EventRecorder::default()));
1043
+ let result = convert(html, None, Some(visitor.clone()));
1044
+ assert!(result.is_ok(), "conversion should succeed: {:?}", result.err());
1045
+
1046
+ let events = visitor.borrow().events.clone();
1047
+
1048
+ // Find the indices of start/end pairs for the two ac:parameter elements.
1049
+ // With correct XML self-closing handling:
1050
+ // start(ac:parameter)[foo] → end(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux]
1051
+ // With the bug (html5ever treats `/>` as open tag):
1052
+ // start(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux] → end(ac:parameter)[foo]
1053
+
1054
+ // Collect positions of start/end events for ac:parameter
1055
+ let ac_param_starts: Vec<usize> = events
1056
+ .iter()
1057
+ .enumerate()
1058
+ .filter(|(_, e)| e.starts_with("start(ac:parameter)"))
1059
+ .map(|(i, _)| i)
1060
+ .collect();
1061
+ let ac_param_ends: Vec<usize> = events
1062
+ .iter()
1063
+ .enumerate()
1064
+ .filter(|(_, e)| e.starts_with("end(ac:parameter)"))
1065
+ .map(|(i, _)| i)
1066
+ .collect();
1067
+
1068
+ assert_eq!(
1069
+ ac_param_starts.len(),
1070
+ 2,
1071
+ "expected exactly 2 ac:parameter start events, got: {events:?}"
1072
+ );
1073
+ assert_eq!(
1074
+ ac_param_ends.len(),
1075
+ 2,
1076
+ "expected exactly 2 ac:parameter end events, got: {events:?}"
1077
+ );
1078
+
1079
+ // Each start must come before the corresponding end: start[0] < end[0] < start[1] < end[1]
1080
+ assert!(
1081
+ ac_param_starts[0] < ac_param_ends[0],
1082
+ "first ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
1083
+ ac_param_starts[0],
1084
+ ac_param_ends[0],
1085
+ );
1086
+ assert!(
1087
+ ac_param_ends[0] < ac_param_starts[1],
1088
+ "first ac:parameter end must precede second ac:parameter start (got end@{}, start@{}); events: {events:?}",
1089
+ ac_param_ends[0],
1090
+ ac_param_starts[1],
1091
+ );
1092
+ assert!(
1093
+ ac_param_starts[1] < ac_param_ends[1],
1094
+ "second ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
1095
+ ac_param_starts[1],
1096
+ ac_param_ends[1],
1097
+ );
1098
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.4.0.pre.rc.18
4
+ version: 3.4.0.pre.rc.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kreuzberg Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-04-29 00:00:00.000000000 Z
11
+ date: 2026-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys