html-to-markdown 3.4.0.pre.rc.18 → 3.4.0.pre.rc.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/html_to_markdown_rb/Cargo.toml +1 -1
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +2 -2
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +1 -1
- data/ext/html_to_markdown_rb/src/lib.rs +1 -1
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +6 -8
- data/sig/types.rbs +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +24 -6
- data/vendor/html-to-markdown-rs/src/converter/context.rs +3 -1
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +167 -1
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +4 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +1 -1
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +1 -1
- data/vendor/html-to-markdown-rs/src/types/document.rs +26 -21
- data/vendor/html-to-markdown-rs/src/types/result.rs +4 -2
- data/vendor/html-to-markdown-rs/src/types/tables.rs +11 -6
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +6 -3
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +94 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e6c2c4a533b89d2eb7db4322b77ae6eec7821d2f2babfbd903b5ba3a354425af
|
|
4
|
+
data.tar.gz: 997c6b7c90856c0554a3876565dd0605ace185736aa7ed614e124b8c2789947b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 196b2bced138d1ee74b1b4972f60cea01a2cf06772efbe13d44ff90222e4093766a661dcde5271dab1d79d9fb671a29365d20b189952ed76f8646f8f0ad1a01f
|
|
7
|
+
data.tar.gz: 1ebb5633c1a86cc427045fb3db1678b4b9ae720fb1d66bafdfdc8b61d013d01a9d1eccc7959d90463b212ac4ba6bcddea3bfadb5a66244c1e8b76c7e310a9181
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:048c75ae74b430ffa33441a8dd7241b1bfe520e31a66eb84709b5ede993ee4c8
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
6
6
|
# frozen_string_literal: true
|
|
7
7
|
|
|
8
8
|
module HtmlToMarkdown
|
|
9
|
-
VERSION =
|
|
9
|
+
VERSION = '3.4.0.pre.rc.23'
|
|
10
10
|
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:2ddad3a0e4196d0f7824563b5eca866af2d2a475750704444e5ecc0336f8baa6
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
// This file is auto-generated by alef. DO NOT EDIT.
|
|
2
|
-
// alef:hash:
|
|
2
|
+
// alef:hash:bb6faa37da8d32b19ccd60b267ae8b20d43326587e859f396a9826ab0925d398
|
|
3
3
|
// Re-generate with: alef generate
|
|
4
4
|
#![allow(dead_code, unused_imports, unused_variables)]
|
|
5
5
|
#![allow(
|
data/lib/bin/html-to-markdown
CHANGED
|
Binary file
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require_relative 'html_to_markdown/version'
|
|
4
4
|
require 'html_to_markdown_rb'
|
|
5
|
+
require 'json'
|
|
5
6
|
|
|
6
7
|
# High-performance HTML to Markdown conversion.
|
|
7
8
|
#
|
|
@@ -29,14 +30,11 @@ module HtmlToMarkdown
|
|
|
29
30
|
# (and more, matching ConversionOptions fields)
|
|
30
31
|
# @return [String] The converted Markdown content.
|
|
31
32
|
def self.convert(html, options = {}, visitor = nil)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
HtmlToMarkdownRs::ConversionOptions.new(options)
|
|
38
|
-
end
|
|
39
|
-
result = HtmlToMarkdownRs.convert(html, opts, visitor)
|
|
33
|
+
# The Rust FFI expects options as a JSON string; serialise the hash here
|
|
34
|
+
# rather than constructing a ConversionOptions object, which the generated
|
|
35
|
+
# FFI layer cannot coerce back to String (see issue #334).
|
|
36
|
+
opts_json = options.nil? || options.empty? ? nil : options.to_json
|
|
37
|
+
result = HtmlToMarkdownRs.convert(html, opts_json, visitor)
|
|
40
38
|
result.content || ''
|
|
41
39
|
end
|
|
42
40
|
end
|
data/sig/types.rbs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:f0d66ccd989cb158aa2206dc4fc0596d3e4060cbb323372db1418e22598b6c21
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
data/vendor/Cargo.toml
CHANGED
|
@@ -6,12 +6,26 @@ use std::borrow::Cow;
|
|
|
6
6
|
|
|
7
7
|
use crate::error::Result;
|
|
8
8
|
use crate::options::{ConversionOptions, WhitespaceMode};
|
|
9
|
+
|
|
10
|
+
/// The visitor parameter type accepted by [`convert`].
|
|
11
|
+
///
|
|
12
|
+
/// When the `visitor` feature is enabled, this is the full `VisitorHandle`
|
|
13
|
+
/// (a shared reference-counted dyn `HtmlVisitor`). When the feature is off
|
|
14
|
+
/// it degrades to a unit type so that callers can keep a stable 3-arity
|
|
15
|
+
/// `convert(html, options, None)` call signature regardless of feature flags.
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
pub type VisitorParam = crate::visitor::VisitorHandle;
|
|
18
|
+
#[cfg(not(feature = "visitor"))]
|
|
19
|
+
pub type VisitorParam = ();
|
|
20
|
+
#[cfg(any(feature = "serde", feature = "metadata", feature = "inline-images"))]
|
|
21
|
+
use crate::ConversionError;
|
|
22
|
+
#[cfg(any(feature = "serde", feature = "metadata"))]
|
|
23
|
+
use crate::ConversionOptionsUpdate;
|
|
9
24
|
use crate::text;
|
|
10
25
|
use crate::types::ConversionResult;
|
|
11
26
|
use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
|
|
12
|
-
use crate::{ConversionError, ConversionOptionsUpdate};
|
|
13
27
|
|
|
14
|
-
#[cfg(feature = "inline-images")]
|
|
28
|
+
#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
|
|
15
29
|
use crate::InlineImageConfig;
|
|
16
30
|
#[cfg(feature = "metadata")]
|
|
17
31
|
use crate::{HtmlMetadata, MetadataConfig};
|
|
@@ -40,9 +54,11 @@ use crate::{HtmlMetadata, MetadataConfig};
|
|
|
40
54
|
pub fn convert(
|
|
41
55
|
html: &str,
|
|
42
56
|
options: Option<ConversionOptions>,
|
|
43
|
-
|
|
57
|
+
visitor: Option<VisitorParam>,
|
|
44
58
|
) -> Result<ConversionResult> {
|
|
59
|
+
#[cfg(any(feature = "metadata", feature = "inline-images"))]
|
|
45
60
|
use std::cell::RefCell;
|
|
61
|
+
#[cfg(any(feature = "metadata", feature = "inline-images"))]
|
|
46
62
|
use std::rc::Rc;
|
|
47
63
|
|
|
48
64
|
let options = options.unwrap_or_default();
|
|
@@ -100,10 +116,12 @@ pub fn convert(
|
|
|
100
116
|
None
|
|
101
117
|
};
|
|
102
118
|
|
|
103
|
-
//
|
|
104
|
-
//
|
|
119
|
+
// `convert_html_impl` expects the visitor slot to be `Option<()>` when the visitor
|
|
120
|
+
// feature is off. We accept `Option<VisitorParam>` (a feature-gated alias) at the
|
|
121
|
+
// public API — when the feature is off it's `Option<()>`, so `visitor` already has
|
|
122
|
+
// the right type and we don't need to override it.
|
|
105
123
|
#[cfg(not(feature = "visitor"))]
|
|
106
|
-
let visitor
|
|
124
|
+
let _ = visitor.is_some();
|
|
107
125
|
|
|
108
126
|
// Run the conversion pipeline.
|
|
109
127
|
// Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
|
|
@@ -6,7 +6,9 @@
|
|
|
6
6
|
|
|
7
7
|
#[cfg(any(feature = "inline-images", feature = "visitor"))]
|
|
8
8
|
use std::cell::RefCell;
|
|
9
|
-
|
|
9
|
+
#[cfg(feature = "metadata")]
|
|
10
|
+
use std::collections::BTreeMap;
|
|
11
|
+
use std::collections::HashSet;
|
|
10
12
|
use std::rc::Rc;
|
|
11
13
|
|
|
12
14
|
#[cfg(feature = "inline-images")]
|
|
@@ -14,6 +14,7 @@ use crate::converter::utility::content::collect_tag_attributes;
|
|
|
14
14
|
use crate::converter::utility::content::{collect_link_label_text, escape_link_label, normalize_link_label};
|
|
15
15
|
use crate::converter::utility::preprocessing::sanitize_markdown_url;
|
|
16
16
|
use crate::options::ConversionOptions;
|
|
17
|
+
#[cfg(any(feature = "metadata", feature = "visitor"))]
|
|
17
18
|
use std::collections::BTreeMap;
|
|
18
19
|
use tl::{NodeHandle, Parser};
|
|
19
20
|
|
|
@@ -97,17 +97,183 @@ pub fn has_custom_element_tags(html: &str) -> bool {
|
|
|
97
97
|
false
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
+
/// HTML5 void elements that are self-closing by spec and must NOT be expanded.
|
|
101
|
+
///
|
|
102
|
+
/// These elements are always void in HTML5: they have no end tag, and `<br />` is
|
|
103
|
+
/// equivalent to `<br>`. We must leave them as-is when pre-processing XML-style
|
|
104
|
+
/// self-closing syntax so that `repair_with_html5ever` can parse them correctly.
|
|
105
|
+
const HTML5_VOID_ELEMENTS: &[&str] = &[
|
|
106
|
+
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr",
|
|
107
|
+
];
|
|
108
|
+
|
|
109
|
+
/// Expand XML-style self-closing tags to explicit open+close pairs.
|
|
110
|
+
///
|
|
111
|
+
/// HTML5 does not honour the `/>` self-close syntax for non-void elements. When
|
|
112
|
+
/// `repair_with_html5ever` re-parses content that contains custom / namespaced tags
|
|
113
|
+
/// written as `<ac:parameter name="foo" />`, the HTML5 parser treats the `/>` as `>`
|
|
114
|
+
/// and leaves the element open. Subsequent siblings then nest inside it, breaking
|
|
115
|
+
/// visitor pre-order/post-order start/end pairing.
|
|
116
|
+
///
|
|
117
|
+
/// This function scans the input byte-by-byte and rewrites any `<tag ... />` where
|
|
118
|
+
/// `tag` is not a known HTML5 void element into `<tag ...></tag>`. Known void
|
|
119
|
+
/// elements are left unchanged because they must not receive an explicit close tag.
|
|
120
|
+
///
|
|
121
|
+
/// # Correctness guarantees
|
|
122
|
+
/// - Non-ASCII bytes are never interpreted as structural characters; all multi-byte
|
|
123
|
+
/// UTF-8 sequences pass through unmodified via `&input[byte_offset..]` slicing.
|
|
124
|
+
/// - Attribute values containing `/>` are skipped correctly (the scanner tracks
|
|
125
|
+
/// whether it is inside a quoted attribute).
|
|
126
|
+
/// - `</closing>` tags are never modified.
|
|
127
|
+
/// - The function is pure and returns a new `String`; if no substitution is needed
|
|
128
|
+
/// the allocation is still performed (cheap given repair is already rare).
|
|
129
|
+
pub fn expand_xml_self_closing_tags(input: &str) -> String {
|
|
130
|
+
let bytes = input.as_bytes();
|
|
131
|
+
let len = bytes.len();
|
|
132
|
+
let mut output = String::with_capacity(len);
|
|
133
|
+
// `copy_start` tracks the beginning of a contiguous span of unmodified input
|
|
134
|
+
// that should be copied verbatim to `output`.
|
|
135
|
+
let mut copy_start = 0usize;
|
|
136
|
+
let mut i = 0;
|
|
137
|
+
|
|
138
|
+
while i < len {
|
|
139
|
+
if bytes[i] != b'<' {
|
|
140
|
+
i += 1;
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// We are at `<`. Flush the unmodified span up to (but not including) this `<`.
|
|
145
|
+
let tag_open = i;
|
|
146
|
+
i += 1;
|
|
147
|
+
|
|
148
|
+
// Skip closing tags entirely — they must not be modified.
|
|
149
|
+
if i < len && bytes[i] == b'/' {
|
|
150
|
+
// Scan to the matching `>`.
|
|
151
|
+
while i < len && bytes[i] != b'>' {
|
|
152
|
+
i += 1;
|
|
153
|
+
}
|
|
154
|
+
if i < len {
|
|
155
|
+
i += 1; // consume `>`
|
|
156
|
+
}
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Skip leading whitespace after `<` (unusual but tolerated).
|
|
161
|
+
while i < len && bytes[i].is_ascii_whitespace() {
|
|
162
|
+
i += 1;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Collect the tag name (byte-aligned; tag names are always ASCII).
|
|
166
|
+
let name_start = i;
|
|
167
|
+
while i < len {
|
|
168
|
+
let ch = bytes[i];
|
|
169
|
+
if ch == b'>' || ch == b'/' || ch.is_ascii_whitespace() {
|
|
170
|
+
break;
|
|
171
|
+
}
|
|
172
|
+
i += 1;
|
|
173
|
+
}
|
|
174
|
+
let tag_name_bytes = &bytes[name_start..i];
|
|
175
|
+
|
|
176
|
+
// Empty tag name — emit verbatim and continue.
|
|
177
|
+
if tag_name_bytes.is_empty() {
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// Check whether this is a known HTML5 void element (case-insensitive).
|
|
182
|
+
let tag_name_lower = tag_name_bytes.iter().map(u8::to_ascii_lowercase).collect::<Vec<_>>();
|
|
183
|
+
let is_void = HTML5_VOID_ELEMENTS
|
|
184
|
+
.iter()
|
|
185
|
+
.any(|v| v.as_bytes() == tag_name_lower.as_slice());
|
|
186
|
+
|
|
187
|
+
// Scan the rest of the tag to find `/>` or `>`, skipping quoted attrs.
|
|
188
|
+
let attrs_start = i;
|
|
189
|
+
let mut in_single_quote = false;
|
|
190
|
+
let mut in_double_quote = false;
|
|
191
|
+
let mut self_closing = false;
|
|
192
|
+
|
|
193
|
+
while i < len {
|
|
194
|
+
match bytes[i] {
|
|
195
|
+
b'"' if !in_single_quote => {
|
|
196
|
+
in_double_quote = !in_double_quote;
|
|
197
|
+
i += 1;
|
|
198
|
+
}
|
|
199
|
+
b'\'' if !in_double_quote => {
|
|
200
|
+
in_single_quote = !in_single_quote;
|
|
201
|
+
i += 1;
|
|
202
|
+
}
|
|
203
|
+
b'/' if !in_single_quote && !in_double_quote => {
|
|
204
|
+
if i + 1 < len && bytes[i + 1] == b'>' {
|
|
205
|
+
self_closing = true;
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
i += 1;
|
|
209
|
+
}
|
|
210
|
+
b'>' if !in_single_quote && !in_double_quote => {
|
|
211
|
+
break;
|
|
212
|
+
}
|
|
213
|
+
_ => {
|
|
214
|
+
i += 1;
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
if self_closing && !is_void {
|
|
220
|
+
// Flush unchanged input up to (not including) this tag.
|
|
221
|
+
output.push_str(&input[copy_start..tag_open]);
|
|
222
|
+
|
|
223
|
+
let tag_name_str = std::str::from_utf8(tag_name_bytes).unwrap_or("");
|
|
224
|
+
// attrs_part covers everything between the end of the tag name and `/>`,
|
|
225
|
+
// i.e. `&input[attrs_start..i]` (the `/` at `i` is the start of `/>`)
|
|
226
|
+
let attrs_part = &input[attrs_start..i];
|
|
227
|
+
|
|
228
|
+
// Non-void: expand `<tag attrs/>` → `<tag attrs></tag>`.
|
|
229
|
+
output.push('<');
|
|
230
|
+
output.push_str(tag_name_str);
|
|
231
|
+
output.push_str(attrs_part);
|
|
232
|
+
output.push('>');
|
|
233
|
+
output.push('<');
|
|
234
|
+
output.push('/');
|
|
235
|
+
output.push_str(tag_name_str);
|
|
236
|
+
output.push('>');
|
|
237
|
+
|
|
238
|
+
i += 2; // consume `/>`
|
|
239
|
+
copy_start = i;
|
|
240
|
+
} else {
|
|
241
|
+
// Not a self-closing non-void tag: advance past `/>` or `>`.
|
|
242
|
+
if i < len && bytes[i] == b'/' {
|
|
243
|
+
i += 2; // skip `/>`
|
|
244
|
+
} else if i < len && bytes[i] == b'>' {
|
|
245
|
+
i += 1;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Flush the remaining unchanged tail.
|
|
251
|
+
output.push_str(&input[copy_start..]);
|
|
252
|
+
output
|
|
253
|
+
}
|
|
254
|
+
|
|
100
255
|
/// Try to repair HTML using html5ever parser.
|
|
101
256
|
///
|
|
102
257
|
/// Returns Some(repaired_html) if repair was successful, None otherwise.
|
|
258
|
+
///
|
|
259
|
+
/// Before feeding the input to the HTML5 parser, XML-style self-closing tags on
|
|
260
|
+
/// non-void elements (e.g. `<ac:parameter name="foo" />`) are expanded to explicit
|
|
261
|
+
/// open+close pairs. This preserves the intended document structure because HTML5
|
|
262
|
+
/// semantics do not honour `/>` on unknown elements — without the expansion, the
|
|
263
|
+
/// element would be left open and subsequent siblings would nest inside it, breaking
|
|
264
|
+
/// visitor start/end event pairing (issue #331).
|
|
103
265
|
pub fn repair_with_html5ever(input: &str) -> Option<String> {
|
|
104
266
|
use crate::rcdom::{RcDom, SerializableHandle};
|
|
105
267
|
use html5ever::serialize::{SerializeOpts, serialize};
|
|
106
268
|
use html5ever::tendril::TendrilSink;
|
|
107
269
|
|
|
270
|
+
// Expand XML-style self-closing on non-void elements before the HTML5 parse so
|
|
271
|
+
// that `<ac:parameter ... />` is not silently left open by the HTML5 parser.
|
|
272
|
+
let expanded = expand_xml_self_closing_tags(input);
|
|
273
|
+
|
|
108
274
|
let dom = html5ever::parse_document(RcDom::default(), Default::default())
|
|
109
275
|
.from_utf8()
|
|
110
|
-
.read_from(&mut
|
|
276
|
+
.read_from(&mut expanded.as_bytes())
|
|
111
277
|
.ok()?;
|
|
112
278
|
|
|
113
279
|
let mut buf = Vec::with_capacity(input.len());
|
|
@@ -8,7 +8,9 @@
|
|
|
8
8
|
|
|
9
9
|
use crate::converter::media::svg::serialize_element;
|
|
10
10
|
use crate::options::ConversionOptions;
|
|
11
|
-
|
|
11
|
+
#[cfg(feature = "metadata")]
|
|
12
|
+
use crate::text::decode_html_entities;
|
|
13
|
+
use crate::text::escape;
|
|
12
14
|
use tl::{NodeHandle, Parser};
|
|
13
15
|
|
|
14
16
|
// Type aliases for Context and DomContext to avoid circular imports
|
|
@@ -125,6 +127,7 @@ fn handle_head(
|
|
|
125
127
|
///
|
|
126
128
|
/// Script elements are processed to extract JSON-LD structured data when
|
|
127
129
|
/// the type is "application/ld+json" and metadata collection is enabled.
|
|
130
|
+
#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
|
|
128
131
|
fn handle_script(
|
|
129
132
|
node_handle: &NodeHandle,
|
|
130
133
|
parser: &Parser,
|
|
@@ -95,7 +95,7 @@ pub use convert_api::{conversion_options_from_json, conversion_options_update_fr
|
|
|
95
95
|
#[cfg(feature = "metadata")]
|
|
96
96
|
pub use convert_api::metadata_config_from_json;
|
|
97
97
|
|
|
98
|
-
#[cfg(feature = "inline-images")]
|
|
98
|
+
#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
|
|
99
99
|
pub use convert_api::inline_image_config_from_json;
|
|
100
100
|
|
|
101
101
|
// Tests
|
|
@@ -118,7 +118,7 @@ pub struct ConversionOptions {
|
|
|
118
118
|
/// Invalid selectors are silently skipped at conversion time.
|
|
119
119
|
///
|
|
120
120
|
/// Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
|
|
121
|
-
#[serde(default)]
|
|
121
|
+
#[cfg_attr(any(feature = "serde", feature = "metadata"), serde(default))]
|
|
122
122
|
pub exclude_selectors: Vec<String>,
|
|
123
123
|
}
|
|
124
124
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
use std::collections::HashMap;
|
|
4
4
|
|
|
5
|
+
#[cfg(feature = "serde")]
|
|
5
6
|
use serde::{Deserialize, Serialize};
|
|
6
7
|
|
|
7
8
|
use super::tables::TableGrid;
|
|
@@ -9,41 +10,44 @@ use super::tables::TableGrid;
|
|
|
9
10
|
/// A structured document tree representing the semantic content of an HTML document.
|
|
10
11
|
///
|
|
11
12
|
/// Uses a flat node array with index-based parent/child references for efficient traversal.
|
|
12
|
-
#[derive(Debug, Clone, PartialEq, Eq
|
|
13
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
14
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
13
15
|
pub struct DocumentStructure {
|
|
14
16
|
/// All nodes in document reading order.
|
|
15
17
|
pub nodes: Vec<DocumentNode>,
|
|
16
18
|
/// The source format (always "html" for this crate).
|
|
17
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
19
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
18
20
|
pub source_format: Option<String>,
|
|
19
21
|
}
|
|
20
22
|
|
|
21
23
|
/// A single node in the document tree.
|
|
22
|
-
#[derive(Debug, Clone, PartialEq, Eq
|
|
24
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
25
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
23
26
|
pub struct DocumentNode {
|
|
24
27
|
/// Deterministic node identifier.
|
|
25
28
|
pub id: String,
|
|
26
29
|
/// The semantic content of this node.
|
|
27
30
|
pub content: NodeContent,
|
|
28
31
|
/// Index of the parent node (None for root nodes).
|
|
29
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
32
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
30
33
|
pub parent: Option<u32>,
|
|
31
34
|
/// Indices of child nodes in reading order.
|
|
32
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
35
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
|
|
33
36
|
pub children: Vec<u32>,
|
|
34
37
|
/// Inline formatting annotations (bold, italic, links, etc.) with byte offsets into the text.
|
|
35
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
38
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Vec::is_empty", default))]
|
|
36
39
|
pub annotations: Vec<TextAnnotation>,
|
|
37
40
|
/// Format-specific attributes (e.g. class, id, data-* attributes).
|
|
38
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
41
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
39
42
|
pub attributes: Option<HashMap<String, String>>,
|
|
40
43
|
}
|
|
41
44
|
|
|
42
45
|
/// The semantic content type of a document node.
|
|
43
46
|
///
|
|
44
47
|
/// Uses internally tagged representation (`"node_type": "heading"`) for JSON serialization.
|
|
45
|
-
#[derive(Debug, Clone, PartialEq, Eq
|
|
46
|
-
#[
|
|
48
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
49
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
50
|
+
#[cfg_attr(feature = "serde", serde(tag = "node_type", rename_all = "snake_case"))]
|
|
47
51
|
pub enum NodeContent {
|
|
48
52
|
/// A heading element (h1-h6).
|
|
49
53
|
Heading {
|
|
@@ -75,13 +79,13 @@ pub enum NodeContent {
|
|
|
75
79
|
/// An image element.
|
|
76
80
|
Image {
|
|
77
81
|
/// Alt text or caption.
|
|
78
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
82
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
79
83
|
description: Option<String>,
|
|
80
84
|
/// Image source URL.
|
|
81
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
85
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
82
86
|
src: Option<String>,
|
|
83
87
|
/// Index into `ConversionResult.images` when image extraction is enabled.
|
|
84
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
88
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
85
89
|
image_index: Option<u32>,
|
|
86
90
|
},
|
|
87
91
|
/// A code block or inline code.
|
|
@@ -89,7 +93,7 @@ pub enum NodeContent {
|
|
|
89
93
|
/// The code text content.
|
|
90
94
|
text: String,
|
|
91
95
|
/// Programming language (from class="language-*" or similar).
|
|
92
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
96
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
93
97
|
language: Option<String>,
|
|
94
98
|
},
|
|
95
99
|
/// A block quote container.
|
|
@@ -118,13 +122,13 @@ pub enum NodeContent {
|
|
|
118
122
|
/// A section grouping container (auto-generated from heading hierarchy).
|
|
119
123
|
Group {
|
|
120
124
|
/// Optional section label.
|
|
121
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
125
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
122
126
|
label: Option<String>,
|
|
123
127
|
/// The heading level that created this group.
|
|
124
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
128
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
125
129
|
heading_level: Option<u8>,
|
|
126
130
|
/// The heading text that created this group.
|
|
127
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
131
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
128
132
|
heading_text: Option<String>,
|
|
129
133
|
},
|
|
130
134
|
}
|
|
@@ -132,7 +136,8 @@ pub enum NodeContent {
|
|
|
132
136
|
/// An inline text annotation with byte-range offsets.
|
|
133
137
|
///
|
|
134
138
|
/// Annotations describe formatting (bold, italic, etc.) and links within a node's text content.
|
|
135
|
-
#[derive(Debug, Clone, PartialEq, Eq
|
|
139
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
140
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
136
141
|
pub struct TextAnnotation {
|
|
137
142
|
/// Start byte offset (inclusive) into the parent node's text.
|
|
138
143
|
pub start: u32,
|
|
@@ -145,9 +150,9 @@ pub struct TextAnnotation {
|
|
|
145
150
|
/// The type of an inline text annotation.
|
|
146
151
|
///
|
|
147
152
|
/// Uses internally tagged representation (`"annotation_type": "bold"`) for JSON serialization.
|
|
148
|
-
#[derive(Debug, Clone, PartialEq, Eq,
|
|
149
|
-
#[
|
|
150
|
-
#[
|
|
153
|
+
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
|
154
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
155
|
+
#[cfg_attr(feature = "serde", serde(tag = "annotation_type", rename_all = "snake_case"))]
|
|
151
156
|
pub enum AnnotationKind {
|
|
152
157
|
/// Bold / strong emphasis.
|
|
153
158
|
#[default]
|
|
@@ -171,7 +176,7 @@ pub enum AnnotationKind {
|
|
|
171
176
|
/// The link URL.
|
|
172
177
|
url: String,
|
|
173
178
|
/// Optional link title attribute.
|
|
174
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
179
|
+
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
|
|
175
180
|
title: Option<String>,
|
|
176
181
|
},
|
|
177
182
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
//! The primary result type for HTML conversion and extraction.
|
|
2
2
|
|
|
3
|
+
#[cfg(feature = "serde")]
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
4
5
|
|
|
5
6
|
use super::document::DocumentStructure;
|
|
@@ -20,7 +21,8 @@ use super::warnings::ProcessingWarning;
|
|
|
20
21
|
/// assert!(result.content.is_some());
|
|
21
22
|
/// assert!(result.warnings.is_empty());
|
|
22
23
|
/// ```
|
|
23
|
-
#[derive(Debug, Clone, Default
|
|
24
|
+
#[derive(Debug, Clone, Default)]
|
|
25
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
24
26
|
pub struct ConversionResult {
|
|
25
27
|
/// Converted text output (markdown, djot, or plain text).
|
|
26
28
|
///
|
|
@@ -44,7 +46,7 @@ pub struct ConversionResult {
|
|
|
44
46
|
///
|
|
45
47
|
/// Populated when `extract_images` is `true` in options.
|
|
46
48
|
#[cfg(feature = "inline-images")]
|
|
47
|
-
#[serde(skip)]
|
|
49
|
+
#[cfg_attr(feature = "serde", serde(skip))]
|
|
48
50
|
pub images: Vec<crate::inline_images::InlineImage>,
|
|
49
51
|
|
|
50
52
|
/// Non-fatal processing warnings.
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
//! Structured table types aligned with kreuzberg's `TableGrid`.
|
|
2
2
|
|
|
3
|
+
#[cfg(feature = "serde")]
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
4
5
|
|
|
5
6
|
/// A structured table grid with cell-level data including spans.
|
|
6
|
-
#[derive(Debug, Clone, Default, PartialEq, Eq
|
|
7
|
+
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
|
8
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
7
9
|
pub struct TableGrid {
|
|
8
10
|
/// Number of rows.
|
|
9
11
|
pub rows: u32,
|
|
@@ -14,7 +16,8 @@ pub struct TableGrid {
|
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
/// A single cell in a table grid.
|
|
17
|
-
#[derive(Debug, Clone, PartialEq, Eq
|
|
19
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
20
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
18
21
|
pub struct GridCell {
|
|
19
22
|
/// The text content of the cell.
|
|
20
23
|
pub content: String,
|
|
@@ -23,22 +26,24 @@ pub struct GridCell {
|
|
|
23
26
|
/// 0-indexed column position.
|
|
24
27
|
pub col: u32,
|
|
25
28
|
/// Number of rows this cell spans (default 1).
|
|
26
|
-
#[serde(default = "default_span")]
|
|
29
|
+
#[cfg_attr(feature = "serde", serde(default = "default_span"))]
|
|
27
30
|
pub row_span: u32,
|
|
28
31
|
/// Number of columns this cell spans (default 1).
|
|
29
|
-
#[serde(default = "default_span")]
|
|
32
|
+
#[cfg_attr(feature = "serde", serde(default = "default_span"))]
|
|
30
33
|
pub col_span: u32,
|
|
31
34
|
/// Whether this is a header cell (`<th>`).
|
|
32
|
-
#[serde(default)]
|
|
35
|
+
#[cfg_attr(feature = "serde", serde(default))]
|
|
33
36
|
pub is_header: bool,
|
|
34
37
|
}
|
|
35
38
|
|
|
39
|
+
#[cfg(feature = "serde")]
|
|
36
40
|
fn default_span() -> u32 {
|
|
37
41
|
1
|
|
38
42
|
}
|
|
39
43
|
|
|
40
44
|
/// A top-level extracted table with both structured data and markdown representation.
|
|
41
|
-
#[derive(Debug, Clone
|
|
45
|
+
#[derive(Debug, Clone)]
|
|
46
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
42
47
|
pub struct TableData {
|
|
43
48
|
/// The structured table grid.
|
|
44
49
|
pub grid: TableGrid,
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
//! Processing warning types for non-fatal issues during conversion.
|
|
2
2
|
|
|
3
|
+
#[cfg(feature = "serde")]
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
4
5
|
|
|
5
6
|
/// A non-fatal warning generated during HTML processing.
|
|
6
|
-
#[derive(Debug, Clone
|
|
7
|
+
#[derive(Debug, Clone)]
|
|
8
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
7
9
|
pub struct ProcessingWarning {
|
|
8
10
|
/// Human-readable warning message.
|
|
9
11
|
pub message: String,
|
|
@@ -12,8 +14,9 @@ pub struct ProcessingWarning {
|
|
|
12
14
|
}
|
|
13
15
|
|
|
14
16
|
/// Categories of processing warnings.
|
|
15
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq
|
|
16
|
-
#[
|
|
17
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
18
|
+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
19
|
+
#[cfg_attr(feature = "serde", serde(rename_all = "snake_case"))]
|
|
17
20
|
pub enum WarningKind {
|
|
18
21
|
/// An image could not be extracted (e.g. invalid data URI, unsupported format).
|
|
19
22
|
ImageExtractionFailed,
|
|
@@ -1002,3 +1002,97 @@ fn test_element_end_replacement_with_metadata_preserves_subsequent_content() {
|
|
|
1002
1002
|
"content after replaced element should not be lost"
|
|
1003
1003
|
);
|
|
1004
1004
|
}
|
|
1005
|
+
|
|
1006
|
+
/// Regression test for issue #331: visitor receives mismatched start/end events for
|
|
1007
|
+
/// hyphenated tag names that contain XML-style self-closing children.
|
|
1008
|
+
///
|
|
1009
|
+
/// When `<ac:parameter ac:name="foo" />` appears inside a hyphenated custom element, the
|
|
1010
|
+
/// `repair_with_html5ever` fallback (triggered because the outer tag contains a hyphen) used
|
|
1011
|
+
/// to re-parse with HTML5 semantics. HTML5 does NOT honour XML-style self-closing on unknown
|
|
1012
|
+
/// elements, so `<ac:parameter ... />` was treated as an open tag and subsequent siblings were
|
|
1013
|
+
/// nested inside it. That caused `visit_element_start("ac:parameter")` for "foo" to be
|
|
1014
|
+
/// followed by `visit_element_start("ac:parameter")` for "quux", then both ends in reversed
|
|
1015
|
+
/// order — violating the expected pre-order/post-order pairing.
|
|
1016
|
+
#[test]
|
|
1017
|
+
fn test_issue_331_hyphenated_tags_xml_self_closing_visitor_events() {
|
|
1018
|
+
#[derive(Debug, Default)]
|
|
1019
|
+
struct EventRecorder {
|
|
1020
|
+
events: Vec<String>,
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
impl HtmlVisitor for EventRecorder {
|
|
1024
|
+
fn visit_element_start(&mut self, ctx: &NodeContext) -> VisitResult {
|
|
1025
|
+
self.events.push(format!("start({})", ctx.tag_name));
|
|
1026
|
+
VisitResult::Continue
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
fn visit_element_end(&mut self, ctx: &NodeContext, _output: &str) -> VisitResult {
|
|
1030
|
+
self.events.push(format!("end({})", ctx.tag_name));
|
|
1031
|
+
VisitResult::Continue
|
|
1032
|
+
}
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
let html = r#"
|
|
1036
|
+
<structured-macro>
|
|
1037
|
+
<ac:parameter ac:name="foo" />
|
|
1038
|
+
<ac:parameter ac:name="quux">lalaland</ac:parameter>
|
|
1039
|
+
</structured-macro>
|
|
1040
|
+
"#;
|
|
1041
|
+
|
|
1042
|
+
let visitor = Rc::new(RefCell::new(EventRecorder::default()));
|
|
1043
|
+
let result = convert(html, None, Some(visitor.clone()));
|
|
1044
|
+
assert!(result.is_ok(), "conversion should succeed: {:?}", result.err());
|
|
1045
|
+
|
|
1046
|
+
let events = visitor.borrow().events.clone();
|
|
1047
|
+
|
|
1048
|
+
// Find the indices of start/end pairs for the two ac:parameter elements.
|
|
1049
|
+
// With correct XML self-closing handling:
|
|
1050
|
+
// start(ac:parameter)[foo] → end(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux]
|
|
1051
|
+
// With the bug (html5ever treats `/>` as open tag):
|
|
1052
|
+
// start(ac:parameter)[foo] → start(ac:parameter)[quux] → end(ac:parameter)[quux] → end(ac:parameter)[foo]
|
|
1053
|
+
|
|
1054
|
+
// Collect positions of start/end events for ac:parameter
|
|
1055
|
+
let ac_param_starts: Vec<usize> = events
|
|
1056
|
+
.iter()
|
|
1057
|
+
.enumerate()
|
|
1058
|
+
.filter(|(_, e)| e.starts_with("start(ac:parameter)"))
|
|
1059
|
+
.map(|(i, _)| i)
|
|
1060
|
+
.collect();
|
|
1061
|
+
let ac_param_ends: Vec<usize> = events
|
|
1062
|
+
.iter()
|
|
1063
|
+
.enumerate()
|
|
1064
|
+
.filter(|(_, e)| e.starts_with("end(ac:parameter)"))
|
|
1065
|
+
.map(|(i, _)| i)
|
|
1066
|
+
.collect();
|
|
1067
|
+
|
|
1068
|
+
assert_eq!(
|
|
1069
|
+
ac_param_starts.len(),
|
|
1070
|
+
2,
|
|
1071
|
+
"expected exactly 2 ac:parameter start events, got: {events:?}"
|
|
1072
|
+
);
|
|
1073
|
+
assert_eq!(
|
|
1074
|
+
ac_param_ends.len(),
|
|
1075
|
+
2,
|
|
1076
|
+
"expected exactly 2 ac:parameter end events, got: {events:?}"
|
|
1077
|
+
);
|
|
1078
|
+
|
|
1079
|
+
// Each start must come before the corresponding end: start[0] < end[0] < start[1] < end[1]
|
|
1080
|
+
assert!(
|
|
1081
|
+
ac_param_starts[0] < ac_param_ends[0],
|
|
1082
|
+
"first ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
|
|
1083
|
+
ac_param_starts[0],
|
|
1084
|
+
ac_param_ends[0],
|
|
1085
|
+
);
|
|
1086
|
+
assert!(
|
|
1087
|
+
ac_param_ends[0] < ac_param_starts[1],
|
|
1088
|
+
"first ac:parameter end must precede second ac:parameter start (got end@{}, start@{}); events: {events:?}",
|
|
1089
|
+
ac_param_ends[0],
|
|
1090
|
+
ac_param_starts[1],
|
|
1091
|
+
);
|
|
1092
|
+
assert!(
|
|
1093
|
+
ac_param_starts[1] < ac_param_ends[1],
|
|
1094
|
+
"second ac:parameter: start must precede end (got start@{}, end@{}); events: {events:?}",
|
|
1095
|
+
ac_param_starts[1],
|
|
1096
|
+
ac_param_ends[1],
|
|
1097
|
+
);
|
|
1098
|
+
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.4.0.pre.rc.
|
|
4
|
+
version: 3.4.0.pre.rc.23
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kreuzberg Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|