html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -15,14 +15,14 @@ use std::collections::BTreeMap;
|
|
|
15
15
|
|
|
16
16
|
use crate::converter::dom_context::DomContext;
|
|
17
17
|
use crate::converter::main_helpers::{
|
|
18
|
-
extract_head_metadata, format_metadata_frontmatter,
|
|
19
|
-
|
|
18
|
+
extract_head_metadata, format_metadata_frontmatter, has_custom_element_tags, repair_with_html5ever,
|
|
19
|
+
trim_line_end_whitespace, trim_trailing_whitespace,
|
|
20
20
|
};
|
|
21
21
|
use crate::converter::plain_text::extract_plain_text;
|
|
22
22
|
use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
|
|
23
23
|
use crate::converter::utility::caching::build_dom_context;
|
|
24
24
|
use crate::converter::utility::content::normalized_tag_name;
|
|
25
|
-
use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
|
|
25
|
+
use crate::converter::utility::preprocessing::{preprocess_html, strip_hidden_elements, strip_script_and_style_tags};
|
|
26
26
|
use crate::converter::utility::serialization::serialize_tag_to_html;
|
|
27
27
|
use crate::options::OutputFormat;
|
|
28
28
|
|
|
@@ -31,12 +31,13 @@ use crate::error::Result;
|
|
|
31
31
|
use crate::options::ConversionOptions;
|
|
32
32
|
|
|
33
33
|
use crate::converter::context::{Context, InlineCollectorHandle};
|
|
34
|
+
use crate::types::structure_collector::StructureCollectorHandle;
|
|
34
35
|
|
|
35
36
|
/// Converts HTML to Markdown using the provided conversion options.
|
|
36
37
|
///
|
|
37
38
|
/// This is the main entry point for HTML to Markdown conversion.
|
|
38
39
|
pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
|
|
39
|
-
convert_html_impl(html, options, None, None, None)
|
|
40
|
+
convert_html_impl(html, options, None, None, None, None).map(|(md, _)| md)
|
|
40
41
|
}
|
|
41
42
|
|
|
42
43
|
/// Converts HTML to Markdown with a custom visitor for callbacks during traversal.
|
|
@@ -49,26 +50,13 @@ pub fn convert_html_with_visitor(
|
|
|
49
50
|
options: &ConversionOptions,
|
|
50
51
|
visitor: Option<crate::visitor::VisitorHandle>,
|
|
51
52
|
) -> Result<String> {
|
|
52
|
-
convert_html_impl(html, options, None, None, visitor)
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/// Converts HTML to Markdown with an async visitor for callbacks during traversal.
|
|
56
|
-
///
|
|
57
|
-
/// Async variant with async visitor callbacks for Promise-based bindings.
|
|
58
|
-
#[cfg(feature = "async-visitor")]
|
|
59
|
-
#[allow(clippy::future_not_send)]
|
|
60
|
-
pub async fn convert_html_with_visitor_async(
|
|
61
|
-
html: &str,
|
|
62
|
-
options: &ConversionOptions,
|
|
63
|
-
visitor: Option<crate::visitor_helpers::AsyncVisitorHandle>,
|
|
64
|
-
) -> Result<String> {
|
|
65
|
-
convert_html_impl_async(html, options, None, None, visitor).await
|
|
53
|
+
convert_html_impl(html, options, None, None, visitor, None).map(|(md, _)| md)
|
|
66
54
|
}
|
|
67
55
|
|
|
68
56
|
/// Internal implementation of HTML to Markdown conversion.
|
|
69
57
|
///
|
|
70
|
-
///
|
|
71
|
-
///
|
|
58
|
+
/// Returns `(markdown, Option<DocumentStructure>)`. The structure is populated when
|
|
59
|
+
/// `options.include_document_structure == true` and a `structure_collector` handle is provided.
|
|
72
60
|
#[cfg_attr(
|
|
73
61
|
any(not(feature = "inline-images"), not(feature = "metadata"), not(feature = "visitor")),
|
|
74
62
|
allow(unused_variables)
|
|
@@ -82,10 +70,13 @@ pub(crate) fn convert_html_impl(
|
|
|
82
70
|
#[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
|
|
83
71
|
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
|
|
84
72
|
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
85
|
-
|
|
73
|
+
structure_collector: Option<StructureCollectorHandle>,
|
|
74
|
+
) -> Result<(String, Option<crate::types::DocumentStructure>)> {
|
|
86
75
|
// Strip script and style tags completely to prevent parser confusion from HTML-like content
|
|
87
76
|
// inside script/style elements. This preserves JSON-LD for metadata extraction.
|
|
88
77
|
let stripped = strip_script_and_style_tags(html);
|
|
78
|
+
// Strip elements with the `hidden` attribute before parsing.
|
|
79
|
+
let stripped = strip_hidden_elements(&stripped);
|
|
89
80
|
let mut preprocessed = preprocess_html(&stripped).into_owned();
|
|
90
81
|
let mut preprocessed_len = preprocessed.len();
|
|
91
82
|
|
|
@@ -113,11 +104,6 @@ pub(crate) fn convert_html_impl(
|
|
|
113
104
|
let mut parser = dom.parser();
|
|
114
105
|
let mut output = String::with_capacity(preprocessed_len.saturating_add(preprocessed_len / 4));
|
|
115
106
|
|
|
116
|
-
// Check and handle hOCR documents
|
|
117
|
-
if handle_hocr_document(&dom, parser, options, &mut output) {
|
|
118
|
-
return Ok(output);
|
|
119
|
-
}
|
|
120
|
-
|
|
121
107
|
let mut dom_ctx = build_dom_context(&dom, parser, preprocessed_len);
|
|
122
108
|
|
|
123
109
|
// Check for inline-block misnesting and repair if needed
|
|
@@ -211,13 +197,37 @@ pub(crate) fn convert_html_impl(
|
|
|
211
197
|
}
|
|
212
198
|
|
|
213
199
|
#[cfg(all(feature = "metadata", feature = "visitor"))]
|
|
214
|
-
let ctx = Context::new(
|
|
200
|
+
let ctx = Context::new(
|
|
201
|
+
options,
|
|
202
|
+
inline_collector,
|
|
203
|
+
metadata_collector,
|
|
204
|
+
visitor,
|
|
205
|
+
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
206
|
+
);
|
|
215
207
|
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
|
|
216
|
-
let ctx = Context::new(
|
|
208
|
+
let ctx = Context::new(
|
|
209
|
+
options,
|
|
210
|
+
inline_collector,
|
|
211
|
+
metadata_collector,
|
|
212
|
+
_visitor,
|
|
213
|
+
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
|
+
);
|
|
217
215
|
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
|
|
218
|
-
let ctx = Context::new(
|
|
216
|
+
let ctx = Context::new(
|
|
217
|
+
options,
|
|
218
|
+
inline_collector,
|
|
219
|
+
_metadata_collector,
|
|
220
|
+
visitor,
|
|
221
|
+
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
222
|
+
);
|
|
219
223
|
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
|
|
220
|
-
let ctx = Context::new(
|
|
224
|
+
let ctx = Context::new(
|
|
225
|
+
options,
|
|
226
|
+
inline_collector,
|
|
227
|
+
_metadata_collector,
|
|
228
|
+
_visitor,
|
|
229
|
+
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
230
|
+
);
|
|
221
231
|
|
|
222
232
|
for child_handle in dom.children() {
|
|
223
233
|
walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
|
|
@@ -228,20 +238,32 @@ pub(crate) fn convert_html_impl(
|
|
|
228
238
|
return Err(crate::error::ConversionError::Visitor(err.clone()));
|
|
229
239
|
}
|
|
230
240
|
|
|
241
|
+
// Drop ctx before unwrapping the structure collector Rc — ctx holds a cloned Rc
|
|
242
|
+
// reference to the same collector, and Rc::try_unwrap requires exactly one reference.
|
|
243
|
+
drop(ctx);
|
|
244
|
+
|
|
231
245
|
// If plain text was requested, discard the markdown output and return plain text.
|
|
232
246
|
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
233
247
|
if is_plain_text {
|
|
234
248
|
let plain = extract_plain_text(&dom, parser, options);
|
|
235
|
-
|
|
249
|
+
let document =
|
|
250
|
+
structure_collector.and_then(|sc| std::rc::Rc::try_unwrap(sc).ok().map(|cell| cell.into_inner().finish()));
|
|
251
|
+
return Ok((plain, document));
|
|
236
252
|
}
|
|
237
253
|
|
|
238
254
|
trim_line_end_whitespace(&mut output);
|
|
239
255
|
let trimmed = output.trim_end_matches('\n');
|
|
240
|
-
if trimmed.is_empty() {
|
|
241
|
-
|
|
256
|
+
let markdown = if trimmed.is_empty() {
|
|
257
|
+
String::new()
|
|
242
258
|
} else {
|
|
243
|
-
|
|
244
|
-
}
|
|
259
|
+
format!("{trimmed}\n")
|
|
260
|
+
};
|
|
261
|
+
|
|
262
|
+
// Finish the structure collector if present.
|
|
263
|
+
let document =
|
|
264
|
+
structure_collector.and_then(|sc| std::rc::Rc::try_unwrap(sc).ok().map(|cell| cell.into_inner().finish()));
|
|
265
|
+
|
|
266
|
+
Ok((markdown, document))
|
|
245
267
|
}
|
|
246
268
|
// has_more_than_one_char moved to main_helpers
|
|
247
269
|
// is_inline_element available from utility::content
|
|
@@ -473,6 +495,34 @@ pub(crate) fn walk_node(
|
|
|
473
495
|
);
|
|
474
496
|
}
|
|
475
497
|
|
|
498
|
+
// Quote element routed to semantic dispatcher
|
|
499
|
+
"q" => {
|
|
500
|
+
crate::converter::semantic::dispatch_semantic_handler(
|
|
501
|
+
&tag_name,
|
|
502
|
+
node_handle,
|
|
503
|
+
parser,
|
|
504
|
+
output,
|
|
505
|
+
options,
|
|
506
|
+
ctx,
|
|
507
|
+
depth,
|
|
508
|
+
dom_ctx,
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// Figure elements routed to semantic dispatcher
|
|
513
|
+
"figure" | "figcaption" => {
|
|
514
|
+
crate::converter::semantic::dispatch_semantic_handler(
|
|
515
|
+
&tag_name,
|
|
516
|
+
node_handle,
|
|
517
|
+
parser,
|
|
518
|
+
output,
|
|
519
|
+
options,
|
|
520
|
+
ctx,
|
|
521
|
+
depth,
|
|
522
|
+
dom_ctx,
|
|
523
|
+
);
|
|
524
|
+
}
|
|
525
|
+
|
|
476
526
|
// Semantic interactive elements routed to semantic dispatcher
|
|
477
527
|
"details" | "summary" | "dialog" | "menu" => {
|
|
478
528
|
crate::converter::semantic::dispatch_semantic_handler(
|
|
@@ -569,24 +619,3 @@ pub(crate) fn walk_node(
|
|
|
569
619
|
tl::Node::Comment(_) => {}
|
|
570
620
|
}
|
|
571
621
|
}
|
|
572
|
-
/// Async equivalent of `convert_html_impl` for Promise-based visitor callbacks.
|
|
573
|
-
#[cfg(feature = "async-visitor")]
|
|
574
|
-
#[allow(clippy::future_not_send)]
|
|
575
|
-
pub(crate) async fn convert_html_impl_async(
|
|
576
|
-
html: &str,
|
|
577
|
-
options: &ConversionOptions,
|
|
578
|
-
_inline_collector: Option<InlineCollectorHandle>,
|
|
579
|
-
#[cfg(feature = "metadata")] _metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
|
|
580
|
-
#[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
|
|
581
|
-
visitor: Option<crate::visitor_helpers::AsyncVisitorHandle>,
|
|
582
|
-
) -> Result<String> {
|
|
583
|
-
if visitor.is_some() {
|
|
584
|
-
return Err(crate::error::ConversionError::ParseError(
|
|
585
|
-
"Async visitor not yet implemented. Use AsyncToSyncVisitorBridge.".to_string(),
|
|
586
|
-
));
|
|
587
|
-
}
|
|
588
|
-
#[cfg(feature = "visitor")]
|
|
589
|
-
return convert_html_impl(html, options, _inline_collector, _metadata_collector, None);
|
|
590
|
-
#[cfg(not(feature = "visitor"))]
|
|
591
|
-
return convert_html_impl(html, options, _inline_collector, _metadata_collector, ());
|
|
592
|
-
}
|
|
@@ -145,8 +145,8 @@ pub fn extract_head_metadata(
|
|
|
145
145
|
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
|
|
146
146
|
// Look for meta tags
|
|
147
147
|
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("meta")
|
|
148
|
-
&& !options.strip_tags.
|
|
149
|
-
&& !options.preserve_tags.
|
|
148
|
+
&& !options.strip_tags.iter().any(|t| t == "meta")
|
|
149
|
+
&& !options.preserve_tags.iter().any(|t| t == "meta")
|
|
150
150
|
{
|
|
151
151
|
if let (Some(name), Some(content)) = (
|
|
152
152
|
child_tag.attributes().get("name").flatten(),
|
|
@@ -168,8 +168,8 @@ pub fn extract_head_metadata(
|
|
|
168
168
|
}
|
|
169
169
|
// Look for title tag
|
|
170
170
|
if child_tag.name().as_utf8_str().eq_ignore_ascii_case("title")
|
|
171
|
-
&& !options.strip_tags.
|
|
172
|
-
&& !options.preserve_tags.
|
|
171
|
+
&& !options.strip_tags.iter().any(|t| t == "title")
|
|
172
|
+
&& !options.preserve_tags.iter().any(|t| t == "title")
|
|
173
173
|
{
|
|
174
174
|
// Extract text content from title tag
|
|
175
175
|
let mut title_content = String::new();
|
|
@@ -284,66 +284,3 @@ pub fn is_inline_element(tag_name: &str) -> bool {
|
|
|
284
284
|
| "meter"
|
|
285
285
|
)
|
|
286
286
|
}
|
|
287
|
-
|
|
288
|
-
/// Handle hOCR document conversion, returning true if handled, false if not hOCR.
|
|
289
|
-
pub fn handle_hocr_document(
|
|
290
|
-
dom: &tl::VDom<'_>,
|
|
291
|
-
parser: &tl::Parser<'_>,
|
|
292
|
-
options: &ConversionOptions,
|
|
293
|
-
output: &mut String,
|
|
294
|
-
) -> bool {
|
|
295
|
-
use crate::converter::utility::attributes::{is_hocr_document, may_be_hocr};
|
|
296
|
-
use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
|
|
297
|
-
|
|
298
|
-
let preprocessed = dom.outer_html();
|
|
299
|
-
if !may_be_hocr(preprocessed.as_ref()) {
|
|
300
|
-
return false;
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
let mut is_hocr = false;
|
|
304
|
-
for child_handle in dom.children() {
|
|
305
|
-
if is_hocr_document(*child_handle, parser) {
|
|
306
|
-
is_hocr = true;
|
|
307
|
-
break;
|
|
308
|
-
}
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
if !is_hocr {
|
|
312
|
-
return false;
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
let (elements, metadata) = extract_hocr_document(dom);
|
|
316
|
-
|
|
317
|
-
if options.extract_metadata && !options.convert_as_inline {
|
|
318
|
-
let mut metadata_map = BTreeMap::new();
|
|
319
|
-
if let Some(system) = metadata.ocr_system {
|
|
320
|
-
metadata_map.insert("ocr-system".to_string(), system);
|
|
321
|
-
}
|
|
322
|
-
if !metadata.ocr_capabilities.is_empty() {
|
|
323
|
-
metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
|
|
324
|
-
}
|
|
325
|
-
if let Some(pages) = metadata.ocr_number_of_pages {
|
|
326
|
-
metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
|
|
327
|
-
}
|
|
328
|
-
if !metadata.ocr_langs.is_empty() {
|
|
329
|
-
metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
|
|
330
|
-
}
|
|
331
|
-
if !metadata.ocr_scripts.is_empty() {
|
|
332
|
-
metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
if !metadata_map.is_empty() {
|
|
336
|
-
output.push_str(&format_metadata_frontmatter(&metadata_map));
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
|
|
341
|
-
|
|
342
|
-
if !markdown.trim().is_empty() {
|
|
343
|
-
markdown.truncate(markdown.trim_end().len());
|
|
344
|
-
output.push_str(&markdown);
|
|
345
|
-
output.push('\n');
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
true
|
|
349
|
-
}
|
|
@@ -12,6 +12,7 @@ use tl::{HTMLTag, NodeHandle, Parser};
|
|
|
12
12
|
|
|
13
13
|
use crate::converter::Context;
|
|
14
14
|
use crate::converter::dom_context::DomContext;
|
|
15
|
+
use crate::converter::main_helpers::tag_name_eq;
|
|
15
16
|
use crate::options::ConversionOptions;
|
|
16
17
|
|
|
17
18
|
/// Extract src attribute from media element (audio, video, iframe).
|
|
@@ -46,11 +47,6 @@ pub(crate) fn is_source_element(tag: &HTMLTag) -> bool {
|
|
|
46
47
|
tag_name_eq(tag.name().as_utf8_str(), "source")
|
|
47
48
|
}
|
|
48
49
|
|
|
49
|
-
/// Compare tag name with needle (case-insensitive).
|
|
50
|
-
fn tag_name_eq<'a>(name: impl AsRef<str>, needle: &str) -> bool {
|
|
51
|
-
name.as_ref().eq_ignore_ascii_case(needle)
|
|
52
|
-
}
|
|
53
|
-
|
|
54
50
|
/// Determine if media should output source link in markdown.
|
|
55
51
|
///
|
|
56
52
|
/// Returns true if src is non-empty.
|
|
@@ -1,41 +1,4 @@
|
|
|
1
1
|
//! Graphic element handling (custom graphic elements with alternative source attributes).
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
/// Handle custom graphic elements with multiple source attribute options.
|
|
7
|
-
///
|
|
8
|
-
/// The graphic element is a custom XML element that supports multiple source attributes:
|
|
9
|
-
/// - `url` (primary)
|
|
10
|
-
/// - `href` (secondary)
|
|
11
|
-
/// - `xlink:href` (SVG standard)
|
|
12
|
-
/// - `src` (fallback)
|
|
13
|
-
///
|
|
14
|
-
/// This is commonly used in publishing formats like EPUB.
|
|
15
|
-
pub(crate) fn extract_graphic_src<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
|
|
16
|
-
tag.attributes()
|
|
17
|
-
.get("url")
|
|
18
|
-
.flatten()
|
|
19
|
-
.or_else(|| tag.attributes().get("href").flatten())
|
|
20
|
-
.or_else(|| tag.attributes().get("xlink:href").flatten())
|
|
21
|
-
.or_else(|| tag.attributes().get("src").flatten())
|
|
22
|
-
.map_or_else(|| Cow::Borrowed(""), |v| v.as_utf8_str())
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
/// Extract alt text from graphic element with fallback to filename.
|
|
26
|
-
pub(crate) fn extract_graphic_alt<'a>(tag: &'a HTMLTag<'a>) -> Cow<'a, str> {
|
|
27
|
-
tag.attributes()
|
|
28
|
-
.get("alt")
|
|
29
|
-
.flatten()
|
|
30
|
-
.map(|v| v.as_utf8_str())
|
|
31
|
-
.or_else(|| tag.attributes().get("filename").flatten().map(|v| v.as_utf8_str()))
|
|
32
|
-
.unwrap_or_else(|| Cow::Borrowed(""))
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
/// Get source attributes to skip during metadata collection.
|
|
36
|
-
///
|
|
37
|
-
/// These attributes are handled specially and should not be included
|
|
38
|
-
/// in the generic attributes map.
|
|
39
|
-
pub(crate) fn should_skip_graphic_attr(key_str: &str) -> bool {
|
|
40
|
-
matches!(key_str, "url" | "href" | "xlink:href" | "src")
|
|
41
|
-
}
|
|
2
|
+
//!
|
|
3
|
+
//! The `<graphic>` element is a custom XML element used in publishing formats like EPUB.
|
|
4
|
+
//! Conversion logic lives in `crate::converter::handlers::graphic`.
|
|
@@ -171,14 +171,6 @@ pub(crate) fn handle_inline_data_image(
|
|
|
171
171
|
collector.push_image(index, image);
|
|
172
172
|
}
|
|
173
173
|
|
|
174
|
-
/// Check if heading tag allows inline images based on configuration.
|
|
175
|
-
pub(crate) fn heading_allows_inline_images(
|
|
176
|
-
tag_name: &str,
|
|
177
|
-
keep_inline_images_in: &std::collections::HashSet<String>,
|
|
178
|
-
) -> bool {
|
|
179
|
-
keep_inline_images_in.contains(tag_name)
|
|
180
|
-
}
|
|
181
|
-
|
|
182
174
|
/// Extract non-empty trimmed string or return None.
|
|
183
175
|
#[cfg(feature = "inline-images")]
|
|
184
176
|
fn non_empty_trimmed(value: &str) -> Option<String> {
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
//! SVG and MathML element handling with serialization and base64 encoding.
|
|
2
2
|
|
|
3
|
+
use crate::converter::main_helpers::tag_name_eq;
|
|
4
|
+
use crate::converter::utility::content::normalized_tag_name;
|
|
3
5
|
#[allow(unused_imports)]
|
|
4
6
|
use std::collections::BTreeMap;
|
|
5
7
|
use tl::{NodeHandle, Parser};
|
|
@@ -93,7 +95,7 @@ pub(crate) fn handle_inline_svg(
|
|
|
93
95
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
94
96
|
pub(crate) fn serialize_element(node_handle: &NodeHandle, parser: &Parser) -> String {
|
|
95
97
|
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
96
|
-
let tag_name = normalized_tag_name(tag.name().as_utf8_str()
|
|
98
|
+
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|
|
97
99
|
let mut html = String::with_capacity(256);
|
|
98
100
|
html.push('<');
|
|
99
101
|
html.push_str(&tag_name);
|
|
@@ -142,11 +144,6 @@ pub(crate) fn serialize_node(node_handle: &NodeHandle, parser: &Parser) -> Strin
|
|
|
142
144
|
}
|
|
143
145
|
}
|
|
144
146
|
|
|
145
|
-
/// Normalize tag name to lowercase.
|
|
146
|
-
fn normalized_tag_name(name: &str) -> String {
|
|
147
|
-
name.to_ascii_lowercase()
|
|
148
|
-
}
|
|
149
|
-
|
|
150
147
|
/// Extract non-empty trimmed string or return None.
|
|
151
148
|
#[cfg(feature = "inline-images")]
|
|
152
149
|
fn non_empty_trimmed(value: &str) -> Option<String> {
|
|
@@ -158,12 +155,6 @@ fn non_empty_trimmed(value: &str) -> Option<String> {
|
|
|
158
155
|
}
|
|
159
156
|
}
|
|
160
157
|
|
|
161
|
-
/// Encode SVG to base64 data URI.
|
|
162
|
-
pub(crate) fn encode_svg_to_data_uri(svg_html: &str) -> String {
|
|
163
|
-
use base64::{Engine as _, engine::general_purpose::STANDARD};
|
|
164
|
-
STANDARD.encode(svg_html.as_bytes())
|
|
165
|
-
}
|
|
166
|
-
|
|
167
158
|
/// Handle SVG element conversion to Markdown.
|
|
168
159
|
///
|
|
169
160
|
/// Extracts title from child elements, handles inline image collection,
|
|
@@ -179,7 +170,6 @@ pub(crate) fn handle_svg(
|
|
|
179
170
|
_depth: usize,
|
|
180
171
|
dom_ctx: &super::DomContext,
|
|
181
172
|
) {
|
|
182
|
-
use crate::converter::main_helpers::tag_name_eq;
|
|
183
173
|
use crate::converter::utility::content::get_text_content;
|
|
184
174
|
|
|
185
175
|
let mut title = String::from("SVG Image");
|
|
@@ -100,7 +100,7 @@ fn handle_head(
|
|
|
100
100
|
let json = child_tag.inner_text(parser);
|
|
101
101
|
let json = json.trim();
|
|
102
102
|
if !json.is_empty() {
|
|
103
|
-
let json = decode_html_entities(json)
|
|
103
|
+
let json = decode_html_entities(json);
|
|
104
104
|
if !json.is_empty() {
|
|
105
105
|
collector.borrow_mut().add_json_ld(json);
|
|
106
106
|
}
|
|
@@ -68,7 +68,6 @@
|
|
|
68
68
|
//! - Inline image extraction (`inline-images` feature)
|
|
69
69
|
//! - Metadata collection (`metadata` feature)
|
|
70
70
|
//! - Custom visitor callbacks (`visitor` feature)
|
|
71
|
-
//! - Async visitor support (`async-visitor` feature)
|
|
72
71
|
//!
|
|
73
72
|
//! # Example Integration
|
|
74
73
|
//!
|
|
@@ -120,17 +119,10 @@ pub use self::main::convert_html;
|
|
|
120
119
|
#[cfg(feature = "visitor")]
|
|
121
120
|
pub use self::main::convert_html_with_visitor;
|
|
122
121
|
|
|
123
|
-
#[cfg(feature = "async-visitor")]
|
|
124
|
-
pub use self::main::convert_html_with_visitor_async;
|
|
125
|
-
|
|
126
122
|
// Import the tree walker and utility functions from main and main_helpers
|
|
127
123
|
pub(crate) use self::main::{convert_html_impl, walk_node};
|
|
128
124
|
pub(crate) use self::main_helpers::trim_trailing_whitespace;
|
|
129
125
|
|
|
130
|
-
#[cfg(feature = "async-visitor")]
|
|
131
|
-
#[allow(unused_imports)]
|
|
132
|
-
pub(crate) use self::main::convert_html_impl_async;
|
|
133
|
-
|
|
134
126
|
// Re-export helper functions from utility modules (migrated from converter_legacy)
|
|
135
127
|
pub(crate) use crate::converter::utility::content::{chomp_inline, get_text_content, normalized_tag_name};
|
|
136
128
|
#[allow(unused_imports)]
|
|
@@ -292,22 +292,47 @@ fn ensure_newline(buf: &mut String) {
|
|
|
292
292
|
}
|
|
293
293
|
}
|
|
294
294
|
|
|
295
|
-
///
|
|
296
|
-
fn
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
295
|
+
/// Collapse runs of 3 or more consecutive newlines to exactly 2 in a single pass.
|
|
296
|
+
fn collapse_triple_newlines(buf: &mut String) {
|
|
297
|
+
let bytes = buf.as_bytes();
|
|
298
|
+
let mut result = String::with_capacity(buf.len());
|
|
299
|
+
let mut newline_count = 0usize;
|
|
300
|
+
for &b in bytes {
|
|
301
|
+
if b == b'\n' {
|
|
302
|
+
newline_count += 1;
|
|
303
|
+
if newline_count <= 2 {
|
|
304
|
+
result.push('\n');
|
|
305
|
+
}
|
|
306
|
+
} else {
|
|
307
|
+
newline_count = 0;
|
|
308
|
+
result.push(b as char);
|
|
309
|
+
}
|
|
300
310
|
}
|
|
311
|
+
*buf = result;
|
|
312
|
+
}
|
|
301
313
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
314
|
+
/// Trim trailing whitespace from every line in a buffer without allocating per-line strings.
|
|
315
|
+
///
|
|
316
|
+
/// Uses a single allocation of the same capacity, writing each line's trimmed content
|
|
317
|
+
/// and inserting newline separators directly.
|
|
318
|
+
fn trim_line_ends(buf: &mut String) {
|
|
319
|
+
let mut result = String::with_capacity(buf.len());
|
|
320
|
+
for line in buf.lines() {
|
|
321
|
+
if !result.is_empty() {
|
|
322
|
+
result.push('\n');
|
|
309
323
|
}
|
|
324
|
+
result.push_str(line.trim_end());
|
|
310
325
|
}
|
|
326
|
+
*buf = result;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
|
|
330
|
+
fn post_process(buf: &mut String) {
|
|
331
|
+
// Collapse runs of 3+ newlines to exactly 2
|
|
332
|
+
collapse_triple_newlines(buf);
|
|
333
|
+
|
|
334
|
+
// Trim trailing whitespace from each line in-place
|
|
335
|
+
trim_line_ends(buf);
|
|
311
336
|
|
|
312
337
|
// Trim to single trailing newline
|
|
313
338
|
let keep = buf.trim_end_matches('\n').len();
|
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
// Note: Context and DomContext are defined in converter.rs
|
|
16
16
|
// walk_node is also defined there and must be called via the parent module
|
|
17
17
|
|
|
18
|
+
use crate::converter::utility::content::chomp_inline;
|
|
19
|
+
|
|
18
20
|
/// Handles the `<dfn>` element.
|
|
19
21
|
///
|
|
20
22
|
/// A dfn element marks a term that is being defined. The content represents
|
|
@@ -220,14 +222,9 @@ pub fn handle_q(
|
|
|
220
222
|
|
|
221
223
|
let trimmed = content.trim();
|
|
222
224
|
if !trimmed.is_empty() {
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
output.push('"');
|
|
227
|
-
let escaped = trimmed.replace('\\', r"\\").replace('"', r#"\""#);
|
|
228
|
-
output.push_str(&escaped);
|
|
229
|
-
output.push('"');
|
|
230
|
-
}
|
|
225
|
+
output.push('"');
|
|
226
|
+
output.push_str(trimmed);
|
|
227
|
+
output.push('"');
|
|
231
228
|
}
|
|
232
229
|
}
|
|
233
230
|
}
|
|
@@ -256,28 +253,6 @@ pub fn handle(
|
|
|
256
253
|
}
|
|
257
254
|
}
|
|
258
255
|
|
|
259
|
-
/// Extracts prefix, suffix, and trimmed content from inline element text.
|
|
260
|
-
///
|
|
261
|
-
/// This helper function splits leading and trailing whitespace from content,
|
|
262
|
-
/// allowing inline elements to preserve surrounding whitespace context.
|
|
263
|
-
///
|
|
264
|
-
/// # Returns
|
|
265
|
-
///
|
|
266
|
-
/// A tuple of `(prefix, suffix, trimmed_content)` where:
|
|
267
|
-
/// - `prefix`: Leading whitespace (spaces, tabs, newlines)
|
|
268
|
-
/// - `suffix`: Trailing whitespace (spaces, tabs, newlines)
|
|
269
|
-
/// - `trimmed_content`: The content without leading/trailing whitespace
|
|
270
|
-
fn chomp_inline(content: &str) -> (&str, &str, &str) {
|
|
271
|
-
let trimmed = content.trim();
|
|
272
|
-
let prefix_len = content.len() - content.trim_start().len();
|
|
273
|
-
let suffix_len = content.len() - content.trim_end().len();
|
|
274
|
-
|
|
275
|
-
let prefix = &content[..prefix_len];
|
|
276
|
-
let suffix = &content[content.len() - suffix_len..];
|
|
277
|
-
|
|
278
|
-
(prefix, suffix, trimmed)
|
|
279
|
-
}
|
|
280
|
-
|
|
281
256
|
/// Appends inline suffix to the output.
|
|
282
257
|
///
|
|
283
258
|
/// This is a placeholder for integrating with other inline formatting systems
|
|
@@ -164,3 +164,32 @@ pub fn handle(
|
|
|
164
164
|
_ => {}
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
|
+
|
|
168
|
+
#[cfg(test)]
|
|
169
|
+
mod tests {
|
|
170
|
+
#[test]
|
|
171
|
+
fn figure_caption_separated_from_image() {
|
|
172
|
+
let html = r#"<figure><img src="photo.jpg" alt="Photo"><figcaption>A nice photo</figcaption></figure>"#;
|
|
173
|
+
let result = crate::convert(html, None).unwrap();
|
|
174
|
+
let content = result.content.unwrap_or_default();
|
|
175
|
+
assert!(
|
|
176
|
+
content.contains(""),
|
|
177
|
+
"image should be present: {}",
|
|
178
|
+
content
|
|
179
|
+
);
|
|
180
|
+
assert!(
|
|
181
|
+
content.contains("A nice photo"),
|
|
182
|
+
"caption should be present: {}",
|
|
183
|
+
content
|
|
184
|
+
);
|
|
185
|
+
// Image and caption should not be on the same line
|
|
186
|
+
let lines: Vec<&str> = content.lines().filter(|l| !l.trim().is_empty()).collect();
|
|
187
|
+
let img_line = lines.iter().position(|l| l.contains("![")).unwrap_or(999);
|
|
188
|
+
let cap_line = lines.iter().position(|l| l.contains("A nice photo")).unwrap_or(999);
|
|
189
|
+
assert!(
|
|
190
|
+
cap_line > img_line,
|
|
191
|
+
"caption should be on a separate line after image, lines: {:?}",
|
|
192
|
+
lines
|
|
193
|
+
);
|
|
194
|
+
}
|
|
195
|
+
}
|