html-to-markdown 3.2.3 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "3.
|
|
6
|
+
version = "3.4.0-rc.13"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.85"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -18,7 +18,7 @@ clap = { version = "4.6", features = ["derive"] }
|
|
|
18
18
|
clap_complete = "4.6"
|
|
19
19
|
clap_mangen = "0.3"
|
|
20
20
|
encoding_rs = "0.8"
|
|
21
|
-
ext-php-rs = "0.15.
|
|
21
|
+
ext-php-rs = "0.15.12"
|
|
22
22
|
html5ever = "0.39.0"
|
|
23
23
|
once_cell = "1.21"
|
|
24
24
|
pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -7,6 +7,6 @@ fn main() {
|
|
|
7
7
|
let opts: ConversionOptions = serde_json::from_str(json).unwrap();
|
|
8
8
|
println!("code_block_style: {:?}", opts.code_block_style);
|
|
9
9
|
|
|
10
|
-
let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
|
|
10
|
+
let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts), None).unwrap();
|
|
11
11
|
println!("result: {:?}", result.content);
|
|
12
12
|
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -3,7 +3,7 @@ fn convert(
|
|
|
3
3
|
html: &str,
|
|
4
4
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
5
5
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
6
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
fn main() {
|
|
@@ -30,14 +30,18 @@ use crate::{HtmlMetadata, MetadataConfig};
|
|
|
30
30
|
/// use html_to_markdown_rs::{convert, ConversionOptions};
|
|
31
31
|
///
|
|
32
32
|
/// let html = "<h1>Hello World</h1>";
|
|
33
|
-
/// let result = convert(html, None).unwrap();
|
|
33
|
+
/// let result = convert(html, None, None).unwrap();
|
|
34
34
|
/// assert!(result.content.as_deref().unwrap_or("").contains("Hello World"));
|
|
35
35
|
/// ```
|
|
36
36
|
///
|
|
37
37
|
/// # Errors
|
|
38
38
|
///
|
|
39
39
|
/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
|
|
40
|
-
pub fn convert(
|
|
40
|
+
pub fn convert(
|
|
41
|
+
html: &str,
|
|
42
|
+
options: Option<ConversionOptions>,
|
|
43
|
+
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
|
|
44
|
+
) -> Result<ConversionResult> {
|
|
41
45
|
use std::cell::RefCell;
|
|
42
46
|
use std::rc::Rc;
|
|
43
47
|
|
|
@@ -96,6 +100,11 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
96
100
|
None
|
|
97
101
|
};
|
|
98
102
|
|
|
103
|
+
// When the visitor feature is not enabled, there is no visitor parameter.
|
|
104
|
+
// convert_html_impl expects `Option<()>` in the non-visitor slot.
|
|
105
|
+
#[cfg(not(feature = "visitor"))]
|
|
106
|
+
let visitor: Option<()> = None;
|
|
107
|
+
|
|
99
108
|
// Run the conversion pipeline.
|
|
100
109
|
// Pass structure_collector by value — convert_html_impl will consume it via Rc::try_unwrap
|
|
101
110
|
// to return the finished DocumentStructure. We must not hold a second Rc reference.
|
|
@@ -107,7 +116,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
107
116
|
&options,
|
|
108
117
|
image_collector.as_ref().map(Rc::clone),
|
|
109
118
|
metadata_collector.as_ref().map(Rc::clone),
|
|
110
|
-
|
|
119
|
+
visitor,
|
|
111
120
|
structure_collector,
|
|
112
121
|
)?
|
|
113
122
|
}
|
|
@@ -118,7 +127,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
118
127
|
&options,
|
|
119
128
|
None,
|
|
120
129
|
metadata_collector.as_ref().map(Rc::clone),
|
|
121
|
-
|
|
130
|
+
visitor,
|
|
122
131
|
structure_collector,
|
|
123
132
|
)?
|
|
124
133
|
}
|
|
@@ -129,7 +138,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
129
138
|
&options,
|
|
130
139
|
image_collector.as_ref().map(Rc::clone),
|
|
131
140
|
None,
|
|
132
|
-
|
|
141
|
+
visitor,
|
|
133
142
|
structure_collector,
|
|
134
143
|
)?
|
|
135
144
|
}
|
|
@@ -140,7 +149,7 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
140
149
|
&options,
|
|
141
150
|
None,
|
|
142
151
|
None,
|
|
143
|
-
|
|
152
|
+
visitor,
|
|
144
153
|
structure_collector,
|
|
145
154
|
)?
|
|
146
155
|
}
|
|
@@ -201,25 +210,6 @@ pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<Convers
|
|
|
201
210
|
})
|
|
202
211
|
}
|
|
203
212
|
|
|
204
|
-
/// Internal: convert with visitor support. Used by FFI crate.
|
|
205
|
-
/// Will be removed when convert() accepts visitor parameter directly.
|
|
206
|
-
#[cfg(feature = "visitor")]
|
|
207
|
-
#[doc(hidden)]
|
|
208
|
-
pub fn convert_with_visitor(
|
|
209
|
-
html: &str,
|
|
210
|
-
options: Option<ConversionOptions>,
|
|
211
|
-
visitor: Option<crate::visitor::VisitorHandle>,
|
|
212
|
-
) -> Result<String> {
|
|
213
|
-
let options = options.unwrap_or_default();
|
|
214
|
-
let normalized_html = normalize_input(html)?;
|
|
215
|
-
let markdown = crate::converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
|
|
216
|
-
if options.wrap {
|
|
217
|
-
Ok(crate::wrapper::wrap_markdown(&markdown, &options))
|
|
218
|
-
} else {
|
|
219
|
-
Ok(markdown)
|
|
220
|
-
}
|
|
221
|
-
}
|
|
222
|
-
|
|
223
213
|
/// Validate and normalize HTML input for conversion.
|
|
224
214
|
fn normalize_input(html: &str) -> Result<Cow<'_, str>> {
|
|
225
215
|
let decoded = decode_utf16_if_needed(html);
|
|
@@ -22,7 +22,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
22
22
|
///
|
|
23
23
|
/// Processes blockquote content, applies `> ` prefix to each line,
|
|
24
24
|
/// handles optional `cite` attribution, and manages spacing.
|
|
25
|
-
pub
|
|
25
|
+
pub fn handle(
|
|
26
26
|
node_handle: &NodeHandle,
|
|
27
27
|
parser: &Parser,
|
|
28
28
|
output: &mut String,
|
|
@@ -28,7 +28,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
28
28
|
/// * `ctx` - Current conversion context
|
|
29
29
|
/// * `depth` - Current recursion depth
|
|
30
30
|
/// * `dom_ctx` - DOM context for tracking relationships
|
|
31
|
-
pub
|
|
31
|
+
pub fn handle_structural_container(
|
|
32
32
|
node_handle: &NodeHandle,
|
|
33
33
|
parser: &Parser,
|
|
34
34
|
output: &mut String,
|
|
@@ -64,7 +64,7 @@ pub(crate) fn handle_structural_container(
|
|
|
64
64
|
/// * `ctx` - Current conversion context
|
|
65
65
|
/// * `depth` - Current recursion depth
|
|
66
66
|
/// * `dom_ctx` - DOM context for tracking relationships
|
|
67
|
-
pub
|
|
67
|
+
pub fn handle_passthrough(
|
|
68
68
|
node_handle: &NodeHandle,
|
|
69
69
|
parser: &Parser,
|
|
70
70
|
output: &mut String,
|
|
@@ -101,7 +101,7 @@ pub(crate) fn handle_passthrough(
|
|
|
101
101
|
/// * `_depth` - Current recursion depth (unused)
|
|
102
102
|
/// * `_dom_ctx` - DOM context (unused)
|
|
103
103
|
#[inline]
|
|
104
|
-
pub
|
|
104
|
+
pub fn handle_noop(
|
|
105
105
|
_node_handle: &NodeHandle,
|
|
106
106
|
_parser: &Parser,
|
|
107
107
|
_output: &mut String,
|
|
@@ -25,7 +25,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
25
25
|
/// # Note
|
|
26
26
|
/// This function references `walk_node` and helper functions from converter.rs
|
|
27
27
|
/// which must be accessible (pub(crate)) for this module to work correctly.
|
|
28
|
-
pub
|
|
28
|
+
pub fn handle(
|
|
29
29
|
node_handle: &NodeHandle,
|
|
30
30
|
parser: &Parser,
|
|
31
31
|
output: &mut String,
|
|
@@ -27,7 +27,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
27
27
|
/// # Note
|
|
28
28
|
/// This function references `walk_node` from converter.rs which must be
|
|
29
29
|
/// accessible (pub(crate)) for this module to work correctly.
|
|
30
|
-
pub
|
|
30
|
+
pub fn handle(
|
|
31
31
|
tag_name: &str,
|
|
32
32
|
node_handle: &NodeHandle,
|
|
33
33
|
parser: &Parser,
|
|
@@ -149,7 +149,7 @@ pub(crate) fn handle(
|
|
|
149
149
|
}
|
|
150
150
|
|
|
151
151
|
/// Determine if a heading element should allow inline images.
|
|
152
|
-
pub
|
|
152
|
+
pub fn heading_allows_inline_images(
|
|
153
153
|
tag_name: &str,
|
|
154
154
|
keep_inline_images_in: &std::rc::Rc<std::collections::HashSet<String>>,
|
|
155
155
|
) -> bool {
|
|
@@ -189,7 +189,7 @@ fn normalize_heading_text(text: &str) -> Cow<'_, str> {
|
|
|
189
189
|
}
|
|
190
190
|
|
|
191
191
|
/// Format heading output with appropriate markdown syntax.
|
|
192
|
-
pub
|
|
192
|
+
pub fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
|
|
193
193
|
if text.is_empty() {
|
|
194
194
|
return;
|
|
195
195
|
}
|
|
@@ -374,7 +374,7 @@ fn visitor_heading_output(
|
|
|
374
374
|
/// - Multiple headings are found
|
|
375
375
|
/// - Non-whitespace non-heading content exists
|
|
376
376
|
/// - Non-text comments exist
|
|
377
|
-
pub
|
|
377
|
+
pub fn find_single_heading_child(node_handle: NodeHandle, parser: &Parser) -> Option<(usize, NodeHandle)> {
|
|
378
378
|
let node = node_handle.get(parser)?;
|
|
379
379
|
|
|
380
380
|
let tl::Node::Tag(tag) = node else {
|
|
@@ -397,13 +397,12 @@ pub(crate) fn find_single_heading_child(node_handle: NodeHandle, parser: &Parser
|
|
|
397
397
|
}
|
|
398
398
|
tl::Node::Tag(child_tag) => {
|
|
399
399
|
let name = crate::converter::utility::content::normalized_tag_name(child_tag.name().as_utf8_str());
|
|
400
|
-
|
|
400
|
+
{
|
|
401
|
+
let level = heading_level_from_name(name.as_ref())?;
|
|
401
402
|
if heading_data.is_some() {
|
|
402
403
|
return None;
|
|
403
404
|
}
|
|
404
405
|
heading_data = Some((level, *child_handle));
|
|
405
|
-
} else {
|
|
406
|
-
return None;
|
|
407
406
|
}
|
|
408
407
|
}
|
|
409
408
|
tl::Node::Comment(_) => return None,
|
|
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
15
15
|
///
|
|
16
16
|
/// Converts to Markdown horizontal rule (---) with appropriate blank line
|
|
17
17
|
/// spacing based on context and previous siblings.
|
|
18
|
-
pub
|
|
18
|
+
pub fn handle(
|
|
19
19
|
node_handle: &NodeHandle,
|
|
20
20
|
parser: &Parser,
|
|
21
21
|
output: &mut String,
|
|
@@ -15,7 +15,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
15
15
|
///
|
|
16
16
|
/// Converts to appropriate Markdown line break syntax based on the configured
|
|
17
17
|
/// newline style and current context (e.g., in headings).
|
|
18
|
-
pub
|
|
18
|
+
pub fn handle(
|
|
19
19
|
_node_handle: &NodeHandle,
|
|
20
20
|
_parser: &Parser,
|
|
21
21
|
output: &mut String,
|
|
@@ -1,22 +1,3 @@
|
|
|
1
|
-
//! Block element handlers for HTML to Markdown conversion.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides specialized handlers for block-level HTML elements:
|
|
4
|
-
//! - Headings (h1-h6)
|
|
5
|
-
//! - Paragraphs (p)
|
|
6
|
-
//! - Blockquotes (blockquote)
|
|
7
|
-
//! - Preformatted code (pre)
|
|
8
|
-
//! - Tables (table, thead, tbody, tfoot, tr, th, td)
|
|
9
|
-
//!
|
|
10
|
-
//! These handlers are designed to be extracted from the main `converter.rs`
|
|
11
|
-
//! file and integrated once the converter module is refactored.
|
|
12
|
-
//!
|
|
13
|
-
//! **Note on Current Integration:**
|
|
14
|
-
//! This module cannot currently be fully integrated into converter.rs due to
|
|
15
|
-
//! Rust's module system rules (cannot have both converter.rs and converter/mod.rs).
|
|
16
|
-
//! Once converter.rs is refactored to use converter/main.rs or similar pattern,
|
|
17
|
-
//! these handlers should be exposed through converter/mod.rs and used in the
|
|
18
|
-
//! main walk_node function via the dispatch_block_handler function below.
|
|
19
|
-
|
|
20
1
|
pub mod blockquote;
|
|
21
2
|
pub mod container;
|
|
22
3
|
pub mod div;
|
|
@@ -27,92 +8,3 @@ pub mod paragraph;
|
|
|
27
8
|
pub mod preformatted;
|
|
28
9
|
pub mod table;
|
|
29
10
|
pub mod unknown;
|
|
30
|
-
|
|
31
|
-
// Re-export types from parent module for submodule access
|
|
32
|
-
pub use super::{Context, DomContext};
|
|
33
|
-
|
|
34
|
-
// Re-export for internal use by dispatcher (crate-private)
|
|
35
|
-
|
|
36
|
-
/// Dispatches block element handling to the appropriate handler.
|
|
37
|
-
///
|
|
38
|
-
/// This function is designed to be called from the main walk_node function
|
|
39
|
-
/// in converter.rs once the module is refactored. It returns `true` if the
|
|
40
|
-
/// element was handled, `false` otherwise.
|
|
41
|
-
///
|
|
42
|
-
/// # Usage in converter.rs
|
|
43
|
-
/// ```text
|
|
44
|
-
/// if crate::converter::block::dispatch_block_handler(
|
|
45
|
-
/// &tag_name,
|
|
46
|
-
/// node_handle,
|
|
47
|
-
/// parser,
|
|
48
|
-
/// output,
|
|
49
|
-
/// options,
|
|
50
|
-
/// ctx,
|
|
51
|
-
/// depth,
|
|
52
|
-
/// dom_ctx,
|
|
53
|
-
/// ) {
|
|
54
|
-
/// return; // Element was handled
|
|
55
|
-
/// }
|
|
56
|
-
/// ```
|
|
57
|
-
pub fn dispatch_block_handler(
|
|
58
|
-
tag_name: &str,
|
|
59
|
-
node_handle: &tl::NodeHandle,
|
|
60
|
-
parser: &tl::Parser,
|
|
61
|
-
output: &mut String,
|
|
62
|
-
options: &crate::options::ConversionOptions,
|
|
63
|
-
ctx: &super::Context,
|
|
64
|
-
depth: usize,
|
|
65
|
-
dom_ctx: &super::DomContext,
|
|
66
|
-
) -> bool {
|
|
67
|
-
match tag_name {
|
|
68
|
-
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
|
|
69
|
-
heading::handle(tag_name, node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
70
|
-
true
|
|
71
|
-
}
|
|
72
|
-
"p" => {
|
|
73
|
-
paragraph::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
74
|
-
true
|
|
75
|
-
}
|
|
76
|
-
"blockquote" => {
|
|
77
|
-
blockquote::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
78
|
-
true
|
|
79
|
-
}
|
|
80
|
-
"pre" => {
|
|
81
|
-
preformatted::handle_pre(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
82
|
-
true
|
|
83
|
-
}
|
|
84
|
-
"br" => {
|
|
85
|
-
line_break::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
86
|
-
true
|
|
87
|
-
}
|
|
88
|
-
"hr" => {
|
|
89
|
-
horizontal_rule::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
90
|
-
true
|
|
91
|
-
}
|
|
92
|
-
"div" => {
|
|
93
|
-
div::handle(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
94
|
-
true
|
|
95
|
-
}
|
|
96
|
-
"table" => {
|
|
97
|
-
table::handle_table(node_handle, parser, output, options, ctx, dom_ctx, depth);
|
|
98
|
-
true
|
|
99
|
-
}
|
|
100
|
-
"caption" => {
|
|
101
|
-
table::handle_caption(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
102
|
-
true
|
|
103
|
-
}
|
|
104
|
-
"body" | "html" => {
|
|
105
|
-
container::handle_structural_container(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
106
|
-
true
|
|
107
|
-
}
|
|
108
|
-
"time" | "data" => {
|
|
109
|
-
container::handle_passthrough(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
110
|
-
true
|
|
111
|
-
}
|
|
112
|
-
"wbr" | "source" | "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {
|
|
113
|
-
container::handle_noop(node_handle, parser, output, options, ctx, depth, dom_ctx);
|
|
114
|
-
true
|
|
115
|
-
}
|
|
116
|
-
_ => false,
|
|
117
|
-
}
|
|
118
|
-
}
|
|
@@ -18,7 +18,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
18
18
|
///
|
|
19
19
|
/// Processes children with proper context, manages spacing,
|
|
20
20
|
/// and handles special cases for table cells and list items.
|
|
21
|
-
pub
|
|
21
|
+
pub fn handle(
|
|
22
22
|
node_handle: &NodeHandle,
|
|
23
23
|
parser: &Parser,
|
|
24
24
|
output: &mut String,
|
|
@@ -19,7 +19,7 @@ type Context = crate::converter::Context;
|
|
|
19
19
|
type DomContext = crate::converter::DomContext;
|
|
20
20
|
|
|
21
21
|
/// Handle preformatted code blocks (pre element).
|
|
22
|
-
pub
|
|
22
|
+
pub fn handle_pre(
|
|
23
23
|
node_handle: &NodeHandle,
|
|
24
24
|
parser: &Parser,
|
|
25
25
|
output: &mut String,
|
|
@@ -400,7 +400,7 @@ mod tests {
|
|
|
400
400
|
#[test]
|
|
401
401
|
fn single_nested_table_stays_as_table() {
|
|
402
402
|
let html = r"<table><tr><td>Label</td><td><table><tr><td>A</td><td>B</td></tr></table></td></tr></table>";
|
|
403
|
-
let result = crate::convert(html, None).unwrap();
|
|
403
|
+
let result = crate::convert(html, None, None).unwrap();
|
|
404
404
|
let content = result.content.unwrap_or_default();
|
|
405
405
|
assert!(content.contains('|'), "should produce pipe table, not list");
|
|
406
406
|
}
|
|
@@ -185,7 +185,7 @@ mod tests {
|
|
|
185
185
|
#[test]
|
|
186
186
|
fn rich_formatting_preserved_in_cells() {
|
|
187
187
|
let html = "<table><tr><th>H</th></tr><tr><td><strong>Bold</strong> and <em>italic</em></td></tr></table>";
|
|
188
|
-
let result = crate::convert(html, None).unwrap();
|
|
188
|
+
let result = crate::convert(html, None, None).unwrap();
|
|
189
189
|
let content = result.content.unwrap_or_default();
|
|
190
190
|
assert!(
|
|
191
191
|
content.contains("**Bold**") || content.contains("__Bold__"),
|
|
@@ -20,11 +20,9 @@ pub mod scanner;
|
|
|
20
20
|
pub(super) mod utils;
|
|
21
21
|
|
|
22
22
|
// Re-export types from parent module for submodule access
|
|
23
|
-
pub use super::super::{Context, DomContext};
|
|
24
23
|
|
|
25
24
|
// Re-export for use in converter.rs
|
|
26
|
-
pub
|
|
27
|
-
pub(crate) use caption::handle_caption;
|
|
25
|
+
pub use caption::handle_caption;
|
|
28
26
|
|
|
29
27
|
/// Dispatches table element handling to the main convert_table function.
|
|
30
28
|
///
|
|
@@ -84,7 +82,7 @@ pub fn dispatch_table_handler(
|
|
|
84
82
|
/// * `ctx` - Conversion context (includes list state)
|
|
85
83
|
/// * `dom_ctx` - DOM context for tree structure info
|
|
86
84
|
/// * `depth` - Current nesting depth
|
|
87
|
-
pub
|
|
85
|
+
pub fn handle_table_with_context(
|
|
88
86
|
node_handle: &tl::NodeHandle,
|
|
89
87
|
parser: &tl::Parser,
|
|
90
88
|
output: &mut String,
|
|
@@ -32,7 +32,7 @@ type DomContext = crate::converter::DomContext;
|
|
|
32
32
|
/// # Code Block Detection
|
|
33
33
|
/// Code blocks (identified by markdown formatting) are always preserved,
|
|
34
34
|
/// even if they appear "empty" according to trim().
|
|
35
|
-
pub
|
|
35
|
+
pub fn handle(
|
|
36
36
|
node_handle: &NodeHandle,
|
|
37
37
|
parser: &Parser,
|
|
38
38
|
output: &mut String,
|
|
@@ -80,6 +80,8 @@ pub struct Context {
|
|
|
80
80
|
pub(crate) preserve_tags: Rc<HashSet<String>>,
|
|
81
81
|
/// Tag names that allow inline images inside headings.
|
|
82
82
|
pub(crate) keep_inline_images_in: Rc<HashSet<String>>,
|
|
83
|
+
/// Node IDs matching `exclude_selectors` — these nodes and all descendants are dropped.
|
|
84
|
+
pub(crate) excluded_node_ids: Rc<HashSet<u32>>,
|
|
83
85
|
#[cfg(feature = "inline-images")]
|
|
84
86
|
/// Shared collector for inline images when enabled.
|
|
85
87
|
pub(crate) inline_collector: Option<InlineCollectorHandle>,
|
|
@@ -111,6 +113,13 @@ pub struct Context {
|
|
|
111
113
|
}
|
|
112
114
|
|
|
113
115
|
impl Context {
|
|
116
|
+
/// Set the pre-computed set of node IDs that match `exclude_selectors`.
|
|
117
|
+
///
|
|
118
|
+
/// Called in `convert_html_impl` after DOM parsing, before the walk starts.
|
|
119
|
+
pub(crate) fn set_excluded_node_ids(&mut self, ids: HashSet<u32>) {
|
|
120
|
+
self.excluded_node_ids = Rc::new(ids);
|
|
121
|
+
}
|
|
122
|
+
|
|
114
123
|
/// Create a new conversion context from options and optional collectors.
|
|
115
124
|
#[allow(clippy::too_many_arguments)]
|
|
116
125
|
#[cfg_attr(
|
|
@@ -171,6 +180,7 @@ impl Context {
|
|
|
171
180
|
strip_tags: Rc::new(options.strip_tags.iter().cloned().collect()),
|
|
172
181
|
preserve_tags: Rc::new(options.preserve_tags.iter().cloned().collect()),
|
|
173
182
|
keep_inline_images_in: Rc::new(options.keep_inline_images_in.iter().cloned().collect()),
|
|
183
|
+
excluded_node_ids: Rc::new(HashSet::new()),
|
|
174
184
|
#[cfg(feature = "inline-images")]
|
|
175
185
|
inline_collector,
|
|
176
186
|
#[cfg(feature = "metadata")]
|
|
@@ -15,7 +15,7 @@ use crate::text;
|
|
|
15
15
|
///
|
|
16
16
|
/// This struct stores pre-computed information about tag elements to avoid
|
|
17
17
|
/// repeated parsing during tree traversal.
|
|
18
|
-
pub
|
|
18
|
+
pub struct TagInfo {
|
|
19
19
|
/// The normalized (lowercase) tag name.
|
|
20
20
|
pub(crate) name: String,
|
|
21
21
|
/// Whether this element behaves like an inline element (including script/style).
|