html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
|
@@ -7,7 +7,7 @@ use crate::converter::DomContext;
|
|
|
7
7
|
|
|
8
8
|
/// Get the tag name of the next sibling element.
|
|
9
9
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
10
|
-
pub
|
|
10
|
+
pub fn get_next_sibling_tag<'a>(
|
|
11
11
|
node_handle: &tl::NodeHandle,
|
|
12
12
|
parser: &'a tl::Parser,
|
|
13
13
|
dom_ctx: &'a DomContext,
|
|
@@ -17,7 +17,7 @@ pub(crate) fn get_next_sibling_tag<'a>(
|
|
|
17
17
|
|
|
18
18
|
/// Get the tag name of the previous sibling element.
|
|
19
19
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
20
|
-
pub
|
|
20
|
+
pub fn get_previous_sibling_tag<'a>(
|
|
21
21
|
node_handle: &tl::NodeHandle,
|
|
22
22
|
parser: &tl::Parser,
|
|
23
23
|
dom_ctx: &'a DomContext,
|
|
@@ -53,17 +53,13 @@ pub(crate) fn get_previous_sibling_tag<'a>(
|
|
|
53
53
|
|
|
54
54
|
/// Check if the previous sibling is an inline tag.
|
|
55
55
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
56
|
-
pub
|
|
57
|
-
node_handle: &tl::NodeHandle,
|
|
58
|
-
parser: &tl::Parser,
|
|
59
|
-
dom_ctx: &DomContext,
|
|
60
|
-
) -> bool {
|
|
56
|
+
pub fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
|
|
61
57
|
dom_ctx.previous_inline_like(*node_handle, parser)
|
|
62
58
|
}
|
|
63
59
|
|
|
64
60
|
/// Check if the next sibling is whitespace-only text.
|
|
65
61
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
66
|
-
pub
|
|
62
|
+
pub fn next_sibling_is_whitespace_text(
|
|
67
63
|
node_handle: &tl::NodeHandle,
|
|
68
64
|
parser: &tl::Parser,
|
|
69
65
|
dom_ctx: &DomContext,
|
|
@@ -73,11 +69,7 @@ pub(crate) fn next_sibling_is_whitespace_text(
|
|
|
73
69
|
|
|
74
70
|
/// Check if the next sibling is an inline tag.
|
|
75
71
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
76
|
-
pub
|
|
77
|
-
node_handle: &tl::NodeHandle,
|
|
78
|
-
parser: &tl::Parser,
|
|
79
|
-
dom_ctx: &DomContext,
|
|
80
|
-
) -> bool {
|
|
72
|
+
pub fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
|
|
81
73
|
dom_ctx.next_inline_like(*node_handle, parser)
|
|
82
74
|
}
|
|
83
75
|
|
|
@@ -85,7 +77,7 @@ pub(crate) fn next_sibling_is_inline_tag(
|
|
|
85
77
|
///
|
|
86
78
|
/// Avoids adding spaces before siblings that are already whitespace.
|
|
87
79
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
88
|
-
pub
|
|
80
|
+
pub fn append_inline_suffix(
|
|
89
81
|
output: &mut String,
|
|
90
82
|
suffix: &str,
|
|
91
83
|
has_core_content: bool,
|
|
@@ -163,6 +163,12 @@ pub struct InlineImage {
|
|
|
163
163
|
pub attributes: BTreeMap<String, String>,
|
|
164
164
|
}
|
|
165
165
|
|
|
166
|
+
impl std::fmt::Display for InlineImage {
|
|
167
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
168
|
+
write!(f, "{self:?}")
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
166
172
|
/// Human-friendly warning emitted during inline image extraction.
|
|
167
173
|
#[derive(Debug, Clone)]
|
|
168
174
|
pub struct InlineImageWarning {
|
|
@@ -47,28 +47,29 @@
|
|
|
47
47
|
// Module Declarations
|
|
48
48
|
// ============================================================================
|
|
49
49
|
|
|
50
|
-
pub mod converter;
|
|
51
50
|
pub mod error;
|
|
52
|
-
#[cfg(feature = "inline-images")]
|
|
53
|
-
mod inline_images;
|
|
54
51
|
#[cfg(feature = "metadata")]
|
|
55
52
|
pub mod metadata;
|
|
56
53
|
pub mod options;
|
|
57
|
-
pub mod safety;
|
|
58
|
-
pub mod text;
|
|
59
54
|
pub mod types;
|
|
60
55
|
#[cfg(feature = "visitor")]
|
|
61
56
|
pub mod visitor;
|
|
62
|
-
#[cfg(feature = "visitor")]
|
|
63
|
-
pub mod visitor_helpers;
|
|
64
|
-
pub mod wrapper;
|
|
65
57
|
|
|
66
58
|
// Internal modules (not part of public API)
|
|
67
59
|
mod convert_api;
|
|
60
|
+
#[allow(dead_code)]
|
|
61
|
+
pub(crate) mod converter;
|
|
68
62
|
mod exports;
|
|
69
|
-
|
|
63
|
+
#[cfg(feature = "inline-images")]
|
|
64
|
+
mod inline_images;
|
|
65
|
+
pub(crate) mod prelude;
|
|
70
66
|
mod rcdom;
|
|
67
|
+
pub(crate) mod text;
|
|
71
68
|
mod validation;
|
|
69
|
+
#[cfg(feature = "visitor")]
|
|
70
|
+
#[allow(clippy::ref_option)]
|
|
71
|
+
pub(crate) mod visitor_helpers;
|
|
72
|
+
pub(crate) mod wrapper;
|
|
72
73
|
|
|
73
74
|
// ============================================================================
|
|
74
75
|
// Public Re-exports (from exports module)
|
|
@@ -79,6 +80,8 @@ pub use types::{
|
|
|
79
80
|
AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent, ProcessingWarning,
|
|
80
81
|
TableData, TableGrid, TextAnnotation, WarningKind,
|
|
81
82
|
};
|
|
83
|
+
#[cfg(feature = "visitor")]
|
|
84
|
+
pub use visitor::{NodeContext, NodeType, VisitResult};
|
|
82
85
|
|
|
83
86
|
// ============================================================================
|
|
84
87
|
// Main Public API Functions
|
|
@@ -95,10 +98,6 @@ pub use convert_api::metadata_config_from_json;
|
|
|
95
98
|
#[cfg(feature = "inline-images")]
|
|
96
99
|
pub use convert_api::inline_image_config_from_json;
|
|
97
100
|
|
|
98
|
-
#[cfg(feature = "visitor")]
|
|
99
|
-
#[doc(hidden)]
|
|
100
|
-
pub use convert_api::convert_with_visitor;
|
|
101
|
-
|
|
102
101
|
// Tests
|
|
103
102
|
// ============================================================================
|
|
104
103
|
|
|
@@ -109,27 +108,27 @@ mod basic_tests {
|
|
|
109
108
|
#[test]
|
|
110
109
|
fn test_binary_input_rejected() {
|
|
111
110
|
let html = format!("abc{}def", "\0".repeat(20));
|
|
112
|
-
let result = convert(&html, None);
|
|
111
|
+
let result = convert(&html, None, None);
|
|
113
112
|
assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
|
|
114
113
|
}
|
|
115
114
|
|
|
116
115
|
#[test]
|
|
117
116
|
fn test_binary_magic_rejected() {
|
|
118
117
|
let html = "%PDF-1.7";
|
|
119
|
-
let result = convert(html, None);
|
|
118
|
+
let result = convert(html, None, None);
|
|
120
119
|
assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
|
|
121
120
|
}
|
|
122
121
|
|
|
123
122
|
#[test]
|
|
124
123
|
fn test_utf16_hint_recovered() {
|
|
125
124
|
let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
|
|
126
|
-
let result = convert(&html, None);
|
|
125
|
+
let result = convert(&html, None, None);
|
|
127
126
|
assert!(result.is_ok(), "UTF-16 input should be recovered instead of rejected");
|
|
128
127
|
}
|
|
129
128
|
|
|
130
129
|
#[test]
|
|
131
130
|
fn test_plain_text_allowed() {
|
|
132
|
-
let result = convert("Just text", None).unwrap();
|
|
131
|
+
let result = convert("Just text", None, None).unwrap();
|
|
133
132
|
let content = result.content.unwrap_or_default();
|
|
134
133
|
assert!(content.contains("Just text"));
|
|
135
134
|
}
|
|
@@ -141,7 +140,7 @@ mod basic_tests {
|
|
|
141
140
|
escape_underscores: true,
|
|
142
141
|
..ConversionOptions::default()
|
|
143
142
|
};
|
|
144
|
-
let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
|
|
143
|
+
let result = convert("Text *asterisks* _underscores_", Some(options), None).unwrap();
|
|
145
144
|
let content = result.content.unwrap_or_default();
|
|
146
145
|
assert!(content.contains(r"\*asterisks\*"));
|
|
147
146
|
assert!(content.contains(r"\_underscores\_"));
|
|
@@ -105,6 +105,21 @@ pub struct ConversionOptions {
|
|
|
105
105
|
pub capture_svg: bool,
|
|
106
106
|
/// Infer image dimensions from data.
|
|
107
107
|
pub infer_dimensions: bool,
|
|
108
|
+
/// Maximum DOM traversal depth. `None` means unlimited.
|
|
109
|
+
/// When set, subtrees beyond this depth are silently truncated.
|
|
110
|
+
pub max_depth: Option<usize>,
|
|
111
|
+
/// CSS selectors for elements to exclude entirely (element + all content).
|
|
112
|
+
///
|
|
113
|
+
/// Unlike `strip_tags` (which removes the tag wrapper but keeps children),
|
|
114
|
+
/// excluded elements and all their descendants are dropped from the output.
|
|
115
|
+
/// Supports any CSS selector that `tl` supports: tag names, `.class`,
|
|
116
|
+
/// `#id`, `[attribute]`, etc.
|
|
117
|
+
///
|
|
118
|
+
/// Invalid selectors are silently skipped at conversion time.
|
|
119
|
+
///
|
|
120
|
+
/// Example: `vec![".cookie-banner".into(), "#ad-container".into(), "[role='complementary']".into()]`
|
|
121
|
+
#[serde(default)]
|
|
122
|
+
pub exclude_selectors: Vec<String>,
|
|
108
123
|
}
|
|
109
124
|
|
|
110
125
|
impl Default for ConversionOptions {
|
|
@@ -148,6 +163,8 @@ impl Default for ConversionOptions {
|
|
|
148
163
|
max_image_size: 5_242_880,
|
|
149
164
|
capture_svg: false,
|
|
150
165
|
infer_dimensions: true,
|
|
166
|
+
max_depth: None,
|
|
167
|
+
exclude_selectors: Vec::new(),
|
|
151
168
|
}
|
|
152
169
|
}
|
|
153
170
|
}
|
|
@@ -255,6 +272,14 @@ impl ConversionOptionsBuilder {
|
|
|
255
272
|
builder_setter!(max_image_size, u64);
|
|
256
273
|
builder_setter!(capture_svg, bool);
|
|
257
274
|
builder_setter!(infer_dimensions, bool);
|
|
275
|
+
builder_setter!(max_depth, Option<usize>);
|
|
276
|
+
|
|
277
|
+
/// Set the list of CSS selectors for elements to exclude entirely from output.
|
|
278
|
+
#[must_use]
|
|
279
|
+
pub fn exclude_selectors(mut self, selectors: Vec<String>) -> Self {
|
|
280
|
+
self.0.exclude_selectors = selectors;
|
|
281
|
+
self
|
|
282
|
+
}
|
|
258
283
|
|
|
259
284
|
// Preprocessing
|
|
260
285
|
/// Set the pre-processing options applied to the HTML before conversion.
|
|
@@ -368,6 +393,10 @@ pub struct ConversionOptionsUpdate {
|
|
|
368
393
|
pub capture_svg: Option<bool>,
|
|
369
394
|
/// Optional override for [`ConversionOptions::infer_dimensions`].
|
|
370
395
|
pub infer_dimensions: Option<bool>,
|
|
396
|
+
/// Optional override for [`ConversionOptions::max_depth`].
|
|
397
|
+
pub max_depth: Option<Option<usize>>,
|
|
398
|
+
/// Optional override for [`ConversionOptions::exclude_selectors`].
|
|
399
|
+
pub exclude_selectors: Option<Vec<String>>,
|
|
371
400
|
}
|
|
372
401
|
|
|
373
402
|
impl ConversionOptions {
|
|
@@ -417,6 +446,8 @@ impl ConversionOptions {
|
|
|
417
446
|
apply!(max_image_size);
|
|
418
447
|
apply!(capture_svg);
|
|
419
448
|
apply!(infer_dimensions);
|
|
449
|
+
apply!(max_depth);
|
|
450
|
+
apply!(exclude_selectors);
|
|
420
451
|
if let Some(preprocessing) = update.preprocessing {
|
|
421
452
|
self.preprocessing.apply_update(preprocessing);
|
|
422
453
|
}
|
|
@@ -1,12 +1 @@
|
|
|
1
|
-
//! Prelude module for convenient imports.
|
|
2
|
-
//!
|
|
3
|
-
//! Re-exports the most commonly used types and functions from the crate.
|
|
4
|
-
//! Users can import everything they need with:
|
|
5
|
-
//! ```
|
|
6
|
-
//! use html_to_markdown_rs::prelude::*;
|
|
7
|
-
//! ```
|
|
8
|
-
|
|
9
|
-
pub use crate::convert;
|
|
10
|
-
pub use crate::error::{ConversionError, Result};
|
|
11
|
-
pub use crate::options::{ConversionOptions, HeadingStyle};
|
|
12
|
-
pub use crate::types::ConversionResult;
|
|
1
|
+
//! Prelude module for convenient internal imports.
|
|
@@ -314,36 +314,6 @@ const fn is_unicode_space(ch: char) -> bool {
|
|
|
314
314
|
)
|
|
315
315
|
}
|
|
316
316
|
|
|
317
|
-
/// Underline text with a character.
|
|
318
|
-
#[must_use]
|
|
319
|
-
pub fn underline(text: &str, pad_char: char) -> String {
|
|
320
|
-
let text = text.trim_end();
|
|
321
|
-
if text.is_empty() {
|
|
322
|
-
return String::new();
|
|
323
|
-
}
|
|
324
|
-
format!("{}\n{}\n\n", text, pad_char.to_string().repeat(text.len()))
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
/// Indent text with a string prefix.
|
|
328
|
-
#[must_use]
|
|
329
|
-
pub fn indent(text: &str, level: usize, indent_str: &str) -> String {
|
|
330
|
-
if text.is_empty() {
|
|
331
|
-
return String::new();
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
let prefix = indent_str.repeat(level);
|
|
335
|
-
text.lines()
|
|
336
|
-
.map(|line| {
|
|
337
|
-
if line.is_empty() {
|
|
338
|
-
String::new()
|
|
339
|
-
} else {
|
|
340
|
-
format!("{prefix}{line}")
|
|
341
|
-
}
|
|
342
|
-
})
|
|
343
|
-
.collect::<Vec<_>>()
|
|
344
|
-
.join("\n")
|
|
345
|
-
}
|
|
346
|
-
|
|
347
317
|
#[cfg(test)]
|
|
348
318
|
mod tests {
|
|
349
319
|
use super::*;
|
|
@@ -385,18 +355,4 @@ mod tests {
|
|
|
385
355
|
assert_eq!(chomp("text "), ("", " ", "text"));
|
|
386
356
|
assert_eq!(chomp(""), ("", "", ""));
|
|
387
357
|
}
|
|
388
|
-
|
|
389
|
-
#[test]
|
|
390
|
-
fn test_underline() {
|
|
391
|
-
assert_eq!(underline("Title", '='), "Title\n=====\n\n");
|
|
392
|
-
assert_eq!(underline("Subtitle", '-'), "Subtitle\n--------\n\n");
|
|
393
|
-
assert_eq!(underline("", '='), "");
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
#[test]
|
|
397
|
-
fn test_indent() {
|
|
398
|
-
assert_eq!(indent("line1\nline2", 1, "\t"), "\tline1\n\tline2");
|
|
399
|
-
assert_eq!(indent("text", 2, " "), " text");
|
|
400
|
-
assert_eq!(indent("", 1, "\t"), "");
|
|
401
|
-
}
|
|
402
358
|
}
|
|
@@ -12,6 +12,7 @@ use std::collections::BTreeMap;
|
|
|
12
12
|
/// This enum categorizes all HTML elements that the converter recognizes,
|
|
13
13
|
/// providing a coarse-grained classification for visitor dispatch.
|
|
14
14
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
15
|
+
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
|
|
15
16
|
pub enum NodeType {
|
|
16
17
|
/// Text node (most frequent - 100+ per document)
|
|
17
18
|
Text,
|
|
@@ -207,6 +208,7 @@ pub enum NodeType {
|
|
|
207
208
|
/// Provides comprehensive metadata about the current node being visited,
|
|
208
209
|
/// including its type, attributes, position in the DOM tree, and parent context.
|
|
209
210
|
#[derive(Debug, Clone)]
|
|
211
|
+
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
|
|
210
212
|
pub struct NodeContext {
|
|
211
213
|
/// Coarse-grained node type classification
|
|
212
214
|
pub node_type: NodeType,
|
|
@@ -235,8 +237,10 @@ pub struct NodeContext {
|
|
|
235
237
|
/// Allows visitors to control the conversion flow by either proceeding
|
|
236
238
|
/// with default behavior, providing custom output, skipping elements,
|
|
237
239
|
/// preserving HTML, or signaling errors.
|
|
238
|
-
#[derive(Debug, Clone)]
|
|
240
|
+
#[derive(Debug, Clone, Default)]
|
|
241
|
+
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
|
|
239
242
|
pub enum VisitResult {
|
|
243
|
+
#[default]
|
|
240
244
|
/// Continue with default conversion behavior
|
|
241
245
|
Continue,
|
|
242
246
|
|
|
@@ -296,7 +296,10 @@ macro_rules! try_visitor {
|
|
|
296
296
|
return Ok(String::new());
|
|
297
297
|
}
|
|
298
298
|
$crate::visitor_helpers::VisitorDispatch::PreserveHtml => {
|
|
299
|
-
//
|
|
299
|
+
// Falls through to default conversion — full HTML preservation requires
|
|
300
|
+
// the node handle and parser context which aren't available in this macro.
|
|
301
|
+
// Callers that need PreserveHtml support should match on the dispatch
|
|
302
|
+
// result directly and call serialize_tag_to_html.
|
|
300
303
|
}
|
|
301
304
|
}
|
|
302
305
|
}};
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -293,5 +293,5 @@ fn convert(
|
|
|
293
293
|
html: &str,
|
|
294
294
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
295
295
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
296
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
296
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
297
297
|
}
|
|
@@ -3,7 +3,7 @@ fn convert(
|
|
|
3
3
|
html: &str,
|
|
4
4
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
5
5
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
6
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
use html_to_markdown_rs::{ConversionOptions, OutputFormat};
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
4
|
+
|
|
5
|
+
fn convert(html: &str, opts: Option<ConversionOptions>) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
+
#[cfg(feature = "visitor")]
|
|
7
|
+
let result = html_to_markdown_rs::convert(html, opts, None);
|
|
8
|
+
#[cfg(not(feature = "visitor"))]
|
|
9
|
+
let result = html_to_markdown_rs::convert(html, opts);
|
|
10
|
+
result.map(|r| r.content.unwrap_or_default())
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
#[test]
|
|
14
|
+
fn test_exclude_selectors_drops_matching_elements() {
|
|
15
|
+
let html = r#"<body>
|
|
16
|
+
<div class="cookie-banner">Accept cookies</div>
|
|
17
|
+
<article><p>Main content here.</p></article>
|
|
18
|
+
<div id="ad-container">Buy stuff</div>
|
|
19
|
+
</body>"#;
|
|
20
|
+
|
|
21
|
+
let options = ConversionOptions {
|
|
22
|
+
exclude_selectors: vec![".cookie-banner".to_string(), "#ad-container".to_string()],
|
|
23
|
+
..Default::default()
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
let result = convert(html, Some(options)).unwrap();
|
|
27
|
+
|
|
28
|
+
assert!(result.contains("Main content"), "Should keep main content");
|
|
29
|
+
assert!(!result.contains("cookie"), "Should drop .cookie-banner element");
|
|
30
|
+
assert!(!result.contains("Buy stuff"), "Should drop #ad-container element");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[test]
|
|
34
|
+
fn test_exclude_selectors_drops_nested_content() {
|
|
35
|
+
let html = r#"<body>
|
|
36
|
+
<aside class="sidebar">
|
|
37
|
+
<h2>Related articles</h2>
|
|
38
|
+
<p>Some sidebar content</p>
|
|
39
|
+
</aside>
|
|
40
|
+
<main><p>Primary content.</p></main>
|
|
41
|
+
</body>"#;
|
|
42
|
+
|
|
43
|
+
let options = ConversionOptions {
|
|
44
|
+
exclude_selectors: vec![".sidebar".to_string()],
|
|
45
|
+
..Default::default()
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
let result = convert(html, Some(options)).unwrap();
|
|
49
|
+
|
|
50
|
+
assert!(result.contains("Primary content"), "Should keep main content");
|
|
51
|
+
assert!(
|
|
52
|
+
!result.contains("Related articles"),
|
|
53
|
+
"Should drop heading inside excluded element"
|
|
54
|
+
);
|
|
55
|
+
assert!(
|
|
56
|
+
!result.contains("sidebar content"),
|
|
57
|
+
"Should drop paragraph inside excluded element"
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[test]
|
|
62
|
+
fn test_exclude_selectors_empty_list_is_noop() {
|
|
63
|
+
let html = r"<body><p>Hello world</p></body>";
|
|
64
|
+
|
|
65
|
+
let options = ConversionOptions {
|
|
66
|
+
exclude_selectors: vec![],
|
|
67
|
+
..Default::default()
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
let result = convert(html, Some(options)).unwrap();
|
|
71
|
+
assert!(
|
|
72
|
+
result.contains("Hello world"),
|
|
73
|
+
"Empty exclude_selectors should not affect output"
|
|
74
|
+
);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
#[test]
|
|
78
|
+
fn test_exclude_selectors_invalid_selector_is_skipped() {
|
|
79
|
+
let html = r"<body><p>Visible text</p></body>";
|
|
80
|
+
|
|
81
|
+
// An empty string or garbled selector should not panic or error — just be ignored.
|
|
82
|
+
let options = ConversionOptions {
|
|
83
|
+
exclude_selectors: vec![String::new(), "p".to_string()],
|
|
84
|
+
..Default::default()
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
// Should not return an error; whether the paragraph is excluded depends on the
|
|
88
|
+
// selector, but it must not panic.
|
|
89
|
+
let _ = convert(html, Some(options));
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
#[test]
|
|
93
|
+
fn test_exclude_selectors_attribute_selector() {
|
|
94
|
+
let html = r#"<body>
|
|
95
|
+
<div role="complementary">Sidebar</div>
|
|
96
|
+
<p>Main text</p>
|
|
97
|
+
</body>"#;
|
|
98
|
+
|
|
99
|
+
let options = ConversionOptions {
|
|
100
|
+
exclude_selectors: vec!["[role='complementary']".to_string()],
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let result = convert(html, Some(options)).unwrap();
|
|
105
|
+
|
|
106
|
+
assert!(result.contains("Main text"), "Should keep non-excluded content");
|
|
107
|
+
assert!(
|
|
108
|
+
!result.contains("Sidebar"),
|
|
109
|
+
"Should drop element matching attribute selector"
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
#[test]
|
|
114
|
+
fn test_exclude_selectors_plain_text_output() {
|
|
115
|
+
let html = r#"<body>
|
|
116
|
+
<div class="nav">Navigation links</div>
|
|
117
|
+
<p>Article body text.</p>
|
|
118
|
+
</body>"#;
|
|
119
|
+
|
|
120
|
+
let options = ConversionOptions {
|
|
121
|
+
exclude_selectors: vec![".nav".to_string()],
|
|
122
|
+
output_format: html_to_markdown_rs::OutputFormat::Plain,
|
|
123
|
+
..Default::default()
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
let result = convert(html, Some(options)).unwrap();
|
|
127
|
+
|
|
128
|
+
assert!(
|
|
129
|
+
result.contains("Article body text"),
|
|
130
|
+
"Should keep body text in plain output"
|
|
131
|
+
);
|
|
132
|
+
assert!(
|
|
133
|
+
!result.contains("Navigation links"),
|
|
134
|
+
"Should drop excluded element in plain output"
|
|
135
|
+
);
|
|
136
|
+
}
|
|
@@ -619,5 +619,5 @@ fn convert(
|
|
|
619
619
|
html: &str,
|
|
620
620
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
621
621
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
622
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
622
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
623
623
|
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
#[test]
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -129,5 +129,5 @@ fn convert(
|
|
|
129
129
|
html: &str,
|
|
130
130
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
131
131
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
132
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
132
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
133
133
|
}
|
|
@@ -140,5 +140,5 @@ fn convert(
|
|
|
140
140
|
html: &str,
|
|
141
141
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
142
142
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
143
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
143
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
144
144
|
}
|
|
@@ -8,7 +8,7 @@ fn test_strong_blockquote_strong_newlines() {
|
|
|
8
8
|
html: &str,
|
|
9
9
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
10
10
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
11
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
11
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
// Test case from issue #176: strong + blockquote + strong
|
|
@@ -39,7 +39,7 @@ fn test_paragraph_blockquote_paragraph_newlines() {
|
|
|
39
39
|
html: &str,
|
|
40
40
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
41
41
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
42
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
42
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
// Control test: p + blockquote + p should work correctly
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
#[test]
|