html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
#[test]
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
/// Regression test for <https://github.com/kreuzberg-dev/html-to-markdown/issues/212>
|
|
@@ -11,7 +11,7 @@ fn convert(
|
|
|
11
11
|
html: &str,
|
|
12
12
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
13
13
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
14
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
14
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
/// Minimal reproducer: a <details> containing a <p> with <strong> inside.
|
|
@@ -14,7 +14,7 @@ fn extracts_json_ld_from_head_script() {
|
|
|
14
14
|
</html>
|
|
15
15
|
"#;
|
|
16
16
|
|
|
17
|
-
let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
|
|
17
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("convert failed");
|
|
18
18
|
let metadata = result.metadata;
|
|
19
19
|
|
|
20
20
|
assert_eq!(metadata.structured_data.len(), 1);
|
|
@@ -35,7 +35,7 @@ fn extracts_json_ld_from_body_script_and_keeps_content() {
|
|
|
35
35
|
</html>
|
|
36
36
|
"#;
|
|
37
37
|
|
|
38
|
-
let result = html_to_markdown_rs::convert(html, None).expect("convert failed");
|
|
38
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("convert failed");
|
|
39
39
|
let metadata = result.metadata;
|
|
40
40
|
|
|
41
41
|
assert_eq!(metadata.structured_data.len(), 1);
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -3,7 +3,7 @@ fn convert(
|
|
|
3
3
|
html: &str,
|
|
4
4
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
5
5
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
6
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
6
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
7
7
|
}
|
|
8
8
|
|
|
9
9
|
use html_to_markdown_rs::{ConversionOptions, OutputFormat};
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
use html_to_markdown_rs::{ConversionOptions, LinkStyle};
|
|
4
4
|
|
|
5
5
|
fn convert(html: &str, options: Option<ConversionOptions>) -> String {
|
|
6
|
-
html_to_markdown_rs::convert(html, options)
|
|
6
|
+
html_to_markdown_rs::convert(html, options, None)
|
|
7
7
|
.unwrap()
|
|
8
8
|
.content
|
|
9
9
|
.unwrap_or_default()
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
fn convert(html: &str) -> String {
|
|
4
|
+
html_to_markdown_rs::convert(html, None, None)
|
|
5
|
+
.map(|r| r.content.unwrap_or_default())
|
|
6
|
+
.expect("conversion should succeed")
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
// --- header ---
|
|
10
|
+
|
|
11
|
+
#[test]
|
|
12
|
+
fn test_h1_inside_header() {
|
|
13
|
+
let html = "<header><h1>Title in header not exported???</h1></header>";
|
|
14
|
+
let result = convert(html);
|
|
15
|
+
assert_eq!(result, "# Title in header not exported???\n");
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
#[test]
|
|
19
|
+
fn test_paragraph_inside_header() {
|
|
20
|
+
let html = "<header><p>Intro text</p></header>";
|
|
21
|
+
let result = convert(html);
|
|
22
|
+
assert_eq!(result, "Intro text\n");
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
#[test]
|
|
26
|
+
fn test_header_with_nested_elements() {
|
|
27
|
+
let html = "<header><h1>Title</h1><p>Subtitle</p></header>";
|
|
28
|
+
let result = convert(html);
|
|
29
|
+
assert!(result.contains("# Title"), "Should contain h1: {result}");
|
|
30
|
+
assert!(result.contains("Subtitle"), "Should contain paragraph: {result}");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// --- footer ---
|
|
34
|
+
|
|
35
|
+
#[test]
|
|
36
|
+
fn test_paragraph_inside_footer() {
|
|
37
|
+
let html = "<footer><p>Footer content</p></footer>";
|
|
38
|
+
let result = convert(html);
|
|
39
|
+
assert_eq!(result, "Footer content\n");
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// --- main ---
|
|
43
|
+
|
|
44
|
+
#[test]
|
|
45
|
+
fn test_h2_inside_main() {
|
|
46
|
+
let html = "<main><h2>Main heading</h2></main>";
|
|
47
|
+
let result = convert(html);
|
|
48
|
+
assert_eq!(result, "## Main heading\n");
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// --- article ---
|
|
52
|
+
|
|
53
|
+
#[test]
|
|
54
|
+
fn test_article_with_header_and_section() {
|
|
55
|
+
let html = "<article><header><h1>Title</h1></header><section><p>Content here</p></section></article>";
|
|
56
|
+
let result = convert(html);
|
|
57
|
+
assert!(result.contains("# Title"), "Should contain heading: {result}");
|
|
58
|
+
assert!(result.contains("Content here"), "Should contain content: {result}");
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// --- section ---
|
|
62
|
+
|
|
63
|
+
#[test]
|
|
64
|
+
fn test_heading_inside_section() {
|
|
65
|
+
let html = "<section><h2>Section Heading</h2><p>Section body</p></section>";
|
|
66
|
+
let result = convert(html);
|
|
67
|
+
assert!(result.contains("## Section Heading"), "Should contain h2: {result}");
|
|
68
|
+
assert!(result.contains("Section body"), "Should contain body: {result}");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// --- nav ---
|
|
72
|
+
|
|
73
|
+
#[test]
|
|
74
|
+
fn test_nav_dropped_by_default() {
|
|
75
|
+
// nav is dropped by default when remove_navigation is true (the default)
|
|
76
|
+
let html = r#"<nav><a href="/home">Home</a><a href="/about">About</a></nav>"#;
|
|
77
|
+
let result = convert(html);
|
|
78
|
+
assert!(result.is_empty(), "nav should be dropped by default: '{result}'");
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
#[test]
|
|
82
|
+
fn test_nav_preserved_when_remove_navigation_disabled() {
|
|
83
|
+
use html_to_markdown_rs::{ConversionOptions, PreprocessingOptions};
|
|
84
|
+
let opts = ConversionOptions {
|
|
85
|
+
preprocessing: PreprocessingOptions {
|
|
86
|
+
remove_navigation: false,
|
|
87
|
+
..Default::default()
|
|
88
|
+
},
|
|
89
|
+
..Default::default()
|
|
90
|
+
};
|
|
91
|
+
let html = r#"<nav><a href="/home">Home</a></nav>"#;
|
|
92
|
+
let result = html_to_markdown_rs::convert(html, Some(opts), None)
|
|
93
|
+
.map(|r| r.content.unwrap_or_default())
|
|
94
|
+
.expect("conversion should succeed");
|
|
95
|
+
assert!(
|
|
96
|
+
result.contains("Home"),
|
|
97
|
+
"nav should pass through when remove_navigation=false: '{result}'"
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// --- aside ---
|
|
102
|
+
|
|
103
|
+
#[test]
|
|
104
|
+
fn test_paragraph_inside_aside() {
|
|
105
|
+
let html = "<aside><p>Side note</p></aside>";
|
|
106
|
+
let result = convert(html);
|
|
107
|
+
assert_eq!(result, "Side note\n");
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// --- navigation-hinted header should still be dropped ---
|
|
111
|
+
|
|
112
|
+
#[test]
|
|
113
|
+
fn test_site_chrome_header_dropped() {
|
|
114
|
+
// A <header> with class="site-header" is site chrome and should be removed
|
|
115
|
+
let html = r#"<header class="site-header"><a href="/">Logo</a></header><p>Content</p>"#;
|
|
116
|
+
let result = convert(html);
|
|
117
|
+
assert!(
|
|
118
|
+
!result.contains("Logo"),
|
|
119
|
+
"site-chrome header should be dropped: '{result}'"
|
|
120
|
+
);
|
|
121
|
+
assert!(
|
|
122
|
+
result.contains("Content"),
|
|
123
|
+
"body content should be preserved: '{result}'"
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[test]
|
|
128
|
+
fn test_header_with_role_navigation_dropped() {
|
|
129
|
+
// A <header role="navigation"> is nav chrome and should be removed
|
|
130
|
+
let html = r#"<header role="navigation"><a href="/">Home</a></header><p>Body</p>"#;
|
|
131
|
+
let result = convert(html);
|
|
132
|
+
assert!(
|
|
133
|
+
!result.contains("Home"),
|
|
134
|
+
"navigation header should be dropped: '{result}'"
|
|
135
|
+
);
|
|
136
|
+
assert!(result.contains("Body"), "body content should be preserved: '{result}'");
|
|
137
|
+
}
|
|
@@ -518,5 +518,5 @@ fn convert(
|
|
|
518
518
|
html: &str,
|
|
519
519
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
520
520
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
521
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
521
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
522
522
|
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use html_to_markdown_rs::ConversionOptions;
|
|
@@ -718,7 +718,7 @@ fn test_table_colspan_no_header_issue_233() {
|
|
|
718
718
|
<td>Cell 2</td>
|
|
719
719
|
</tr>
|
|
720
720
|
</table>"#;
|
|
721
|
-
let result = html_to_markdown_rs::convert(html, None)
|
|
721
|
+
let result = html_to_markdown_rs::convert(html, None, None)
|
|
722
722
|
.unwrap()
|
|
723
723
|
.content
|
|
724
724
|
.unwrap_or_default();
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
#![cfg(feature = "visitor")]
|
|
9
9
|
|
|
10
|
-
use html_to_markdown_rs::
|
|
10
|
+
use html_to_markdown_rs::convert;
|
|
11
11
|
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
|
|
12
12
|
use std::cell::RefCell;
|
|
13
13
|
use std::rc::Rc;
|
|
@@ -147,7 +147,10 @@ fn test_issue_187_content_filter() {
|
|
|
147
147
|
"#;
|
|
148
148
|
|
|
149
149
|
let visitor = Rc::new(RefCell::new(ContentFilter::default()));
|
|
150
|
-
let result =
|
|
150
|
+
let result = convert(html, None, Some(visitor.clone()))
|
|
151
|
+
.unwrap()
|
|
152
|
+
.content
|
|
153
|
+
.unwrap_or_default();
|
|
151
154
|
|
|
152
155
|
println!("Converted Markdown:\n{result}");
|
|
153
156
|
println!("\nSkipped Elements:");
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
use std::cell::RefCell;
|
|
10
10
|
use std::rc::Rc;
|
|
11
11
|
|
|
12
|
-
use html_to_markdown_rs::
|
|
12
|
+
use html_to_markdown_rs::convert;
|
|
13
13
|
use html_to_markdown_rs::visitor::HtmlVisitor;
|
|
14
14
|
|
|
15
15
|
/// Empty visitor — does nothing, just uses default implementations.
|
|
@@ -26,7 +26,7 @@ fn make_visitor() -> Rc<RefCell<dyn HtmlVisitor>> {
|
|
|
26
26
|
fn test_cyrillic_with_tabs_between_divs_and_visitor() {
|
|
27
27
|
// Exact reproduction from the issue
|
|
28
28
|
let html = "<div><span>А</span></div>\t\t\t<div><span>По";
|
|
29
|
-
let result =
|
|
29
|
+
let result = convert(html, None, Some(make_visitor()));
|
|
30
30
|
assert!(result.is_ok(), "Should not panic: {result:?}");
|
|
31
31
|
}
|
|
32
32
|
|
|
@@ -40,7 +40,7 @@ fn test_multibyte_utf8_with_tabs_and_visitor() {
|
|
|
40
40
|
];
|
|
41
41
|
|
|
42
42
|
for html in &cases {
|
|
43
|
-
let result =
|
|
43
|
+
let result = convert(html, None, Some(make_visitor()));
|
|
44
44
|
assert!(result.is_ok(), "Should not panic for: {html}\nError: {result:?}");
|
|
45
45
|
}
|
|
46
46
|
}
|
|
@@ -50,7 +50,7 @@ fn test_cyrillic_with_varying_tab_counts_and_visitor() {
|
|
|
50
50
|
for n in 1..=5 {
|
|
51
51
|
let tabs = "\t".repeat(n);
|
|
52
52
|
let html = format!("<div><span>А</span></div>{tabs}<div><span>По");
|
|
53
|
-
let result =
|
|
53
|
+
let result = convert(&html, None, Some(make_visitor()));
|
|
54
54
|
assert!(result.is_ok(), "Should not panic with {n} tabs: {result:?}");
|
|
55
55
|
}
|
|
56
56
|
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
//! Regression test for issue #277: silent truncation on large HTML inputs.
|
|
4
|
+
//!
|
|
5
|
+
//! The bug was caused by `repair_with_html5ever` re-introducing `<script>` elements
|
|
6
|
+
//! that had already been stripped, and `preprocess_html` failing to find the closing
|
|
7
|
+
//! tag when script content contained unbalanced literal `<script>` strings.
|
|
8
|
+
|
|
9
|
+
fn convert(html: &str) -> String {
|
|
10
|
+
html_to_markdown_rs::convert(html, None, None)
|
|
11
|
+
.expect("conversion should not fail")
|
|
12
|
+
.content
|
|
13
|
+
.unwrap_or_default()
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// When custom elements trigger html5ever repair, scripts must be re-stripped.
|
|
17
|
+
/// Without the fix, content after a script with unbalanced `<script>` literals
|
|
18
|
+
/// would be silently truncated.
|
|
19
|
+
#[test]
|
|
20
|
+
fn test_no_truncation_after_repair_with_scripts() {
|
|
21
|
+
// Custom element triggers repair_with_html5ever
|
|
22
|
+
// Script content has an unbalanced literal `<script>` that confuses depth tracking
|
|
23
|
+
let html = r"<html>
|
|
24
|
+
<head>
|
|
25
|
+
<script>
|
|
26
|
+
var example = '<script>';
|
|
27
|
+
console.log(example);
|
|
28
|
+
</script>
|
|
29
|
+
</head>
|
|
30
|
+
<body>
|
|
31
|
+
<custom-widget>widget</custom-widget>
|
|
32
|
+
<p>Content before</p>
|
|
33
|
+
<p>Content after scripts that must not be truncated</p>
|
|
34
|
+
<p>Final paragraph</p>
|
|
35
|
+
</body>
|
|
36
|
+
</html>";
|
|
37
|
+
|
|
38
|
+
let result = convert(html);
|
|
39
|
+
assert!(
|
|
40
|
+
result.contains("Content before"),
|
|
41
|
+
"Should contain content before script region"
|
|
42
|
+
);
|
|
43
|
+
assert!(
|
|
44
|
+
result.contains("Content after scripts that must not be truncated"),
|
|
45
|
+
"Content after scripts should NOT be silently truncated. Got:\n{result}"
|
|
46
|
+
);
|
|
47
|
+
assert!(
|
|
48
|
+
result.contains("Final paragraph"),
|
|
49
|
+
"Final content should be present. Got:\n{result}"
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Ensure `preprocess_html` doesn't truncate the rest of the document when
|
|
54
|
+
/// `find_closing_tag` returns None (unmatched script opening).
|
|
55
|
+
#[test]
|
|
56
|
+
fn test_preprocess_unmatched_script_preserves_remaining_content() {
|
|
57
|
+
// Even without custom elements, preprocess_html's unwrap_or fallback
|
|
58
|
+
// should not consume the entire rest of the document.
|
|
59
|
+
let html = r"<html><body>
|
|
60
|
+
<p>Before</p>
|
|
61
|
+
<script>var x = '<script>'; var y = '<script>';</script>
|
|
62
|
+
<p>After first script</p>
|
|
63
|
+
<script>var z = 1;</script>
|
|
64
|
+
<p>After second script</p>
|
|
65
|
+
</body></html>";
|
|
66
|
+
|
|
67
|
+
let result = convert(html);
|
|
68
|
+
assert!(result.contains("Before"), "Content before scripts should be present");
|
|
69
|
+
assert!(
|
|
70
|
+
result.contains("After first script"),
|
|
71
|
+
"Content after first script should be present. Got:\n{result}"
|
|
72
|
+
);
|
|
73
|
+
assert!(
|
|
74
|
+
result.contains("After second script"),
|
|
75
|
+
"Content after second script should be present. Got:\n{result}"
|
|
76
|
+
);
|
|
77
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
//! Tests for the `max_depth` recursion-safety option.
|
|
4
|
+
|
|
5
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
6
|
+
|
|
7
|
+
fn convert_with_options(html: &str, options: ConversionOptions) -> String {
|
|
8
|
+
html_to_markdown_rs::convert(html, Some(options), None)
|
|
9
|
+
.expect("conversion should not fail")
|
|
10
|
+
.content
|
|
11
|
+
.unwrap_or_default()
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/// With the default `max_depth: None`, deeply nested content should be fully converted.
|
|
15
|
+
#[test]
|
|
16
|
+
fn test_max_depth_none_converts_deeply_nested() {
|
|
17
|
+
// Build 100 levels of nesting around a leaf text node.
|
|
18
|
+
let mut html = String::from("<p>deep</p>");
|
|
19
|
+
for _ in 0..100 {
|
|
20
|
+
html = format!("<div>{html}</div>");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let options = ConversionOptions {
|
|
24
|
+
extract_metadata: false,
|
|
25
|
+
max_depth: None,
|
|
26
|
+
..Default::default()
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
let result = convert_with_options(&html, options);
|
|
30
|
+
assert!(
|
|
31
|
+
result.contains("deep"),
|
|
32
|
+
"Deeply nested text should be present when max_depth is None. Got:\n{result}"
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// With `max_depth: Some(2)`, block elements at depth 2 are not visited, so
|
|
37
|
+
/// their text content is excluded from the output.
|
|
38
|
+
#[test]
|
|
39
|
+
fn test_max_depth_truncates_at_limit() {
|
|
40
|
+
// Depth counting (each handler passes depth+1 to its children):
|
|
41
|
+
// depth 0: outer <div> — visited
|
|
42
|
+
// depth 1: <p> — visited, paragraph handler passes depth+1 to children
|
|
43
|
+
// depth 2: "shallow" — visited (2 < 3), appears in output
|
|
44
|
+
// depth 1: inner <div> — visited, div handler passes depth+1 to children
|
|
45
|
+
// depth 2: <p> — visited, paragraph handler passes depth+1 to children
|
|
46
|
+
// depth 3: "deep" — skipped (3 >= 3), absent from output
|
|
47
|
+
let html = "<div><p>shallow</p><div><p>deep</p></div></div>";
|
|
48
|
+
|
|
49
|
+
let options = ConversionOptions {
|
|
50
|
+
extract_metadata: false,
|
|
51
|
+
max_depth: Some(3),
|
|
52
|
+
..Default::default()
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
let result = convert_with_options(html, options);
|
|
56
|
+
assert!(
|
|
57
|
+
result.contains("shallow"),
|
|
58
|
+
"Content at depth < max_depth should be present. Got:\n{result}"
|
|
59
|
+
);
|
|
60
|
+
assert!(
|
|
61
|
+
!result.contains("deep"),
|
|
62
|
+
"Content at depth >= max_depth should be absent. Got:\n{result}"
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// With `max_depth: Some(0)`, no nodes are processed and the output is empty or whitespace only.
|
|
67
|
+
#[test]
|
|
68
|
+
fn test_max_depth_zero_produces_empty() {
|
|
69
|
+
let html = "<p>hello</p>";
|
|
70
|
+
|
|
71
|
+
let options = ConversionOptions {
|
|
72
|
+
extract_metadata: false,
|
|
73
|
+
max_depth: Some(0),
|
|
74
|
+
..Default::default()
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
let result = convert_with_options(html, options);
|
|
78
|
+
assert!(
|
|
79
|
+
result.trim().is_empty(),
|
|
80
|
+
"max_depth: Some(0) should produce no output. Got:\n{result}"
|
|
81
|
+
);
|
|
82
|
+
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -111,7 +111,7 @@ fn test_preserve_json_ld_script() {
|
|
|
111
111
|
</body>
|
|
112
112
|
</html>"#;
|
|
113
113
|
|
|
114
|
-
let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
|
|
114
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
|
|
115
115
|
let metadata = result.metadata;
|
|
116
116
|
let markdown = result.content.unwrap_or_default();
|
|
117
117
|
|
|
@@ -164,7 +164,7 @@ fn test_multiple_script_tags() {
|
|
|
164
164
|
</body>
|
|
165
165
|
</html>"#;
|
|
166
166
|
|
|
167
|
-
let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
|
|
167
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
|
|
168
168
|
let metadata = result.metadata;
|
|
169
169
|
let markdown = result.content.unwrap_or_default();
|
|
170
170
|
|
|
@@ -221,7 +221,7 @@ fn test_reuters_like_structure() {
|
|
|
221
221
|
</body>
|
|
222
222
|
</html>"#;
|
|
223
223
|
|
|
224
|
-
let result = html_to_markdown_rs::convert(html, None).expect("Failed to convert");
|
|
224
|
+
let result = html_to_markdown_rs::convert(html, None, None).expect("Failed to convert");
|
|
225
225
|
let metadata = result.metadata;
|
|
226
226
|
let markdown = result.content.unwrap_or_default();
|
|
227
227
|
|
|
@@ -392,5 +392,5 @@ fn convert(
|
|
|
392
392
|
html: &str,
|
|
393
393
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
394
394
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
395
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
395
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
396
396
|
}
|
|
@@ -4,7 +4,7 @@ fn convert(
|
|
|
4
4
|
html: &str,
|
|
5
5
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
6
6
|
) -> html_to_markdown_rs::error::Result<String> {
|
|
7
|
-
html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
|
|
7
|
+
html_to_markdown_rs::convert(html, opts, None).map(|r| r.content.unwrap_or_default())
|
|
8
8
|
}
|
|
9
9
|
|
|
10
10
|
use std::fs;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#![allow(missing_docs)]
|
|
2
2
|
#![cfg(feature = "visitor")]
|
|
3
3
|
|
|
4
|
-
use html_to_markdown_rs::
|
|
4
|
+
use html_to_markdown_rs::convert;
|
|
5
5
|
use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
|
|
6
6
|
use std::cell::RefCell;
|
|
7
7
|
use std::rc::Rc;
|
|
@@ -33,7 +33,7 @@ fn test_code_block_visitor() {
|
|
|
33
33
|
inline_codes: vec![],
|
|
34
34
|
}));
|
|
35
35
|
|
|
36
|
-
let result =
|
|
36
|
+
let result = convert(html, None, Some(visitor.clone()));
|
|
37
37
|
assert!(result.is_ok());
|
|
38
38
|
|
|
39
39
|
let visitor_ref = visitor.borrow();
|
|
@@ -49,7 +49,7 @@ fn test_inline_code_visitor() {
|
|
|
49
49
|
inline_codes: vec![],
|
|
50
50
|
}));
|
|
51
51
|
|
|
52
|
-
let result =
|
|
52
|
+
let result = convert(html, None, Some(visitor.clone()));
|
|
53
53
|
assert!(result.is_ok());
|
|
54
54
|
|
|
55
55
|
let visitor_ref = visitor.borrow();
|
|
@@ -71,9 +71,9 @@ fn test_code_block_skip() {
|
|
|
71
71
|
let html = "<pre><code>skipped code</code></pre>";
|
|
72
72
|
let visitor = Rc::new(RefCell::new(SkipCodeVisitor));
|
|
73
73
|
|
|
74
|
-
let result =
|
|
74
|
+
let result = convert(html, None, Some(visitor));
|
|
75
75
|
assert!(result.is_ok());
|
|
76
|
-
let markdown = result.unwrap();
|
|
76
|
+
let markdown = result.unwrap().content.unwrap_or_default();
|
|
77
77
|
assert!(!markdown.contains("skipped code"));
|
|
78
78
|
}
|
|
79
79
|
|
|
@@ -97,7 +97,7 @@ fn test_code_block_language_detection() {
|
|
|
97
97
|
inline_codes: vec![],
|
|
98
98
|
}));
|
|
99
99
|
|
|
100
|
-
let result =
|
|
100
|
+
let result = convert(html, None, Some(visitor.clone()));
|
|
101
101
|
assert!(result.is_ok(), "Failed to convert: {html}");
|
|
102
102
|
|
|
103
103
|
let visitor_ref = visitor.borrow();
|