html-to-markdown 2.29.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +18 -41
- data/README.md +37 -50
- data/ext/html-to-markdown-rb/native/Cargo.lock +17 -705
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -4
- data/ext/html-to-markdown-rb/native/README.md +4 -13
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +2 -73
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +5 -49
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -6
- data/ext/html-to-markdown-rb/native/src/lib.rs +76 -213
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -3
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +13 -194
- data/sig/html_to_markdown.rbs +12 -373
- data/vendor/Cargo.toml +7 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +4 -10
- data/vendor/html-to-markdown-rs/README.md +127 -51
- data/vendor/html-to-markdown-rs/examples/basic.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +8 -2
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +6 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +6 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +151 -745
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -7
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +18 -5
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +16 -11
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +20 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +4 -17
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +140 -0
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +4 -18
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +2 -18
- data/vendor/html-to-markdown-rs/src/converter/context.rs +8 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -6
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +4 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +5 -10
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +4 -170
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +7 -19
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +3 -5
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +6 -12
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +1 -12
- data/vendor/html-to-markdown-rs/src/converter/main.rs +85 -56
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +4 -67
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +1 -5
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +3 -40
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +3 -13
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +37 -12
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +5 -30
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +29 -0
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +1 -36
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +1 -3
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -53
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -41
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +2 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +15 -98
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +113 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +3 -0
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +4 -10
- data/vendor/html-to-markdown-rs/src/exports.rs +1 -4
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +13 -133
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +4 -4
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +22 -22
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +3 -3
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +351 -319
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +8 -2
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -15
- data/vendor/html-to-markdown-rs/src/rcdom.rs +7 -1
- data/vendor/html-to-markdown-rs/src/text.rs +25 -14
- data/vendor/html-to-markdown-rs/src/types/document.rs +175 -0
- data/vendor/html-to-markdown-rs/src/types/mod.rs +17 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +49 -0
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +790 -0
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +442 -0
- data/vendor/html-to-markdown-rs/src/types/tables.rs +47 -0
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +28 -0
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -6
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +1 -21
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -5
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +1 -845
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +23 -6
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +8 -7
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +6 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +4 -6
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +8 -11
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +12 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +17 -28
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +8 -1
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +29 -33
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +8 -1
- metadata +9 -37
- data/bin/benchmark.rb +0 -232
- data/ext/html-to-markdown-rb/native/src/conversion/tables.rs +0 -71
- data/ext/html-to-markdown-rb/native/src/profiling.rs +0 -215
- data/ext/html-to-markdown-rb/native/src/visitor/bridge.rs +0 -252
- data/ext/html-to-markdown-rb/native/src/visitor/callbacks.rs +0 -640
- data/ext/html-to-markdown-rb/native/src/visitor/mod.rs +0 -12
- data/spec/convert_spec.rb +0 -77
- data/spec/convert_with_tables_spec.rb +0 -194
- data/spec/metadata_extraction_spec.rb +0 -437
- data/spec/visitor_issue_187_spec.rb +0 -605
- data/spec/visitor_spec.rb +0 -1149
- data/vendor/html-to-markdown-rs/src/hocr/converter/code_analysis.rs +0 -254
- data/vendor/html-to-markdown-rs/src/hocr/converter/core.rs +0 -249
- data/vendor/html-to-markdown-rs/src/hocr/converter/elements.rs +0 -382
- data/vendor/html-to-markdown-rs/src/hocr/converter/hierarchy.rs +0 -379
- data/vendor/html-to-markdown-rs/src/hocr/converter/keywords.rs +0 -55
- data/vendor/html-to-markdown-rs/src/hocr/converter/layout.rs +0 -313
- data/vendor/html-to-markdown-rs/src/hocr/converter/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/hocr/converter/output.rs +0 -78
- data/vendor/html-to-markdown-rs/src/hocr/extractor.rs +0 -232
- data/vendor/html-to-markdown-rs/src/hocr/mod.rs +0 -31
- data/vendor/html-to-markdown-rs/src/hocr/parser.rs +0 -333
- data/vendor/html-to-markdown-rs/src/hocr/spatial/coords.rs +0 -129
- data/vendor/html-to-markdown-rs/src/hocr/spatial/grouping.rs +0 -165
- data/vendor/html-to-markdown-rs/src/hocr/spatial/layout.rs +0 -335
- data/vendor/html-to-markdown-rs/src/hocr/spatial/mod.rs +0 -15
- data/vendor/html-to-markdown-rs/src/hocr/spatial/output.rs +0 -63
- data/vendor/html-to-markdown-rs/src/hocr/types.rs +0 -269
- data/vendor/html-to-markdown-rs/src/visitor/async_traits.rs +0 -249
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge.rs +0 -189
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/bridge_visitor.rs +0 -343
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/macros.rs +0 -217
- data/vendor/html-to-markdown-rs/tests/async_visitor_test.rs +0 -57
- data/vendor/html-to-markdown-rs/tests/convert_with_metadata_no_frontmatter.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/hocr_compliance_test.rs +0 -509
|
@@ -19,42 +19,7 @@ use std::borrow::Cow;
|
|
|
19
19
|
/// "Text \\[escaped\\]" → "Text \\[escaped\\]"
|
|
20
20
|
/// ```
|
|
21
21
|
pub fn escape_link_label(text: &str) -> String {
|
|
22
|
-
|
|
23
|
-
return String::new();
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
let mut result = String::with_capacity(text.len());
|
|
27
|
-
let mut backslash_count = 0usize;
|
|
28
|
-
let mut bracket_depth = 0usize;
|
|
29
|
-
|
|
30
|
-
for ch in text.chars() {
|
|
31
|
-
if ch == '\\' {
|
|
32
|
-
result.push('\\');
|
|
33
|
-
backslash_count += 1;
|
|
34
|
-
continue;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
let is_escaped = backslash_count % 2 == 1;
|
|
38
|
-
backslash_count = 0;
|
|
39
|
-
|
|
40
|
-
match ch {
|
|
41
|
-
'[' if !is_escaped => {
|
|
42
|
-
bracket_depth = bracket_depth.saturating_add(1);
|
|
43
|
-
result.push('[');
|
|
44
|
-
}
|
|
45
|
-
']' if !is_escaped => {
|
|
46
|
-
if bracket_depth == 0 {
|
|
47
|
-
result.push('\\');
|
|
48
|
-
} else {
|
|
49
|
-
bracket_depth -= 1;
|
|
50
|
-
}
|
|
51
|
-
result.push(']');
|
|
52
|
-
}
|
|
53
|
-
_ => result.push(ch),
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
result
|
|
22
|
+
crate::converter::utility::content::escape_link_label(text)
|
|
58
23
|
}
|
|
59
24
|
|
|
60
25
|
/// Escape malformed angle brackets in markdown output.
|
|
@@ -8,7 +8,5 @@ mod normalization;
|
|
|
8
8
|
mod processing;
|
|
9
9
|
|
|
10
10
|
pub use escaping::{escape_link_label, escape_malformed_angle_brackets};
|
|
11
|
-
pub use normalization::{
|
|
12
|
-
chomp_inline, normalize_heading_text, trim_line_end_whitespace, trim_trailing_whitespace, truncate_at_char_boundary,
|
|
13
|
-
};
|
|
11
|
+
pub use normalization::{normalize_heading_text, trim_line_end_whitespace, truncate_at_char_boundary};
|
|
14
12
|
pub use processing::dedent_code_block;
|
|
@@ -5,59 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
use std::borrow::Cow;
|
|
7
7
|
|
|
8
|
-
/// Chomp whitespace from inline element content, preserving line breaks.
|
|
9
|
-
///
|
|
10
|
-
/// Returns (prefix, suffix, trimmed_text) where:
|
|
11
|
-
/// - prefix: leading whitespace (space or tab)
|
|
12
|
-
/// - suffix: trailing whitespace (including soft breaks like " \n" or "\\\n")
|
|
13
|
-
/// - trimmed_text: the trimmed content
|
|
14
|
-
///
|
|
15
|
-
/// # Examples
|
|
16
|
-
///
|
|
17
|
-
/// ```text
|
|
18
|
-
/// " text \n" → (" ", " \n", "text")
|
|
19
|
-
/// " text " → (" ", " ", "text")
|
|
20
|
-
/// "text" → ("", "", "text")
|
|
21
|
-
/// ```
|
|
22
|
-
pub fn chomp_inline(text: &str) -> (&str, &str, &str) {
|
|
23
|
-
if text.is_empty() {
|
|
24
|
-
return ("", "", "");
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
|
|
28
|
-
|
|
29
|
-
let has_trailing_linebreak = text.ends_with(" \n") || text.ends_with("\\\n");
|
|
30
|
-
|
|
31
|
-
let suffix = if has_trailing_linebreak {
|
|
32
|
-
if text.ends_with(" \n") { " \n" } else { "\\\n" }
|
|
33
|
-
} else if text.ends_with(&[' ', '\t'][..]) {
|
|
34
|
-
" "
|
|
35
|
-
} else {
|
|
36
|
-
""
|
|
37
|
-
};
|
|
38
|
-
|
|
39
|
-
let trimmed = if has_trailing_linebreak {
|
|
40
|
-
text.strip_suffix(" \n").map_or_else(
|
|
41
|
-
|| text.strip_suffix("\\\n").map_or_else(|| text.trim(), |s| s.trim()),
|
|
42
|
-
|s| s.trim(),
|
|
43
|
-
)
|
|
44
|
-
} else {
|
|
45
|
-
text.trim()
|
|
46
|
-
};
|
|
47
|
-
|
|
48
|
-
(prefix, suffix, trimmed)
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/// Remove trailing spaces and tabs from output string.
|
|
52
|
-
///
|
|
53
|
-
/// This is used before adding block separators or newlines to ensure
|
|
54
|
-
/// clean Markdown output without spurious whitespace.
|
|
55
|
-
pub fn trim_trailing_whitespace(output: &mut String) {
|
|
56
|
-
while output.ends_with(' ') || output.ends_with('\t') {
|
|
57
|
-
output.pop();
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
|
|
61
8
|
/// Remove trailing spaces/tabs from every line while preserving newlines.
|
|
62
9
|
pub fn trim_line_end_whitespace(output: &mut String) {
|
|
63
10
|
if output.is_empty() {
|
|
@@ -50,7 +50,7 @@ pub fn process_text_node(
|
|
|
50
50
|
let had_newlines = text_ref.contains('\n');
|
|
51
51
|
let has_double_newline = text_ref.contains("\n\n") || text_ref.contains("\r\n\r\n");
|
|
52
52
|
|
|
53
|
-
if options.strip_newlines {
|
|
53
|
+
if options.strip_newlines && (text.contains('\r') || text.contains('\n')) {
|
|
54
54
|
text = Cow::Owned(text.replace(['\r', '\n'], " "));
|
|
55
55
|
}
|
|
56
56
|
|
|
@@ -153,44 +153,3 @@ pub(crate) fn has_semantic_content_ancestor(
|
|
|
153
153
|
}
|
|
154
154
|
false
|
|
155
155
|
}
|
|
156
|
-
|
|
157
|
-
/// Check if a document might be an hOCR document (has relevant attributes).
|
|
158
|
-
pub(crate) fn may_be_hocr(input: &str) -> bool {
|
|
159
|
-
const HOCR_MARKERS: [&[u8]; 3] = [b"class=\"ocr", b"class='ocr", b"ocr_page"];
|
|
160
|
-
HOCR_MARKERS
|
|
161
|
-
.iter()
|
|
162
|
-
.any(|marker| input.as_bytes().windows(marker.len()).any(|w| w == *marker))
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
/// Check if a node is an hOCR document by examining its root tag.
|
|
166
|
-
pub(crate) fn is_hocr_document(node_handle: tl::NodeHandle, parser: &tl::Parser) -> bool {
|
|
167
|
-
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
168
|
-
let tag_name = tag.name().as_utf8_str();
|
|
169
|
-
if tag_name != "html" {
|
|
170
|
-
return false;
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
// Check for hOCR class on root or first child
|
|
174
|
-
if let Some(Some(class_bytes)) = tag.attributes().get("class") {
|
|
175
|
-
let class = class_bytes.as_utf8_str();
|
|
176
|
-
if class.contains("ocr_document") || class.contains("ocr_page") {
|
|
177
|
-
return true;
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
// Check children
|
|
182
|
-
let children = tag.children();
|
|
183
|
-
for child_handle in children.top().iter() {
|
|
184
|
-
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
|
|
185
|
-
if let Some(Some(class_bytes)) = child_tag.attributes().get("class") {
|
|
186
|
-
let class = class_bytes.as_utf8_str();
|
|
187
|
-
if class.contains("ocr_document") || class.contains("ocr_page") {
|
|
188
|
-
return true;
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
false
|
|
196
|
-
}
|
|
@@ -42,8 +42,9 @@ pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len:
|
|
|
42
42
|
/// scaled proportionally to input size (1KB = 1 slot).
|
|
43
43
|
pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
|
|
44
44
|
const TEXT_CACHE_CAPACITY: usize = 256;
|
|
45
|
+
// `clamp(32, TEXT_CACHE_CAPACITY)` guarantees `target >= 32 > 0`, so `new` always returns Some.
|
|
45
46
|
let target = (input_len / 1024).clamp(32, TEXT_CACHE_CAPACITY);
|
|
46
|
-
NonZeroUsize::new(target).
|
|
47
|
+
NonZeroUsize::new(target).unwrap_or(NonZeroUsize::MIN)
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
/// Recursively record node hierarchy into DOM context.
|
|
@@ -5,10 +5,24 @@
|
|
|
5
5
|
|
|
6
6
|
use crate::text;
|
|
7
7
|
use std::borrow::Cow;
|
|
8
|
+
#[cfg(feature = "visitor")]
|
|
9
|
+
use std::collections::BTreeMap;
|
|
8
10
|
|
|
9
11
|
// Forward declare DomContext from parent module to avoid circular imports
|
|
10
12
|
pub(crate) use crate::converter::DomContext;
|
|
11
13
|
|
|
14
|
+
/// Collect all attributes from an HTML tag as a `BTreeMap<String, String>`.
|
|
15
|
+
///
|
|
16
|
+
/// Boolean attributes (those with `None` as the value) are skipped; only
|
|
17
|
+
/// attributes that carry an explicit value are included.
|
|
18
|
+
#[cfg(feature = "visitor")]
|
|
19
|
+
pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
|
|
20
|
+
tag.attributes()
|
|
21
|
+
.iter()
|
|
22
|
+
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
23
|
+
.collect()
|
|
24
|
+
}
|
|
25
|
+
|
|
12
26
|
/// Chomp whitespace from inline element content, preserving line breaks.
|
|
13
27
|
///
|
|
14
28
|
/// Similar to `text::chomp` but handles line breaks from `<br>` tags specially.
|
|
@@ -131,31 +145,6 @@ pub(crate) fn normalize_link_label(label: &str) -> String {
|
|
|
131
145
|
normalized.as_ref().trim().to_string()
|
|
132
146
|
}
|
|
133
147
|
|
|
134
|
-
/// Check if an inline element is considered empty (no meaningful content).
|
|
135
|
-
pub(crate) fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
|
|
136
|
-
const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
|
|
137
|
-
"abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
|
|
138
|
-
];
|
|
139
|
-
|
|
140
|
-
let tag_name: Option<Cow<'_, str>> = dom_ctx
|
|
141
|
-
.tag_info(node_handle.get_inner(), parser)
|
|
142
|
-
.map(|info| Cow::Borrowed(info.name.as_str()))
|
|
143
|
-
.or_else(|| {
|
|
144
|
-
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
145
|
-
Some(normalized_tag_name(tag.name().as_utf8_str()))
|
|
146
|
-
} else {
|
|
147
|
-
None
|
|
148
|
-
}
|
|
149
|
-
});
|
|
150
|
-
|
|
151
|
-
if let Some(tag_name) = tag_name {
|
|
152
|
-
if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
|
|
153
|
-
return get_text_content(node_handle, parser, dom_ctx).trim().is_empty();
|
|
154
|
-
}
|
|
155
|
-
}
|
|
156
|
-
false
|
|
157
|
-
}
|
|
158
|
-
|
|
159
148
|
/// Normalize a tag name to lowercase, preserving borrowed input when possible.
|
|
160
149
|
pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
|
|
161
150
|
if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
|
|
@@ -167,81 +156,9 @@ pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
|
|
|
167
156
|
}
|
|
168
157
|
}
|
|
169
158
|
|
|
170
|
-
/// Check if an element is inline (not block-level).
|
|
171
|
-
fn is_inline_element(tag_name: &str) -> bool {
|
|
172
|
-
matches!(
|
|
173
|
-
tag_name,
|
|
174
|
-
"a" | "abbr"
|
|
175
|
-
| "b"
|
|
176
|
-
| "bdi"
|
|
177
|
-
| "bdo"
|
|
178
|
-
| "br"
|
|
179
|
-
| "cite"
|
|
180
|
-
| "code"
|
|
181
|
-
| "data"
|
|
182
|
-
| "dfn"
|
|
183
|
-
| "em"
|
|
184
|
-
| "i"
|
|
185
|
-
| "kbd"
|
|
186
|
-
| "mark"
|
|
187
|
-
| "q"
|
|
188
|
-
| "rp"
|
|
189
|
-
| "rt"
|
|
190
|
-
| "ruby"
|
|
191
|
-
| "s"
|
|
192
|
-
| "samp"
|
|
193
|
-
| "small"
|
|
194
|
-
| "span"
|
|
195
|
-
| "strong"
|
|
196
|
-
| "sub"
|
|
197
|
-
| "sup"
|
|
198
|
-
| "time"
|
|
199
|
-
| "u"
|
|
200
|
-
| "var"
|
|
201
|
-
| "wbr"
|
|
202
|
-
| "del"
|
|
203
|
-
| "ins"
|
|
204
|
-
| "img"
|
|
205
|
-
| "map"
|
|
206
|
-
| "area"
|
|
207
|
-
| "audio"
|
|
208
|
-
| "video"
|
|
209
|
-
| "picture"
|
|
210
|
-
| "source"
|
|
211
|
-
| "track"
|
|
212
|
-
| "embed"
|
|
213
|
-
| "object"
|
|
214
|
-
| "param"
|
|
215
|
-
| "input"
|
|
216
|
-
| "label"
|
|
217
|
-
| "button"
|
|
218
|
-
| "select"
|
|
219
|
-
| "textarea"
|
|
220
|
-
| "output"
|
|
221
|
-
| "progress"
|
|
222
|
-
| "meter"
|
|
223
|
-
)
|
|
224
|
-
}
|
|
225
|
-
|
|
226
159
|
/// Check if an element is block-level (not inline).
|
|
227
160
|
pub(crate) fn is_block_level_element(tag_name: &str) -> bool {
|
|
228
|
-
is_block_level_name(tag_name, is_inline_element(tag_name))
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
/// Truncate a string to a maximum length at a valid UTF-8 character boundary.
|
|
232
|
-
///
|
|
233
|
-
/// Ensures the string is not longer than `max_len` bytes, truncating at the last
|
|
234
|
-
/// valid character boundary if necessary to preserve valid UTF-8.
|
|
235
|
-
pub(crate) fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
|
|
236
|
-
if value.len() <= max_len {
|
|
237
|
-
return;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
let mut new_len = max_len.min(value.len());
|
|
241
|
-
while new_len > 0 && !value.is_char_boundary(new_len) {
|
|
242
|
-
new_len -= 1;
|
|
243
|
-
}
|
|
244
|
-
value.truncate(new_len);
|
|
161
|
+
is_block_level_name(tag_name, crate::converter::main_helpers::is_inline_element(tag_name))
|
|
245
162
|
}
|
|
246
163
|
|
|
247
164
|
/// Returns the largest valid char boundary index at or before `index`.
|
|
@@ -176,10 +176,13 @@ pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) ->
|
|
|
176
176
|
const MAX_SCAN: usize = 100_000_000; // 100MB limit per tag - prevents pathological cases
|
|
177
177
|
|
|
178
178
|
while idx < len && (idx - start) < MAX_SCAN {
|
|
179
|
-
// Optimization: skip forward to next '<' quickly
|
|
179
|
+
// Optimization: skip forward to next '<' quickly using memchr
|
|
180
180
|
if bytes[idx] != b'<' {
|
|
181
|
-
idx
|
|
182
|
-
|
|
181
|
+
if let Some(pos) = memchr::memchr(b'<', &bytes[idx..]) {
|
|
182
|
+
idx += pos;
|
|
183
|
+
} else {
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
183
186
|
}
|
|
184
187
|
|
|
185
188
|
// Check for </ pattern
|
|
@@ -291,7 +294,11 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
|
291
294
|
out.push_str(&input[last..idx]);
|
|
292
295
|
out.push_str(&input[idx..open_end]);
|
|
293
296
|
out.push_str("</");
|
|
294
|
-
|
|
297
|
+
// `TAGS` contains only ASCII byte literals (`b"script"`, `b"style"`),
|
|
298
|
+
// which are always valid UTF-8; `from_utf8` cannot fail here.
|
|
299
|
+
if let Ok(tag_str) = str::from_utf8(tag) {
|
|
300
|
+
out.push_str(tag_str);
|
|
301
|
+
}
|
|
295
302
|
out.push('>');
|
|
296
303
|
|
|
297
304
|
last = remove_end;
|
|
@@ -573,6 +580,108 @@ pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
|
|
|
573
580
|
Cow::Owned(url[paren_start..paren_end].to_string())
|
|
574
581
|
}
|
|
575
582
|
|
|
583
|
+
/// Strip elements with the `hidden` attribute from HTML.
|
|
584
|
+
///
|
|
585
|
+
/// Scans for opening tags containing the `hidden` attribute, finds their
|
|
586
|
+
/// matching closing tag, and removes the entire element (tag + content).
|
|
587
|
+
/// Self-closing tags with `hidden` are also removed.
|
|
588
|
+
pub(crate) fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
|
|
589
|
+
let bytes = input.as_bytes();
|
|
590
|
+
let len = bytes.len();
|
|
591
|
+
|
|
592
|
+
if len == 0 || !bytes.contains(&b'<') {
|
|
593
|
+
return Cow::Borrowed(input);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
let mut idx = 0;
|
|
597
|
+
let mut last = 0;
|
|
598
|
+
let mut output: Option<String> = None;
|
|
599
|
+
|
|
600
|
+
while idx < len {
|
|
601
|
+
if bytes[idx] == b'<' && idx + 1 < len && bytes[idx + 1] != b'/' && bytes[idx + 1] != b'!' {
|
|
602
|
+
// Find the end of this opening tag
|
|
603
|
+
if let Some(tag_end) = find_tag_end(bytes, idx + 1) {
|
|
604
|
+
let tag_slice = &input[idx..tag_end];
|
|
605
|
+
if tag_has_hidden_attribute(tag_slice) {
|
|
606
|
+
// Extract the tag name
|
|
607
|
+
let name_start = idx + 1;
|
|
608
|
+
let mut name_end = name_start;
|
|
609
|
+
while name_end < len
|
|
610
|
+
&& !bytes[name_end].is_ascii_whitespace()
|
|
611
|
+
&& bytes[name_end] != b'>'
|
|
612
|
+
&& bytes[name_end] != b'/'
|
|
613
|
+
{
|
|
614
|
+
name_end += 1;
|
|
615
|
+
}
|
|
616
|
+
let tag_name = &bytes[name_start..name_end];
|
|
617
|
+
|
|
618
|
+
// Check if it's a self-closing tag (e.g., <br hidden> or <br hidden/>)
|
|
619
|
+
let is_self_closing = tag_slice.ends_with("/>")
|
|
620
|
+
|| tag_name.eq_ignore_ascii_case(b"br")
|
|
621
|
+
|| tag_name.eq_ignore_ascii_case(b"hr")
|
|
622
|
+
|| tag_name.eq_ignore_ascii_case(b"img")
|
|
623
|
+
|| tag_name.eq_ignore_ascii_case(b"input");
|
|
624
|
+
|
|
625
|
+
let remove_end = if is_self_closing {
|
|
626
|
+
tag_end
|
|
627
|
+
} else {
|
|
628
|
+
// Find the closing tag
|
|
629
|
+
find_closing_tag_bytes(bytes, tag_end, tag_name).unwrap_or(tag_end)
|
|
630
|
+
};
|
|
631
|
+
|
|
632
|
+
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
633
|
+
out.push_str(&input[last..idx]);
|
|
634
|
+
last = remove_end;
|
|
635
|
+
idx = remove_end;
|
|
636
|
+
continue;
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
idx += 1;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
if let Some(mut out) = output {
|
|
644
|
+
if last < len {
|
|
645
|
+
out.push_str(&input[last..]);
|
|
646
|
+
}
|
|
647
|
+
Cow::Owned(out)
|
|
648
|
+
} else {
|
|
649
|
+
Cow::Borrowed(input)
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
/// Check if an opening tag string contains the `hidden` attribute.
|
|
654
|
+
///
|
|
655
|
+
/// Handles: `hidden`, `hidden=""`, `hidden="hidden"`, `hidden="true"`.
|
|
656
|
+
/// Does NOT match attributes like `data-hidden` or `aria-hidden`.
|
|
657
|
+
fn tag_has_hidden_attribute(tag: &str) -> bool {
|
|
658
|
+
let bytes = tag.as_bytes();
|
|
659
|
+
let len = bytes.len();
|
|
660
|
+
let needle = b"hidden";
|
|
661
|
+
let nlen = needle.len();
|
|
662
|
+
|
|
663
|
+
let mut i = 0;
|
|
664
|
+
// Skip past the tag name
|
|
665
|
+
while i < len && bytes[i] != b' ' && bytes[i] != b'\t' && bytes[i] != b'\n' && bytes[i] != b'>' {
|
|
666
|
+
i += 1;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
while i + nlen <= len {
|
|
670
|
+
if bytes[i..i + nlen].eq_ignore_ascii_case(needle) {
|
|
671
|
+
// Check that the character before is whitespace (attribute boundary)
|
|
672
|
+
let before_ok = i == 0 || bytes[i - 1].is_ascii_whitespace();
|
|
673
|
+
// Check that the character after is whitespace, '>', '=', or '/'
|
|
674
|
+
let after = bytes.get(i + nlen).copied();
|
|
675
|
+
let after_ok = matches!(after, None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'=' | b'/'));
|
|
676
|
+
if before_ok && after_ok {
|
|
677
|
+
return true;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
i += 1;
|
|
681
|
+
}
|
|
682
|
+
false
|
|
683
|
+
}
|
|
684
|
+
|
|
576
685
|
#[cfg(test)]
|
|
577
686
|
mod tests {
|
|
578
687
|
use super::sanitize_markdown_url;
|
|
@@ -7,6 +7,7 @@ use crate::converter::utility::content::normalized_tag_name;
|
|
|
7
7
|
|
|
8
8
|
/// Serialize an element to HTML string (for SVG and Math elements).
|
|
9
9
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
10
|
+
#[allow(dead_code)] // used with visitor feature
|
|
10
11
|
pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
11
12
|
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
12
13
|
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|
|
@@ -46,6 +47,7 @@ pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parse
|
|
|
46
47
|
|
|
47
48
|
/// Serialize a node to HTML string.
|
|
48
49
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
50
|
+
#[allow(dead_code)] // used with visitor feature
|
|
49
51
|
pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
50
52
|
if let Some(node) = node_handle.get(parser) {
|
|
51
53
|
match node {
|
|
@@ -67,6 +69,7 @@ pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser
|
|
|
67
69
|
|
|
68
70
|
/// Recursively serialize a node to HTML.
|
|
69
71
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
72
|
+
#[allow(dead_code)] // used with visitor feature
|
|
70
73
|
pub(crate) fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
|
|
71
74
|
match handle.get(parser) {
|
|
72
75
|
Some(tl::Node::Tag(tag)) => {
|
|
@@ -6,6 +6,8 @@
|
|
|
6
6
|
|
|
7
7
|
use std::collections::BTreeMap;
|
|
8
8
|
|
|
9
|
+
#[cfg(feature = "visitor")]
|
|
10
|
+
use crate::converter::utility::content::collect_tag_attributes;
|
|
9
11
|
use crate::converter::utility::content::is_block_level_element;
|
|
10
12
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
11
13
|
|
|
@@ -48,11 +50,7 @@ pub fn handle_visitor_element_start(
|
|
|
48
50
|
depth: usize,
|
|
49
51
|
dom_ctx: &crate::converter::DomContext,
|
|
50
52
|
) -> VisitAction {
|
|
51
|
-
let attributes: BTreeMap<String, String> = tag
|
|
52
|
-
.attributes()
|
|
53
|
-
.iter()
|
|
54
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
55
|
-
.collect();
|
|
53
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
56
54
|
|
|
57
55
|
let node_id = node_handle.get_inner();
|
|
58
56
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -131,11 +129,7 @@ pub fn handle_visitor_element_end(
|
|
|
131
129
|
return;
|
|
132
130
|
}
|
|
133
131
|
|
|
134
|
-
let attributes: BTreeMap<String, String> = tag
|
|
135
|
-
.attributes()
|
|
136
|
-
.iter()
|
|
137
|
-
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
138
|
-
.collect();
|
|
132
|
+
let attributes: BTreeMap<String, String> = collect_tag_attributes(tag);
|
|
139
133
|
|
|
140
134
|
let node_id = node_handle.get_inner();
|
|
141
135
|
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
|
|
@@ -13,7 +13,7 @@ pub use crate::inline_images::{
|
|
|
13
13
|
|
|
14
14
|
#[cfg(feature = "metadata")]
|
|
15
15
|
pub use crate::metadata::{
|
|
16
|
-
DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata,
|
|
16
|
+
DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, HeaderMetadata, HtmlMetadata, ImageMetadata, ImageType,
|
|
17
17
|
LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
|
|
18
18
|
};
|
|
19
19
|
|
|
@@ -21,6 +21,3 @@ pub use crate::options::{
|
|
|
21
21
|
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
|
|
22
22
|
NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
|
|
23
23
|
};
|
|
24
|
-
|
|
25
|
-
#[cfg(feature = "async-visitor")]
|
|
26
|
-
pub use crate::visitor_helpers::AsyncVisitorHandle;
|
|
@@ -172,7 +172,7 @@ pub struct InlineImageWarning {
|
|
|
172
172
|
pub message: String,
|
|
173
173
|
}
|
|
174
174
|
|
|
175
|
-
/// Output
|
|
175
|
+
/// Output containing extracted inline images from `convert()` when `extract_images` is enabled.
|
|
176
176
|
#[derive(Debug, Clone)]
|
|
177
177
|
pub struct HtmlExtraction {
|
|
178
178
|
/// Converted markdown output.
|