html-to-markdown 3.2.4 → 3.4.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Steepfile +6 -0
- data/ext/html_to_markdown_rb/Cargo.toml +2 -2
- data/ext/html_to_markdown_rb/native/Cargo.toml +28 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown/version.rb +10 -0
- data/ext/html_to_markdown_rb/src/html-to-markdown.rb +13 -0
- data/ext/html_to_markdown_rb/src/lib.rs +2088 -268
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +5 -3
- data/sig/types.rbs +769 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/examples/basic.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/table.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +1 -1
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +1 -1
- data/vendor/html-to-markdown-rs/src/convert_api.rs +15 -25
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +6 -7
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -108
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +2 -4
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +10 -0
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +14 -14
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -3
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +2 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +57 -31
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +8 -8
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +5 -5
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +6 -17
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +64 -11
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +80 -22
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +5 -9
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +3 -3
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +10 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +13 -13
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +6 -14
- data/vendor/html-to-markdown-rs/src/inline_images.rs +6 -0
- data/vendor/html-to-markdown-rs/src/lib.rs +17 -18
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +31 -0
- data/vendor/html-to-markdown-rs/src/prelude.rs +1 -12
- data/vendor/html-to-markdown-rs/src/text.rs +0 -44
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +2 -0
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +5 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +4 -1
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +136 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +137 -0
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +2 -2
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +5 -2
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +77 -0
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +82 -0
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +1 -1
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +6 -6
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +103 -35
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +1 -1
- metadata +21 -43
- data/.bundle/config +0 -2
- data/.gitignore +0 -3
- data/.rubocop.yml +0 -59
- data/Gemfile +0 -18
- data/Gemfile.lock +0 -173
- data/README.md +0 -331
- data/Rakefile +0 -26
- data/exe/html-to-markdown +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +0 -6
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +0 -9
- data/html-to-markdown-rb.gemspec +0 -99
- data/lib/html_to_markdown_rs.rb +0 -3
- data/sig/html_to_markdown.rbs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/text/escaping.rs +0 -94
- data/vendor/html-to-markdown-rs/src/converter/text/normalization.rs +0 -86
- data/vendor/html-to-markdown-rs/src/safety.rs +0 -70
|
@@ -4,8 +4,10 @@
|
|
|
4
4
|
//! visible text content with structural whitespace, bypassing the full
|
|
5
5
|
//! Markdown/Djot conversion pipeline.
|
|
6
6
|
|
|
7
|
+
use std::collections::HashSet;
|
|
7
8
|
use std::fmt::Write;
|
|
8
9
|
|
|
10
|
+
use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
|
|
9
11
|
use crate::options::ConversionOptions;
|
|
10
12
|
use crate::text;
|
|
11
13
|
|
|
@@ -61,12 +63,36 @@ const BLOCK_TAGS: &[&str] = &[
|
|
|
61
63
|
/// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
|
|
62
64
|
/// - Tables: cells separated by tab, rows by newline
|
|
63
65
|
/// - Inline elements are recursed without markers
|
|
66
|
+
/// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
|
|
64
67
|
pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
|
|
65
68
|
let mut buf = String::with_capacity(1024);
|
|
66
69
|
let mut list_ctx = ListContext::None;
|
|
67
70
|
|
|
71
|
+
// Pre-compute excluded node IDs from exclude_selectors.
|
|
72
|
+
let excluded_node_ids: HashSet<u32> = if options.exclude_selectors.is_empty() {
|
|
73
|
+
HashSet::new()
|
|
74
|
+
} else {
|
|
75
|
+
let mut ids = HashSet::new();
|
|
76
|
+
for selector in &options.exclude_selectors {
|
|
77
|
+
if let Some(iter) = dom.query_selector(selector) {
|
|
78
|
+
for handle in iter {
|
|
79
|
+
ids.insert(handle.get_inner());
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
ids
|
|
84
|
+
};
|
|
85
|
+
|
|
68
86
|
for child_handle in dom.children() {
|
|
69
|
-
walk_plain(
|
|
87
|
+
walk_plain(
|
|
88
|
+
child_handle,
|
|
89
|
+
parser,
|
|
90
|
+
&mut buf,
|
|
91
|
+
options,
|
|
92
|
+
false,
|
|
93
|
+
&mut list_ctx,
|
|
94
|
+
&excluded_node_ids,
|
|
95
|
+
);
|
|
70
96
|
}
|
|
71
97
|
|
|
72
98
|
post_process(&mut buf);
|
|
@@ -81,6 +107,7 @@ fn walk_plain(
|
|
|
81
107
|
options: &ConversionOptions,
|
|
82
108
|
in_pre: bool,
|
|
83
109
|
list_ctx: &mut ListContext,
|
|
110
|
+
excluded_node_ids: &HashSet<u32>,
|
|
84
111
|
) {
|
|
85
112
|
let Some(node) = node_handle.get(parser) else {
|
|
86
113
|
return;
|
|
@@ -104,6 +131,11 @@ fn walk_plain(
|
|
|
104
131
|
}
|
|
105
132
|
}
|
|
106
133
|
tl::Node::Tag(tag) => {
|
|
134
|
+
// Drop elements matching exclude_selectors, including all their descendants.
|
|
135
|
+
if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
|
|
107
139
|
let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
|
|
108
140
|
let tag_str = tag_name.as_str();
|
|
109
141
|
|
|
@@ -112,6 +144,12 @@ fn walk_plain(
|
|
|
112
144
|
return;
|
|
113
145
|
}
|
|
114
146
|
|
|
147
|
+
// Apply preprocessing: drop nav/footer/aside/noise elements
|
|
148
|
+
// (shared logic with the markdown path).
|
|
149
|
+
if should_drop_for_preprocessing(tag_str, tag, options) {
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
|
|
115
153
|
match tag_str {
|
|
116
154
|
"br" => {
|
|
117
155
|
buf.push('\n');
|
|
@@ -121,7 +159,7 @@ fn walk_plain(
|
|
|
121
159
|
}
|
|
122
160
|
"pre" => {
|
|
123
161
|
ensure_blank_line(buf);
|
|
124
|
-
walk_children(tag, parser, buf, options, true, list_ctx);
|
|
162
|
+
walk_children(tag, parser, buf, options, true, list_ctx, excluded_node_ids);
|
|
125
163
|
ensure_blank_line(buf);
|
|
126
164
|
}
|
|
127
165
|
"img" => {
|
|
@@ -136,13 +174,13 @@ fn walk_plain(
|
|
|
136
174
|
}
|
|
137
175
|
"table" => {
|
|
138
176
|
ensure_blank_line(buf);
|
|
139
|
-
walk_table(tag, parser, buf, options);
|
|
177
|
+
walk_table(tag, parser, buf, options, excluded_node_ids);
|
|
140
178
|
ensure_blank_line(buf);
|
|
141
179
|
}
|
|
142
180
|
"ul" => {
|
|
143
181
|
ensure_newline(buf);
|
|
144
182
|
let mut child_ctx = ListContext::Unordered;
|
|
145
|
-
walk_children(tag, parser, buf, options, false, &mut child_ctx);
|
|
183
|
+
walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
|
|
146
184
|
ensure_newline(buf);
|
|
147
185
|
}
|
|
148
186
|
"ol" => {
|
|
@@ -154,7 +192,7 @@ fn walk_plain(
|
|
|
154
192
|
.unwrap_or(1);
|
|
155
193
|
ensure_newline(buf);
|
|
156
194
|
let mut child_ctx = ListContext::Ordered { next_index: start };
|
|
157
|
-
walk_children(tag, parser, buf, options, false, &mut child_ctx);
|
|
195
|
+
walk_children(tag, parser, buf, options, false, &mut child_ctx, excluded_node_ids);
|
|
158
196
|
ensure_newline(buf);
|
|
159
197
|
}
|
|
160
198
|
"li" => {
|
|
@@ -172,17 +210,17 @@ fn walk_plain(
|
|
|
172
210
|
buf.push_str("- ");
|
|
173
211
|
}
|
|
174
212
|
}
|
|
175
|
-
walk_children(tag, parser, buf, options, false, list_ctx);
|
|
213
|
+
walk_children(tag, parser, buf, options, false, list_ctx, excluded_node_ids);
|
|
176
214
|
ensure_newline(buf);
|
|
177
215
|
}
|
|
178
216
|
_ if BLOCK_TAGS.contains(&tag_str) => {
|
|
179
217
|
ensure_blank_line(buf);
|
|
180
|
-
walk_children(tag, parser, buf, options, in_pre, list_ctx);
|
|
218
|
+
walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
|
|
181
219
|
ensure_blank_line(buf);
|
|
182
220
|
}
|
|
183
221
|
_ => {
|
|
184
222
|
// Inline elements and structural containers (html, body, etc.)
|
|
185
|
-
walk_children(tag, parser, buf, options, in_pre, list_ctx);
|
|
223
|
+
walk_children(tag, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
|
|
186
224
|
}
|
|
187
225
|
}
|
|
188
226
|
}
|
|
@@ -198,16 +236,23 @@ fn walk_children(
|
|
|
198
236
|
options: &ConversionOptions,
|
|
199
237
|
in_pre: bool,
|
|
200
238
|
list_ctx: &mut ListContext,
|
|
239
|
+
excluded_node_ids: &HashSet<u32>,
|
|
201
240
|
) {
|
|
202
241
|
let children = tag.children();
|
|
203
242
|
let top = children.top();
|
|
204
243
|
for child in top.iter() {
|
|
205
|
-
walk_plain(child, parser, buf, options, in_pre, list_ctx);
|
|
244
|
+
walk_plain(child, parser, buf, options, in_pre, list_ctx, excluded_node_ids);
|
|
206
245
|
}
|
|
207
246
|
}
|
|
208
247
|
|
|
209
248
|
/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
|
|
210
|
-
fn walk_table(
|
|
249
|
+
fn walk_table(
|
|
250
|
+
table_tag: &tl::HTMLTag,
|
|
251
|
+
parser: &tl::Parser,
|
|
252
|
+
buf: &mut String,
|
|
253
|
+
options: &ConversionOptions,
|
|
254
|
+
excluded_node_ids: &HashSet<u32>,
|
|
255
|
+
) {
|
|
211
256
|
// Collect all <tr> node handles by recursing into the table
|
|
212
257
|
let mut row_handles = Vec::new();
|
|
213
258
|
collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
|
|
@@ -240,7 +285,15 @@ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, op
|
|
|
240
285
|
let mut cell_buf = String::new();
|
|
241
286
|
if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
|
|
242
287
|
let mut cell_list_ctx = ListContext::None;
|
|
243
|
-
walk_children(
|
|
288
|
+
walk_children(
|
|
289
|
+
cell_tag,
|
|
290
|
+
parser,
|
|
291
|
+
&mut cell_buf,
|
|
292
|
+
options,
|
|
293
|
+
false,
|
|
294
|
+
&mut cell_list_ctx,
|
|
295
|
+
excluded_node_ids,
|
|
296
|
+
);
|
|
244
297
|
}
|
|
245
298
|
buf.push_str(cell_buf.trim());
|
|
246
299
|
}
|
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
|
|
6
6
|
use crate::converter::dom_context::DomContext;
|
|
7
7
|
use crate::converter::main_helpers::is_inline_element;
|
|
8
|
-
use crate::converter::utility::attributes::element_has_navigation_hint;
|
|
8
|
+
use crate::converter::utility::attributes::{attribute_matches_any, element_has_navigation_hint};
|
|
9
9
|
use crate::converter::utility::content::normalized_tag_name;
|
|
10
10
|
use crate::options::ConversionOptions;
|
|
11
11
|
|
|
12
12
|
/// Check if an inline ancestor element is allowed to contain block-level elements.
|
|
13
|
-
pub
|
|
13
|
+
pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
|
|
14
14
|
matches!(tag_name, "a" | "ins" | "del")
|
|
15
15
|
}
|
|
16
16
|
|
|
@@ -18,7 +18,7 @@ pub(crate) fn inline_ancestor_allows_block(tag_name: &str) -> bool {
|
|
|
18
18
|
///
|
|
19
19
|
/// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
|
|
20
20
|
/// whitespace preservation rules and should not be repaired.
|
|
21
|
-
pub
|
|
21
|
+
pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
|
|
22
22
|
for handle in dom_ctx.node_map.iter().flatten() {
|
|
23
23
|
if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
|
|
24
24
|
let is_block = dom_ctx
|
|
@@ -68,43 +68,101 @@ pub(crate) fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser
|
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
/// Determine if a node should be dropped during preprocessing.
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
)
|
|
79
|
-
|
|
71
|
+
///
|
|
72
|
+
/// Behavior depends on the [`PreprocessingPreset`]:
|
|
73
|
+
///
|
|
74
|
+
/// - **Minimal**: Only scripts/styles are stripped (handled elsewhere). This function
|
|
75
|
+
/// drops nothing — all structural elements are preserved.
|
|
76
|
+
/// - **Standard** (default): Drops `<nav>` unconditionally. Drops `<header>`, `<footer>`,
|
|
77
|
+
/// and `<aside>` only when they have navigation hints (class/role/aria attributes
|
|
78
|
+
/// indicating site chrome). Drops `<form>` when `remove_forms` is enabled.
|
|
79
|
+
/// - **Aggressive**: All of Standard, plus: drops `<footer>`, `<aside>`, `<noscript>`
|
|
80
|
+
/// unconditionally. Drops ANY element with navigation hints in class/id/role
|
|
81
|
+
/// (e.g. `<div class="sidebar">`). Drops elements with noise-related classes/roles.
|
|
82
|
+
pub fn should_drop_for_preprocessing(tag_name: &str, tag: &tl::HTMLTag, options: &ConversionOptions) -> bool {
|
|
83
|
+
use crate::options::PreprocessingPreset;
|
|
84
|
+
|
|
80
85
|
if !options.preprocessing.enabled {
|
|
81
86
|
return false;
|
|
82
87
|
}
|
|
83
88
|
|
|
89
|
+
let preset = options.preprocessing.preset;
|
|
90
|
+
|
|
91
|
+
// Minimal preset: drop nothing here (scripts/styles handled in earlier pipeline stage).
|
|
92
|
+
if preset == PreprocessingPreset::Minimal {
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Form removal — applies to both Standard and Aggressive when enabled.
|
|
97
|
+
if options.preprocessing.remove_forms && tag_name == "form" {
|
|
98
|
+
return true;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
let is_aggressive = preset == PreprocessingPreset::Aggressive;
|
|
102
|
+
|
|
103
|
+
// Aggressive: drop <noscript> — its content is fallback for no-JS browsers.
|
|
104
|
+
if is_aggressive && tag_name == "noscript" {
|
|
105
|
+
return true;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Navigation removal — only when the flag is enabled.
|
|
84
109
|
if !options.preprocessing.remove_navigation {
|
|
85
110
|
return false;
|
|
86
111
|
}
|
|
87
112
|
|
|
88
113
|
let has_nav_hint = element_has_navigation_hint(tag);
|
|
89
114
|
|
|
115
|
+
// <nav> is always navigation — drop in both Standard and Aggressive.
|
|
90
116
|
if tag_name == "nav" {
|
|
91
117
|
return true;
|
|
92
118
|
}
|
|
93
119
|
|
|
94
120
|
if tag_name == "header" {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
121
|
+
// Drop <header> only with navigation hints (e.g. class="site-header",
|
|
122
|
+
// role="navigation"). A plain <header> often wraps article titles like
|
|
123
|
+
// <header><h1>Title</h1></header> — dropping it loses content.
|
|
124
|
+
return has_nav_hint;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if tag_name == "footer" || tag_name == "aside" {
|
|
128
|
+
// Standard: drop only with navigation hints.
|
|
129
|
+
// Aggressive: drop unconditionally.
|
|
130
|
+
return is_aggressive || has_nav_hint;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Aggressive: drop ANY element that has navigation hints in class/id/role.
|
|
134
|
+
// This catches <div class="sidebar">, <div class="menu">, <section class="navigation">,
|
|
135
|
+
// and similar non-semantic navigation containers.
|
|
136
|
+
if is_aggressive && has_nav_hint {
|
|
137
|
+
return true;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Aggressive: drop elements with noise-related roles.
|
|
141
|
+
if is_aggressive {
|
|
142
|
+
if element_has_noise_hint(tag) {
|
|
105
143
|
return true;
|
|
106
144
|
}
|
|
107
145
|
}
|
|
108
146
|
|
|
109
147
|
false
|
|
110
148
|
}
|
|
149
|
+
|
|
150
|
+
/// Check if an element has noise-related hints (ads, cookie banners, social sharing).
|
|
151
|
+
fn element_has_noise_hint(tag: &tl::HTMLTag) -> bool {
|
|
152
|
+
const NOISE_KEYWORDS: &[&str] = &[
|
|
153
|
+
"cookie",
|
|
154
|
+
"consent",
|
|
155
|
+
"gdpr",
|
|
156
|
+
"banner",
|
|
157
|
+
"advertisement",
|
|
158
|
+
"ad-container",
|
|
159
|
+
"advert",
|
|
160
|
+
"social-share",
|
|
161
|
+
"share-buttons",
|
|
162
|
+
"popup",
|
|
163
|
+
"modal-overlay",
|
|
164
|
+
"newsletter-signup",
|
|
165
|
+
];
|
|
166
|
+
|
|
167
|
+
attribute_matches_any(tag, "class", NOISE_KEYWORDS) || attribute_matches_any(tag, "id", NOISE_KEYWORDS)
|
|
168
|
+
}
|
|
@@ -170,7 +170,7 @@ mod tests {
|
|
|
170
170
|
#[test]
|
|
171
171
|
fn figure_caption_separated_from_image() {
|
|
172
172
|
let html = r#"<figure><img src="photo.jpg" alt="Photo"><figcaption>A nice photo</figcaption></figure>"#;
|
|
173
|
-
let result = crate::convert(html, None).unwrap();
|
|
173
|
+
let result = crate::convert(html, None, None).unwrap();
|
|
174
174
|
let content = result.content.unwrap_or_default();
|
|
175
175
|
assert!(
|
|
176
176
|
content.contains(""),
|
|
@@ -30,7 +30,7 @@ pub mod sectioning;
|
|
|
30
30
|
pub mod summary;
|
|
31
31
|
|
|
32
32
|
// Re-export types from parent module for submodule access
|
|
33
|
-
pub
|
|
33
|
+
pub use super::walk_node;
|
|
34
34
|
pub use super::{Context, DomContext};
|
|
35
35
|
|
|
36
36
|
// Re-export handler functions for direct use
|
|
@@ -3,10 +3,6 @@
|
|
|
3
3
|
//! This module provides utilities for normalizing, escaping, and processing text content
|
|
4
4
|
//! extracted from HTML documents during the conversion to Markdown format.
|
|
5
5
|
|
|
6
|
-
mod escaping;
|
|
7
|
-
mod normalization;
|
|
8
6
|
mod processing;
|
|
9
7
|
|
|
10
|
-
pub use escaping::{escape_link_label, escape_malformed_angle_brackets};
|
|
11
|
-
pub use normalization::{normalize_heading_text, trim_line_end_whitespace, truncate_at_char_boundary};
|
|
12
8
|
pub use processing::dedent_code_block;
|
|
@@ -6,7 +6,7 @@ use crate::converter::DomContext;
|
|
|
6
6
|
use crate::converter::utility::content::normalized_tag_name;
|
|
7
7
|
|
|
8
8
|
/// Check if a tag has main content semantics based on role or class.
|
|
9
|
-
pub
|
|
9
|
+
pub fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
|
|
10
10
|
if let Some(Some(role)) = tag.attributes().get("role") {
|
|
11
11
|
let lowered = role.as_utf8_str().to_ascii_lowercase();
|
|
12
12
|
if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
|
|
@@ -38,7 +38,7 @@ pub(crate) fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
/// Check if an element has navigation-related hints in its attributes.
|
|
41
|
-
pub
|
|
41
|
+
pub fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
|
|
42
42
|
if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
|
|
43
43
|
return true;
|
|
44
44
|
}
|
|
@@ -88,7 +88,7 @@ pub(crate) fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
|
|
|
88
88
|
}
|
|
89
89
|
|
|
90
90
|
/// Check if an attribute value matches any of the given keywords (space or custom-separator aware).
|
|
91
|
-
pub
|
|
91
|
+
pub fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
|
|
92
92
|
let Some(attr_value) = tag.attributes().get(attr) else {
|
|
93
93
|
return false;
|
|
94
94
|
};
|
|
@@ -113,7 +113,7 @@ pub(crate) fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&
|
|
|
113
113
|
|
|
114
114
|
/// Check if an attribute contains any of the given keywords (substring match).
|
|
115
115
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
116
|
-
pub
|
|
116
|
+
pub fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
|
|
117
117
|
let Some(attr_value) = tag.attributes().get(attr) else {
|
|
118
118
|
return false;
|
|
119
119
|
};
|
|
@@ -126,11 +126,7 @@ pub(crate) fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[
|
|
|
126
126
|
|
|
127
127
|
/// Check if a node has a semantic content ancestor (main, article, section).
|
|
128
128
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
129
|
-
pub
|
|
130
|
-
node_handle: &tl::NodeHandle,
|
|
131
|
-
parser: &tl::Parser,
|
|
132
|
-
dom_ctx: &DomContext,
|
|
133
|
-
) -> bool {
|
|
129
|
+
pub fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
|
|
134
130
|
let mut current_id = node_handle.get_inner();
|
|
135
131
|
while let Some(parent_id) = dom_ctx.parent_of(current_id) {
|
|
136
132
|
if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
|
|
@@ -10,7 +10,7 @@ use std::num::NonZeroUsize;
|
|
|
10
10
|
///
|
|
11
11
|
/// Pre-computes parent-child relationships, sibling indices, and caches
|
|
12
12
|
/// tag information for efficient DOM navigation during conversion.
|
|
13
|
-
pub
|
|
13
|
+
pub fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len: usize) -> DomContext {
|
|
14
14
|
let cache_capacity = text_cache_capacity_for_input(input_len);
|
|
15
15
|
let mut ctx = DomContext {
|
|
16
16
|
parent_map: Vec::new(),
|
|
@@ -40,7 +40,7 @@ pub(crate) fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser, input_len:
|
|
|
40
40
|
///
|
|
41
41
|
/// Returns a cache capacity between 32 and TEXT_CACHE_CAPACITY,
|
|
42
42
|
/// scaled proportionally to input size (1KB = 1 slot).
|
|
43
|
-
pub
|
|
43
|
+
pub fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
|
|
44
44
|
const TEXT_CACHE_CAPACITY: usize = 256;
|
|
45
45
|
// `clamp(32, TEXT_CACHE_CAPACITY)` guarantees `target >= 32 > 0`, so `new` always returns Some.
|
|
46
46
|
let target = (input_len / 1024).clamp(32, TEXT_CACHE_CAPACITY);
|
|
@@ -50,7 +50,7 @@ pub(crate) fn text_cache_capacity_for_input(input_len: usize) -> NonZeroUsize {
|
|
|
50
50
|
/// Recursively record node hierarchy into DOM context.
|
|
51
51
|
///
|
|
52
52
|
/// Builds the complete parent-child relationship map for efficient tree traversal.
|
|
53
|
-
pub
|
|
53
|
+
pub fn record_node_hierarchy(
|
|
54
54
|
node_handle: tl::NodeHandle,
|
|
55
55
|
parent: Option<u32>,
|
|
56
56
|
parser: &tl::Parser,
|
|
@@ -9,14 +9,14 @@ use std::borrow::Cow;
|
|
|
9
9
|
use std::collections::BTreeMap;
|
|
10
10
|
|
|
11
11
|
// Forward declare DomContext from parent module to avoid circular imports
|
|
12
|
-
pub
|
|
12
|
+
pub use crate::converter::DomContext;
|
|
13
13
|
|
|
14
14
|
/// Collect all attributes from an HTML tag as a `BTreeMap<String, String>`.
|
|
15
15
|
///
|
|
16
16
|
/// Boolean attributes (those with `None` as the value) are skipped; only
|
|
17
17
|
/// attributes that carry an explicit value are included.
|
|
18
18
|
#[cfg(feature = "visitor")]
|
|
19
|
-
pub
|
|
19
|
+
pub fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, String> {
|
|
20
20
|
tag.attributes()
|
|
21
21
|
.iter()
|
|
22
22
|
.filter_map(|(k, v)| v.as_ref().map(|val| (k.to_string(), val.to_string())))
|
|
@@ -28,7 +28,7 @@ pub(crate) fn collect_tag_attributes(tag: &tl::HTMLTag) -> BTreeMap<String, Stri
|
|
|
28
28
|
/// Similar to `text::chomp` but handles line breaks from `<br>` tags specially.
|
|
29
29
|
/// Line breaks are extracted as suffix to be placed outside formatting.
|
|
30
30
|
/// Returns (prefix, suffix, `trimmed_text`).
|
|
31
|
-
pub
|
|
31
|
+
pub fn chomp_inline(text: &str) -> (&str, &str, &str) {
|
|
32
32
|
if text.is_empty() {
|
|
33
33
|
return ("", "", "");
|
|
34
34
|
}
|
|
@@ -59,13 +59,13 @@ pub(crate) fn chomp_inline(text: &str) -> (&str, &str, &str) {
|
|
|
59
59
|
|
|
60
60
|
/// Get the text content of a node and its children.
|
|
61
61
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
62
|
-
pub
|
|
62
|
+
pub fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> String {
|
|
63
63
|
dom_ctx.text_content(*node_handle, parser)
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
/// Collect inline text for link labels, skipping block-level descendants.
|
|
67
67
|
#[allow(clippy::match_wildcard_for_single_variants)]
|
|
68
|
-
pub
|
|
68
|
+
pub fn collect_link_label_text(
|
|
69
69
|
children: &[tl::NodeHandle],
|
|
70
70
|
parser: &tl::Parser,
|
|
71
71
|
dom_ctx: &DomContext,
|
|
@@ -118,7 +118,7 @@ pub(crate) fn collect_link_label_text(
|
|
|
118
118
|
|
|
119
119
|
/// Normalize a link label by collapsing newlines and normalizing whitespace.
|
|
120
120
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
121
|
-
pub
|
|
121
|
+
pub fn normalize_link_label(label: &str) -> String {
|
|
122
122
|
let mut needs_collapse = false;
|
|
123
123
|
for ch in label.chars() {
|
|
124
124
|
if ch == '\n' || ch == '\r' {
|
|
@@ -146,7 +146,7 @@ pub(crate) fn normalize_link_label(label: &str) -> String {
|
|
|
146
146
|
}
|
|
147
147
|
|
|
148
148
|
/// Normalize a tag name to lowercase, preserving borrowed input when possible.
|
|
149
|
-
pub
|
|
149
|
+
pub fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
|
|
150
150
|
if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
|
|
151
151
|
let mut owned = raw.into_owned();
|
|
152
152
|
owned.make_ascii_lowercase();
|
|
@@ -157,7 +157,7 @@ pub(crate) fn normalized_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
|
|
|
157
157
|
}
|
|
158
158
|
|
|
159
159
|
/// Check if an element is block-level (not inline).
|
|
160
|
-
pub
|
|
160
|
+
pub fn is_block_level_element(tag_name: &str) -> bool {
|
|
161
161
|
is_block_level_name(tag_name, crate::converter::main_helpers::is_inline_element(tag_name))
|
|
162
162
|
}
|
|
163
163
|
|
|
@@ -191,7 +191,7 @@ pub fn floor_char_boundary(s: &str, index: usize) -> usize {
|
|
|
191
191
|
/// Input: "[outer [inner]]"
|
|
192
192
|
/// Output: "[outer [inner]]"
|
|
193
193
|
/// ```
|
|
194
|
-
pub
|
|
194
|
+
pub fn escape_link_label(text: &str) -> String {
|
|
195
195
|
if text.is_empty() {
|
|
196
196
|
return String::new();
|
|
197
197
|
}
|
|
@@ -231,7 +231,7 @@ pub(crate) fn escape_link_label(text: &str) -> String {
|
|
|
231
231
|
}
|
|
232
232
|
|
|
233
233
|
/// Helper for block-level element detection.
|
|
234
|
-
pub
|
|
234
|
+
pub fn is_block_level_name(tag_name: &str, is_inline: bool) -> bool {
|
|
235
235
|
!is_inline
|
|
236
236
|
&& matches!(
|
|
237
237
|
tag_name,
|
|
@@ -7,7 +7,7 @@ use std::borrow::Cow;
|
|
|
7
7
|
use std::str;
|
|
8
8
|
|
|
9
9
|
/// Strip script and style tags and their content from HTML.
|
|
10
|
-
pub
|
|
10
|
+
pub fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
|
|
11
11
|
let bytes = input.as_bytes();
|
|
12
12
|
let len = bytes.len();
|
|
13
13
|
|
|
@@ -163,7 +163,7 @@ pub(crate) fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
|
|
|
163
163
|
/// Returns the position AFTER the closing tag (including the '>').
|
|
164
164
|
/// This is highly optimized for performance and uses a fast-path scan.
|
|
165
165
|
#[inline]
|
|
166
|
-
pub
|
|
166
|
+
pub fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
|
|
167
167
|
let len = bytes.len();
|
|
168
168
|
let tag_len = tag.len();
|
|
169
169
|
|
|
@@ -212,7 +212,7 @@ pub(crate) fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) ->
|
|
|
212
212
|
|
|
213
213
|
/// Compare bytes ignoring ASCII case.
|
|
214
214
|
#[inline]
|
|
215
|
-
pub
|
|
215
|
+
pub fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
|
|
216
216
|
if a.len() != b.len() {
|
|
217
217
|
return false;
|
|
218
218
|
}
|
|
@@ -220,7 +220,7 @@ pub(crate) fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
|
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
/// Preprocess HTML to normalize tags and fix common issues.
|
|
223
|
-
pub
|
|
223
|
+
pub fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
224
224
|
const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
|
|
225
225
|
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
226
226
|
const SVG: &[u8] = b"svg";
|
|
@@ -289,7 +289,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
|
289
289
|
if tag == b"script" && is_json_ld_script_open_tag(&input[idx..open_end]) {
|
|
290
290
|
continue;
|
|
291
291
|
}
|
|
292
|
-
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(
|
|
292
|
+
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(open_end);
|
|
293
293
|
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
294
294
|
out.push_str(&input[last..idx]);
|
|
295
295
|
out.push_str(&input[idx..open_end]);
|
|
@@ -379,7 +379,7 @@ pub(crate) fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
|
379
379
|
}
|
|
380
380
|
|
|
381
381
|
/// Check if a script tag is a JSON-LD script.
|
|
382
|
-
pub
|
|
382
|
+
pub fn is_json_ld_script_open_tag(tag: &str) -> bool {
|
|
383
383
|
let bytes = tag.as_bytes();
|
|
384
384
|
let mut idx = 0;
|
|
385
385
|
while idx + 4 <= bytes.len() {
|
|
@@ -443,7 +443,7 @@ pub(crate) fn is_json_ld_script_open_tag(tag: &str) -> bool {
|
|
|
443
443
|
|
|
444
444
|
/// Case-insensitive byte comparison for ASCII.
|
|
445
445
|
#[inline]
|
|
446
|
-
pub
|
|
446
|
+
pub fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
|
|
447
447
|
if haystack.len() < needle.len() {
|
|
448
448
|
return false;
|
|
449
449
|
}
|
|
@@ -454,7 +454,7 @@ pub(crate) fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool
|
|
|
454
454
|
}
|
|
455
455
|
|
|
456
456
|
/// Check if bytes match a tag start pattern.
|
|
457
|
-
pub
|
|
457
|
+
pub fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
|
|
458
458
|
if start >= bytes.len() {
|
|
459
459
|
return false;
|
|
460
460
|
}
|
|
@@ -477,7 +477,7 @@ pub(crate) fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> b
|
|
|
477
477
|
}
|
|
478
478
|
|
|
479
479
|
/// Find the end of an HTML tag (the position of '>').
|
|
480
|
-
pub
|
|
480
|
+
pub fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
|
|
481
481
|
let len = bytes.len();
|
|
482
482
|
let mut in_quote: Option<u8> = None;
|
|
483
483
|
|
|
@@ -502,7 +502,7 @@ pub(crate) fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
|
|
|
502
502
|
}
|
|
503
503
|
|
|
504
504
|
/// Find the closing tag for a given tag name.
|
|
505
|
-
pub
|
|
505
|
+
pub fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
|
|
506
506
|
let len = bytes.len();
|
|
507
507
|
let mut depth = 1usize;
|
|
508
508
|
|
|
@@ -533,7 +533,7 @@ pub(crate) fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Opti
|
|
|
533
533
|
}
|
|
534
534
|
|
|
535
535
|
/// Check if bytes match an end tag pattern.
|
|
536
|
-
pub
|
|
536
|
+
pub fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
|
|
537
537
|
if start >= bytes.len() || bytes[start] != b'/' {
|
|
538
538
|
return false;
|
|
539
539
|
}
|
|
@@ -553,7 +553,7 @@ pub(crate) fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> b
|
|
|
553
553
|
///
|
|
554
554
|
/// # Returns
|
|
555
555
|
/// * `Cow<str>` - Either the borrowed original URL or an owned sanitized version
|
|
556
|
-
pub
|
|
556
|
+
pub fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
|
|
557
557
|
// Pattern: ...[text](actual_url) or similar markdown-like syntax
|
|
558
558
|
// This handles malformed HTML where markdown syntax wasn't properly converted
|
|
559
559
|
// and prevents downstream URL parsing errors (e.g., bracketed "IPv6" hosts).
|
|
@@ -585,7 +585,7 @@ pub(crate) fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
|
|
|
585
585
|
/// Scans for opening tags containing the `hidden` attribute, finds their
|
|
586
586
|
/// matching closing tag, and removes the entire element (tag + content).
|
|
587
587
|
/// Self-closing tags with `hidden` are also removed.
|
|
588
|
-
pub
|
|
588
|
+
pub fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
|
|
589
589
|
let bytes = input.as_bytes();
|
|
590
590
|
let len = bytes.len();
|
|
591
591
|
|
|
@@ -8,7 +8,7 @@ use crate::converter::utility::content::normalized_tag_name;
|
|
|
8
8
|
/// Serialize an element to HTML string (for SVG and Math elements).
|
|
9
9
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
10
10
|
#[allow(dead_code)] // used with visitor feature
|
|
11
|
-
pub
|
|
11
|
+
pub fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
12
12
|
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
13
13
|
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|
|
14
14
|
let mut html = String::with_capacity(256);
|
|
@@ -48,7 +48,7 @@ pub(crate) fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parse
|
|
|
48
48
|
/// Serialize a node to HTML string.
|
|
49
49
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
50
50
|
#[allow(dead_code)] // used with visitor feature
|
|
51
|
-
pub
|
|
51
|
+
pub fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
52
52
|
if let Some(node) = node_handle.get(parser) {
|
|
53
53
|
match node {
|
|
54
54
|
tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
|
|
@@ -61,7 +61,7 @@ pub(crate) fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser)
|
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
/// Serialize a tag to HTML, wrapping serialize_node_to_html.
|
|
64
|
-
pub
|
|
64
|
+
pub fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
|
|
65
65
|
let mut html = String::new();
|
|
66
66
|
serialize_node_to_html(handle, parser, &mut html);
|
|
67
67
|
html
|
|
@@ -70,7 +70,7 @@ pub(crate) fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser
|
|
|
70
70
|
/// Recursively serialize a node to HTML.
|
|
71
71
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
72
72
|
#[allow(dead_code)] // used with visitor feature
|
|
73
|
-
pub
|
|
73
|
+
pub fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
|
|
74
74
|
match handle.get(parser) {
|
|
75
75
|
Some(tl::Node::Tag(tag)) => {
|
|
76
76
|
let tag_name = normalized_tag_name(tag.name().as_utf8_str());
|