html-to-markdown 3.4.0 → 3.6.0.pre.rc.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +21 -0
- data/README.md +347 -0
- data/Steepfile +10 -2
- data/ext/html_to_markdown_rb/Cargo.toml +3 -2
- data/ext/html_to_markdown_rb/extconf.rb +5 -5
- data/ext/html_to_markdown_rb/native/Cargo.lock +962 -0
- data/ext/html_to_markdown_rb/native/Cargo.toml +6 -11
- data/ext/html_to_markdown_rb/native/extconf.rb +14 -0
- data/ext/html_to_markdown_rb/src/lib.rs +1715 -646
- data/lib/html_to_markdown/native.rb +913 -37
- data/lib/html_to_markdown/version.rb +3 -3
- data/lib/html_to_markdown.rb +9 -4
- data/lib/html_to_markdown_rb.so +0 -0
- data/sig/types.rbs +59 -292
- metadata +32 -179
- data/ext/html_to_markdown_rb/Makefile +0 -592
- data/lib/bin/html-to-markdown +0 -0
- data/vendor/Cargo.toml +0 -33
- data/vendor/html-to-markdown-rs/Cargo.toml +0 -54
- data/vendor/html-to-markdown-rs/README.md +0 -278
- data/vendor/html-to-markdown-rs/examples/basic.rs +0 -24
- data/vendor/html-to-markdown-rs/examples/table.rs +0 -25
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +0 -12
- data/vendor/html-to-markdown-rs/examples/test_escape.rs +0 -58
- data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +0 -113
- data/vendor/html-to-markdown-rs/examples/test_lists.rs +0 -39
- data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +0 -89
- data/vendor/html-to-markdown-rs/examples/test_tables.rs +0 -100
- data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +0 -61
- data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +0 -34
- data/vendor/html-to-markdown-rs/src/convert_api.rs +0 -349
- data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +0 -178
- data/vendor/html-to-markdown-rs/src/converter/block/container.rs +0 -114
- data/vendor/html-to-markdown-rs/src/converter/block/div.rs +0 -149
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +0 -428
- data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +0 -103
- data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +0 -89
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -10
- data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +0 -140
- data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +0 -298
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +0 -453
- data/vendor/html-to-markdown-rs/src/converter/block/table/caption.rs +0 -44
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +0 -276
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +0 -336
- data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +0 -58
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +0 -266
- data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +0 -146
- data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +0 -34
- data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +0 -138
- data/vendor/html-to-markdown-rs/src/converter/context.rs +0 -208
- data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +0 -337
- data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +0 -770
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +0 -82
- data/vendor/html-to-markdown-rs/src/converter/format/djot.rs +0 -64
- data/vendor/html-to-markdown-rs/src/converter/format/markdown.rs +0 -59
- data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -43
- data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +0 -173
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +0 -434
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +0 -234
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +0 -282
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +0 -316
- data/vendor/html-to-markdown-rs/src/converter/handlers/mod.rs +0 -26
- data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +0 -306
- data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +0 -345
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +0 -428
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -237
- data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +0 -337
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +0 -566
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +0 -86
- data/vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +0 -558
- data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +0 -232
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +0 -332
- data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -70
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +0 -201
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +0 -195
- data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +0 -314
- data/vendor/html-to-markdown-rs/src/converter/main.rs +0 -710
- data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +0 -452
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +0 -393
- data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +0 -4
- data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -183
- data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +0 -87
- data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +0 -280
- data/vendor/html-to-markdown-rs/src/converter/metadata.rs +0 -220
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -156
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +0 -516
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +0 -201
- data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +0 -69
- data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +0 -269
- data/vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs +0 -266
- data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +0 -391
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +0 -112
- data/vendor/html-to-markdown-rs/src/converter/semantic/sectioning.rs +0 -85
- data/vendor/html-to-markdown-rs/src/converter/semantic/summary.rs +0 -324
- data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -8
- data/vendor/html-to-markdown-rs/src/converter/text/processing.rs +0 -56
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +0 -269
- data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -151
- data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +0 -74
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +0 -271
- data/vendor/html-to-markdown-rs/src/converter/utility/mod.rs +0 -17
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +0 -1002
- data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +0 -126
- data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +0 -97
- data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +0 -189
- data/vendor/html-to-markdown-rs/src/error.rs +0 -43
- data/vendor/html-to-markdown-rs/src/exports.rs +0 -24
- data/vendor/html-to-markdown-rs/src/inline_images.rs +0 -336
- data/vendor/html-to-markdown-rs/src/lib.rs +0 -139
- data/vendor/html-to-markdown-rs/src/metadata/collector.rs +0 -457
- data/vendor/html-to-markdown-rs/src/metadata/config.rs +0 -394
- data/vendor/html-to-markdown-rs/src/metadata/extraction.rs +0 -398
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +0 -288
- data/vendor/html-to-markdown-rs/src/metadata/types.rs +0 -477
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +0 -559
- data/vendor/html-to-markdown-rs/src/options/inline_image.rs +0 -111
- data/vendor/html-to-markdown-rs/src/options/mod.rs +0 -20
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +0 -201
- data/vendor/html-to-markdown-rs/src/options/validation.rs +0 -416
- data/vendor/html-to-markdown-rs/src/prelude.rs +0 -1
- data/vendor/html-to-markdown-rs/src/rcdom.rs +0 -487
- data/vendor/html-to-markdown-rs/src/text.rs +0 -358
- data/vendor/html-to-markdown-rs/src/types/document.rs +0 -191
- data/vendor/html-to-markdown-rs/src/types/mod.rs +0 -17
- data/vendor/html-to-markdown-rs/src/types/result.rs +0 -54
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +0 -791
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +0 -483
- data/vendor/html-to-markdown-rs/src/types/tables.rs +0 -52
- data/vendor/html-to-markdown-rs/src/types/warnings.rs +0 -33
- data/vendor/html-to-markdown-rs/src/validation.rs +0 -158
- data/vendor/html-to-markdown-rs/src/visitor/default_impl.rs +0 -63
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -41
- data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -370
- data/vendor/html-to-markdown-rs/src/visitor/types.rs +0 -319
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +0 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs +0 -126
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -27
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +0 -110
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +0 -250
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +0 -597
- data/vendor/html-to-markdown-rs/src/wrapper/sync.rs +0 -413
- data/vendor/html-to-markdown-rs/src/wrapper/utils.rs +0 -290
- data/vendor/html-to-markdown-rs/src/wrapper.rs +0 -9
- data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +0 -87
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +0 -297
- data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +0 -153
- data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +0 -132
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +0 -631
- data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +0 -49
- data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +0 -58
- data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +0 -17
- data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +0 -41
- data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +0 -40
- data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +0 -26
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +0 -185
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +0 -100
- data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +0 -133
- data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +0 -144
- data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +0 -62
- data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +0 -128
- data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +0 -20
- data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +0 -62
- data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +0 -68
- data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +0 -87
- data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +0 -74
- data/vendor/html-to-markdown-rs/tests/issue_339_regressions.rs +0 -92
- data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +0 -154
- data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +0 -93
- data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +0 -44
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +0 -199
- data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +0 -273
- data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +0 -61
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +0 -169
- data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +0 -137
- data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +0 -522
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +0 -743
- data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +0 -41
- data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +0 -204
- data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +0 -68
- data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +0 -77
- data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +0 -82
- data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +0 -45
- data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +0 -396
- data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +0 -34
- data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +0 -121
- data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +0 -1190
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +0 -372
|
@@ -1,1002 +0,0 @@
|
|
|
1
|
-
//! HTML preprocessing and normalization.
|
|
2
|
-
//!
|
|
3
|
-
//! Functions for preprocessing HTML before conversion, including script/style stripping,
|
|
4
|
-
//! tag repair, and malformed HTML handling.
|
|
5
|
-
|
|
6
|
-
use std::borrow::Cow;
|
|
7
|
-
use std::str;
|
|
8
|
-
|
|
9
|
-
/// Strip script and style tags and their content from HTML.
|
|
10
|
-
pub fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
|
|
11
|
-
let bytes = input.as_bytes();
|
|
12
|
-
let len = bytes.len();
|
|
13
|
-
|
|
14
|
-
if len == 0 {
|
|
15
|
-
return Cow::Borrowed(input);
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
let mut idx = 0;
|
|
19
|
-
let mut last = 0;
|
|
20
|
-
let mut output: Option<String> = None;
|
|
21
|
-
let mut svg_depth = 0usize;
|
|
22
|
-
|
|
23
|
-
// Fast-path: check if there are any < characters at all
|
|
24
|
-
if !bytes.contains(&b'<') {
|
|
25
|
-
return Cow::Borrowed(input);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
while idx < len {
|
|
29
|
-
if bytes[idx] == b'<' && idx + 1 < len {
|
|
30
|
-
if matches_tag_start(bytes, idx + 1, b"svg") {
|
|
31
|
-
if let Some(open_end) = find_tag_end(bytes, idx + 1 + b"svg".len()) {
|
|
32
|
-
svg_depth += 1;
|
|
33
|
-
idx = open_end;
|
|
34
|
-
continue;
|
|
35
|
-
}
|
|
36
|
-
} else if matches_end_tag_start(bytes, idx + 1, b"svg") {
|
|
37
|
-
if let Some(close_end) = find_tag_end(bytes, idx + 2 + b"svg".len()) {
|
|
38
|
-
if svg_depth > 0 {
|
|
39
|
-
svg_depth = svg_depth.saturating_sub(1);
|
|
40
|
-
}
|
|
41
|
-
idx = close_end;
|
|
42
|
-
continue;
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
if svg_depth > 0 {
|
|
47
|
-
idx += 1;
|
|
48
|
-
continue;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// Check for </script or </style (closing tags first for safety)
|
|
52
|
-
if bytes[idx + 1] == b'/' && idx + 2 < len {
|
|
53
|
-
// Match </script>
|
|
54
|
-
if idx + 9 <= len && eq_ascii_insensitive(&bytes[idx..idx + 9], b"</script>") {
|
|
55
|
-
idx += 9;
|
|
56
|
-
continue;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// Match </style>
|
|
60
|
-
if idx + 8 <= len && eq_ascii_insensitive(&bytes[idx..idx + 8], b"</style>") {
|
|
61
|
-
idx += 8;
|
|
62
|
-
continue;
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
// Check for <script or <style (opening tags)
|
|
67
|
-
// Match <script (case insensitive)
|
|
68
|
-
if idx + 7 < len && eq_ascii_insensitive(&bytes[idx..idx + 7], b"<script") {
|
|
69
|
-
// Check if this is actually "<script" followed by whitespace, >, or attribute
|
|
70
|
-
let after_tag = bytes[idx + 7];
|
|
71
|
-
if after_tag == b'>'
|
|
72
|
-
|| after_tag == b' '
|
|
73
|
-
|| after_tag == b'\t'
|
|
74
|
-
|| after_tag == b'\n'
|
|
75
|
-
|| after_tag == b'\r'
|
|
76
|
-
{
|
|
77
|
-
// Find the opening tag end
|
|
78
|
-
let mut tag_end = idx + 7;
|
|
79
|
-
while tag_end < len && bytes[tag_end] != b'>' {
|
|
80
|
-
tag_end += 1;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if tag_end < len {
|
|
84
|
-
tag_end += 1; // Include the '>'
|
|
85
|
-
|
|
86
|
-
// Check if this is a JSON-LD script tag
|
|
87
|
-
let tag_content = &input[idx..tag_end];
|
|
88
|
-
if !is_json_ld_script_open_tag(tag_content) {
|
|
89
|
-
// Find the closing </script> tag
|
|
90
|
-
let close_tag = find_closing_tag_bytes(bytes, tag_end, b"script");
|
|
91
|
-
if let Some(close_idx) = close_tag {
|
|
92
|
-
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
93
|
-
out.push_str(&input[last..idx]);
|
|
94
|
-
if idx > 0
|
|
95
|
-
&& close_idx < len
|
|
96
|
-
&& !bytes[idx - 1].is_ascii_whitespace()
|
|
97
|
-
&& !bytes[close_idx].is_ascii_whitespace()
|
|
98
|
-
{
|
|
99
|
-
out.push(' ');
|
|
100
|
-
}
|
|
101
|
-
last = close_idx;
|
|
102
|
-
idx = close_idx;
|
|
103
|
-
continue;
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
// Match <style (case insensitive)
|
|
110
|
-
else if idx + 6 < len && eq_ascii_insensitive(&bytes[idx..idx + 6], b"<style") {
|
|
111
|
-
// Check if this is actually "<style" followed by whitespace, >, or attribute
|
|
112
|
-
let after_tag = bytes[idx + 6];
|
|
113
|
-
if after_tag == b'>'
|
|
114
|
-
|| after_tag == b' '
|
|
115
|
-
|| after_tag == b'\t'
|
|
116
|
-
|| after_tag == b'\n'
|
|
117
|
-
|| after_tag == b'\r'
|
|
118
|
-
{
|
|
119
|
-
// Find the opening tag end
|
|
120
|
-
let mut tag_end = idx + 6;
|
|
121
|
-
while tag_end < len && bytes[tag_end] != b'>' {
|
|
122
|
-
tag_end += 1;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
if tag_end < len {
|
|
126
|
-
tag_end += 1; // Include the '>'
|
|
127
|
-
|
|
128
|
-
// Find the closing </style> tag
|
|
129
|
-
let close_tag = find_closing_tag_bytes(bytes, tag_end, b"style");
|
|
130
|
-
if let Some(close_idx) = close_tag {
|
|
131
|
-
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
132
|
-
out.push_str(&input[last..idx]);
|
|
133
|
-
if idx > 0
|
|
134
|
-
&& close_idx < len
|
|
135
|
-
&& !bytes[idx - 1].is_ascii_whitespace()
|
|
136
|
-
&& !bytes[close_idx].is_ascii_whitespace()
|
|
137
|
-
{
|
|
138
|
-
out.push(' ');
|
|
139
|
-
}
|
|
140
|
-
last = close_idx;
|
|
141
|
-
idx = close_idx;
|
|
142
|
-
continue;
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
idx += 1;
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
if let Some(mut out) = output {
|
|
153
|
-
if last < len {
|
|
154
|
-
out.push_str(&input[last..]);
|
|
155
|
-
}
|
|
156
|
-
Cow::Owned(out)
|
|
157
|
-
} else {
|
|
158
|
-
Cow::Borrowed(input)
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/// Find the position of a closing tag in bytes.
|
|
163
|
-
/// Returns the position AFTER the closing tag (including the '>').
|
|
164
|
-
/// This is highly optimized for performance and uses a fast-path scan.
|
|
165
|
-
#[inline]
|
|
166
|
-
pub fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
|
|
167
|
-
let len = bytes.len();
|
|
168
|
-
let tag_len = tag.len();
|
|
169
|
-
|
|
170
|
-
// Fast path: look for the closing tag pattern byte-by-byte
|
|
171
|
-
// We use a simple byte scan to find '</' then validate the tag name
|
|
172
|
-
let mut idx = start;
|
|
173
|
-
|
|
174
|
-
// Limit search to prevent stack overflow on large files
|
|
175
|
-
// Look for closing tag within reasonable bounds
|
|
176
|
-
const MAX_SCAN: usize = 100_000_000; // 100MB limit per tag - prevents pathological cases
|
|
177
|
-
|
|
178
|
-
while idx < len && (idx - start) < MAX_SCAN {
|
|
179
|
-
// Optimization: skip forward to next '<' quickly using memchr
|
|
180
|
-
if bytes[idx] != b'<' {
|
|
181
|
-
if let Some(pos) = memchr::memchr(b'<', &bytes[idx..]) {
|
|
182
|
-
idx += pos;
|
|
183
|
-
} else {
|
|
184
|
-
break;
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// Check for </ pattern
|
|
189
|
-
if idx + 2 < len && bytes[idx + 1] == b'/' {
|
|
190
|
-
// Check if tag name matches
|
|
191
|
-
if idx + 2 + tag_len <= len && eq_ascii_insensitive(&bytes[idx + 2..idx + 2 + tag_len], tag) {
|
|
192
|
-
// Ensure it's followed by > or whitespace
|
|
193
|
-
let after_tag = idx + 2 + tag_len;
|
|
194
|
-
if after_tag < len && (bytes[after_tag] == b'>' || bytes[after_tag].is_ascii_whitespace()) {
|
|
195
|
-
// Find the >
|
|
196
|
-
let mut close_idx = after_tag;
|
|
197
|
-
while close_idx < len && bytes[close_idx] != b'>' {
|
|
198
|
-
close_idx += 1;
|
|
199
|
-
}
|
|
200
|
-
if close_idx < len {
|
|
201
|
-
return Some(close_idx + 1); // Include the '>'
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
idx += 1;
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
None
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
/// Compare bytes ignoring ASCII case.
|
|
214
|
-
#[inline]
|
|
215
|
-
pub fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
|
|
216
|
-
if a.len() != b.len() {
|
|
217
|
-
return false;
|
|
218
|
-
}
|
|
219
|
-
a.iter().zip(b.iter()).all(|(x, y)| x.eq_ignore_ascii_case(y))
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
/// Normalize HTML comment endings that would confuse the `tl` parser.
|
|
223
|
-
///
|
|
224
|
-
/// The `astral-tl` parser mishandles HTML comments whose closing sequence
|
|
225
|
-
/// contains more than two dashes before the `>` (e.g. `<!-- foo --->` or
|
|
226
|
-
/// `<!-- foo ---->`). When it encounters such a comment it creates an empty
|
|
227
|
-
/// comment node and silently discards every byte that follows, so all document
|
|
228
|
-
/// content after the comment is lost.
|
|
229
|
-
///
|
|
230
|
-
/// This function rewrites those bogus closings: every `--[-]+>` sequence that
|
|
231
|
-
/// terminates an HTML comment is normalised to `-->`. Regular `-->` closings
|
|
232
|
-
/// are left unchanged.
|
|
233
|
-
///
|
|
234
|
-
/// # Algorithm
|
|
235
|
-
///
|
|
236
|
-
/// Scans the input byte-by-byte looking for `<!--`. For each comment found it
|
|
237
|
-
/// scans forward for `-->` using the HTML5 comment-end state machine:
|
|
238
|
-
///
|
|
239
|
-
/// - `--[` zero or more `-` `]>` ends the comment.
|
|
240
|
-
/// - Any other character after `--` resets back into the comment body.
|
|
241
|
-
///
|
|
242
|
-
/// If the actual number of leading dashes before `>` is more than two the
|
|
243
|
-
/// closing sequence is replaced with `-->`.
|
|
244
|
-
pub fn normalize_bogus_comment_endings(input: &str) -> Cow<'_, str> {
|
|
245
|
-
let bytes = input.as_bytes();
|
|
246
|
-
let len = bytes.len();
|
|
247
|
-
|
|
248
|
-
// Fast path: the input must contain at least "<!--" and "--->".
|
|
249
|
-
// Without "<!--" there are no comments; without "---" there cannot be a
|
|
250
|
-
// bogus closing.
|
|
251
|
-
if len < 7 || !bytes.windows(4).any(|w| w == b"<!--") {
|
|
252
|
-
return Cow::Borrowed(input);
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
let mut idx = 0;
|
|
256
|
-
let mut last = 0;
|
|
257
|
-
let mut output: Option<String> = None;
|
|
258
|
-
|
|
259
|
-
while idx + 3 < len {
|
|
260
|
-
// Find the next comment opening.
|
|
261
|
-
if !(bytes[idx] == b'<' && bytes[idx + 1] == b'!' && bytes[idx + 2] == b'-' && bytes[idx + 3] == b'-') {
|
|
262
|
-
idx += 1;
|
|
263
|
-
continue;
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
// We are positioned at `<!--`.
|
|
267
|
-
idx += 4; // advance past `<!--`
|
|
268
|
-
|
|
269
|
-
// Walk the comment body looking for the closing sequence.
|
|
270
|
-
// The HTML5 comment-end state machine:
|
|
271
|
-
// COMMENT state: most chars append to body; `-` → COMMENT_END_DASH
|
|
272
|
-
// COMMENT_END_DASH: `-` → COMMENT_END; other → COMMENT
|
|
273
|
-
// COMMENT_END: `>` → done; `-` → stay in COMMENT_END (extra dash);
|
|
274
|
-
// other → COMMENT
|
|
275
|
-
// We track consecutive dashes at the current position.
|
|
276
|
-
let mut consecutive_dashes: usize = 0;
|
|
277
|
-
|
|
278
|
-
while idx < len {
|
|
279
|
-
let b = bytes[idx];
|
|
280
|
-
if b == b'-' {
|
|
281
|
-
consecutive_dashes += 1;
|
|
282
|
-
idx += 1;
|
|
283
|
-
} else if b == b'>' && consecutive_dashes >= 2 {
|
|
284
|
-
// We found a closing sequence. `consecutive_dashes` is the
|
|
285
|
-
// total number of dashes before this `>`. A well-formed close
|
|
286
|
-
// is exactly two (`-->`). Any additional dashes are bogus.
|
|
287
|
-
if consecutive_dashes > 2 {
|
|
288
|
-
// Rewrite: keep the comment body (without the extra dashes)
|
|
289
|
-
// and replace the closing sequence with `-->`.
|
|
290
|
-
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
291
|
-
// Flush everything up to the start of the extra dashes.
|
|
292
|
-
// The comment body ends `consecutive_dashes` bytes before
|
|
293
|
-
// the current `idx` (which points at `>`).
|
|
294
|
-
let close_start = idx - consecutive_dashes;
|
|
295
|
-
out.push_str(&input[last..close_start]);
|
|
296
|
-
out.push_str("-->");
|
|
297
|
-
idx += 1; // consume `>`
|
|
298
|
-
last = idx;
|
|
299
|
-
} else {
|
|
300
|
-
// Normal `-->` — no rewrite needed.
|
|
301
|
-
idx += 1; // consume `>`
|
|
302
|
-
}
|
|
303
|
-
break;
|
|
304
|
-
} else {
|
|
305
|
-
// Any non-dash non-`>` character resets the dash count and
|
|
306
|
-
// returns us to the plain comment body state.
|
|
307
|
-
consecutive_dashes = 0;
|
|
308
|
-
idx += 1;
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
// If we reached end-of-input without finding a close, the comment is
|
|
312
|
-
// unclosed. We leave the remainder as-is; the parser will handle it.
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
match output {
|
|
316
|
-
Some(mut out) => {
|
|
317
|
-
if last < len {
|
|
318
|
-
out.push_str(&input[last..]);
|
|
319
|
-
}
|
|
320
|
-
Cow::Owned(out)
|
|
321
|
-
}
|
|
322
|
-
None => Cow::Borrowed(input),
|
|
323
|
-
}
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
/// Normalize closing tags whose `>` appears on a subsequent line.
|
|
327
|
-
///
|
|
328
|
-
/// Some HTML formatters (JSX-style) write closing tags as:
|
|
329
|
-
///
|
|
330
|
-
/// ```html
|
|
331
|
-
/// </a
|
|
332
|
-
/// >
|
|
333
|
-
/// ```
|
|
334
|
-
///
|
|
335
|
-
/// The `tl` parser does not handle end-tags with a newline before the closing
|
|
336
|
-
/// `>`, leaving the element unclosed so all subsequent siblings become children
|
|
337
|
-
/// of the open element. This pass collapses such patterns to a single-line
|
|
338
|
-
/// closing tag (`</a>`) before the document reaches `tl`.
|
|
339
|
-
///
|
|
340
|
-
/// Only the whitespace between the tag name and the closing `>` is normalised;
|
|
341
|
-
/// the rest of the document is untouched.
|
|
342
|
-
pub fn normalize_split_closing_tags(input: &str) -> Cow<'_, str> {
|
|
343
|
-
let bytes = input.as_bytes();
|
|
344
|
-
let len = bytes.len();
|
|
345
|
-
|
|
346
|
-
// Fast path: need both '</' and '\n' to have any candidates.
|
|
347
|
-
if len < 4 || !bytes.contains(&b'\n') {
|
|
348
|
-
return Cow::Borrowed(input);
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
let mut idx = 0;
|
|
352
|
-
let mut last = 0;
|
|
353
|
-
let mut output: Option<String> = None;
|
|
354
|
-
|
|
355
|
-
while idx + 2 < len {
|
|
356
|
-
// Look for `</`
|
|
357
|
-
if bytes[idx] != b'<' || bytes[idx + 1] != b'/' {
|
|
358
|
-
idx += 1;
|
|
359
|
-
continue;
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
// Scan tag name: ASCII letters, digits, hyphens (HTML5 allows hyphens in custom elements)
|
|
363
|
-
let name_start = idx + 2;
|
|
364
|
-
let mut name_end = name_start;
|
|
365
|
-
while name_end < len && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-') {
|
|
366
|
-
name_end += 1;
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
if name_end == name_start {
|
|
370
|
-
// No tag name — not a closing tag we care about.
|
|
371
|
-
idx += 1;
|
|
372
|
-
continue;
|
|
373
|
-
}
|
|
374
|
-
|
|
375
|
-
// After the tag name, skip any whitespace. If there is a newline in
|
|
376
|
-
// that whitespace before the `>`, we need to rewrite.
|
|
377
|
-
let ws_start = name_end;
|
|
378
|
-
let mut ws_end = ws_start;
|
|
379
|
-
let mut has_newline = false;
|
|
380
|
-
while ws_end < len && bytes[ws_end].is_ascii_whitespace() {
|
|
381
|
-
if bytes[ws_end] == b'\n' || bytes[ws_end] == b'\r' {
|
|
382
|
-
has_newline = true;
|
|
383
|
-
}
|
|
384
|
-
ws_end += 1;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
if !has_newline || ws_end >= len || bytes[ws_end] != b'>' {
|
|
388
|
-
// Either no whitespace newline, or the `>` is not the next char.
|
|
389
|
-
idx += 1;
|
|
390
|
-
continue;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
// We have `</tagname [whitespace-with-newline]>` — rewrite to `</tagname>`.
|
|
394
|
-
let tag_name = &input[name_start..name_end];
|
|
395
|
-
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
396
|
-
out.push_str(&input[last..idx]);
|
|
397
|
-
out.push_str("</");
|
|
398
|
-
out.push_str(tag_name);
|
|
399
|
-
out.push('>');
|
|
400
|
-
|
|
401
|
-
idx = ws_end + 1; // advance past the `>`
|
|
402
|
-
last = idx;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
match output {
|
|
406
|
-
Some(mut out) => {
|
|
407
|
-
if last < len {
|
|
408
|
-
out.push_str(&input[last..]);
|
|
409
|
-
}
|
|
410
|
-
Cow::Owned(out)
|
|
411
|
-
}
|
|
412
|
-
None => Cow::Borrowed(input),
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
/// Preprocess HTML to normalize tags and fix common issues.
|
|
417
|
-
pub fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
418
|
-
const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
|
|
419
|
-
const TAGS: [&[u8]; 2] = [b"script", b"style"];
|
|
420
|
-
const SVG: &[u8] = b"svg";
|
|
421
|
-
const DOCTYPE: &[u8] = b"doctype";
|
|
422
|
-
const EMPTY_COMMENT: &[u8] = b"<!---->";
|
|
423
|
-
|
|
424
|
-
let bytes = input.as_bytes();
|
|
425
|
-
let len = bytes.len();
|
|
426
|
-
if len == 0 {
|
|
427
|
-
return Cow::Borrowed(input);
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
let mut idx = 0;
|
|
431
|
-
let mut last = 0;
|
|
432
|
-
let mut output: Option<String> = None;
|
|
433
|
-
let mut svg_depth = 0usize;
|
|
434
|
-
|
|
435
|
-
while idx < len {
|
|
436
|
-
if bytes[idx] == b'<' {
|
|
437
|
-
if bytes[idx..].starts_with(EMPTY_COMMENT) {
|
|
438
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
439
|
-
out.push_str(&input[last..idx]);
|
|
440
|
-
out.push_str("<!-- -->");
|
|
441
|
-
idx += EMPTY_COMMENT.len();
|
|
442
|
-
last = idx;
|
|
443
|
-
continue;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
let mut replaced = false;
|
|
447
|
-
for (pattern, replacement) in &SELF_CLOSING {
|
|
448
|
-
if bytes[idx..].starts_with(pattern) {
|
|
449
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
450
|
-
out.push_str(&input[last..idx]);
|
|
451
|
-
out.push_str(replacement);
|
|
452
|
-
idx += pattern.len();
|
|
453
|
-
last = idx;
|
|
454
|
-
replaced = true;
|
|
455
|
-
break;
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
if replaced {
|
|
459
|
-
continue;
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
if matches_tag_start(bytes, idx + 1, SVG) {
|
|
463
|
-
if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
|
|
464
|
-
svg_depth += 1;
|
|
465
|
-
idx = open_end;
|
|
466
|
-
continue;
|
|
467
|
-
}
|
|
468
|
-
} else if matches_end_tag_start(bytes, idx + 1, SVG) {
|
|
469
|
-
if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
|
|
470
|
-
if svg_depth > 0 {
|
|
471
|
-
svg_depth = svg_depth.saturating_sub(1);
|
|
472
|
-
}
|
|
473
|
-
idx = close_end;
|
|
474
|
-
continue;
|
|
475
|
-
}
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
if svg_depth == 0 {
|
|
479
|
-
let mut handled = false;
|
|
480
|
-
for tag in TAGS {
|
|
481
|
-
if matches_tag_start(bytes, idx + 1, tag) {
|
|
482
|
-
if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
483
|
-
if tag == b"script" && is_json_ld_script_open_tag(&input[idx..open_end]) {
|
|
484
|
-
continue;
|
|
485
|
-
}
|
|
486
|
-
let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(open_end);
|
|
487
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
488
|
-
out.push_str(&input[last..idx]);
|
|
489
|
-
out.push_str(&input[idx..open_end]);
|
|
490
|
-
out.push_str("</");
|
|
491
|
-
// `TAGS` contains only ASCII byte literals (`b"script"`, `b"style"`),
|
|
492
|
-
// which are always valid UTF-8; `from_utf8` cannot fail here.
|
|
493
|
-
if let Ok(tag_str) = str::from_utf8(tag) {
|
|
494
|
-
out.push_str(tag_str);
|
|
495
|
-
}
|
|
496
|
-
out.push('>');
|
|
497
|
-
|
|
498
|
-
last = remove_end;
|
|
499
|
-
idx = remove_end;
|
|
500
|
-
handled = true;
|
|
501
|
-
}
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
if handled {
|
|
505
|
-
break;
|
|
506
|
-
}
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
if handled {
|
|
510
|
-
continue;
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
if idx + 2 < len && bytes[idx + 1] == b'!' {
|
|
514
|
-
let mut cursor = idx + 2;
|
|
515
|
-
while cursor < len && bytes[cursor].is_ascii_whitespace() {
|
|
516
|
-
cursor += 1;
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
if cursor + DOCTYPE.len() <= len
|
|
520
|
-
&& bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
|
|
521
|
-
{
|
|
522
|
-
if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
|
|
523
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
|
|
524
|
-
out.push_str(&input[last..idx]);
|
|
525
|
-
last = end;
|
|
526
|
-
idx = end;
|
|
527
|
-
continue;
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
let is_valid_tag = if idx + 1 < len {
|
|
534
|
-
match bytes[idx + 1] {
|
|
535
|
-
b'!' => {
|
|
536
|
-
idx + 2 < len
|
|
537
|
-
&& (bytes[idx + 2] == b'-'
|
|
538
|
-
|| bytes[idx + 2].is_ascii_alphabetic()
|
|
539
|
-
|| bytes[idx + 2].is_ascii_uppercase())
|
|
540
|
-
}
|
|
541
|
-
b'/' => {
|
|
542
|
-
idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
|
|
543
|
-
}
|
|
544
|
-
b'?' => true,
|
|
545
|
-
c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
|
|
546
|
-
_ => false,
|
|
547
|
-
}
|
|
548
|
-
} else {
|
|
549
|
-
false
|
|
550
|
-
};
|
|
551
|
-
|
|
552
|
-
if !is_valid_tag {
|
|
553
|
-
let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
|
|
554
|
-
out.push_str(&input[last..idx]);
|
|
555
|
-
out.push_str("<");
|
|
556
|
-
idx += 1;
|
|
557
|
-
last = idx;
|
|
558
|
-
continue;
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
|
|
562
|
-
idx += 1;
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
if let Some(mut out) = output {
|
|
566
|
-
if last < len {
|
|
567
|
-
out.push_str(&input[last..]);
|
|
568
|
-
}
|
|
569
|
-
Cow::Owned(out)
|
|
570
|
-
} else {
|
|
571
|
-
Cow::Borrowed(input)
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
/// Check if a script tag is a JSON-LD script.
|
|
576
|
-
pub fn is_json_ld_script_open_tag(tag: &str) -> bool {
|
|
577
|
-
let bytes = tag.as_bytes();
|
|
578
|
-
let mut idx = 0;
|
|
579
|
-
while idx + 4 <= bytes.len() {
|
|
580
|
-
if eq_ascii_case_insensitive(&bytes[idx..], b"type") {
|
|
581
|
-
let before_ok = idx == 0
|
|
582
|
-
|| bytes
|
|
583
|
-
.get(idx.saturating_sub(1))
|
|
584
|
-
.is_some_and(|b| b.is_ascii_whitespace() || *b == b'<' || *b == b'/');
|
|
585
|
-
let after_ok = bytes
|
|
586
|
-
.get(idx + 4)
|
|
587
|
-
.is_some_and(|b| b.is_ascii_whitespace() || *b == b'=');
|
|
588
|
-
if !before_ok || !after_ok {
|
|
589
|
-
idx += 4;
|
|
590
|
-
continue;
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
let mut i = idx + 4;
|
|
594
|
-
while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
|
|
595
|
-
i += 1;
|
|
596
|
-
}
|
|
597
|
-
if bytes.get(i) != Some(&b'=') {
|
|
598
|
-
idx += 4;
|
|
599
|
-
continue;
|
|
600
|
-
}
|
|
601
|
-
i += 1;
|
|
602
|
-
while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
|
|
603
|
-
i += 1;
|
|
604
|
-
}
|
|
605
|
-
if i >= bytes.len() {
|
|
606
|
-
return false;
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
let (value_start, value_end) = match bytes[i] {
|
|
610
|
-
b'"' | b'\'' => {
|
|
611
|
-
let quote = bytes[i];
|
|
612
|
-
let start = i + 1;
|
|
613
|
-
let mut end = start;
|
|
614
|
-
while end < bytes.len() && bytes[end] != quote {
|
|
615
|
-
end += 1;
|
|
616
|
-
}
|
|
617
|
-
(start, end)
|
|
618
|
-
}
|
|
619
|
-
_ => {
|
|
620
|
-
let start = i;
|
|
621
|
-
let mut end = start;
|
|
622
|
-
while end < bytes.len() && !bytes[end].is_ascii_whitespace() && bytes[end] != b'>' {
|
|
623
|
-
end += 1;
|
|
624
|
-
}
|
|
625
|
-
(start, end)
|
|
626
|
-
}
|
|
627
|
-
};
|
|
628
|
-
|
|
629
|
-
let value = &tag[value_start..value_end];
|
|
630
|
-
let media_type = value.split(';').next().unwrap_or(value).trim();
|
|
631
|
-
return eq_ascii_case_insensitive(media_type.as_bytes(), b"application/ld+json");
|
|
632
|
-
}
|
|
633
|
-
idx += 1;
|
|
634
|
-
}
|
|
635
|
-
false
|
|
636
|
-
}
|
|
637
|
-
|
|
638
|
-
/// Case-insensitive byte comparison for ASCII.
|
|
639
|
-
#[inline]
|
|
640
|
-
pub fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
|
|
641
|
-
if haystack.len() < needle.len() {
|
|
642
|
-
return false;
|
|
643
|
-
}
|
|
644
|
-
haystack
|
|
645
|
-
.iter()
|
|
646
|
-
.zip(needle.iter())
|
|
647
|
-
.all(|(a, b)| a.eq_ignore_ascii_case(b))
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
/// Check if bytes match a tag start pattern.
|
|
651
|
-
pub fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
|
|
652
|
-
if start >= bytes.len() {
|
|
653
|
-
return false;
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
if start + tag.len() > bytes.len() {
|
|
657
|
-
return false;
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
|
|
661
|
-
return false;
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
start += tag.len();
|
|
665
|
-
|
|
666
|
-
match bytes.get(start) {
|
|
667
|
-
Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
|
|
668
|
-
Some(_) => false,
|
|
669
|
-
None => true,
|
|
670
|
-
}
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
/// Find the end of an HTML tag (the position of '>').
|
|
674
|
-
pub fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
|
|
675
|
-
let len = bytes.len();
|
|
676
|
-
let mut in_quote: Option<u8> = None;
|
|
677
|
-
|
|
678
|
-
while idx < len {
|
|
679
|
-
match bytes[idx] {
|
|
680
|
-
b'"' | b'\'' => {
|
|
681
|
-
if let Some(current) = in_quote {
|
|
682
|
-
if current == bytes[idx] {
|
|
683
|
-
in_quote = None;
|
|
684
|
-
}
|
|
685
|
-
} else {
|
|
686
|
-
in_quote = Some(bytes[idx]);
|
|
687
|
-
}
|
|
688
|
-
}
|
|
689
|
-
b'>' if in_quote.is_none() => return Some(idx + 1),
|
|
690
|
-
_ => {}
|
|
691
|
-
}
|
|
692
|
-
idx += 1;
|
|
693
|
-
}
|
|
694
|
-
|
|
695
|
-
None
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
/// Find the closing tag for a given tag name.
|
|
699
|
-
pub fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
|
|
700
|
-
let len = bytes.len();
|
|
701
|
-
let mut depth = 1usize;
|
|
702
|
-
|
|
703
|
-
while idx < len {
|
|
704
|
-
if bytes[idx] == b'<' {
|
|
705
|
-
if matches_tag_start(bytes, idx + 1, tag) {
|
|
706
|
-
if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
|
|
707
|
-
depth += 1;
|
|
708
|
-
idx = next;
|
|
709
|
-
continue;
|
|
710
|
-
}
|
|
711
|
-
} else if matches_end_tag_start(bytes, idx + 1, tag) {
|
|
712
|
-
if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
|
|
713
|
-
depth -= 1;
|
|
714
|
-
if depth == 0 {
|
|
715
|
-
return Some(close);
|
|
716
|
-
}
|
|
717
|
-
idx = close;
|
|
718
|
-
continue;
|
|
719
|
-
}
|
|
720
|
-
}
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
idx += 1;
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
None
|
|
727
|
-
}
|
|
728
|
-
|
|
729
|
-
/// Check if bytes match an end tag pattern.
|
|
730
|
-
pub fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
|
|
731
|
-
if start >= bytes.len() || bytes[start] != b'/' {
|
|
732
|
-
return false;
|
|
733
|
-
}
|
|
734
|
-
matches_tag_start(bytes, start + 1, tag)
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
/// Sanitize malformed markdown-like URLs in HTML attributes.
|
|
738
|
-
///
|
|
739
|
-
/// Handles cases like: `//[domain.com/path](http://domain.com/path)`
|
|
740
|
-
/// Extracts the actual URL from parentheses.
|
|
741
|
-
///
|
|
742
|
-
/// This is an internal function used during preprocessing to extract valid URLs
|
|
743
|
-
/// from malformed HTML that contains markdown-like syntax.
|
|
744
|
-
///
|
|
745
|
-
/// # Arguments
|
|
746
|
-
/// * `url` - The URL string to sanitize
|
|
747
|
-
///
|
|
748
|
-
/// # Returns
|
|
749
|
-
/// * `Cow<str>` - Either the borrowed original URL or an owned sanitized version
|
|
750
|
-
pub fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
|
|
751
|
-
// Pattern: ...[text](actual_url) or similar markdown-like syntax
|
|
752
|
-
// This handles malformed HTML where markdown syntax wasn't properly converted
|
|
753
|
-
// and prevents downstream URL parsing errors (e.g., bracketed "IPv6" hosts).
|
|
754
|
-
|
|
755
|
-
// Fast-path: we only care about markdown-like link syntax.
|
|
756
|
-
let Some(mid) = url.find("](") else {
|
|
757
|
-
return Cow::Borrowed(url);
|
|
758
|
-
};
|
|
759
|
-
|
|
760
|
-
// Ensure there is an opening '[' before the "](..." sequence.
|
|
761
|
-
if !url[..mid].contains('[') {
|
|
762
|
-
return Cow::Borrowed(url);
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
let paren_start = mid + 2;
|
|
766
|
-
let Some(rel_end) = url[paren_start..].find(')') else {
|
|
767
|
-
return Cow::Borrowed(url);
|
|
768
|
-
};
|
|
769
|
-
let paren_end = paren_start + rel_end;
|
|
770
|
-
if paren_start >= paren_end {
|
|
771
|
-
return Cow::Borrowed(url);
|
|
772
|
-
}
|
|
773
|
-
|
|
774
|
-
Cow::Owned(url[paren_start..paren_end].to_string())
|
|
775
|
-
}
|
|
776
|
-
|
|
777
|
-
/// Strip elements with the `hidden` attribute from HTML.
|
|
778
|
-
///
|
|
779
|
-
/// Scans for opening tags containing the `hidden` attribute, finds their
|
|
780
|
-
/// matching closing tag, and removes the entire element (tag + content).
|
|
781
|
-
/// Self-closing tags with `hidden` are also removed.
|
|
782
|
-
pub fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
|
|
783
|
-
let bytes = input.as_bytes();
|
|
784
|
-
let len = bytes.len();
|
|
785
|
-
|
|
786
|
-
if len == 0 || !bytes.contains(&b'<') {
|
|
787
|
-
return Cow::Borrowed(input);
|
|
788
|
-
}
|
|
789
|
-
|
|
790
|
-
let mut idx = 0;
|
|
791
|
-
let mut last = 0;
|
|
792
|
-
let mut output: Option<String> = None;
|
|
793
|
-
|
|
794
|
-
while idx < len {
|
|
795
|
-
if bytes[idx] == b'<' && idx + 1 < len && bytes[idx + 1] != b'/' && bytes[idx + 1] != b'!' {
|
|
796
|
-
// Find the end of this opening tag
|
|
797
|
-
if let Some(tag_end) = find_tag_end(bytes, idx + 1) {
|
|
798
|
-
let tag_slice = &input[idx..tag_end];
|
|
799
|
-
if tag_has_hidden_attribute(tag_slice) {
|
|
800
|
-
// Extract the tag name
|
|
801
|
-
let name_start = idx + 1;
|
|
802
|
-
let mut name_end = name_start;
|
|
803
|
-
while name_end < len
|
|
804
|
-
&& !bytes[name_end].is_ascii_whitespace()
|
|
805
|
-
&& bytes[name_end] != b'>'
|
|
806
|
-
&& bytes[name_end] != b'/'
|
|
807
|
-
{
|
|
808
|
-
name_end += 1;
|
|
809
|
-
}
|
|
810
|
-
let tag_name = &bytes[name_start..name_end];
|
|
811
|
-
|
|
812
|
-
// Check if it's a self-closing tag (e.g., <br hidden> or <br hidden/>)
|
|
813
|
-
let is_self_closing = tag_slice.ends_with("/>")
|
|
814
|
-
|| tag_name.eq_ignore_ascii_case(b"br")
|
|
815
|
-
|| tag_name.eq_ignore_ascii_case(b"hr")
|
|
816
|
-
|| tag_name.eq_ignore_ascii_case(b"img")
|
|
817
|
-
|| tag_name.eq_ignore_ascii_case(b"input");
|
|
818
|
-
|
|
819
|
-
let remove_end = if is_self_closing {
|
|
820
|
-
tag_end
|
|
821
|
-
} else {
|
|
822
|
-
// Find the closing tag
|
|
823
|
-
find_closing_tag_bytes(bytes, tag_end, tag_name).unwrap_or(tag_end)
|
|
824
|
-
};
|
|
825
|
-
|
|
826
|
-
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
827
|
-
out.push_str(&input[last..idx]);
|
|
828
|
-
last = remove_end;
|
|
829
|
-
idx = remove_end;
|
|
830
|
-
continue;
|
|
831
|
-
}
|
|
832
|
-
}
|
|
833
|
-
}
|
|
834
|
-
idx += 1;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
if let Some(mut out) = output {
|
|
838
|
-
if last < len {
|
|
839
|
-
out.push_str(&input[last..]);
|
|
840
|
-
}
|
|
841
|
-
Cow::Owned(out)
|
|
842
|
-
} else {
|
|
843
|
-
Cow::Borrowed(input)
|
|
844
|
-
}
|
|
845
|
-
}
|
|
846
|
-
|
|
847
|
-
/// Check if an opening tag string contains the `hidden` attribute.
|
|
848
|
-
///
|
|
849
|
-
/// Handles: `hidden`, `hidden=""`, `hidden="hidden"`, `hidden="true"`.
|
|
850
|
-
/// Does NOT match attributes like `data-hidden` or `aria-hidden`.
|
|
851
|
-
fn tag_has_hidden_attribute(tag: &str) -> bool {
|
|
852
|
-
let bytes = tag.as_bytes();
|
|
853
|
-
let len = bytes.len();
|
|
854
|
-
let needle = b"hidden";
|
|
855
|
-
let nlen = needle.len();
|
|
856
|
-
|
|
857
|
-
let mut i = 0;
|
|
858
|
-
// Skip past the tag name
|
|
859
|
-
while i < len && bytes[i] != b' ' && bytes[i] != b'\t' && bytes[i] != b'\n' && bytes[i] != b'>' {
|
|
860
|
-
i += 1;
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
while i + nlen <= len {
|
|
864
|
-
if bytes[i..i + nlen].eq_ignore_ascii_case(needle) {
|
|
865
|
-
// Check that the character before is whitespace (attribute boundary)
|
|
866
|
-
let before_ok = i == 0 || bytes[i - 1].is_ascii_whitespace();
|
|
867
|
-
// Check that the character after is whitespace, '>', '=', or '/'
|
|
868
|
-
let after = bytes.get(i + nlen).copied();
|
|
869
|
-
let after_ok = matches!(after, None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'=' | b'/'));
|
|
870
|
-
if before_ok && after_ok {
|
|
871
|
-
return true;
|
|
872
|
-
}
|
|
873
|
-
}
|
|
874
|
-
i += 1;
|
|
875
|
-
}
|
|
876
|
-
false
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
#[cfg(test)]
|
|
880
|
-
mod tests {
|
|
881
|
-
use super::{normalize_bogus_comment_endings, normalize_split_closing_tags, sanitize_markdown_url};
|
|
882
|
-
|
|
883
|
-
// ── normalize_bogus_comment_endings ───────────────────────────────────────
|
|
884
|
-
|
|
885
|
-
#[test]
|
|
886
|
-
fn normalize_bogus_comment_endings_leaves_well_formed_comment_unchanged() {
|
|
887
|
-
let input = "<p>A</p><!-- foo --><p>B</p>";
|
|
888
|
-
let result = normalize_bogus_comment_endings(input);
|
|
889
|
-
// Borrowed means unchanged
|
|
890
|
-
assert_eq!(result.as_ref(), input);
|
|
891
|
-
}
|
|
892
|
-
|
|
893
|
-
#[test]
|
|
894
|
-
fn normalize_bogus_comment_endings_rewrites_triple_dash_close() {
|
|
895
|
-
let input = "<!-- foo --->";
|
|
896
|
-
let result = normalize_bogus_comment_endings(input);
|
|
897
|
-
assert_eq!(result.as_ref(), "<!-- foo -->");
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
#[test]
|
|
901
|
-
fn normalize_bogus_comment_endings_rewrites_four_dash_close() {
|
|
902
|
-
let input = "<!-- foo ---->";
|
|
903
|
-
let result = normalize_bogus_comment_endings(input);
|
|
904
|
-
assert_eq!(result.as_ref(), "<!-- foo -->");
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
#[test]
|
|
908
|
-
fn normalize_bogus_comment_endings_preserves_content_after_comment() {
|
|
909
|
-
let input = "<h1>One</h1><!-- /// ---><p>Two</p>";
|
|
910
|
-
let result = normalize_bogus_comment_endings(input);
|
|
911
|
-
assert_eq!(result.as_ref(), "<h1>One</h1><!-- /// --><p>Two</p>");
|
|
912
|
-
}
|
|
913
|
-
|
|
914
|
-
#[test]
|
|
915
|
-
fn normalize_bogus_comment_endings_handles_multiple_bogus_comments() {
|
|
916
|
-
let input = "<p>A</p><!-- x ---><p>B</p><!-- y ----><p>C</p>";
|
|
917
|
-
let result = normalize_bogus_comment_endings(input);
|
|
918
|
-
assert_eq!(result.as_ref(), "<p>A</p><!-- x --><p>B</p><!-- y --><p>C</p>");
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
#[test]
|
|
922
|
-
fn normalize_bogus_comment_endings_handles_no_comments() {
|
|
923
|
-
let input = "<p>Just a paragraph</p>";
|
|
924
|
-
let result = normalize_bogus_comment_endings(input);
|
|
925
|
-
assert_eq!(result.as_ref(), input);
|
|
926
|
-
}
|
|
927
|
-
|
|
928
|
-
#[test]
|
|
929
|
-
fn normalize_bogus_comment_endings_empty_input() {
|
|
930
|
-
let result = normalize_bogus_comment_endings("");
|
|
931
|
-
assert_eq!(result.as_ref(), "");
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
// ── normalize_split_closing_tags ──────────────────────────────────────────
|
|
935
|
-
|
|
936
|
-
#[test]
|
|
937
|
-
fn normalize_split_closing_tags_collapses_newline_before_close_bracket() {
|
|
938
|
-
let input = "<a href=\"#x\">text</a\n>";
|
|
939
|
-
let result = normalize_split_closing_tags(input);
|
|
940
|
-
assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
|
|
941
|
-
}
|
|
942
|
-
|
|
943
|
-
#[test]
|
|
944
|
-
fn normalize_split_closing_tags_collapses_indented_newline_before_close_bracket() {
|
|
945
|
-
let input = "<a href=\"#x\">text</a\n >";
|
|
946
|
-
let result = normalize_split_closing_tags(input);
|
|
947
|
-
assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
|
|
948
|
-
}
|
|
949
|
-
|
|
950
|
-
#[test]
|
|
951
|
-
fn normalize_split_closing_tags_leaves_well_formed_closing_tags_unchanged() {
|
|
952
|
-
let input = "<a href=\"#x\">text</a>";
|
|
953
|
-
let result = normalize_split_closing_tags(input);
|
|
954
|
-
assert_eq!(result.as_ref(), input);
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
#[test]
|
|
958
|
-
fn normalize_split_closing_tags_handles_multiple_split_closing_tags() {
|
|
959
|
-
let input = "<li><a href=\"#a\">A</a\n >\n<a href=\"#b\">B</a\n>";
|
|
960
|
-
let result = normalize_split_closing_tags(input);
|
|
961
|
-
assert_eq!(result.as_ref(), "<li><a href=\"#a\">A</a>\n<a href=\"#b\">B</a>");
|
|
962
|
-
}
|
|
963
|
-
|
|
964
|
-
#[test]
|
|
965
|
-
fn normalize_split_closing_tags_does_not_collapse_inline_whitespace() {
|
|
966
|
-
// Only newlines trigger the normalisation; spaces alone must not.
|
|
967
|
-
let input = "<a href=\"#x\">text</a >";
|
|
968
|
-
let result = normalize_split_closing_tags(input);
|
|
969
|
-
// A space before > is actually valid HTML and tl handles it fine.
|
|
970
|
-
// We must not touch it to avoid over-normalising.
|
|
971
|
-
assert_eq!(result.as_ref(), input);
|
|
972
|
-
}
|
|
973
|
-
|
|
974
|
-
#[test]
|
|
975
|
-
fn normalize_split_closing_tags_empty_input() {
|
|
976
|
-
let result = normalize_split_closing_tags("");
|
|
977
|
-
assert_eq!(result.as_ref(), "");
|
|
978
|
-
}
|
|
979
|
-
|
|
980
|
-
// ── sanitize_markdown_url ─────────────────────────────────────────────────
|
|
981
|
-
|
|
982
|
-
#[test]
|
|
983
|
-
fn sanitize_markdown_url_extracts_scheme_relative_markdown_like_url() {
|
|
984
|
-
let input = "//[p1.zemanta.com/v2/p/ns/45625/PAGE\\_VIEW/](http://p1.zemanta.com/v2/p/ns/45625/PAGE_VIEW/)";
|
|
985
|
-
let sanitized = sanitize_markdown_url(input);
|
|
986
|
-
assert_eq!(sanitized, "http://p1.zemanta.com/v2/p/ns/45625/PAGE_VIEW/");
|
|
987
|
-
}
|
|
988
|
-
|
|
989
|
-
#[test]
|
|
990
|
-
fn sanitize_markdown_url_extracts_standard_markdown_like_url() {
|
|
991
|
-
let input = "[label](https://example.com/path?q=1)";
|
|
992
|
-
let sanitized = sanitize_markdown_url(input);
|
|
993
|
-
assert_eq!(sanitized, "https://example.com/path?q=1");
|
|
994
|
-
}
|
|
995
|
-
|
|
996
|
-
#[test]
|
|
997
|
-
fn sanitize_markdown_url_leaves_normal_urls_unchanged() {
|
|
998
|
-
let input = "https://example.com/normal";
|
|
999
|
-
let sanitized = sanitize_markdown_url(input);
|
|
1000
|
-
assert_eq!(sanitized, input);
|
|
1001
|
-
}
|
|
1002
|
-
}
|