html-to-markdown 3.4.0 → 3.6.0.pre.rc.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +21 -0
  3. data/README.md +347 -0
  4. data/Steepfile +10 -2
  5. data/ext/html_to_markdown_rb/Cargo.toml +3 -2
  6. data/ext/html_to_markdown_rb/extconf.rb +5 -5
  7. data/ext/html_to_markdown_rb/native/Cargo.lock +962 -0
  8. data/ext/html_to_markdown_rb/native/Cargo.toml +6 -11
  9. data/ext/html_to_markdown_rb/native/extconf.rb +14 -0
  10. data/ext/html_to_markdown_rb/src/lib.rs +1715 -646
  11. data/lib/html_to_markdown/native.rb +913 -37
  12. data/lib/html_to_markdown/version.rb +3 -3
  13. data/lib/html_to_markdown.rb +9 -4
  14. data/lib/html_to_markdown_rb.so +0 -0
  15. data/sig/types.rbs +59 -292
  16. metadata +32 -179
  17. data/ext/html_to_markdown_rb/Makefile +0 -592
  18. data/lib/bin/html-to-markdown +0 -0
  19. data/vendor/Cargo.toml +0 -33
  20. data/vendor/html-to-markdown-rs/Cargo.toml +0 -54
  21. data/vendor/html-to-markdown-rs/README.md +0 -278
  22. data/vendor/html-to-markdown-rs/examples/basic.rs +0 -24
  23. data/vendor/html-to-markdown-rs/examples/table.rs +0 -25
  24. data/vendor/html-to-markdown-rs/examples/test_deser.rs +0 -12
  25. data/vendor/html-to-markdown-rs/examples/test_escape.rs +0 -58
  26. data/vendor/html-to-markdown-rs/examples/test_inline_formatting.rs +0 -113
  27. data/vendor/html-to-markdown-rs/examples/test_lists.rs +0 -39
  28. data/vendor/html-to-markdown-rs/examples/test_semantic_tags.rs +0 -89
  29. data/vendor/html-to-markdown-rs/examples/test_tables.rs +0 -100
  30. data/vendor/html-to-markdown-rs/examples/test_task_lists.rs +0 -61
  31. data/vendor/html-to-markdown-rs/examples/test_whitespace.rs +0 -34
  32. data/vendor/html-to-markdown-rs/src/convert_api.rs +0 -349
  33. data/vendor/html-to-markdown-rs/src/converter/block/blockquote.rs +0 -178
  34. data/vendor/html-to-markdown-rs/src/converter/block/container.rs +0 -114
  35. data/vendor/html-to-markdown-rs/src/converter/block/div.rs +0 -149
  36. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +0 -428
  37. data/vendor/html-to-markdown-rs/src/converter/block/horizontal_rule.rs +0 -103
  38. data/vendor/html-to-markdown-rs/src/converter/block/line_break.rs +0 -89
  39. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +0 -10
  40. data/vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +0 -140
  41. data/vendor/html-to-markdown-rs/src/converter/block/preformatted.rs +0 -298
  42. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +0 -453
  43. data/vendor/html-to-markdown-rs/src/converter/block/table/caption.rs +0 -44
  44. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +0 -276
  45. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +0 -336
  46. data/vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +0 -58
  47. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +0 -266
  48. data/vendor/html-to-markdown-rs/src/converter/block/table/scanner.rs +0 -146
  49. data/vendor/html-to-markdown-rs/src/converter/block/table/utils.rs +0 -34
  50. data/vendor/html-to-markdown-rs/src/converter/block/unknown.rs +0 -138
  51. data/vendor/html-to-markdown-rs/src/converter/context.rs +0 -208
  52. data/vendor/html-to-markdown-rs/src/converter/dom_context.rs +0 -337
  53. data/vendor/html-to-markdown-rs/src/converter/form/elements.rs +0 -770
  54. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +0 -82
  55. data/vendor/html-to-markdown-rs/src/converter/format/djot.rs +0 -64
  56. data/vendor/html-to-markdown-rs/src/converter/format/markdown.rs +0 -59
  57. data/vendor/html-to-markdown-rs/src/converter/format/mod.rs +0 -43
  58. data/vendor/html-to-markdown-rs/src/converter/handlers/blockquote.rs +0 -173
  59. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +0 -434
  60. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +0 -234
  61. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +0 -282
  62. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +0 -316
  63. data/vendor/html-to-markdown-rs/src/converter/handlers/mod.rs +0 -26
  64. data/vendor/html-to-markdown-rs/src/converter/inline/code.rs +0 -306
  65. data/vendor/html-to-markdown-rs/src/converter/inline/emphasis.rs +0 -345
  66. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +0 -428
  67. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +0 -237
  68. data/vendor/html-to-markdown-rs/src/converter/inline/ruby.rs +0 -337
  69. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +0 -566
  70. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/mod.rs +0 -86
  71. data/vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +0 -558
  72. data/vendor/html-to-markdown-rs/src/converter/list/definition.rs +0 -232
  73. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +0 -332
  74. data/vendor/html-to-markdown-rs/src/converter/list/mod.rs +0 -70
  75. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +0 -201
  76. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +0 -195
  77. data/vendor/html-to-markdown-rs/src/converter/list/utils.rs +0 -314
  78. data/vendor/html-to-markdown-rs/src/converter/main.rs +0 -710
  79. data/vendor/html-to-markdown-rs/src/converter/main_helpers.rs +0 -452
  80. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +0 -393
  81. data/vendor/html-to-markdown-rs/src/converter/media/graphic.rs +0 -4
  82. data/vendor/html-to-markdown-rs/src/converter/media/image.rs +0 -183
  83. data/vendor/html-to-markdown-rs/src/converter/media/mod.rs +0 -87
  84. data/vendor/html-to-markdown-rs/src/converter/media/svg.rs +0 -280
  85. data/vendor/html-to-markdown-rs/src/converter/metadata.rs +0 -220
  86. data/vendor/html-to-markdown-rs/src/converter/mod.rs +0 -156
  87. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +0 -516
  88. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +0 -201
  89. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +0 -69
  90. data/vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs +0 -269
  91. data/vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs +0 -266
  92. data/vendor/html-to-markdown-rs/src/converter/semantic/figure.rs +0 -391
  93. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +0 -112
  94. data/vendor/html-to-markdown-rs/src/converter/semantic/sectioning.rs +0 -85
  95. data/vendor/html-to-markdown-rs/src/converter/semantic/summary.rs +0 -324
  96. data/vendor/html-to-markdown-rs/src/converter/text/mod.rs +0 -8
  97. data/vendor/html-to-markdown-rs/src/converter/text/processing.rs +0 -56
  98. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +0 -269
  99. data/vendor/html-to-markdown-rs/src/converter/utility/attributes.rs +0 -151
  100. data/vendor/html-to-markdown-rs/src/converter/utility/caching.rs +0 -74
  101. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +0 -271
  102. data/vendor/html-to-markdown-rs/src/converter/utility/mod.rs +0 -17
  103. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +0 -1002
  104. data/vendor/html-to-markdown-rs/src/converter/utility/serialization.rs +0 -126
  105. data/vendor/html-to-markdown-rs/src/converter/utility/siblings.rs +0 -97
  106. data/vendor/html-to-markdown-rs/src/converter/visitor_hooks.rs +0 -189
  107. data/vendor/html-to-markdown-rs/src/error.rs +0 -43
  108. data/vendor/html-to-markdown-rs/src/exports.rs +0 -24
  109. data/vendor/html-to-markdown-rs/src/inline_images.rs +0 -336
  110. data/vendor/html-to-markdown-rs/src/lib.rs +0 -139
  111. data/vendor/html-to-markdown-rs/src/metadata/collector.rs +0 -457
  112. data/vendor/html-to-markdown-rs/src/metadata/config.rs +0 -394
  113. data/vendor/html-to-markdown-rs/src/metadata/extraction.rs +0 -398
  114. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +0 -288
  115. data/vendor/html-to-markdown-rs/src/metadata/types.rs +0 -477
  116. data/vendor/html-to-markdown-rs/src/options/conversion.rs +0 -559
  117. data/vendor/html-to-markdown-rs/src/options/inline_image.rs +0 -111
  118. data/vendor/html-to-markdown-rs/src/options/mod.rs +0 -20
  119. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +0 -201
  120. data/vendor/html-to-markdown-rs/src/options/validation.rs +0 -416
  121. data/vendor/html-to-markdown-rs/src/prelude.rs +0 -1
  122. data/vendor/html-to-markdown-rs/src/rcdom.rs +0 -487
  123. data/vendor/html-to-markdown-rs/src/text.rs +0 -358
  124. data/vendor/html-to-markdown-rs/src/types/document.rs +0 -191
  125. data/vendor/html-to-markdown-rs/src/types/mod.rs +0 -17
  126. data/vendor/html-to-markdown-rs/src/types/result.rs +0 -54
  127. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +0 -791
  128. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +0 -483
  129. data/vendor/html-to-markdown-rs/src/types/tables.rs +0 -52
  130. data/vendor/html-to-markdown-rs/src/types/warnings.rs +0 -33
  131. data/vendor/html-to-markdown-rs/src/validation.rs +0 -158
  132. data/vendor/html-to-markdown-rs/src/visitor/default_impl.rs +0 -63
  133. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +0 -41
  134. data/vendor/html-to-markdown-rs/src/visitor/traits.rs +0 -370
  135. data/vendor/html-to-markdown-rs/src/visitor/types.rs +0 -319
  136. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/callbacks/mod.rs +0 -1
  137. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/content.rs +0 -126
  138. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/mod.rs +0 -27
  139. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +0 -110
  140. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +0 -250
  141. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +0 -597
  142. data/vendor/html-to-markdown-rs/src/wrapper/sync.rs +0 -413
  143. data/vendor/html-to-markdown-rs/src/wrapper/utils.rs +0 -290
  144. data/vendor/html-to-markdown-rs/src/wrapper.rs +0 -9
  145. data/vendor/html-to-markdown-rs/tests/br_in_inline_test.rs +0 -87
  146. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +0 -297
  147. data/vendor/html-to-markdown-rs/tests/djot_output_test.rs +0 -153
  148. data/vendor/html-to-markdown-rs/tests/exclude_selectors_test.rs +0 -132
  149. data/vendor/html-to-markdown-rs/tests/integration_test.rs +0 -631
  150. data/vendor/html-to-markdown-rs/tests/issue_121_regressions.rs +0 -49
  151. data/vendor/html-to-markdown-rs/tests/issue_127_regressions.rs +0 -58
  152. data/vendor/html-to-markdown-rs/tests/issue_128_regressions.rs +0 -17
  153. data/vendor/html-to-markdown-rs/tests/issue_131_regressions.rs +0 -41
  154. data/vendor/html-to-markdown-rs/tests/issue_134_regressions.rs +0 -40
  155. data/vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +0 -26
  156. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +0 -185
  157. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +0 -100
  158. data/vendor/html-to-markdown-rs/tests/issue_145_regressions.rs +0 -133
  159. data/vendor/html-to-markdown-rs/tests/issue_146_regressions.rs +0 -144
  160. data/vendor/html-to-markdown-rs/tests/issue_176_regressions.rs +0 -62
  161. data/vendor/html-to-markdown-rs/tests/issue_190_regressions.rs +0 -128
  162. data/vendor/html-to-markdown-rs/tests/issue_199_regressions.rs +0 -20
  163. data/vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +0 -62
  164. data/vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +0 -68
  165. data/vendor/html-to-markdown-rs/tests/issue_216_217_regressions.rs +0 -87
  166. data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +0 -74
  167. data/vendor/html-to-markdown-rs/tests/issue_339_regressions.rs +0 -92
  168. data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +0 -154
  169. data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +0 -93
  170. data/vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs +0 -44
  171. data/vendor/html-to-markdown-rs/tests/lists_test.rs +0 -199
  172. data/vendor/html-to-markdown-rs/tests/plain_output_test.rs +0 -273
  173. data/vendor/html-to-markdown-rs/tests/preprocessing_tests.rs +0 -61
  174. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +0 -169
  175. data/vendor/html-to-markdown-rs/tests/sectioning_elements_test.rs +0 -137
  176. data/vendor/html-to-markdown-rs/tests/skip_images_test.rs +0 -522
  177. data/vendor/html-to-markdown-rs/tests/tables_test.rs +0 -743
  178. data/vendor/html-to-markdown-rs/tests/test_custom_elements.rs +0 -41
  179. data/vendor/html-to-markdown-rs/tests/test_issue_187.rs +0 -204
  180. data/vendor/html-to-markdown-rs/tests/test_issue_218.rs +0 -68
  181. data/vendor/html-to-markdown-rs/tests/test_issue_277.rs +0 -77
  182. data/vendor/html-to-markdown-rs/tests/test_max_depth.rs +0 -82
  183. data/vendor/html-to-markdown-rs/tests/test_nested_simple.rs +0 -45
  184. data/vendor/html-to-markdown-rs/tests/test_script_style_stripping.rs +0 -396
  185. data/vendor/html-to-markdown-rs/tests/test_spa_bisect.rs +0 -34
  186. data/vendor/html-to-markdown-rs/tests/visitor_code_integration_test.rs +0 -121
  187. data/vendor/html-to-markdown-rs/tests/visitor_integration_test.rs +0 -1190
  188. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +0 -372
@@ -1,1002 +0,0 @@
1
- //! HTML preprocessing and normalization.
2
- //!
3
- //! Functions for preprocessing HTML before conversion, including script/style stripping,
4
- //! tag repair, and malformed HTML handling.
5
-
6
- use std::borrow::Cow;
7
- use std::str;
8
-
9
- /// Strip script and style tags and their content from HTML.
10
- pub fn strip_script_and_style_tags(input: &str) -> Cow<'_, str> {
11
- let bytes = input.as_bytes();
12
- let len = bytes.len();
13
-
14
- if len == 0 {
15
- return Cow::Borrowed(input);
16
- }
17
-
18
- let mut idx = 0;
19
- let mut last = 0;
20
- let mut output: Option<String> = None;
21
- let mut svg_depth = 0usize;
22
-
23
- // Fast-path: check if there are any < characters at all
24
- if !bytes.contains(&b'<') {
25
- return Cow::Borrowed(input);
26
- }
27
-
28
- while idx < len {
29
- if bytes[idx] == b'<' && idx + 1 < len {
30
- if matches_tag_start(bytes, idx + 1, b"svg") {
31
- if let Some(open_end) = find_tag_end(bytes, idx + 1 + b"svg".len()) {
32
- svg_depth += 1;
33
- idx = open_end;
34
- continue;
35
- }
36
- } else if matches_end_tag_start(bytes, idx + 1, b"svg") {
37
- if let Some(close_end) = find_tag_end(bytes, idx + 2 + b"svg".len()) {
38
- if svg_depth > 0 {
39
- svg_depth = svg_depth.saturating_sub(1);
40
- }
41
- idx = close_end;
42
- continue;
43
- }
44
- }
45
-
46
- if svg_depth > 0 {
47
- idx += 1;
48
- continue;
49
- }
50
-
51
- // Check for </script or </style (closing tags first for safety)
52
- if bytes[idx + 1] == b'/' && idx + 2 < len {
53
- // Match </script>
54
- if idx + 9 <= len && eq_ascii_insensitive(&bytes[idx..idx + 9], b"</script>") {
55
- idx += 9;
56
- continue;
57
- }
58
-
59
- // Match </style>
60
- if idx + 8 <= len && eq_ascii_insensitive(&bytes[idx..idx + 8], b"</style>") {
61
- idx += 8;
62
- continue;
63
- }
64
- }
65
-
66
- // Check for <script or <style (opening tags)
67
- // Match <script (case insensitive)
68
- if idx + 7 < len && eq_ascii_insensitive(&bytes[idx..idx + 7], b"<script") {
69
- // Check if this is actually "<script" followed by whitespace, >, or attribute
70
- let after_tag = bytes[idx + 7];
71
- if after_tag == b'>'
72
- || after_tag == b' '
73
- || after_tag == b'\t'
74
- || after_tag == b'\n'
75
- || after_tag == b'\r'
76
- {
77
- // Find the opening tag end
78
- let mut tag_end = idx + 7;
79
- while tag_end < len && bytes[tag_end] != b'>' {
80
- tag_end += 1;
81
- }
82
-
83
- if tag_end < len {
84
- tag_end += 1; // Include the '>'
85
-
86
- // Check if this is a JSON-LD script tag
87
- let tag_content = &input[idx..tag_end];
88
- if !is_json_ld_script_open_tag(tag_content) {
89
- // Find the closing </script> tag
90
- let close_tag = find_closing_tag_bytes(bytes, tag_end, b"script");
91
- if let Some(close_idx) = close_tag {
92
- let out = output.get_or_insert_with(|| String::with_capacity(len));
93
- out.push_str(&input[last..idx]);
94
- if idx > 0
95
- && close_idx < len
96
- && !bytes[idx - 1].is_ascii_whitespace()
97
- && !bytes[close_idx].is_ascii_whitespace()
98
- {
99
- out.push(' ');
100
- }
101
- last = close_idx;
102
- idx = close_idx;
103
- continue;
104
- }
105
- }
106
- }
107
- }
108
- }
109
- // Match <style (case insensitive)
110
- else if idx + 6 < len && eq_ascii_insensitive(&bytes[idx..idx + 6], b"<style") {
111
- // Check if this is actually "<style" followed by whitespace, >, or attribute
112
- let after_tag = bytes[idx + 6];
113
- if after_tag == b'>'
114
- || after_tag == b' '
115
- || after_tag == b'\t'
116
- || after_tag == b'\n'
117
- || after_tag == b'\r'
118
- {
119
- // Find the opening tag end
120
- let mut tag_end = idx + 6;
121
- while tag_end < len && bytes[tag_end] != b'>' {
122
- tag_end += 1;
123
- }
124
-
125
- if tag_end < len {
126
- tag_end += 1; // Include the '>'
127
-
128
- // Find the closing </style> tag
129
- let close_tag = find_closing_tag_bytes(bytes, tag_end, b"style");
130
- if let Some(close_idx) = close_tag {
131
- let out = output.get_or_insert_with(|| String::with_capacity(len));
132
- out.push_str(&input[last..idx]);
133
- if idx > 0
134
- && close_idx < len
135
- && !bytes[idx - 1].is_ascii_whitespace()
136
- && !bytes[close_idx].is_ascii_whitespace()
137
- {
138
- out.push(' ');
139
- }
140
- last = close_idx;
141
- idx = close_idx;
142
- continue;
143
- }
144
- }
145
- }
146
- }
147
- }
148
-
149
- idx += 1;
150
- }
151
-
152
- if let Some(mut out) = output {
153
- if last < len {
154
- out.push_str(&input[last..]);
155
- }
156
- Cow::Owned(out)
157
- } else {
158
- Cow::Borrowed(input)
159
- }
160
- }
161
-
162
- /// Find the position of a closing tag in bytes.
163
- /// Returns the position AFTER the closing tag (including the '>').
164
- /// This is highly optimized for performance and uses a fast-path scan.
165
- #[inline]
166
- pub fn find_closing_tag_bytes(bytes: &[u8], start: usize, tag: &[u8]) -> Option<usize> {
167
- let len = bytes.len();
168
- let tag_len = tag.len();
169
-
170
- // Fast path: look for the closing tag pattern byte-by-byte
171
- // We use a simple byte scan to find '</' then validate the tag name
172
- let mut idx = start;
173
-
174
- // Limit search to prevent stack overflow on large files
175
- // Look for closing tag within reasonable bounds
176
- const MAX_SCAN: usize = 100_000_000; // 100MB limit per tag - prevents pathological cases
177
-
178
- while idx < len && (idx - start) < MAX_SCAN {
179
- // Optimization: skip forward to next '<' quickly using memchr
180
- if bytes[idx] != b'<' {
181
- if let Some(pos) = memchr::memchr(b'<', &bytes[idx..]) {
182
- idx += pos;
183
- } else {
184
- break;
185
- }
186
- }
187
-
188
- // Check for </ pattern
189
- if idx + 2 < len && bytes[idx + 1] == b'/' {
190
- // Check if tag name matches
191
- if idx + 2 + tag_len <= len && eq_ascii_insensitive(&bytes[idx + 2..idx + 2 + tag_len], tag) {
192
- // Ensure it's followed by > or whitespace
193
- let after_tag = idx + 2 + tag_len;
194
- if after_tag < len && (bytes[after_tag] == b'>' || bytes[after_tag].is_ascii_whitespace()) {
195
- // Find the >
196
- let mut close_idx = after_tag;
197
- while close_idx < len && bytes[close_idx] != b'>' {
198
- close_idx += 1;
199
- }
200
- if close_idx < len {
201
- return Some(close_idx + 1); // Include the '>'
202
- }
203
- }
204
- }
205
- }
206
-
207
- idx += 1;
208
- }
209
-
210
- None
211
- }
212
-
213
- /// Compare bytes ignoring ASCII case.
214
- #[inline]
215
- pub fn eq_ascii_insensitive(a: &[u8], b: &[u8]) -> bool {
216
- if a.len() != b.len() {
217
- return false;
218
- }
219
- a.iter().zip(b.iter()).all(|(x, y)| x.eq_ignore_ascii_case(y))
220
- }
221
-
222
- /// Normalize HTML comment endings that would confuse the `tl` parser.
223
- ///
224
- /// The `astral-tl` parser mishandles HTML comments whose closing sequence
225
- /// contains more than two dashes before the `>` (e.g. `<!-- foo --->` or
226
- /// `<!-- foo ---->`). When it encounters such a comment it creates an empty
227
- /// comment node and silently discards every byte that follows, so all document
228
- /// content after the comment is lost.
229
- ///
230
- /// This function rewrites those bogus closings: every `--[-]+>` sequence that
231
- /// terminates an HTML comment is normalised to `-->`. Regular `-->` closings
232
- /// are left unchanged.
233
- ///
234
- /// # Algorithm
235
- ///
236
- /// Scans the input byte-by-byte looking for `<!--`. For each comment found it
237
- /// scans forward for `-->` using the HTML5 comment-end state machine:
238
- ///
239
- /// - `--[` zero or more `-` `]>` ends the comment.
240
- /// - Any other character after `--` resets back into the comment body.
241
- ///
242
- /// If the actual number of leading dashes before `>` is more than two the
243
- /// closing sequence is replaced with `-->`.
244
- pub fn normalize_bogus_comment_endings(input: &str) -> Cow<'_, str> {
245
- let bytes = input.as_bytes();
246
- let len = bytes.len();
247
-
248
- // Fast path: the input must contain at least "<!--" and "--->".
249
- // Without "<!--" there are no comments; without "---" there cannot be a
250
- // bogus closing.
251
- if len < 7 || !bytes.windows(4).any(|w| w == b"<!--") {
252
- return Cow::Borrowed(input);
253
- }
254
-
255
- let mut idx = 0;
256
- let mut last = 0;
257
- let mut output: Option<String> = None;
258
-
259
- while idx + 3 < len {
260
- // Find the next comment opening.
261
- if !(bytes[idx] == b'<' && bytes[idx + 1] == b'!' && bytes[idx + 2] == b'-' && bytes[idx + 3] == b'-') {
262
- idx += 1;
263
- continue;
264
- }
265
-
266
- // We are positioned at `<!--`.
267
- idx += 4; // advance past `<!--`
268
-
269
- // Walk the comment body looking for the closing sequence.
270
- // The HTML5 comment-end state machine:
271
- // COMMENT state: most chars append to body; `-` → COMMENT_END_DASH
272
- // COMMENT_END_DASH: `-` → COMMENT_END; other → COMMENT
273
- // COMMENT_END: `>` → done; `-` → stay in COMMENT_END (extra dash);
274
- // other → COMMENT
275
- // We track consecutive dashes at the current position.
276
- let mut consecutive_dashes: usize = 0;
277
-
278
- while idx < len {
279
- let b = bytes[idx];
280
- if b == b'-' {
281
- consecutive_dashes += 1;
282
- idx += 1;
283
- } else if b == b'>' && consecutive_dashes >= 2 {
284
- // We found a closing sequence. `consecutive_dashes` is the
285
- // total number of dashes before this `>`. A well-formed close
286
- // is exactly two (`-->`). Any additional dashes are bogus.
287
- if consecutive_dashes > 2 {
288
- // Rewrite: keep the comment body (without the extra dashes)
289
- // and replace the closing sequence with `-->`.
290
- let out = output.get_or_insert_with(|| String::with_capacity(len));
291
- // Flush everything up to the start of the extra dashes.
292
- // The comment body ends `consecutive_dashes` bytes before
293
- // the current `idx` (which points at `>`).
294
- let close_start = idx - consecutive_dashes;
295
- out.push_str(&input[last..close_start]);
296
- out.push_str("-->");
297
- idx += 1; // consume `>`
298
- last = idx;
299
- } else {
300
- // Normal `-->` — no rewrite needed.
301
- idx += 1; // consume `>`
302
- }
303
- break;
304
- } else {
305
- // Any non-dash non-`>` character resets the dash count and
306
- // returns us to the plain comment body state.
307
- consecutive_dashes = 0;
308
- idx += 1;
309
- }
310
- }
311
- // If we reached end-of-input without finding a close, the comment is
312
- // unclosed. We leave the remainder as-is; the parser will handle it.
313
- }
314
-
315
- match output {
316
- Some(mut out) => {
317
- if last < len {
318
- out.push_str(&input[last..]);
319
- }
320
- Cow::Owned(out)
321
- }
322
- None => Cow::Borrowed(input),
323
- }
324
- }
325
-
326
- /// Normalize closing tags whose `>` appears on a subsequent line.
327
- ///
328
- /// Some HTML formatters (JSX-style) write closing tags as:
329
- ///
330
- /// ```html
331
- /// </a
332
- /// >
333
- /// ```
334
- ///
335
- /// The `tl` parser does not handle end-tags with a newline before the closing
336
- /// `>`, leaving the element unclosed so all subsequent siblings become children
337
- /// of the open element. This pass collapses such patterns to a single-line
338
- /// closing tag (`</a>`) before the document reaches `tl`.
339
- ///
340
- /// Only the whitespace between the tag name and the closing `>` is normalised;
341
- /// the rest of the document is untouched.
342
- pub fn normalize_split_closing_tags(input: &str) -> Cow<'_, str> {
343
- let bytes = input.as_bytes();
344
- let len = bytes.len();
345
-
346
- // Fast path: need both '</' and '\n' to have any candidates.
347
- if len < 4 || !bytes.contains(&b'\n') {
348
- return Cow::Borrowed(input);
349
- }
350
-
351
- let mut idx = 0;
352
- let mut last = 0;
353
- let mut output: Option<String> = None;
354
-
355
- while idx + 2 < len {
356
- // Look for `</`
357
- if bytes[idx] != b'<' || bytes[idx + 1] != b'/' {
358
- idx += 1;
359
- continue;
360
- }
361
-
362
- // Scan tag name: ASCII letters, digits, hyphens (HTML5 allows hyphens in custom elements)
363
- let name_start = idx + 2;
364
- let mut name_end = name_start;
365
- while name_end < len && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-') {
366
- name_end += 1;
367
- }
368
-
369
- if name_end == name_start {
370
- // No tag name — not a closing tag we care about.
371
- idx += 1;
372
- continue;
373
- }
374
-
375
- // After the tag name, skip any whitespace. If there is a newline in
376
- // that whitespace before the `>`, we need to rewrite.
377
- let ws_start = name_end;
378
- let mut ws_end = ws_start;
379
- let mut has_newline = false;
380
- while ws_end < len && bytes[ws_end].is_ascii_whitespace() {
381
- if bytes[ws_end] == b'\n' || bytes[ws_end] == b'\r' {
382
- has_newline = true;
383
- }
384
- ws_end += 1;
385
- }
386
-
387
- if !has_newline || ws_end >= len || bytes[ws_end] != b'>' {
388
- // Either no whitespace newline, or the `>` is not the next char.
389
- idx += 1;
390
- continue;
391
- }
392
-
393
- // We have `</tagname [whitespace-with-newline]>` — rewrite to `</tagname>`.
394
- let tag_name = &input[name_start..name_end];
395
- let out = output.get_or_insert_with(|| String::with_capacity(len));
396
- out.push_str(&input[last..idx]);
397
- out.push_str("</");
398
- out.push_str(tag_name);
399
- out.push('>');
400
-
401
- idx = ws_end + 1; // advance past the `>`
402
- last = idx;
403
- }
404
-
405
- match output {
406
- Some(mut out) => {
407
- if last < len {
408
- out.push_str(&input[last..]);
409
- }
410
- Cow::Owned(out)
411
- }
412
- None => Cow::Borrowed(input),
413
- }
414
- }
415
-
416
- /// Preprocess HTML to normalize tags and fix common issues.
417
- pub fn preprocess_html(input: &str) -> Cow<'_, str> {
418
- const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
419
- const TAGS: [&[u8]; 2] = [b"script", b"style"];
420
- const SVG: &[u8] = b"svg";
421
- const DOCTYPE: &[u8] = b"doctype";
422
- const EMPTY_COMMENT: &[u8] = b"<!---->";
423
-
424
- let bytes = input.as_bytes();
425
- let len = bytes.len();
426
- if len == 0 {
427
- return Cow::Borrowed(input);
428
- }
429
-
430
- let mut idx = 0;
431
- let mut last = 0;
432
- let mut output: Option<String> = None;
433
- let mut svg_depth = 0usize;
434
-
435
- while idx < len {
436
- if bytes[idx] == b'<' {
437
- if bytes[idx..].starts_with(EMPTY_COMMENT) {
438
- let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
439
- out.push_str(&input[last..idx]);
440
- out.push_str("<!-- -->");
441
- idx += EMPTY_COMMENT.len();
442
- last = idx;
443
- continue;
444
- }
445
-
446
- let mut replaced = false;
447
- for (pattern, replacement) in &SELF_CLOSING {
448
- if bytes[idx..].starts_with(pattern) {
449
- let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
450
- out.push_str(&input[last..idx]);
451
- out.push_str(replacement);
452
- idx += pattern.len();
453
- last = idx;
454
- replaced = true;
455
- break;
456
- }
457
- }
458
- if replaced {
459
- continue;
460
- }
461
-
462
- if matches_tag_start(bytes, idx + 1, SVG) {
463
- if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
464
- svg_depth += 1;
465
- idx = open_end;
466
- continue;
467
- }
468
- } else if matches_end_tag_start(bytes, idx + 1, SVG) {
469
- if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
470
- if svg_depth > 0 {
471
- svg_depth = svg_depth.saturating_sub(1);
472
- }
473
- idx = close_end;
474
- continue;
475
- }
476
- }
477
-
478
- if svg_depth == 0 {
479
- let mut handled = false;
480
- for tag in TAGS {
481
- if matches_tag_start(bytes, idx + 1, tag) {
482
- if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
483
- if tag == b"script" && is_json_ld_script_open_tag(&input[idx..open_end]) {
484
- continue;
485
- }
486
- let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(open_end);
487
- let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
488
- out.push_str(&input[last..idx]);
489
- out.push_str(&input[idx..open_end]);
490
- out.push_str("</");
491
- // `TAGS` contains only ASCII byte literals (`b"script"`, `b"style"`),
492
- // which are always valid UTF-8; `from_utf8` cannot fail here.
493
- if let Ok(tag_str) = str::from_utf8(tag) {
494
- out.push_str(tag_str);
495
- }
496
- out.push('>');
497
-
498
- last = remove_end;
499
- idx = remove_end;
500
- handled = true;
501
- }
502
- }
503
-
504
- if handled {
505
- break;
506
- }
507
- }
508
-
509
- if handled {
510
- continue;
511
- }
512
-
513
- if idx + 2 < len && bytes[idx + 1] == b'!' {
514
- let mut cursor = idx + 2;
515
- while cursor < len && bytes[cursor].is_ascii_whitespace() {
516
- cursor += 1;
517
- }
518
-
519
- if cursor + DOCTYPE.len() <= len
520
- && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
521
- {
522
- if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
523
- let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
524
- out.push_str(&input[last..idx]);
525
- last = end;
526
- idx = end;
527
- continue;
528
- }
529
- }
530
- }
531
- }
532
-
533
- let is_valid_tag = if idx + 1 < len {
534
- match bytes[idx + 1] {
535
- b'!' => {
536
- idx + 2 < len
537
- && (bytes[idx + 2] == b'-'
538
- || bytes[idx + 2].is_ascii_alphabetic()
539
- || bytes[idx + 2].is_ascii_uppercase())
540
- }
541
- b'/' => {
542
- idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
543
- }
544
- b'?' => true,
545
- c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
546
- _ => false,
547
- }
548
- } else {
549
- false
550
- };
551
-
552
- if !is_valid_tag {
553
- let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
554
- out.push_str(&input[last..idx]);
555
- out.push_str("&lt;");
556
- idx += 1;
557
- last = idx;
558
- continue;
559
- }
560
- }
561
-
562
- idx += 1;
563
- }
564
-
565
- if let Some(mut out) = output {
566
- if last < len {
567
- out.push_str(&input[last..]);
568
- }
569
- Cow::Owned(out)
570
- } else {
571
- Cow::Borrowed(input)
572
- }
573
- }
574
-
575
- /// Check if a script tag is a JSON-LD script.
576
- pub fn is_json_ld_script_open_tag(tag: &str) -> bool {
577
- let bytes = tag.as_bytes();
578
- let mut idx = 0;
579
- while idx + 4 <= bytes.len() {
580
- if eq_ascii_case_insensitive(&bytes[idx..], b"type") {
581
- let before_ok = idx == 0
582
- || bytes
583
- .get(idx.saturating_sub(1))
584
- .is_some_and(|b| b.is_ascii_whitespace() || *b == b'<' || *b == b'/');
585
- let after_ok = bytes
586
- .get(idx + 4)
587
- .is_some_and(|b| b.is_ascii_whitespace() || *b == b'=');
588
- if !before_ok || !after_ok {
589
- idx += 4;
590
- continue;
591
- }
592
-
593
- let mut i = idx + 4;
594
- while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
595
- i += 1;
596
- }
597
- if bytes.get(i) != Some(&b'=') {
598
- idx += 4;
599
- continue;
600
- }
601
- i += 1;
602
- while bytes.get(i).is_some_and(u8::is_ascii_whitespace) {
603
- i += 1;
604
- }
605
- if i >= bytes.len() {
606
- return false;
607
- }
608
-
609
- let (value_start, value_end) = match bytes[i] {
610
- b'"' | b'\'' => {
611
- let quote = bytes[i];
612
- let start = i + 1;
613
- let mut end = start;
614
- while end < bytes.len() && bytes[end] != quote {
615
- end += 1;
616
- }
617
- (start, end)
618
- }
619
- _ => {
620
- let start = i;
621
- let mut end = start;
622
- while end < bytes.len() && !bytes[end].is_ascii_whitespace() && bytes[end] != b'>' {
623
- end += 1;
624
- }
625
- (start, end)
626
- }
627
- };
628
-
629
- let value = &tag[value_start..value_end];
630
- let media_type = value.split(';').next().unwrap_or(value).trim();
631
- return eq_ascii_case_insensitive(media_type.as_bytes(), b"application/ld+json");
632
- }
633
- idx += 1;
634
- }
635
- false
636
- }
637
-
638
- /// Case-insensitive byte comparison for ASCII.
639
- #[inline]
640
- pub fn eq_ascii_case_insensitive(haystack: &[u8], needle: &[u8]) -> bool {
641
- if haystack.len() < needle.len() {
642
- return false;
643
- }
644
- haystack
645
- .iter()
646
- .zip(needle.iter())
647
- .all(|(a, b)| a.eq_ignore_ascii_case(b))
648
- }
649
-
650
- /// Check if bytes match a tag start pattern.
651
- pub fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
652
- if start >= bytes.len() {
653
- return false;
654
- }
655
-
656
- if start + tag.len() > bytes.len() {
657
- return false;
658
- }
659
-
660
- if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
661
- return false;
662
- }
663
-
664
- start += tag.len();
665
-
666
- match bytes.get(start) {
667
- Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
668
- Some(_) => false,
669
- None => true,
670
- }
671
- }
672
-
673
- /// Find the end of an HTML tag (the position of '>').
674
- pub fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
675
- let len = bytes.len();
676
- let mut in_quote: Option<u8> = None;
677
-
678
- while idx < len {
679
- match bytes[idx] {
680
- b'"' | b'\'' => {
681
- if let Some(current) = in_quote {
682
- if current == bytes[idx] {
683
- in_quote = None;
684
- }
685
- } else {
686
- in_quote = Some(bytes[idx]);
687
- }
688
- }
689
- b'>' if in_quote.is_none() => return Some(idx + 1),
690
- _ => {}
691
- }
692
- idx += 1;
693
- }
694
-
695
- None
696
- }
697
-
698
- /// Find the closing tag for a given tag name.
699
- pub fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
700
- let len = bytes.len();
701
- let mut depth = 1usize;
702
-
703
- while idx < len {
704
- if bytes[idx] == b'<' {
705
- if matches_tag_start(bytes, idx + 1, tag) {
706
- if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
707
- depth += 1;
708
- idx = next;
709
- continue;
710
- }
711
- } else if matches_end_tag_start(bytes, idx + 1, tag) {
712
- if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
713
- depth -= 1;
714
- if depth == 0 {
715
- return Some(close);
716
- }
717
- idx = close;
718
- continue;
719
- }
720
- }
721
- }
722
-
723
- idx += 1;
724
- }
725
-
726
- None
727
- }
728
-
729
- /// Check if bytes match an end tag pattern.
730
- pub fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
731
- if start >= bytes.len() || bytes[start] != b'/' {
732
- return false;
733
- }
734
- matches_tag_start(bytes, start + 1, tag)
735
- }
736
-
737
- /// Sanitize malformed markdown-like URLs in HTML attributes.
738
- ///
739
- /// Handles cases like: `//[domain.com/path](http://domain.com/path)`
740
- /// Extracts the actual URL from parentheses.
741
- ///
742
- /// This is an internal function used during preprocessing to extract valid URLs
743
- /// from malformed HTML that contains markdown-like syntax.
744
- ///
745
- /// # Arguments
746
- /// * `url` - The URL string to sanitize
747
- ///
748
- /// # Returns
749
- /// * `Cow<str>` - Either the borrowed original URL or an owned sanitized version
750
- pub fn sanitize_markdown_url(url: &str) -> Cow<'_, str> {
751
- // Pattern: ...[text](actual_url) or similar markdown-like syntax
752
- // This handles malformed HTML where markdown syntax wasn't properly converted
753
- // and prevents downstream URL parsing errors (e.g., bracketed "IPv6" hosts).
754
-
755
- // Fast-path: we only care about markdown-like link syntax.
756
- let Some(mid) = url.find("](") else {
757
- return Cow::Borrowed(url);
758
- };
759
-
760
- // Ensure there is an opening '[' before the "](..." sequence.
761
- if !url[..mid].contains('[') {
762
- return Cow::Borrowed(url);
763
- }
764
-
765
- let paren_start = mid + 2;
766
- let Some(rel_end) = url[paren_start..].find(')') else {
767
- return Cow::Borrowed(url);
768
- };
769
- let paren_end = paren_start + rel_end;
770
- if paren_start >= paren_end {
771
- return Cow::Borrowed(url);
772
- }
773
-
774
- Cow::Owned(url[paren_start..paren_end].to_string())
775
- }
776
-
777
- /// Strip elements with the `hidden` attribute from HTML.
778
- ///
779
- /// Scans for opening tags containing the `hidden` attribute, finds their
780
- /// matching closing tag, and removes the entire element (tag + content).
781
- /// Self-closing tags with `hidden` are also removed.
782
- pub fn strip_hidden_elements(input: &str) -> Cow<'_, str> {
783
- let bytes = input.as_bytes();
784
- let len = bytes.len();
785
-
786
- if len == 0 || !bytes.contains(&b'<') {
787
- return Cow::Borrowed(input);
788
- }
789
-
790
- let mut idx = 0;
791
- let mut last = 0;
792
- let mut output: Option<String> = None;
793
-
794
- while idx < len {
795
- if bytes[idx] == b'<' && idx + 1 < len && bytes[idx + 1] != b'/' && bytes[idx + 1] != b'!' {
796
- // Find the end of this opening tag
797
- if let Some(tag_end) = find_tag_end(bytes, idx + 1) {
798
- let tag_slice = &input[idx..tag_end];
799
- if tag_has_hidden_attribute(tag_slice) {
800
- // Extract the tag name
801
- let name_start = idx + 1;
802
- let mut name_end = name_start;
803
- while name_end < len
804
- && !bytes[name_end].is_ascii_whitespace()
805
- && bytes[name_end] != b'>'
806
- && bytes[name_end] != b'/'
807
- {
808
- name_end += 1;
809
- }
810
- let tag_name = &bytes[name_start..name_end];
811
-
812
- // Check if it's a self-closing tag (e.g., <br hidden> or <br hidden/>)
813
- let is_self_closing = tag_slice.ends_with("/>")
814
- || tag_name.eq_ignore_ascii_case(b"br")
815
- || tag_name.eq_ignore_ascii_case(b"hr")
816
- || tag_name.eq_ignore_ascii_case(b"img")
817
- || tag_name.eq_ignore_ascii_case(b"input");
818
-
819
- let remove_end = if is_self_closing {
820
- tag_end
821
- } else {
822
- // Find the closing tag
823
- find_closing_tag_bytes(bytes, tag_end, tag_name).unwrap_or(tag_end)
824
- };
825
-
826
- let out = output.get_or_insert_with(|| String::with_capacity(len));
827
- out.push_str(&input[last..idx]);
828
- last = remove_end;
829
- idx = remove_end;
830
- continue;
831
- }
832
- }
833
- }
834
- idx += 1;
835
- }
836
-
837
- if let Some(mut out) = output {
838
- if last < len {
839
- out.push_str(&input[last..]);
840
- }
841
- Cow::Owned(out)
842
- } else {
843
- Cow::Borrowed(input)
844
- }
845
- }
846
-
847
- /// Check if an opening tag string contains the `hidden` attribute.
848
- ///
849
- /// Handles: `hidden`, `hidden=""`, `hidden="hidden"`, `hidden="true"`.
850
- /// Does NOT match attributes like `data-hidden` or `aria-hidden`.
851
- fn tag_has_hidden_attribute(tag: &str) -> bool {
852
- let bytes = tag.as_bytes();
853
- let len = bytes.len();
854
- let needle = b"hidden";
855
- let nlen = needle.len();
856
-
857
- let mut i = 0;
858
- // Skip past the tag name
859
- while i < len && bytes[i] != b' ' && bytes[i] != b'\t' && bytes[i] != b'\n' && bytes[i] != b'>' {
860
- i += 1;
861
- }
862
-
863
- while i + nlen <= len {
864
- if bytes[i..i + nlen].eq_ignore_ascii_case(needle) {
865
- // Check that the character before is whitespace (attribute boundary)
866
- let before_ok = i == 0 || bytes[i - 1].is_ascii_whitespace();
867
- // Check that the character after is whitespace, '>', '=', or '/'
868
- let after = bytes.get(i + nlen).copied();
869
- let after_ok = matches!(after, None | Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'=' | b'/'));
870
- if before_ok && after_ok {
871
- return true;
872
- }
873
- }
874
- i += 1;
875
- }
876
- false
877
- }
878
-
879
- #[cfg(test)]
880
- mod tests {
881
- use super::{normalize_bogus_comment_endings, normalize_split_closing_tags, sanitize_markdown_url};
882
-
883
- // ── normalize_bogus_comment_endings ───────────────────────────────────────
884
-
885
- #[test]
886
- fn normalize_bogus_comment_endings_leaves_well_formed_comment_unchanged() {
887
- let input = "<p>A</p><!-- foo --><p>B</p>";
888
- let result = normalize_bogus_comment_endings(input);
889
- // Borrowed means unchanged
890
- assert_eq!(result.as_ref(), input);
891
- }
892
-
893
- #[test]
894
- fn normalize_bogus_comment_endings_rewrites_triple_dash_close() {
895
- let input = "<!-- foo --->";
896
- let result = normalize_bogus_comment_endings(input);
897
- assert_eq!(result.as_ref(), "<!-- foo -->");
898
- }
899
-
900
- #[test]
901
- fn normalize_bogus_comment_endings_rewrites_four_dash_close() {
902
- let input = "<!-- foo ---->";
903
- let result = normalize_bogus_comment_endings(input);
904
- assert_eq!(result.as_ref(), "<!-- foo -->");
905
- }
906
-
907
- #[test]
908
- fn normalize_bogus_comment_endings_preserves_content_after_comment() {
909
- let input = "<h1>One</h1><!-- /// ---><p>Two</p>";
910
- let result = normalize_bogus_comment_endings(input);
911
- assert_eq!(result.as_ref(), "<h1>One</h1><!-- /// --><p>Two</p>");
912
- }
913
-
914
- #[test]
915
- fn normalize_bogus_comment_endings_handles_multiple_bogus_comments() {
916
- let input = "<p>A</p><!-- x ---><p>B</p><!-- y ----><p>C</p>";
917
- let result = normalize_bogus_comment_endings(input);
918
- assert_eq!(result.as_ref(), "<p>A</p><!-- x --><p>B</p><!-- y --><p>C</p>");
919
- }
920
-
921
- #[test]
922
- fn normalize_bogus_comment_endings_handles_no_comments() {
923
- let input = "<p>Just a paragraph</p>";
924
- let result = normalize_bogus_comment_endings(input);
925
- assert_eq!(result.as_ref(), input);
926
- }
927
-
928
- #[test]
929
- fn normalize_bogus_comment_endings_empty_input() {
930
- let result = normalize_bogus_comment_endings("");
931
- assert_eq!(result.as_ref(), "");
932
- }
933
-
934
- // ── normalize_split_closing_tags ──────────────────────────────────────────
935
-
936
- #[test]
937
- fn normalize_split_closing_tags_collapses_newline_before_close_bracket() {
938
- let input = "<a href=\"#x\">text</a\n>";
939
- let result = normalize_split_closing_tags(input);
940
- assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
941
- }
942
-
943
- #[test]
944
- fn normalize_split_closing_tags_collapses_indented_newline_before_close_bracket() {
945
- let input = "<a href=\"#x\">text</a\n >";
946
- let result = normalize_split_closing_tags(input);
947
- assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
948
- }
949
-
950
- #[test]
951
- fn normalize_split_closing_tags_leaves_well_formed_closing_tags_unchanged() {
952
- let input = "<a href=\"#x\">text</a>";
953
- let result = normalize_split_closing_tags(input);
954
- assert_eq!(result.as_ref(), input);
955
- }
956
-
957
- #[test]
958
- fn normalize_split_closing_tags_handles_multiple_split_closing_tags() {
959
- let input = "<li><a href=\"#a\">A</a\n >\n<a href=\"#b\">B</a\n>";
960
- let result = normalize_split_closing_tags(input);
961
- assert_eq!(result.as_ref(), "<li><a href=\"#a\">A</a>\n<a href=\"#b\">B</a>");
962
- }
963
-
964
- #[test]
965
- fn normalize_split_closing_tags_does_not_collapse_inline_whitespace() {
966
- // Only newlines trigger the normalisation; spaces alone must not.
967
- let input = "<a href=\"#x\">text</a >";
968
- let result = normalize_split_closing_tags(input);
969
- // A space before > is actually valid HTML and tl handles it fine.
970
- // We must not touch it to avoid over-normalising.
971
- assert_eq!(result.as_ref(), input);
972
- }
973
-
974
- #[test]
975
- fn normalize_split_closing_tags_empty_input() {
976
- let result = normalize_split_closing_tags("");
977
- assert_eq!(result.as_ref(), "");
978
- }
979
-
980
- // ── sanitize_markdown_url ─────────────────────────────────────────────────
981
-
982
- #[test]
983
- fn sanitize_markdown_url_extracts_scheme_relative_markdown_like_url() {
984
- let input = "//[p1.zemanta.com/v2/p/ns/45625/PAGE\\_VIEW/](http://p1.zemanta.com/v2/p/ns/45625/PAGE_VIEW/)";
985
- let sanitized = sanitize_markdown_url(input);
986
- assert_eq!(sanitized, "http://p1.zemanta.com/v2/p/ns/45625/PAGE_VIEW/");
987
- }
988
-
989
- #[test]
990
- fn sanitize_markdown_url_extracts_standard_markdown_like_url() {
991
- let input = "[label](https://example.com/path?q=1)";
992
- let sanitized = sanitize_markdown_url(input);
993
- assert_eq!(sanitized, "https://example.com/path?q=1");
994
- }
995
-
996
- #[test]
997
- fn sanitize_markdown_url_leaves_normal_urls_unchanged() {
998
- let input = "https://example.com/normal";
999
- let sanitized = sanitize_markdown_url(input);
1000
- assert_eq!(sanitized, input);
1001
- }
1002
- }