html-to-markdown 3.4.0.pre.rc.45 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/html_to_markdown_rb/Cargo.toml +1 -1
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/native.rb +3 -1
- data/lib/html_to_markdown/version.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +47 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +87 -12
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +87 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +19 -1
- data/vendor/html-to-markdown-rs/src/converter/main.rs +9 -1
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +158 -40
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +64 -31
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +137 -1
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +13 -5
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +4 -2
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +53 -0
- data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +74 -0
- data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +154 -0
- data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +93 -0
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +39 -23
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +74 -47
- metadata +4 -1
|
@@ -11,6 +11,13 @@ use crate::converter::preprocessing_helpers::should_drop_for_preprocessing;
|
|
|
11
11
|
use crate::options::ConversionOptions;
|
|
12
12
|
use crate::text;
|
|
13
13
|
|
|
14
|
+
#[cfg(feature = "visitor")]
|
|
15
|
+
use crate::converter::utility::content::{collect_tag_attributes, is_block_level_element};
|
|
16
|
+
#[cfg(feature = "visitor")]
|
|
17
|
+
use crate::visitor::{NodeContext, NodeType, VisitResult, VisitorHandle};
|
|
18
|
+
#[cfg(feature = "visitor")]
|
|
19
|
+
use std::collections::BTreeMap;
|
|
20
|
+
|
|
14
21
|
/// Tracks list context for proper marker emission on `<li>` elements.
|
|
15
22
|
#[derive(Clone, Debug)]
|
|
16
23
|
enum ListContext {
|
|
@@ -53,6 +60,30 @@ const BLOCK_TAGS: &[&str] = &[
|
|
|
53
60
|
"search",
|
|
54
61
|
];
|
|
55
62
|
|
|
63
|
+
/// Shared walker state threaded through all recursive calls.
|
|
64
|
+
///
|
|
65
|
+
/// Holds the options, visitor (feature-gated), and current DOM depth.
|
|
66
|
+
/// Using a struct avoids feature-gated function parameters at call sites.
|
|
67
|
+
struct WalkState<'a> {
|
|
68
|
+
options: &'a ConversionOptions,
|
|
69
|
+
excluded_node_ids: &'a HashSet<u32>,
|
|
70
|
+
depth: usize,
|
|
71
|
+
#[cfg(feature = "visitor")]
|
|
72
|
+
visitor: Option<&'a VisitorHandle>,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
impl WalkState<'_> {
|
|
76
|
+
fn descend(&self) -> Self {
|
|
77
|
+
WalkState {
|
|
78
|
+
options: self.options,
|
|
79
|
+
excluded_node_ids: self.excluded_node_ids,
|
|
80
|
+
depth: self.depth + 1,
|
|
81
|
+
#[cfg(feature = "visitor")]
|
|
82
|
+
visitor: self.visitor,
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
56
87
|
/// Extract plain text from a parsed DOM tree.
|
|
57
88
|
///
|
|
58
89
|
/// Walks the tree collecting visible text with structural whitespace:
|
|
@@ -64,6 +95,8 @@ const BLOCK_TAGS: &[&str] = &[
|
|
|
64
95
|
/// - Tables: cells separated by tab, rows by newline
|
|
65
96
|
/// - Inline elements are recursed without markers
|
|
66
97
|
/// - Nodes matching `excluded_node_ids` (from `exclude_selectors`) are dropped entirely
|
|
98
|
+
/// - When a visitor is configured, `visit_element_start`, `visit_element_end`, and
|
|
99
|
+
/// `visit_text` callbacks are fired and their results are honoured.
|
|
67
100
|
pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
|
|
68
101
|
let mut buf = String::with_capacity(1024);
|
|
69
102
|
let mut list_ctx = ListContext::None;
|
|
@@ -83,16 +116,16 @@ pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &Convers
|
|
|
83
116
|
ids
|
|
84
117
|
};
|
|
85
118
|
|
|
119
|
+
let state = WalkState {
|
|
120
|
+
options,
|
|
121
|
+
excluded_node_ids: &excluded_node_ids,
|
|
122
|
+
depth: 0,
|
|
123
|
+
#[cfg(feature = "visitor")]
|
|
124
|
+
visitor: options.visitor.as_ref(),
|
|
125
|
+
};
|
|
126
|
+
|
|
86
127
|
for child_handle in dom.children() {
|
|
87
|
-
walk_plain(
|
|
88
|
-
child_handle,
|
|
89
|
-
parser,
|
|
90
|
-
&mut buf,
|
|
91
|
-
options,
|
|
92
|
-
false,
|
|
93
|
-
&mut list_ctx,
|
|
94
|
-
&excluded_node_ids,
|
|
95
|
-
);
|
|
128
|
+
walk_plain(child_handle, parser, &mut buf, false, &mut list_ctx, &state);
|
|
96
129
|
}
|
|
97
130
|
|
|
98
131
|
post_process(&mut buf);
|
|
@@ -104,10 +137,9 @@ fn walk_plain(
|
|
|
104
137
|
node_handle: &tl::NodeHandle,
|
|
105
138
|
parser: &tl::Parser,
|
|
106
139
|
buf: &mut String,
|
|
107
|
-
options: &ConversionOptions,
|
|
108
140
|
in_pre: bool,
|
|
109
141
|
list_ctx: &mut ListContext,
|
|
110
|
-
|
|
142
|
+
state: &WalkState<'_>,
|
|
111
143
|
) {
|
|
112
144
|
let Some(node) = node_handle.get(parser) else {
|
|
113
145
|
return;
|
|
@@ -117,6 +149,30 @@ fn walk_plain(
|
|
|
117
149
|
tl::Node::Raw(bytes) => {
|
|
118
150
|
let raw = bytes.as_utf8_str();
|
|
119
151
|
let decoded = text::decode_html_entities_cow(raw.as_ref());
|
|
152
|
+
|
|
153
|
+
#[cfg(feature = "visitor")]
|
|
154
|
+
if let Some(visitor_handle) = state.visitor {
|
|
155
|
+
let text_str: &str = &decoded;
|
|
156
|
+
let node_ctx = NodeContext {
|
|
157
|
+
node_type: NodeType::Text,
|
|
158
|
+
tag_name: String::new(),
|
|
159
|
+
attributes: BTreeMap::new(),
|
|
160
|
+
depth: state.depth,
|
|
161
|
+
index_in_parent: 0,
|
|
162
|
+
parent_tag: None,
|
|
163
|
+
is_inline: true,
|
|
164
|
+
};
|
|
165
|
+
let result = visitor_handle.borrow_mut().visit_text(&node_ctx, text_str);
|
|
166
|
+
match result {
|
|
167
|
+
VisitResult::Skip => return,
|
|
168
|
+
VisitResult::Custom(custom) => {
|
|
169
|
+
buf.push_str(&custom);
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
_ => {}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
120
176
|
if in_pre {
|
|
121
177
|
buf.push_str(&decoded);
|
|
122
178
|
} else {
|
|
@@ -132,7 +188,7 @@ fn walk_plain(
|
|
|
132
188
|
}
|
|
133
189
|
tl::Node::Tag(tag) => {
|
|
134
190
|
// Drop elements matching exclude_selectors, including all their descendants.
|
|
135
|
-
if !excluded_node_ids.is_empty() && excluded_node_ids.contains(&node_handle.get_inner()) {
|
|
191
|
+
if !state.excluded_node_ids.is_empty() && state.excluded_node_ids.contains(&node_handle.get_inner()) {
|
|
136
192
|
return;
|
|
137
193
|
}
|
|
138
194
|
|
|
@@ -146,10 +202,55 @@ fn walk_plain(
|
|
|
146
202
|
|
|
147
203
|
// Apply preprocessing: drop nav/footer/aside/noise elements
|
|
148
204
|
// (shared logic with the markdown path).
|
|
149
|
-
if should_drop_for_preprocessing(tag_str, tag, options) {
|
|
205
|
+
if should_drop_for_preprocessing(tag_str, tag, state.options) {
|
|
150
206
|
return;
|
|
151
207
|
}
|
|
152
208
|
|
|
209
|
+
// --- visitor: element start ---
|
|
210
|
+
#[cfg(feature = "visitor")]
|
|
211
|
+
if let Some(visitor_handle) = state.visitor {
|
|
212
|
+
let attributes = collect_tag_attributes(tag);
|
|
213
|
+
let node_ctx = NodeContext {
|
|
214
|
+
node_type: NodeType::Element,
|
|
215
|
+
tag_name: tag_str.to_string(),
|
|
216
|
+
attributes,
|
|
217
|
+
depth: state.depth,
|
|
218
|
+
index_in_parent: 0,
|
|
219
|
+
parent_tag: None,
|
|
220
|
+
is_inline: !is_block_level_element(tag_str),
|
|
221
|
+
};
|
|
222
|
+
let result = visitor_handle.borrow_mut().visit_element_start(&node_ctx);
|
|
223
|
+
match result {
|
|
224
|
+
VisitResult::Skip => return,
|
|
225
|
+
VisitResult::Custom(custom) => {
|
|
226
|
+
buf.push_str(&custom);
|
|
227
|
+
// Still call visit_element_end with the custom content as context.
|
|
228
|
+
let end_result = visitor_handle.borrow_mut().visit_element_end(&node_ctx, &custom);
|
|
229
|
+
match end_result {
|
|
230
|
+
VisitResult::Custom(replacement) => {
|
|
231
|
+
let trim_len = buf.len() - custom.len();
|
|
232
|
+
buf.truncate(trim_len);
|
|
233
|
+
buf.push_str(&replacement);
|
|
234
|
+
}
|
|
235
|
+
VisitResult::Skip => {
|
|
236
|
+
let trim_len = buf.len() - custom.len();
|
|
237
|
+
buf.truncate(trim_len);
|
|
238
|
+
}
|
|
239
|
+
_ => {}
|
|
240
|
+
}
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
_ => {}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Record the buf position before this element's content so visit_element_end
|
|
248
|
+
// can truncate back to it for Custom/Skip results.
|
|
249
|
+
#[cfg(feature = "visitor")]
|
|
250
|
+
let element_output_start = buf.len();
|
|
251
|
+
|
|
252
|
+
let child_state = state.descend();
|
|
253
|
+
|
|
153
254
|
match tag_str {
|
|
154
255
|
"br" => {
|
|
155
256
|
buf.push('\n');
|
|
@@ -159,11 +260,11 @@ fn walk_plain(
|
|
|
159
260
|
}
|
|
160
261
|
"pre" => {
|
|
161
262
|
ensure_blank_line(buf);
|
|
162
|
-
walk_children(tag, parser, buf,
|
|
263
|
+
walk_children(tag, parser, buf, true, list_ctx, &child_state);
|
|
163
264
|
ensure_blank_line(buf);
|
|
164
265
|
}
|
|
165
266
|
"img" => {
|
|
166
|
-
if !options.skip_images {
|
|
267
|
+
if !state.options.skip_images {
|
|
167
268
|
if let Some(Some(alt)) = tag.attributes().get("alt") {
|
|
168
269
|
let alt_text = alt.as_utf8_str();
|
|
169
270
|
if !alt_text.is_empty() {
|
|
@@ -174,13 +275,13 @@ fn walk_plain(
|
|
|
174
275
|
}
|
|
175
276
|
"table" => {
|
|
176
277
|
ensure_blank_line(buf);
|
|
177
|
-
walk_table(tag, parser, buf,
|
|
278
|
+
walk_table(tag, parser, buf, &child_state);
|
|
178
279
|
ensure_blank_line(buf);
|
|
179
280
|
}
|
|
180
281
|
"ul" => {
|
|
181
282
|
ensure_newline(buf);
|
|
182
283
|
let mut child_ctx = ListContext::Unordered;
|
|
183
|
-
walk_children(tag, parser, buf,
|
|
284
|
+
walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
|
|
184
285
|
ensure_newline(buf);
|
|
185
286
|
}
|
|
186
287
|
"ol" => {
|
|
@@ -192,7 +293,7 @@ fn walk_plain(
|
|
|
192
293
|
.unwrap_or(1);
|
|
193
294
|
ensure_newline(buf);
|
|
194
295
|
let mut child_ctx = ListContext::Ordered { next_index: start };
|
|
195
|
-
walk_children(tag, parser, buf,
|
|
296
|
+
walk_children(tag, parser, buf, false, &mut child_ctx, &child_state);
|
|
196
297
|
ensure_newline(buf);
|
|
197
298
|
}
|
|
198
299
|
"li" => {
|
|
@@ -210,17 +311,48 @@ fn walk_plain(
|
|
|
210
311
|
buf.push_str("- ");
|
|
211
312
|
}
|
|
212
313
|
}
|
|
213
|
-
walk_children(tag, parser, buf,
|
|
314
|
+
walk_children(tag, parser, buf, false, list_ctx, &child_state);
|
|
214
315
|
ensure_newline(buf);
|
|
215
316
|
}
|
|
216
317
|
_ if BLOCK_TAGS.contains(&tag_str) => {
|
|
217
318
|
ensure_blank_line(buf);
|
|
218
|
-
walk_children(tag, parser, buf,
|
|
319
|
+
walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
|
|
219
320
|
ensure_blank_line(buf);
|
|
220
321
|
}
|
|
221
322
|
_ => {
|
|
222
323
|
// Inline elements and structural containers (html, body, etc.)
|
|
223
|
-
walk_children(tag, parser, buf,
|
|
324
|
+
walk_children(tag, parser, buf, in_pre, list_ctx, &child_state);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// --- visitor: element end ---
|
|
329
|
+
#[cfg(feature = "visitor")]
|
|
330
|
+
if let Some(visitor_handle) = state.visitor {
|
|
331
|
+
let attributes = collect_tag_attributes(tag);
|
|
332
|
+
let node_ctx = NodeContext {
|
|
333
|
+
node_type: NodeType::Element,
|
|
334
|
+
tag_name: tag_str.to_string(),
|
|
335
|
+
attributes,
|
|
336
|
+
depth: state.depth,
|
|
337
|
+
index_in_parent: 0,
|
|
338
|
+
parent_tag: None,
|
|
339
|
+
is_inline: !is_block_level_element(tag_str),
|
|
340
|
+
};
|
|
341
|
+
// Clamp safe_start in case children truncated the buffer.
|
|
342
|
+
let safe_start = element_output_start.min(buf.len());
|
|
343
|
+
let element_content = &buf[safe_start..];
|
|
344
|
+
let result = visitor_handle
|
|
345
|
+
.borrow_mut()
|
|
346
|
+
.visit_element_end(&node_ctx, element_content);
|
|
347
|
+
match result {
|
|
348
|
+
VisitResult::Custom(custom) => {
|
|
349
|
+
buf.truncate(safe_start);
|
|
350
|
+
buf.push_str(&custom);
|
|
351
|
+
}
|
|
352
|
+
VisitResult::Skip => {
|
|
353
|
+
buf.truncate(safe_start);
|
|
354
|
+
}
|
|
355
|
+
_ => {}
|
|
224
356
|
}
|
|
225
357
|
}
|
|
226
358
|
}
|
|
@@ -233,26 +365,19 @@ fn walk_children(
|
|
|
233
365
|
tag: &tl::HTMLTag,
|
|
234
366
|
parser: &tl::Parser,
|
|
235
367
|
buf: &mut String,
|
|
236
|
-
options: &ConversionOptions,
|
|
237
368
|
in_pre: bool,
|
|
238
369
|
list_ctx: &mut ListContext,
|
|
239
|
-
|
|
370
|
+
state: &WalkState<'_>,
|
|
240
371
|
) {
|
|
241
372
|
let children = tag.children();
|
|
242
373
|
let top = children.top();
|
|
243
374
|
for child in top.iter() {
|
|
244
|
-
walk_plain(child, parser, buf,
|
|
375
|
+
walk_plain(child, parser, buf, in_pre, list_ctx, state);
|
|
245
376
|
}
|
|
246
377
|
}
|
|
247
378
|
|
|
248
379
|
/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
|
|
249
|
-
fn walk_table(
|
|
250
|
-
table_tag: &tl::HTMLTag,
|
|
251
|
-
parser: &tl::Parser,
|
|
252
|
-
buf: &mut String,
|
|
253
|
-
options: &ConversionOptions,
|
|
254
|
-
excluded_node_ids: &HashSet<u32>,
|
|
255
|
-
) {
|
|
380
|
+
fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, state: &WalkState<'_>) {
|
|
256
381
|
// Collect all <tr> node handles by recursing into the table
|
|
257
382
|
let mut row_handles = Vec::new();
|
|
258
383
|
collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
|
|
@@ -278,6 +403,7 @@ fn walk_table(
|
|
|
278
403
|
}
|
|
279
404
|
}
|
|
280
405
|
|
|
406
|
+
let cell_state = state.descend();
|
|
281
407
|
for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
|
|
282
408
|
if cell_idx > 0 {
|
|
283
409
|
buf.push('\t');
|
|
@@ -285,15 +411,7 @@ fn walk_table(
|
|
|
285
411
|
let mut cell_buf = String::new();
|
|
286
412
|
if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
|
|
287
413
|
let mut cell_list_ctx = ListContext::None;
|
|
288
|
-
walk_children(
|
|
289
|
-
cell_tag,
|
|
290
|
-
parser,
|
|
291
|
-
&mut cell_buf,
|
|
292
|
-
options,
|
|
293
|
-
false,
|
|
294
|
-
&mut cell_list_ctx,
|
|
295
|
-
excluded_node_ids,
|
|
296
|
-
);
|
|
414
|
+
walk_children(cell_tag, parser, &mut cell_buf, false, &mut cell_list_ctx, &cell_state);
|
|
297
415
|
}
|
|
298
416
|
buf.push_str(cell_buf.trim());
|
|
299
417
|
}
|
|
@@ -18,48 +18,61 @@ pub fn inline_ancestor_allows_block(tag_name: &str) -> bool {
|
|
|
18
18
|
///
|
|
19
19
|
/// Excludes elements inside `<pre>` or `<code>` blocks, as they have special
|
|
20
20
|
/// whitespace preservation rules and should not be repaired.
|
|
21
|
+
///
|
|
22
|
+
/// Also detects table structural elements (`td`, `tr`, `th`) nested under `<p>` —
|
|
23
|
+
/// a structural impossibility in valid HTML that signals the `tl` parser absorbed
|
|
24
|
+
/// a table into a paragraph because of an unclosed `<p>` (common in Word/Outlook
|
|
25
|
+
/// HTML such as `<p class='MsoNormal'>` cells). Issue #336.
|
|
21
26
|
pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
|
|
22
27
|
for handle in dom_ctx.node_map.iter().flatten() {
|
|
23
28
|
if let Some(tl::Node::Tag(_tag)) = handle.get(parser) {
|
|
24
|
-
let
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
29
|
+
let node_id = handle.get_inner();
|
|
30
|
+
let Some(info) = dom_ctx.tag_info(node_id, parser) else {
|
|
31
|
+
continue;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
// Table elements under <p>: tl misparsed an unclosed <p> in <td>.
|
|
35
|
+
if matches!(info.name.as_str(), "td" | "tr" | "th") && has_p_ancestor(dom_ctx, parser, node_id) {
|
|
36
|
+
return true;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if !info.is_block {
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Check if this block element or any ancestor is pre/code
|
|
44
|
+
let mut check_parent = Some(node_id);
|
|
45
|
+
let mut inside_preformatted = false;
|
|
46
|
+
while let Some(check_id) = check_parent {
|
|
47
|
+
if let Some(info) = dom_ctx.tag_info(check_id, parser) {
|
|
48
|
+
if matches!(info.name.as_str(), "pre" | "code") {
|
|
49
|
+
inside_preformatted = true;
|
|
50
|
+
break;
|
|
38
51
|
}
|
|
39
|
-
check_parent = dom_ctx.parent_of(node_id);
|
|
40
52
|
}
|
|
53
|
+
check_parent = dom_ctx.parent_of(check_id);
|
|
54
|
+
}
|
|
41
55
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
// Skip misnesting check for elements inside pre/code blocks
|
|
57
|
+
if inside_preformatted {
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
46
60
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
61
|
+
let mut current = dom_ctx.parent_of(node_id);
|
|
62
|
+
while let Some(parent_id) = current {
|
|
63
|
+
if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
|
|
64
|
+
if is_inline_element(&parent_info.name) && !inline_ancestor_allows_block(&parent_info.name) {
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
} else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
|
|
68
|
+
if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
|
|
69
|
+
let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
|
|
70
|
+
if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
|
|
51
71
|
return true;
|
|
52
72
|
}
|
|
53
|
-
} else if let Some(parent_handle) = dom_ctx.node_handle(parent_id) {
|
|
54
|
-
if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
|
|
55
|
-
let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
|
|
56
|
-
if is_inline_element(&parent_name) && !inline_ancestor_allows_block(&parent_name) {
|
|
57
|
-
return true;
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
73
|
}
|
|
61
|
-
current = dom_ctx.parent_of(parent_id);
|
|
62
74
|
}
|
|
75
|
+
current = dom_ctx.parent_of(parent_id);
|
|
63
76
|
}
|
|
64
77
|
}
|
|
65
78
|
}
|
|
@@ -67,6 +80,26 @@ pub fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bo
|
|
|
67
80
|
false
|
|
68
81
|
}
|
|
69
82
|
|
|
83
|
+
/// Walk ancestors of `node_id` looking for a `<p>` element.
|
|
84
|
+
///
|
|
85
|
+
/// Stops ascending once it leaves the table hierarchy (`table`/`body`/`html`)
|
|
86
|
+
/// to avoid false positives where a `<p>` legitimately wraps a `<table>`.
|
|
87
|
+
fn has_p_ancestor(dom_ctx: &DomContext, parser: &tl::Parser, node_id: u32) -> bool {
|
|
88
|
+
let mut current = dom_ctx.parent_of(node_id);
|
|
89
|
+
while let Some(parent_id) = current {
|
|
90
|
+
if let Some(parent_info) = dom_ctx.tag_info(parent_id, parser) {
|
|
91
|
+
if parent_info.name == "p" {
|
|
92
|
+
return true;
|
|
93
|
+
}
|
|
94
|
+
if matches!(parent_info.name.as_str(), "table" | "body" | "html") {
|
|
95
|
+
return false;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
current = dom_ctx.parent_of(parent_id);
|
|
99
|
+
}
|
|
100
|
+
false
|
|
101
|
+
}
|
|
102
|
+
|
|
70
103
|
/// Determine if a node should be dropped during preprocessing.
|
|
71
104
|
///
|
|
72
105
|
/// Behavior depends on the [`PreprocessingPreset`]:
|
|
@@ -114,25 +114,19 @@ pub fn process_text_node(
|
|
|
114
114
|
let processed_text = if ctx.in_code || ctx.in_ruby {
|
|
115
115
|
text.into_owned()
|
|
116
116
|
} else if ctx.in_table_cell {
|
|
117
|
+
// Always escape * and _ in table cells to prevent unintended emphasis.
|
|
117
118
|
let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
|
|
118
119
|
let normalized_text = text::normalize_whitespace_cow(text.as_ref());
|
|
119
120
|
let escaped_result = text::escape(
|
|
120
121
|
normalized_text.as_ref(),
|
|
121
122
|
options.escape_misc,
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
true,
|
|
124
|
+
true,
|
|
124
125
|
options.escape_ascii,
|
|
125
126
|
);
|
|
126
127
|
escaped_result.into_owned()
|
|
127
128
|
} else {
|
|
128
|
-
text::escape(
|
|
129
|
-
text.as_ref(),
|
|
130
|
-
options.escape_misc,
|
|
131
|
-
options.escape_asterisks,
|
|
132
|
-
options.escape_underscores,
|
|
133
|
-
options.escape_ascii,
|
|
134
|
-
)
|
|
135
|
-
.into_owned()
|
|
129
|
+
text::escape(text.as_ref(), options.escape_misc, true, true, options.escape_ascii).into_owned()
|
|
136
130
|
};
|
|
137
131
|
if options.escape_misc {
|
|
138
132
|
escaped
|
|
@@ -323,6 +323,96 @@ pub fn normalize_bogus_comment_endings(input: &str) -> Cow<'_, str> {
|
|
|
323
323
|
}
|
|
324
324
|
}
|
|
325
325
|
|
|
326
|
+
/// Normalize closing tags whose `>` appears on a subsequent line.
|
|
327
|
+
///
|
|
328
|
+
/// Some HTML formatters (JSX-style) write closing tags as:
|
|
329
|
+
///
|
|
330
|
+
/// ```html
|
|
331
|
+
/// </a
|
|
332
|
+
/// >
|
|
333
|
+
/// ```
|
|
334
|
+
///
|
|
335
|
+
/// The `tl` parser does not handle end-tags with a newline before the closing
|
|
336
|
+
/// `>`, leaving the element unclosed so all subsequent siblings become children
|
|
337
|
+
/// of the open element. This pass collapses such patterns to a single-line
|
|
338
|
+
/// closing tag (`</a>`) before the document reaches `tl`.
|
|
339
|
+
///
|
|
340
|
+
/// Only the whitespace between the tag name and the closing `>` is normalised;
|
|
341
|
+
/// the rest of the document is untouched.
|
|
342
|
+
pub fn normalize_split_closing_tags(input: &str) -> Cow<'_, str> {
|
|
343
|
+
let bytes = input.as_bytes();
|
|
344
|
+
let len = bytes.len();
|
|
345
|
+
|
|
346
|
+
// Fast path: need both '</' and '\n' to have any candidates.
|
|
347
|
+
if len < 4 || !bytes.contains(&b'\n') {
|
|
348
|
+
return Cow::Borrowed(input);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
let mut idx = 0;
|
|
352
|
+
let mut last = 0;
|
|
353
|
+
let mut output: Option<String> = None;
|
|
354
|
+
|
|
355
|
+
while idx + 2 < len {
|
|
356
|
+
// Look for `</`
|
|
357
|
+
if bytes[idx] != b'<' || bytes[idx + 1] != b'/' {
|
|
358
|
+
idx += 1;
|
|
359
|
+
continue;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Scan tag name: ASCII letters, digits, hyphens (HTML5 allows hyphens in custom elements)
|
|
363
|
+
let name_start = idx + 2;
|
|
364
|
+
let mut name_end = name_start;
|
|
365
|
+
while name_end < len && (bytes[name_end].is_ascii_alphanumeric() || bytes[name_end] == b'-') {
|
|
366
|
+
name_end += 1;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if name_end == name_start {
|
|
370
|
+
// No tag name — not a closing tag we care about.
|
|
371
|
+
idx += 1;
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// After the tag name, skip any whitespace. If there is a newline in
|
|
376
|
+
// that whitespace before the `>`, we need to rewrite.
|
|
377
|
+
let ws_start = name_end;
|
|
378
|
+
let mut ws_end = ws_start;
|
|
379
|
+
let mut has_newline = false;
|
|
380
|
+
while ws_end < len && bytes[ws_end].is_ascii_whitespace() {
|
|
381
|
+
if bytes[ws_end] == b'\n' || bytes[ws_end] == b'\r' {
|
|
382
|
+
has_newline = true;
|
|
383
|
+
}
|
|
384
|
+
ws_end += 1;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
if !has_newline || ws_end >= len || bytes[ws_end] != b'>' {
|
|
388
|
+
// Either no whitespace newline, or the `>` is not the next char.
|
|
389
|
+
idx += 1;
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
// We have `</tagname [whitespace-with-newline]>` — rewrite to `</tagname>`.
|
|
394
|
+
let tag_name = &input[name_start..name_end];
|
|
395
|
+
let out = output.get_or_insert_with(|| String::with_capacity(len));
|
|
396
|
+
out.push_str(&input[last..idx]);
|
|
397
|
+
out.push_str("</");
|
|
398
|
+
out.push_str(tag_name);
|
|
399
|
+
out.push('>');
|
|
400
|
+
|
|
401
|
+
idx = ws_end + 1; // advance past the `>`
|
|
402
|
+
last = idx;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
match output {
|
|
406
|
+
Some(mut out) => {
|
|
407
|
+
if last < len {
|
|
408
|
+
out.push_str(&input[last..]);
|
|
409
|
+
}
|
|
410
|
+
Cow::Owned(out)
|
|
411
|
+
}
|
|
412
|
+
None => Cow::Borrowed(input),
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
326
416
|
/// Preprocess HTML to normalize tags and fix common issues.
|
|
327
417
|
pub fn preprocess_html(input: &str) -> Cow<'_, str> {
|
|
328
418
|
const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
|
|
@@ -788,7 +878,7 @@ fn tag_has_hidden_attribute(tag: &str) -> bool {
|
|
|
788
878
|
|
|
789
879
|
#[cfg(test)]
|
|
790
880
|
mod tests {
|
|
791
|
-
use super::{normalize_bogus_comment_endings, sanitize_markdown_url};
|
|
881
|
+
use super::{normalize_bogus_comment_endings, normalize_split_closing_tags, sanitize_markdown_url};
|
|
792
882
|
|
|
793
883
|
// ── normalize_bogus_comment_endings ───────────────────────────────────────
|
|
794
884
|
|
|
@@ -841,6 +931,52 @@ mod tests {
|
|
|
841
931
|
assert_eq!(result.as_ref(), "");
|
|
842
932
|
}
|
|
843
933
|
|
|
934
|
+
// ── normalize_split_closing_tags ──────────────────────────────────────────
|
|
935
|
+
|
|
936
|
+
#[test]
|
|
937
|
+
fn normalize_split_closing_tags_collapses_newline_before_close_bracket() {
|
|
938
|
+
let input = "<a href=\"#x\">text</a\n>";
|
|
939
|
+
let result = normalize_split_closing_tags(input);
|
|
940
|
+
assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
#[test]
|
|
944
|
+
fn normalize_split_closing_tags_collapses_indented_newline_before_close_bracket() {
|
|
945
|
+
let input = "<a href=\"#x\">text</a\n >";
|
|
946
|
+
let result = normalize_split_closing_tags(input);
|
|
947
|
+
assert_eq!(result.as_ref(), "<a href=\"#x\">text</a>");
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
#[test]
|
|
951
|
+
fn normalize_split_closing_tags_leaves_well_formed_closing_tags_unchanged() {
|
|
952
|
+
let input = "<a href=\"#x\">text</a>";
|
|
953
|
+
let result = normalize_split_closing_tags(input);
|
|
954
|
+
assert_eq!(result.as_ref(), input);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
#[test]
|
|
958
|
+
fn normalize_split_closing_tags_handles_multiple_split_closing_tags() {
|
|
959
|
+
let input = "<li><a href=\"#a\">A</a\n >\n<a href=\"#b\">B</a\n>";
|
|
960
|
+
let result = normalize_split_closing_tags(input);
|
|
961
|
+
assert_eq!(result.as_ref(), "<li><a href=\"#a\">A</a>\n<a href=\"#b\">B</a>");
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
#[test]
|
|
965
|
+
fn normalize_split_closing_tags_does_not_collapse_inline_whitespace() {
|
|
966
|
+
// Only newlines trigger the normalisation; spaces alone must not.
|
|
967
|
+
let input = "<a href=\"#x\">text</a >";
|
|
968
|
+
let result = normalize_split_closing_tags(input);
|
|
969
|
+
// A space before > is actually valid HTML and tl handles it fine.
|
|
970
|
+
// We must not touch it to avoid over-normalising.
|
|
971
|
+
assert_eq!(result.as_ref(), input);
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
#[test]
|
|
975
|
+
fn normalize_split_closing_tags_empty_input() {
|
|
976
|
+
let result = normalize_split_closing_tags("");
|
|
977
|
+
assert_eq!(result.as_ref(), "");
|
|
978
|
+
}
|
|
979
|
+
|
|
844
980
|
// ── sanitize_markdown_url ─────────────────────────────────────────────────
|
|
845
981
|
|
|
846
982
|
#[test]
|
|
@@ -198,9 +198,13 @@ fn test_strikethrough() {
|
|
|
198
198
|
fn test_simple_table() {
|
|
199
199
|
let html = "<table><tr><th>Header</th></tr><tr><td>Cell</td></tr></table>";
|
|
200
200
|
let result = convert(html, None).unwrap();
|
|
201
|
-
assert!(result.contains("| Header |"));
|
|
202
|
-
|
|
203
|
-
assert!(
|
|
201
|
+
assert!(result.contains("| Header |"), "header row missing: {result}");
|
|
202
|
+
// Separator uses at least as many dashes as the widest cell ("Header" = 6).
|
|
203
|
+
assert!(
|
|
204
|
+
result.lines().any(|l| l.starts_with("| ----")),
|
|
205
|
+
"separator row missing: {result}"
|
|
206
|
+
);
|
|
207
|
+
assert!(result.contains("| Cell"), "cell row missing: {result}");
|
|
204
208
|
}
|
|
205
209
|
|
|
206
210
|
#[test]
|
|
@@ -221,7 +225,10 @@ fn test_table_rowspan() {
|
|
|
221
225
|
..Default::default()
|
|
222
226
|
};
|
|
223
227
|
let result = convert(html, Some(options)).unwrap();
|
|
224
|
-
|
|
228
|
+
// Columns are padded to the widest cell per column (rowspan accounted):
|
|
229
|
+
// col 0: max("Header 1"=8, "Spanning cell"=13, ""=0) = 13
|
|
230
|
+
// col 1: max("Header 2"=8, "First row content<br>Second line"=32, "Next row<br>More content"=24) = 32
|
|
231
|
+
let expected = "| Header 1 | Header 2 |\n| ------------- | -------------------------------- |\n| Spanning cell | First row content<br>Second line |\n| | Next row<br>More content |\n";
|
|
225
232
|
assert_eq!(result, expected);
|
|
226
233
|
}
|
|
227
234
|
|
|
@@ -534,7 +541,8 @@ fn test_ordered_list_with_heading_and_table() {
|
|
|
534
541
|
";
|
|
535
542
|
|
|
536
543
|
let result = convert(html, None).unwrap();
|
|
537
|
-
|
|
544
|
+
// Separator dashes match the column width ("blah" = 4 chars → 4 dashes).
|
|
545
|
+
let expected = "1. ### h3\n2. *table*\n\n | blah |\n | ---- |\n";
|
|
538
546
|
assert_eq!(result, expected);
|
|
539
547
|
}
|
|
540
548
|
|