html-to-markdown 3.4.0.pre.rc.44 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/html_to_markdown_rb/Cargo.toml +1 -1
- data/lib/bin/html-to-markdown +0 -0
- data/lib/html_to_markdown/native.rb +9 -5
- data/lib/html_to_markdown/version.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +47 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +87 -12
- data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +87 -5
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +4 -4
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +19 -1
- data/vendor/html-to-markdown-rs/src/converter/main.rs +9 -1
- data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +158 -40
- data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +64 -31
- data/vendor/html-to-markdown-rs/src/converter/text_node.rs +4 -10
- data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +137 -1
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +13 -5
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +4 -2
- data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +53 -0
- data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +74 -0
- data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +154 -0
- data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +93 -0
- data/vendor/html-to-markdown-rs/tests/tables_test.rs +39 -23
- data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +74 -47
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e1296a51b6ca0f757a95487c567e9d1e0d2acda80a46656cd14002322a4b850b
|
|
4
|
+
data.tar.gz: dca253ecef2f4818aa9e39e56d48ca5bb9ad35f92759580ee893d17eb455d494
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 989e6b723c3670894b9e87ad87c1d752c27e3917f5cc60f0ba8f40175ee97c35ef4a2a40da53ab77908c6febb44abc0fe66064470494e73be3a5aae61d8498ae
|
|
7
|
+
data.tar.gz: 6275bc085c86f521114dbfd860e856dfdeac43ae92b755d81960e22e11a6d73ec4f25c89c837c37072d5500fd22af92f9507ace1673337708cac1a5376db926c
|
data/lib/bin/html-to-markdown
CHANGED
|
Binary file
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:b54e7bb2ab55cc6c25c9cac0e62ec66c35fd2d1956ef9ba5e3dc9e7ba5e666a5
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
@@ -23,7 +23,8 @@ end
|
|
|
23
23
|
class Hash
|
|
24
24
|
# Support internally-tagged enum accessors like format.excel, format.email, etc.
|
|
25
25
|
# Also support direct field access like format.sheet_count
|
|
26
|
-
|
|
26
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
27
|
+
def method_missing(method_name, *args, &block)
|
|
27
28
|
# Try symbol key first (how Magnus converts JSON keys)
|
|
28
29
|
return self[method_name] if key?(method_name)
|
|
29
30
|
|
|
@@ -31,22 +32,25 @@ class Hash
|
|
|
31
32
|
return self[method_name.to_s] if key?(method_name.to_s)
|
|
32
33
|
|
|
33
34
|
# Check if this hash has a 'format_type' field (indicating an internally-tagged enum)
|
|
34
|
-
format_type = self[:format_type] || self['format_type']
|
|
35
|
+
format_type = self[:'format_type'] || self['format_type']
|
|
35
36
|
return super unless format_type
|
|
36
37
|
|
|
37
38
|
# If the method name matches the format_type (snake_case), extract and return the variant's wrapped data
|
|
38
39
|
# Internally-tagged enums store variant data in the '_0' field (from alef's struct variant conversion)
|
|
39
40
|
# This allows format.excel to return the ExcelMetadata hash with sheet_count, sheet_names, etc.
|
|
40
41
|
snake_case_method = method_name.to_s.downcase
|
|
41
|
-
|
|
42
|
+
if snake_case_method == format_type.to_s.downcase
|
|
43
|
+
return self[:'_0'] || self['_0'] || self
|
|
44
|
+
end
|
|
42
45
|
|
|
43
46
|
super
|
|
44
47
|
end
|
|
48
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
45
49
|
|
|
46
50
|
def respond_to_missing?(method_name, include_private = false)
|
|
47
51
|
return true if key?(method_name) || key?(method_name.to_s)
|
|
48
52
|
|
|
49
|
-
format_type = self[:format_type] || self['format_type']
|
|
53
|
+
format_type = self[:'format_type'] || self['format_type']
|
|
50
54
|
return false unless format_type
|
|
51
55
|
|
|
52
56
|
snake_case_method = method_name.to_s.downcase
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# This file is auto-generated by alef — DO NOT EDIT.
|
|
2
|
-
# alef:hash:
|
|
2
|
+
# alef:hash:9c58cf63849e82246f03b4fcc3996c264d47f2b2c27e0e8ba6b93eb4a84cb279
|
|
3
3
|
# To regenerate: alef generate
|
|
4
4
|
# To verify freshness: alef verify --exit-code
|
|
5
5
|
# Issues & docs: https://github.com/kreuzberg-dev/alef
|
|
6
6
|
# frozen_string_literal: true
|
|
7
7
|
|
|
8
8
|
module HtmlToMarkdown
|
|
9
|
-
VERSION = '3.4.0
|
|
9
|
+
VERSION = '3.4.0'
|
|
10
10
|
end
|
data/vendor/Cargo.toml
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
use std::borrow::Cow;
|
|
7
7
|
|
|
8
8
|
use super::cell::{collect_table_cells, get_colspan};
|
|
9
|
-
use super::cells::{append_layout_row, convert_table_row};
|
|
9
|
+
use super::cells::{append_layout_row, collect_row_cell_widths, convert_table_row};
|
|
10
10
|
use super::scanner::scan_table;
|
|
11
11
|
use super::utils::{is_tag_name, normalized_tag_name};
|
|
12
12
|
#[cfg(feature = "visitor")]
|
|
@@ -216,6 +216,50 @@ pub fn handle_table(
|
|
|
216
216
|
let mut rowspan_tracker = vec![None; total_cols];
|
|
217
217
|
let mut row_cells = Vec::new();
|
|
218
218
|
|
|
219
|
+
// Pre-pass: compute per-column max content widths for aligned padding.
|
|
220
|
+
// Uses a rowspan tracker so spanned columns are skipped just as they
|
|
221
|
+
// are in the render pass, keeping column indices correctly aligned.
|
|
222
|
+
let col_widths = {
|
|
223
|
+
let mut widths: Vec<usize> = Vec::new();
|
|
224
|
+
let mut prepass_rowspan: Vec<Option<usize>> = Vec::new();
|
|
225
|
+
let children = tag.children();
|
|
226
|
+
for child_handle in children.top().iter() {
|
|
227
|
+
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
|
|
228
|
+
let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
|
|
229
|
+
match tag_name.as_ref() {
|
|
230
|
+
"thead" | "tbody" | "tfoot" => {
|
|
231
|
+
for row_handle in child_tag.children().top().iter() {
|
|
232
|
+
if is_tag_name(row_handle, parser, dom_ctx, "tr") {
|
|
233
|
+
collect_row_cell_widths(
|
|
234
|
+
row_handle,
|
|
235
|
+
parser,
|
|
236
|
+
options,
|
|
237
|
+
ctx,
|
|
238
|
+
dom_ctx,
|
|
239
|
+
&mut widths,
|
|
240
|
+
&mut prepass_rowspan,
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
"tr" | "row" => {
|
|
246
|
+
collect_row_cell_widths(
|
|
247
|
+
child_handle,
|
|
248
|
+
parser,
|
|
249
|
+
options,
|
|
250
|
+
ctx,
|
|
251
|
+
dom_ctx,
|
|
252
|
+
&mut widths,
|
|
253
|
+
&mut prepass_rowspan,
|
|
254
|
+
);
|
|
255
|
+
}
|
|
256
|
+
_ => {}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
widths
|
|
261
|
+
};
|
|
262
|
+
|
|
219
263
|
let children = tag.children();
|
|
220
264
|
{
|
|
221
265
|
for child_handle in children.top().iter() {
|
|
@@ -282,6 +326,7 @@ pub fn handle_table(
|
|
|
282
326
|
dom_ctx,
|
|
283
327
|
depth + 1,
|
|
284
328
|
is_header_section,
|
|
329
|
+
&col_widths,
|
|
285
330
|
);
|
|
286
331
|
row_index += 1;
|
|
287
332
|
}
|
|
@@ -312,6 +357,7 @@ pub fn handle_table(
|
|
|
312
357
|
dom_ctx,
|
|
313
358
|
depth + 1,
|
|
314
359
|
row_index == 0,
|
|
360
|
+
&col_widths,
|
|
315
361
|
);
|
|
316
362
|
row_index += 1;
|
|
317
363
|
}
|
|
@@ -97,10 +97,86 @@ pub fn collect_table_cells(
|
|
|
97
97
|
}
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
+
/// Extract the text content of a table cell for column width calculation.
|
|
101
|
+
///
|
|
102
|
+
/// Returns the same text that would appear in the rendered cell, without
|
|
103
|
+
/// the surrounding pipe delimiters. Used in the first pass to compute
|
|
104
|
+
/// maximum column widths before rendering with padding.
|
|
105
|
+
///
|
|
106
|
+
/// # Arguments
|
|
107
|
+
/// * `node_handle` - Handle to the cell element
|
|
108
|
+
/// * `parser` - HTML parser instance
|
|
109
|
+
/// * `options` - Conversion options
|
|
110
|
+
/// * `ctx` - Conversion context
|
|
111
|
+
/// * `dom_ctx` - DOM context
|
|
112
|
+
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
113
|
+
pub fn cell_text_content(
|
|
114
|
+
node_handle: &tl::NodeHandle,
|
|
115
|
+
parser: &tl::Parser,
|
|
116
|
+
options: &crate::options::ConversionOptions,
|
|
117
|
+
ctx: &super::super::super::Context,
|
|
118
|
+
dom_ctx: &super::super::super::DomContext,
|
|
119
|
+
) -> String {
|
|
120
|
+
let mut text = String::with_capacity(64);
|
|
121
|
+
|
|
122
|
+
let cell_ctx = super::super::super::Context {
|
|
123
|
+
in_table_cell: true,
|
|
124
|
+
..ctx.clone()
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
|
|
128
|
+
let children = tag.children();
|
|
129
|
+
let has_tag_child = children
|
|
130
|
+
.top()
|
|
131
|
+
.iter()
|
|
132
|
+
.any(|child_handle| matches!(child_handle.get(parser), Some(tl::Node::Tag(_))));
|
|
133
|
+
|
|
134
|
+
if has_tag_child {
|
|
135
|
+
for child_handle in children.top().iter() {
|
|
136
|
+
super::super::super::walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
|
|
137
|
+
}
|
|
138
|
+
} else {
|
|
139
|
+
let raw = dom_ctx.text_content(*node_handle, parser);
|
|
140
|
+
let normalized = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
|
|
141
|
+
crate::text::normalize_whitespace_cow(raw.as_str())
|
|
142
|
+
} else {
|
|
143
|
+
Cow::Borrowed(raw.as_str())
|
|
144
|
+
};
|
|
145
|
+
let escaped = escape_cell_text(normalized.as_ref(), options);
|
|
146
|
+
text = escaped;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
let text = text.trim();
|
|
151
|
+
if options.br_in_tables {
|
|
152
|
+
text.to_string()
|
|
153
|
+
} else if text.contains('\n') {
|
|
154
|
+
text.replace('\n', " ")
|
|
155
|
+
} else {
|
|
156
|
+
text.to_string()
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/// Escape text for use inside a table cell.
|
|
161
|
+
///
|
|
162
|
+
/// Always escapes `*` and `_` (to prevent unintended emphasis inside cells),
|
|
163
|
+
/// applies `escape_misc` / `escape_ascii` per options, and escapes `|` (pipe)
|
|
164
|
+
/// when `escape_misc` is not already handling it.
|
|
165
|
+
fn escape_cell_text(text: &str, options: &crate::options::ConversionOptions) -> String {
|
|
166
|
+
// Always escape * and _ in table cells to prevent unintended emphasis.
|
|
167
|
+
let escaped = crate::text::escape(text, options.escape_misc, true, true, options.escape_ascii);
|
|
168
|
+
if options.escape_misc {
|
|
169
|
+
escaped.into_owned()
|
|
170
|
+
} else {
|
|
171
|
+
escaped.replace('|', r"\|")
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
100
175
|
/// Convert a table cell (td or th) to Markdown format.
|
|
101
176
|
///
|
|
102
177
|
/// Processes cell content and renders it with pipe delimiters for Markdown tables.
|
|
103
178
|
/// Handles colspan by adding extra pipes, and escapes pipes in cell content.
|
|
179
|
+
/// Always escapes `*` and `_` to prevent unintended emphasis inside cells.
|
|
104
180
|
///
|
|
105
181
|
/// # Arguments
|
|
106
182
|
/// * `node_handle` - Handle to the cell element
|
|
@@ -110,6 +186,7 @@ pub fn collect_table_cells(
|
|
|
110
186
|
/// * `ctx` - Conversion context (visitor, etc)
|
|
111
187
|
/// * `_tag_name` - Tag name (for consistency, not used)
|
|
112
188
|
/// * `dom_ctx` - DOM context for content extraction
|
|
189
|
+
/// * `col_width` - Optional target width for padding (None = no padding)
|
|
113
190
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
114
191
|
pub fn convert_table_cell(
|
|
115
192
|
node_handle: &tl::NodeHandle,
|
|
@@ -119,6 +196,7 @@ pub fn convert_table_cell(
|
|
|
119
196
|
ctx: &super::super::super::Context,
|
|
120
197
|
_tag_name: &str,
|
|
121
198
|
dom_ctx: &super::super::super::DomContext,
|
|
199
|
+
col_width: Option<usize>,
|
|
122
200
|
) {
|
|
123
201
|
let mut text = String::with_capacity(128);
|
|
124
202
|
|
|
@@ -145,18 +223,7 @@ pub fn convert_table_cell(
|
|
|
145
223
|
} else {
|
|
146
224
|
Cow::Borrowed(raw.as_str())
|
|
147
225
|
};
|
|
148
|
-
|
|
149
|
-
normalized.as_ref(),
|
|
150
|
-
options.escape_misc,
|
|
151
|
-
options.escape_asterisks,
|
|
152
|
-
options.escape_underscores,
|
|
153
|
-
options.escape_ascii,
|
|
154
|
-
);
|
|
155
|
-
if options.escape_misc {
|
|
156
|
-
text = escaped.into_owned();
|
|
157
|
-
} else {
|
|
158
|
-
text = escaped.replace('|', r"\|");
|
|
159
|
-
}
|
|
226
|
+
text = escape_cell_text(normalized.as_ref(), options);
|
|
160
227
|
}
|
|
161
228
|
}
|
|
162
229
|
|
|
@@ -175,6 +242,14 @@ pub fn convert_table_cell(
|
|
|
175
242
|
|
|
176
243
|
output.push(' ');
|
|
177
244
|
output.push_str(&text);
|
|
245
|
+
if let Some(width) = col_width {
|
|
246
|
+
let text_len = text.chars().count();
|
|
247
|
+
if text_len < width {
|
|
248
|
+
for _ in 0..(width - text_len) {
|
|
249
|
+
output.push(' ');
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
178
253
|
for _ in 0..colspan {
|
|
179
254
|
output.push_str(" |");
|
|
180
255
|
}
|
|
@@ -10,7 +10,7 @@ use crate::converter::utility::content::collect_tag_attributes;
|
|
|
10
10
|
use crate::converter::utility::content::normalized_tag_name;
|
|
11
11
|
use std::borrow::Cow;
|
|
12
12
|
|
|
13
|
-
use super::cell::{collect_table_cells, convert_table_cell, get_colspan_rowspan};
|
|
13
|
+
use super::cell::{cell_text_content, collect_table_cells, convert_table_cell, get_colspan_rowspan};
|
|
14
14
|
|
|
15
15
|
/// Maximum allowed table columns to prevent unbounded memory usage.
|
|
16
16
|
const MAX_TABLE_COLS: usize = 1000;
|
|
@@ -87,6 +87,75 @@ pub fn append_layout_row(
|
|
|
87
87
|
}
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
+
/// Collect the rendered text content of every cell in a row for width calculation.
|
|
91
|
+
///
|
|
92
|
+
/// `rowspan_tracker` mirrors the tracker in `convert_table_row` so that spanned
|
|
93
|
+
/// columns are skipped in the width pre-pass just as they are skipped in rendering.
|
|
94
|
+
/// Pass a shared tracker across all row calls to correctly handle multi-row spans.
|
|
95
|
+
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
96
|
+
pub fn collect_row_cell_widths(
|
|
97
|
+
node_handle: &tl::NodeHandle,
|
|
98
|
+
parser: &tl::Parser,
|
|
99
|
+
options: &crate::options::ConversionOptions,
|
|
100
|
+
ctx: &super::super::super::Context,
|
|
101
|
+
dom_ctx: &super::super::super::DomContext,
|
|
102
|
+
col_widths: &mut Vec<usize>,
|
|
103
|
+
rowspan_tracker: &mut Vec<Option<usize>>,
|
|
104
|
+
) {
|
|
105
|
+
let mut cells = Vec::new();
|
|
106
|
+
collect_table_cells(node_handle, parser, dom_ctx, &mut cells);
|
|
107
|
+
|
|
108
|
+
let mut col = 0usize;
|
|
109
|
+
let mut cell_iter = cells.iter();
|
|
110
|
+
|
|
111
|
+
loop {
|
|
112
|
+
// Skip columns that are filled by a rowspan from a previous row.
|
|
113
|
+
while col < rowspan_tracker.len() {
|
|
114
|
+
if let Some(Some(remaining)) = rowspan_tracker.get_mut(col) {
|
|
115
|
+
if *remaining > 0 {
|
|
116
|
+
*remaining -= 1;
|
|
117
|
+
if *remaining == 0 {
|
|
118
|
+
rowspan_tracker[col] = None;
|
|
119
|
+
}
|
|
120
|
+
col += 1;
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
let Some(cell_handle) = cell_iter.next() else {
|
|
128
|
+
break;
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
let text = cell_text_content(cell_handle, parser, options, ctx, dom_ctx);
|
|
132
|
+
let width = text.chars().count();
|
|
133
|
+
|
|
134
|
+
// Grow the widths vec if needed.
|
|
135
|
+
if col >= col_widths.len() {
|
|
136
|
+
col_widths.resize(col + 1, 0);
|
|
137
|
+
}
|
|
138
|
+
if width > col_widths[col] {
|
|
139
|
+
col_widths[col] = width;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
|
|
143
|
+
|
|
144
|
+
// Record rowspan for future rows.
|
|
145
|
+
if rowspan > 1 {
|
|
146
|
+
if col >= rowspan_tracker.len() {
|
|
147
|
+
rowspan_tracker.resize(col + 1, None);
|
|
148
|
+
}
|
|
149
|
+
rowspan_tracker[col] = Some(rowspan - 1);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
col = col.saturating_add(colspan);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Minimum separator dash count per column (matches `---`).
|
|
157
|
+
const MIN_SEPARATOR_DASHES: usize = 3;
|
|
158
|
+
|
|
90
159
|
/// Convert a table row (tr) to Markdown format.
|
|
91
160
|
///
|
|
92
161
|
/// Processes all cells in a row, handling colspan and rowspan for proper
|
|
@@ -107,6 +176,7 @@ pub fn append_layout_row(
|
|
|
107
176
|
/// * `dom_ctx` - DOM context
|
|
108
177
|
/// * `depth` - Nesting depth
|
|
109
178
|
/// * `is_header` - Whether this is a header row
|
|
179
|
+
/// * `col_widths` - Per-column max content widths for padding (empty = no padding)
|
|
110
180
|
#[allow(clippy::too_many_arguments)]
|
|
111
181
|
#[cfg_attr(not(feature = "visitor"), allow(unused_variables))]
|
|
112
182
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
|
@@ -124,6 +194,7 @@ pub fn convert_table_row(
|
|
|
124
194
|
dom_ctx: &super::super::super::DomContext,
|
|
125
195
|
depth: usize,
|
|
126
196
|
is_header: bool,
|
|
197
|
+
col_widths: &[usize],
|
|
127
198
|
) {
|
|
128
199
|
let mut row_text = String::with_capacity(256);
|
|
129
200
|
let mut cells = Vec::new();
|
|
@@ -203,7 +274,13 @@ pub fn convert_table_row(
|
|
|
203
274
|
if col_index < total_cols {
|
|
204
275
|
if let Some(Some(remaining_rows)) = rowspan_tracker.get_mut(col_index) {
|
|
205
276
|
if *remaining_rows > 0 {
|
|
277
|
+
let width = col_widths.get(col_index).copied();
|
|
206
278
|
row_text.push(' ');
|
|
279
|
+
if let Some(w) = width {
|
|
280
|
+
for _ in 0..w {
|
|
281
|
+
row_text.push(' ');
|
|
282
|
+
}
|
|
283
|
+
}
|
|
207
284
|
row_text.push_str(" |");
|
|
208
285
|
*remaining_rows -= 1;
|
|
209
286
|
if *remaining_rows == 0 {
|
|
@@ -216,7 +293,8 @@ pub fn convert_table_row(
|
|
|
216
293
|
}
|
|
217
294
|
|
|
218
295
|
if let Some(cell_handle) = cell_iter.next() {
|
|
219
|
-
|
|
296
|
+
let col_width = col_widths.get(col_index).copied();
|
|
297
|
+
convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx, col_width);
|
|
220
298
|
|
|
221
299
|
let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
|
|
222
300
|
|
|
@@ -230,8 +308,9 @@ pub fn convert_table_row(
|
|
|
230
308
|
}
|
|
231
309
|
}
|
|
232
310
|
} else {
|
|
233
|
-
for cell_handle in
|
|
234
|
-
|
|
311
|
+
for (cell_idx, cell_handle) in cells.iter().enumerate() {
|
|
312
|
+
let col_width = col_widths.get(cell_idx).copied();
|
|
313
|
+
convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx, col_width);
|
|
235
314
|
}
|
|
236
315
|
}
|
|
237
316
|
|
|
@@ -247,7 +326,10 @@ pub fn convert_table_row(
|
|
|
247
326
|
if i > 0 {
|
|
248
327
|
output.push_str(" | ");
|
|
249
328
|
}
|
|
250
|
-
|
|
329
|
+
let dash_count = col_widths.get(i).copied().unwrap_or(0).max(MIN_SEPARATOR_DASHES);
|
|
330
|
+
for _ in 0..dash_count {
|
|
331
|
+
output.push('-');
|
|
332
|
+
}
|
|
251
333
|
}
|
|
252
334
|
output.push_str(" |\n");
|
|
253
335
|
}
|
|
@@ -117,11 +117,11 @@ pub fn handle_table_with_context(
|
|
|
117
117
|
let indented = layout::indent_table_for_list(&table_output, ctx.list_depth, options);
|
|
118
118
|
output.push_str(&indented);
|
|
119
119
|
} else {
|
|
120
|
-
if !output.ends_with("\n\n") {
|
|
121
|
-
if output.
|
|
122
|
-
output.push_str("\n\n");
|
|
123
|
-
} else {
|
|
120
|
+
if !output.is_empty() && !output.ends_with("\n\n") {
|
|
121
|
+
if output.ends_with('\n') {
|
|
124
122
|
output.push('\n');
|
|
123
|
+
} else {
|
|
124
|
+
output.push_str("\n\n");
|
|
125
125
|
}
|
|
126
126
|
}
|
|
127
127
|
output.push_str(&table_output);
|
|
@@ -253,7 +253,25 @@ fn format_image_markdown(
|
|
|
253
253
|
buf.push_str(";
|
|
256
|
-
|
|
256
|
+
|
|
257
|
+
if src.is_empty() {
|
|
258
|
+
buf.push_str("<>");
|
|
259
|
+
} else if src.contains(' ') || src.contains('\n') {
|
|
260
|
+
buf.push('<');
|
|
261
|
+
buf.push_str(src);
|
|
262
|
+
buf.push('>');
|
|
263
|
+
} else {
|
|
264
|
+
let open_count = src.chars().filter(|&c| c == '(').count();
|
|
265
|
+
let close_count = src.chars().filter(|&c| c == ')').count();
|
|
266
|
+
|
|
267
|
+
if open_count == close_count {
|
|
268
|
+
buf.push_str(src);
|
|
269
|
+
} else {
|
|
270
|
+
let escaped_src = src.replace('(', "\\(").replace(')', "\\)");
|
|
271
|
+
buf.push_str(&escaped_src);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
257
275
|
if let Some(title_text) = title {
|
|
258
276
|
buf.push_str(" \"");
|
|
259
277
|
buf.push_str(title_text);
|
|
@@ -23,7 +23,8 @@ use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_d
|
|
|
23
23
|
use crate::converter::utility::caching::build_dom_context;
|
|
24
24
|
use crate::converter::utility::content::normalized_tag_name;
|
|
25
25
|
use crate::converter::utility::preprocessing::{
|
|
26
|
-
normalize_bogus_comment_endings, preprocess_html, strip_hidden_elements,
|
|
26
|
+
normalize_bogus_comment_endings, normalize_split_closing_tags, preprocess_html, strip_hidden_elements,
|
|
27
|
+
strip_script_and_style_tags,
|
|
27
28
|
};
|
|
28
29
|
use crate::converter::utility::serialization::serialize_tag_to_html;
|
|
29
30
|
use crate::options::OutputFormat;
|
|
@@ -66,6 +67,10 @@ pub fn convert_html_impl(
|
|
|
66
67
|
// Normalise bogus HTML comment endings (`--->`, `---->`, …) that cause the
|
|
67
68
|
// `tl` parser to silently discard all document content that follows them.
|
|
68
69
|
let stripped = normalize_bogus_comment_endings(&stripped);
|
|
70
|
+
// Normalise closing tags whose `>` is on a subsequent line (JSX-style `</a\n>`).
|
|
71
|
+
// The `tl` parser does not handle such end-tags and leaves the element unclosed,
|
|
72
|
+
// causing all subsequent siblings to be absorbed as children.
|
|
73
|
+
let stripped = normalize_split_closing_tags(&stripped);
|
|
69
74
|
let mut preprocessed = preprocess_html(&stripped).into_owned();
|
|
70
75
|
let mut preprocessed_len = preprocessed.len();
|
|
71
76
|
|
|
@@ -74,6 +79,7 @@ pub fn convert_html_impl(
|
|
|
74
79
|
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
75
80
|
let stripped = strip_hidden_elements(&stripped);
|
|
76
81
|
let stripped = normalize_bogus_comment_endings(&stripped);
|
|
82
|
+
let stripped = normalize_split_closing_tags(&stripped);
|
|
77
83
|
let repaired = preprocess_html(&stripped).into_owned();
|
|
78
84
|
preprocessed = repaired;
|
|
79
85
|
preprocessed_len = preprocessed.len();
|
|
@@ -88,6 +94,7 @@ pub fn convert_html_impl(
|
|
|
88
94
|
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
89
95
|
let stripped = strip_hidden_elements(&stripped);
|
|
90
96
|
let stripped = normalize_bogus_comment_endings(&stripped);
|
|
97
|
+
let stripped = normalize_split_closing_tags(&stripped);
|
|
91
98
|
preprocessed = preprocess_html(&stripped).into_owned();
|
|
92
99
|
preprocessed_len = preprocessed.len();
|
|
93
100
|
continue;
|
|
@@ -109,6 +116,7 @@ pub fn convert_html_impl(
|
|
|
109
116
|
let stripped = strip_script_and_style_tags(&repaired_html);
|
|
110
117
|
let stripped = strip_hidden_elements(&stripped);
|
|
111
118
|
let stripped = normalize_bogus_comment_endings(&stripped);
|
|
119
|
+
let stripped = normalize_split_closing_tags(&stripped);
|
|
112
120
|
preprocessed = preprocess_html(&stripped).into_owned();
|
|
113
121
|
preprocessed_len = preprocessed.len();
|
|
114
122
|
// Re-parse with repaired HTML
|