html-to-markdown 3.4.0.pre.rc.44 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. checksums.yaml +4 -4
  2. data/ext/html_to_markdown_rb/Cargo.toml +1 -1
  3. data/lib/bin/html-to-markdown +0 -0
  4. data/lib/html_to_markdown/native.rb +9 -5
  5. data/lib/html_to_markdown/version.rb +2 -2
  6. data/vendor/Cargo.toml +1 -1
  7. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  8. data/vendor/html-to-markdown-rs/src/converter/block/table/builder.rs +47 -1
  9. data/vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +87 -12
  10. data/vendor/html-to-markdown-rs/src/converter/block/table/cells.rs +87 -5
  11. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +4 -4
  12. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +19 -1
  13. data/vendor/html-to-markdown-rs/src/converter/main.rs +9 -1
  14. data/vendor/html-to-markdown-rs/src/converter/plain_text.rs +158 -40
  15. data/vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs +64 -31
  16. data/vendor/html-to-markdown-rs/src/converter/text_node.rs +4 -10
  17. data/vendor/html-to-markdown-rs/src/converter/utility/preprocessing.rs +137 -1
  18. data/vendor/html-to-markdown-rs/tests/integration_test.rs +13 -5
  19. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +4 -2
  20. data/vendor/html-to-markdown-rs/tests/issue_143_regressions.rs +53 -0
  21. data/vendor/html-to-markdown-rs/tests/issue_336_regressions.rs +74 -0
  22. data/vendor/html-to-markdown-rs/tests/issue_347_regressions.rs +154 -0
  23. data/vendor/html-to-markdown-rs/tests/issue_348_visitor_plain.rs +93 -0
  24. data/vendor/html-to-markdown-rs/tests/tables_test.rs +39 -23
  25. data/vendor/html-to-markdown-rs/tests/xml_tables_test.rs +74 -47
  26. metadata +4 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d5a2e4bc34ff80125912031e6e942772c52769430f5520f8ab312222cdf099cb
4
- data.tar.gz: e3985c9fa10a4507043670283f7965c2a04a92eb6073f917e48f1514b4dac428
3
+ metadata.gz: e1296a51b6ca0f757a95487c567e9d1e0d2acda80a46656cd14002322a4b850b
4
+ data.tar.gz: dca253ecef2f4818aa9e39e56d48ca5bb9ad35f92759580ee893d17eb455d494
5
5
  SHA512:
6
- metadata.gz: 4a179c7450759417b24669b8bda93462f707af97183634740a687205b7a16b2ce5169daa1369f402b17a3e9ae07a124603a6c2d3ce3f7dc5c2b1832fd42559f8
7
- data.tar.gz: b56307b497e4794a1dc08b9dee9d5fd61dc0df09fa9ea40a819b7b39b8b89daa658798c0b909c2030ab9472b4b72186319f4a73a2cb6fbb194f274e783943a04
6
+ metadata.gz: 989e6b723c3670894b9e87ad87c1d752c27e3917f5cc60f0ba8f40175ee97c35ef4a2a40da53ab77908c6febb44abc0fe66064470494e73be3a5aae61d8498ae
7
+ data.tar.gz: 6275bc085c86f521114dbfd860e856dfdeac43ae92b755d81960e22e11a6d73ec4f25c89c837c37072d5500fd22af92f9507ace1673337708cac1a5376db926c
@@ -1,7 +1,7 @@
1
1
 
2
2
  [package]
3
3
  name = "html-to-markdown-rb"
4
- version = "3.4.0-rc.44"
4
+ version = "3.4.0"
5
5
  edition = "2024"
6
6
  license = "MIT"
7
7
  [workspace]
Binary file
@@ -1,5 +1,5 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:1bd3c99db21c2ca078644f1e220656297c784bcca532f13d3b7efed3f45dbb0b
2
+ # alef:hash:b54e7bb2ab55cc6c25c9cac0e62ec66c35fd2d1956ef9ba5e3dc9e7ba5e666a5
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
@@ -23,7 +23,8 @@ end
23
23
  class Hash
24
24
  # Support internally-tagged enum accessors like format.excel, format.email, etc.
25
25
  # Also support direct field access like format.sheet_count
26
- def method_missing(method_name, *args, &)
26
+ # rubocop:disable Metrics/CyclomaticComplexity
27
+ def method_missing(method_name, *args, &block)
27
28
  # Try symbol key first (how Magnus converts JSON keys)
28
29
  return self[method_name] if key?(method_name)
29
30
 
@@ -31,22 +32,25 @@ class Hash
31
32
  return self[method_name.to_s] if key?(method_name.to_s)
32
33
 
33
34
  # Check if this hash has a 'format_type' field (indicating an internally-tagged enum)
34
- format_type = self[:format_type] || self['format_type']
35
+ format_type = self[:'format_type'] || self['format_type']
35
36
  return super unless format_type
36
37
 
37
38
  # If the method name matches the format_type (snake_case), extract and return the variant's wrapped data
38
39
  # Internally-tagged enums store variant data in the '_0' field (from alef's struct variant conversion)
39
40
  # This allows format.excel to return the ExcelMetadata hash with sheet_count, sheet_names, etc.
40
41
  snake_case_method = method_name.to_s.downcase
41
- return self[:_0] || self['_0'] || self if snake_case_method == format_type.to_s.downcase
42
+ if snake_case_method == format_type.to_s.downcase
43
+ return self[:'_0'] || self['_0'] || self
44
+ end
42
45
 
43
46
  super
44
47
  end
48
+ # rubocop:enable Metrics/CyclomaticComplexity
45
49
 
46
50
  def respond_to_missing?(method_name, include_private = false)
47
51
  return true if key?(method_name) || key?(method_name.to_s)
48
52
 
49
- format_type = self[:format_type] || self['format_type']
53
+ format_type = self[:'format_type'] || self['format_type']
50
54
  return false unless format_type
51
55
 
52
56
  snake_case_method = method_name.to_s.downcase
@@ -1,10 +1,10 @@
1
1
  # This file is auto-generated by alef — DO NOT EDIT.
2
- # alef:hash:4ce3ba64cbea5c72b082111f57a04e219e555fda52859a9673edcec555031706
2
+ # alef:hash:9c58cf63849e82246f03b4fcc3996c264d47f2b2c27e0e8ba6b93eb4a84cb279
3
3
  # To regenerate: alef generate
4
4
  # To verify freshness: alef verify --exit-code
5
5
  # Issues & docs: https://github.com/kreuzberg-dev/alef
6
6
  # frozen_string_literal: true
7
7
 
8
8
  module HtmlToMarkdown
9
- VERSION = '3.4.0.pre.rc.44'
9
+ VERSION = '3.4.0'
10
10
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.4.0-rc.44"
6
+ version = "3.4.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.4.0-rc.44"
3
+ version = "3.4.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -6,7 +6,7 @@
6
6
  use std::borrow::Cow;
7
7
 
8
8
  use super::cell::{collect_table_cells, get_colspan};
9
- use super::cells::{append_layout_row, convert_table_row};
9
+ use super::cells::{append_layout_row, collect_row_cell_widths, convert_table_row};
10
10
  use super::scanner::scan_table;
11
11
  use super::utils::{is_tag_name, normalized_tag_name};
12
12
  #[cfg(feature = "visitor")]
@@ -216,6 +216,50 @@ pub fn handle_table(
216
216
  let mut rowspan_tracker = vec![None; total_cols];
217
217
  let mut row_cells = Vec::new();
218
218
 
219
+ // Pre-pass: compute per-column max content widths for aligned padding.
220
+ // Uses a rowspan tracker so spanned columns are skipped just as they
221
+ // are in the render pass, keeping column indices correctly aligned.
222
+ let col_widths = {
223
+ let mut widths: Vec<usize> = Vec::new();
224
+ let mut prepass_rowspan: Vec<Option<usize>> = Vec::new();
225
+ let children = tag.children();
226
+ for child_handle in children.top().iter() {
227
+ if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
228
+ let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
229
+ match tag_name.as_ref() {
230
+ "thead" | "tbody" | "tfoot" => {
231
+ for row_handle in child_tag.children().top().iter() {
232
+ if is_tag_name(row_handle, parser, dom_ctx, "tr") {
233
+ collect_row_cell_widths(
234
+ row_handle,
235
+ parser,
236
+ options,
237
+ ctx,
238
+ dom_ctx,
239
+ &mut widths,
240
+ &mut prepass_rowspan,
241
+ );
242
+ }
243
+ }
244
+ }
245
+ "tr" | "row" => {
246
+ collect_row_cell_widths(
247
+ child_handle,
248
+ parser,
249
+ options,
250
+ ctx,
251
+ dom_ctx,
252
+ &mut widths,
253
+ &mut prepass_rowspan,
254
+ );
255
+ }
256
+ _ => {}
257
+ }
258
+ }
259
+ }
260
+ widths
261
+ };
262
+
219
263
  let children = tag.children();
220
264
  {
221
265
  for child_handle in children.top().iter() {
@@ -282,6 +326,7 @@ pub fn handle_table(
282
326
  dom_ctx,
283
327
  depth + 1,
284
328
  is_header_section,
329
+ &col_widths,
285
330
  );
286
331
  row_index += 1;
287
332
  }
@@ -312,6 +357,7 @@ pub fn handle_table(
312
357
  dom_ctx,
313
358
  depth + 1,
314
359
  row_index == 0,
360
+ &col_widths,
315
361
  );
316
362
  row_index += 1;
317
363
  }
@@ -97,10 +97,86 @@ pub fn collect_table_cells(
97
97
  }
98
98
  }
99
99
 
100
+ /// Extract the text content of a table cell for column width calculation.
101
+ ///
102
+ /// Returns the same text that would appear in the rendered cell, without
103
+ /// the surrounding pipe delimiters. Used in the first pass to compute
104
+ /// maximum column widths before rendering with padding.
105
+ ///
106
+ /// # Arguments
107
+ /// * `node_handle` - Handle to the cell element
108
+ /// * `parser` - HTML parser instance
109
+ /// * `options` - Conversion options
110
+ /// * `ctx` - Conversion context
111
+ /// * `dom_ctx` - DOM context
112
+ #[allow(clippy::trivially_copy_pass_by_ref)]
113
+ pub fn cell_text_content(
114
+ node_handle: &tl::NodeHandle,
115
+ parser: &tl::Parser,
116
+ options: &crate::options::ConversionOptions,
117
+ ctx: &super::super::super::Context,
118
+ dom_ctx: &super::super::super::DomContext,
119
+ ) -> String {
120
+ let mut text = String::with_capacity(64);
121
+
122
+ let cell_ctx = super::super::super::Context {
123
+ in_table_cell: true,
124
+ ..ctx.clone()
125
+ };
126
+
127
+ if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
128
+ let children = tag.children();
129
+ let has_tag_child = children
130
+ .top()
131
+ .iter()
132
+ .any(|child_handle| matches!(child_handle.get(parser), Some(tl::Node::Tag(_))));
133
+
134
+ if has_tag_child {
135
+ for child_handle in children.top().iter() {
136
+ super::super::super::walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
137
+ }
138
+ } else {
139
+ let raw = dom_ctx.text_content(*node_handle, parser);
140
+ let normalized = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
141
+ crate::text::normalize_whitespace_cow(raw.as_str())
142
+ } else {
143
+ Cow::Borrowed(raw.as_str())
144
+ };
145
+ let escaped = escape_cell_text(normalized.as_ref(), options);
146
+ text = escaped;
147
+ }
148
+ }
149
+
150
+ let text = text.trim();
151
+ if options.br_in_tables {
152
+ text.to_string()
153
+ } else if text.contains('\n') {
154
+ text.replace('\n', " ")
155
+ } else {
156
+ text.to_string()
157
+ }
158
+ }
159
+
160
+ /// Escape text for use inside a table cell.
161
+ ///
162
+ /// Always escapes `*` and `_` (to prevent unintended emphasis inside cells),
163
+ /// applies `escape_misc` / `escape_ascii` per options, and escapes `|` (pipe)
164
+ /// when `escape_misc` is not already handling it.
165
+ fn escape_cell_text(text: &str, options: &crate::options::ConversionOptions) -> String {
166
+ // Always escape * and _ in table cells to prevent unintended emphasis.
167
+ let escaped = crate::text::escape(text, options.escape_misc, true, true, options.escape_ascii);
168
+ if options.escape_misc {
169
+ escaped.into_owned()
170
+ } else {
171
+ escaped.replace('|', r"\|")
172
+ }
173
+ }
174
+
100
175
  /// Convert a table cell (td or th) to Markdown format.
101
176
  ///
102
177
  /// Processes cell content and renders it with pipe delimiters for Markdown tables.
103
178
  /// Handles colspan by adding extra pipes, and escapes pipes in cell content.
179
+ /// Always escapes `*` and `_` to prevent unintended emphasis inside cells.
104
180
  ///
105
181
  /// # Arguments
106
182
  /// * `node_handle` - Handle to the cell element
@@ -110,6 +186,7 @@ pub fn collect_table_cells(
110
186
  /// * `ctx` - Conversion context (visitor, etc)
111
187
  /// * `_tag_name` - Tag name (for consistency, not used)
112
188
  /// * `dom_ctx` - DOM context for content extraction
189
+ /// * `col_width` - Optional target width for padding (None = no padding)
113
190
  #[allow(clippy::trivially_copy_pass_by_ref)]
114
191
  pub fn convert_table_cell(
115
192
  node_handle: &tl::NodeHandle,
@@ -119,6 +196,7 @@ pub fn convert_table_cell(
119
196
  ctx: &super::super::super::Context,
120
197
  _tag_name: &str,
121
198
  dom_ctx: &super::super::super::DomContext,
199
+ col_width: Option<usize>,
122
200
  ) {
123
201
  let mut text = String::with_capacity(128);
124
202
 
@@ -145,18 +223,7 @@ pub fn convert_table_cell(
145
223
  } else {
146
224
  Cow::Borrowed(raw.as_str())
147
225
  };
148
- let escaped = crate::text::escape(
149
- normalized.as_ref(),
150
- options.escape_misc,
151
- options.escape_asterisks,
152
- options.escape_underscores,
153
- options.escape_ascii,
154
- );
155
- if options.escape_misc {
156
- text = escaped.into_owned();
157
- } else {
158
- text = escaped.replace('|', r"\|");
159
- }
226
+ text = escape_cell_text(normalized.as_ref(), options);
160
227
  }
161
228
  }
162
229
 
@@ -175,6 +242,14 @@ pub fn convert_table_cell(
175
242
 
176
243
  output.push(' ');
177
244
  output.push_str(&text);
245
+ if let Some(width) = col_width {
246
+ let text_len = text.chars().count();
247
+ if text_len < width {
248
+ for _ in 0..(width - text_len) {
249
+ output.push(' ');
250
+ }
251
+ }
252
+ }
178
253
  for _ in 0..colspan {
179
254
  output.push_str(" |");
180
255
  }
@@ -10,7 +10,7 @@ use crate::converter::utility::content::collect_tag_attributes;
10
10
  use crate::converter::utility::content::normalized_tag_name;
11
11
  use std::borrow::Cow;
12
12
 
13
- use super::cell::{collect_table_cells, convert_table_cell, get_colspan_rowspan};
13
+ use super::cell::{cell_text_content, collect_table_cells, convert_table_cell, get_colspan_rowspan};
14
14
 
15
15
  /// Maximum allowed table columns to prevent unbounded memory usage.
16
16
  const MAX_TABLE_COLS: usize = 1000;
@@ -87,6 +87,75 @@ pub fn append_layout_row(
87
87
  }
88
88
  }
89
89
 
90
+ /// Collect the rendered text content of every cell in a row for width calculation.
91
+ ///
92
+ /// `rowspan_tracker` mirrors the tracker in `convert_table_row` so that spanned
93
+ /// columns are skipped in the width pre-pass just as they are skipped in rendering.
94
+ /// Pass a shared tracker across all row calls to correctly handle multi-row spans.
95
+ #[allow(clippy::trivially_copy_pass_by_ref)]
96
+ pub fn collect_row_cell_widths(
97
+ node_handle: &tl::NodeHandle,
98
+ parser: &tl::Parser,
99
+ options: &crate::options::ConversionOptions,
100
+ ctx: &super::super::super::Context,
101
+ dom_ctx: &super::super::super::DomContext,
102
+ col_widths: &mut Vec<usize>,
103
+ rowspan_tracker: &mut Vec<Option<usize>>,
104
+ ) {
105
+ let mut cells = Vec::new();
106
+ collect_table_cells(node_handle, parser, dom_ctx, &mut cells);
107
+
108
+ let mut col = 0usize;
109
+ let mut cell_iter = cells.iter();
110
+
111
+ loop {
112
+ // Skip columns that are filled by a rowspan from a previous row.
113
+ while col < rowspan_tracker.len() {
114
+ if let Some(Some(remaining)) = rowspan_tracker.get_mut(col) {
115
+ if *remaining > 0 {
116
+ *remaining -= 1;
117
+ if *remaining == 0 {
118
+ rowspan_tracker[col] = None;
119
+ }
120
+ col += 1;
121
+ continue;
122
+ }
123
+ }
124
+ break;
125
+ }
126
+
127
+ let Some(cell_handle) = cell_iter.next() else {
128
+ break;
129
+ };
130
+
131
+ let text = cell_text_content(cell_handle, parser, options, ctx, dom_ctx);
132
+ let width = text.chars().count();
133
+
134
+ // Grow the widths vec if needed.
135
+ if col >= col_widths.len() {
136
+ col_widths.resize(col + 1, 0);
137
+ }
138
+ if width > col_widths[col] {
139
+ col_widths[col] = width;
140
+ }
141
+
142
+ let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
143
+
144
+ // Record rowspan for future rows.
145
+ if rowspan > 1 {
146
+ if col >= rowspan_tracker.len() {
147
+ rowspan_tracker.resize(col + 1, None);
148
+ }
149
+ rowspan_tracker[col] = Some(rowspan - 1);
150
+ }
151
+
152
+ col = col.saturating_add(colspan);
153
+ }
154
+ }
155
+
156
+ /// Minimum separator dash count per column (matches `---`).
157
+ const MIN_SEPARATOR_DASHES: usize = 3;
158
+
90
159
  /// Convert a table row (tr) to Markdown format.
91
160
  ///
92
161
  /// Processes all cells in a row, handling colspan and rowspan for proper
@@ -107,6 +176,7 @@ pub fn append_layout_row(
107
176
  /// * `dom_ctx` - DOM context
108
177
  /// * `depth` - Nesting depth
109
178
  /// * `is_header` - Whether this is a header row
179
+ /// * `col_widths` - Per-column max content widths for padding (empty = no padding)
110
180
  #[allow(clippy::too_many_arguments)]
111
181
  #[cfg_attr(not(feature = "visitor"), allow(unused_variables))]
112
182
  #[allow(clippy::trivially_copy_pass_by_ref)]
@@ -124,6 +194,7 @@ pub fn convert_table_row(
124
194
  dom_ctx: &super::super::super::DomContext,
125
195
  depth: usize,
126
196
  is_header: bool,
197
+ col_widths: &[usize],
127
198
  ) {
128
199
  let mut row_text = String::with_capacity(256);
129
200
  let mut cells = Vec::new();
@@ -203,7 +274,13 @@ pub fn convert_table_row(
203
274
  if col_index < total_cols {
204
275
  if let Some(Some(remaining_rows)) = rowspan_tracker.get_mut(col_index) {
205
276
  if *remaining_rows > 0 {
277
+ let width = col_widths.get(col_index).copied();
206
278
  row_text.push(' ');
279
+ if let Some(w) = width {
280
+ for _ in 0..w {
281
+ row_text.push(' ');
282
+ }
283
+ }
207
284
  row_text.push_str(" |");
208
285
  *remaining_rows -= 1;
209
286
  if *remaining_rows == 0 {
@@ -216,7 +293,8 @@ pub fn convert_table_row(
216
293
  }
217
294
 
218
295
  if let Some(cell_handle) = cell_iter.next() {
219
- convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
296
+ let col_width = col_widths.get(col_index).copied();
297
+ convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx, col_width);
220
298
 
221
299
  let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
222
300
 
@@ -230,8 +308,9 @@ pub fn convert_table_row(
230
308
  }
231
309
  }
232
310
  } else {
233
- for cell_handle in &cells {
234
- convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
311
+ for (cell_idx, cell_handle) in cells.iter().enumerate() {
312
+ let col_width = col_widths.get(cell_idx).copied();
313
+ convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx, col_width);
235
314
  }
236
315
  }
237
316
 
@@ -247,7 +326,10 @@ pub fn convert_table_row(
247
326
  if i > 0 {
248
327
  output.push_str(" | ");
249
328
  }
250
- output.push_str("---");
329
+ let dash_count = col_widths.get(i).copied().unwrap_or(0).max(MIN_SEPARATOR_DASHES);
330
+ for _ in 0..dash_count {
331
+ output.push('-');
332
+ }
251
333
  }
252
334
  output.push_str(" |\n");
253
335
  }
@@ -117,11 +117,11 @@ pub fn handle_table_with_context(
117
117
  let indented = layout::indent_table_for_list(&table_output, ctx.list_depth, options);
118
118
  output.push_str(&indented);
119
119
  } else {
120
- if !output.ends_with("\n\n") {
121
- if output.is_empty() || !output.ends_with('\n') {
122
- output.push_str("\n\n");
123
- } else {
120
+ if !output.is_empty() && !output.ends_with("\n\n") {
121
+ if output.ends_with('\n') {
124
122
  output.push('\n');
123
+ } else {
124
+ output.push_str("\n\n");
125
125
  }
126
126
  }
127
127
  output.push_str(&table_output);
@@ -253,7 +253,25 @@ fn format_image_markdown(
253
253
  buf.push_str("![");
254
254
  buf.push_str(alt);
255
255
  buf.push_str("](");
256
- buf.push_str(src);
256
+
257
+ if src.is_empty() {
258
+ buf.push_str("<>");
259
+ } else if src.contains(' ') || src.contains('\n') {
260
+ buf.push('<');
261
+ buf.push_str(src);
262
+ buf.push('>');
263
+ } else {
264
+ let open_count = src.chars().filter(|&c| c == '(').count();
265
+ let close_count = src.chars().filter(|&c| c == ')').count();
266
+
267
+ if open_count == close_count {
268
+ buf.push_str(src);
269
+ } else {
270
+ let escaped_src = src.replace('(', "\\(").replace(')', "\\)");
271
+ buf.push_str(&escaped_src);
272
+ }
273
+ }
274
+
257
275
  if let Some(title_text) = title {
258
276
  buf.push_str(" \"");
259
277
  buf.push_str(title_text);
@@ -23,7 +23,8 @@ use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_d
23
23
  use crate::converter::utility::caching::build_dom_context;
24
24
  use crate::converter::utility::content::normalized_tag_name;
25
25
  use crate::converter::utility::preprocessing::{
26
- normalize_bogus_comment_endings, preprocess_html, strip_hidden_elements, strip_script_and_style_tags,
26
+ normalize_bogus_comment_endings, normalize_split_closing_tags, preprocess_html, strip_hidden_elements,
27
+ strip_script_and_style_tags,
27
28
  };
28
29
  use crate::converter::utility::serialization::serialize_tag_to_html;
29
30
  use crate::options::OutputFormat;
@@ -66,6 +67,10 @@ pub fn convert_html_impl(
66
67
  // Normalise bogus HTML comment endings (`--->`, `---->`, …) that cause the
67
68
  // `tl` parser to silently discard all document content that follows them.
68
69
  let stripped = normalize_bogus_comment_endings(&stripped);
70
+ // Normalise closing tags whose `>` is on a subsequent line (JSX-style `</a\n>`).
71
+ // The `tl` parser does not handle such end-tags and leaves the element unclosed,
72
+ // causing all subsequent siblings to be absorbed as children.
73
+ let stripped = normalize_split_closing_tags(&stripped);
69
74
  let mut preprocessed = preprocess_html(&stripped).into_owned();
70
75
  let mut preprocessed_len = preprocessed.len();
71
76
 
@@ -74,6 +79,7 @@ pub fn convert_html_impl(
74
79
  let stripped = strip_script_and_style_tags(&repaired_html);
75
80
  let stripped = strip_hidden_elements(&stripped);
76
81
  let stripped = normalize_bogus_comment_endings(&stripped);
82
+ let stripped = normalize_split_closing_tags(&stripped);
77
83
  let repaired = preprocess_html(&stripped).into_owned();
78
84
  preprocessed = repaired;
79
85
  preprocessed_len = preprocessed.len();
@@ -88,6 +94,7 @@ pub fn convert_html_impl(
88
94
  let stripped = strip_script_and_style_tags(&repaired_html);
89
95
  let stripped = strip_hidden_elements(&stripped);
90
96
  let stripped = normalize_bogus_comment_endings(&stripped);
97
+ let stripped = normalize_split_closing_tags(&stripped);
91
98
  preprocessed = preprocess_html(&stripped).into_owned();
92
99
  preprocessed_len = preprocessed.len();
93
100
  continue;
@@ -109,6 +116,7 @@ pub fn convert_html_impl(
109
116
  let stripped = strip_script_and_style_tags(&repaired_html);
110
117
  let stripped = strip_hidden_elements(&stripped);
111
118
  let stripped = normalize_bogus_comment_endings(&stripped);
119
+ let stripped = normalize_split_closing_tags(&stripped);
112
120
  preprocessed = preprocess_html(&stripped).into_owned();
113
121
  preprocessed_len = preprocessed.len();
114
122
  // Re-parse with repaired HTML