html-to-markdown 2.26.3 → 2.27.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +20 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/options.rs +1 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +14 -13
- data/rust-vendor/html-to-markdown-rs/src/converter/context.rs +0 -3
- data/rust-vendor/html-to-markdown-rs/src/converter/list/definition.rs +8 -67
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +8 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs +265 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs +8 -86
- data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs +2 -2
- data/rust-vendor/html-to-markdown-rs/src/options/validation.rs +4 -0
- data/rust-vendor/html-to-markdown-rs/tests/issue_200_regressions.rs +25 -4
- data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs +214 -0
- data/rust-vendor/pxfm/.cargo-checksum.json +1 -1
- data/rust-vendor/pxfm/.cargo_vcs_info.json +1 -1
- data/rust-vendor/pxfm/Cargo.lock +1 -19
- data/rust-vendor/pxfm/Cargo.toml +1 -4
- data/rust-vendor/pxfm/Cargo.toml.orig +1 -4
- data/rust-vendor/pxfm/src/common.rs +0 -44
- data/rust-vendor/pxfm/src/pow.rs +2 -1
- data/rust-vendor/pxfm/src/powf.rs +1 -0
- data/rust-vendor/pxfm/src/tangent/cotpi.rs +11 -11
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 41d0b097b3f46c377ddadf6da05274a283efeddf1142795a1766e908d7c78290
|
|
4
|
+
data.tar.gz: 2e6585fb07a8e8cf3fc1f474cbadf6b2a3135f00ef4707029d3e35dce02726c5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: df64f0ec15405f15043aa6594aa0560885dfa9c957fcf40ec5b4de35457f06442efd0cd3bb868d06a2a53c67a3a096719ce026e260173114857f94ab5c6249cb
|
|
7
|
+
data.tar.gz: cbaa15dee930c2940b9aaf4f768d63cd95373cc4aea4c093d58a049e6612c3e533645cdbeaf9a64aca0571ad4a2995873ecb1b98a3034de83d5771439673989c
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.27.1)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -172,7 +172,7 @@ CHECKSUMS
|
|
|
172
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
173
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
174
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
175
|
-
html-to-markdown (2.
|
|
175
|
+
html-to-markdown (2.27.1)
|
|
176
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
177
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
178
|
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/badge/Go-v2.
|
|
21
|
+
<img src="https://img.shields.io/badge/Go-v2.27.1-007ec6" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
|
|
|
144
144
|
- `wrap_width`: Wrap at column — default: `80`
|
|
145
145
|
- `code_language`: Default fenced code block language — default: none
|
|
146
146
|
- `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
|
|
147
|
-
- `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
|
|
147
|
+
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
148
148
|
|
|
149
149
|
**`MetadataConfig`** – Selective metadata extraction:
|
|
150
150
|
- `extract_headers`: h1-h6 elements — default: `true`
|
|
@@ -191,6 +191,24 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
|
|
|
191
191
|
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
192
192
|
|
|
193
193
|
|
|
194
|
+
## Plain Text Output
|
|
195
|
+
|
|
196
|
+
Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
```ruby
|
|
200
|
+
require 'html_to_markdown'
|
|
201
|
+
|
|
202
|
+
html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
|
|
203
|
+
|
|
204
|
+
plain = HtmlToMarkdown.convert(html, output_format: 'plain')
|
|
205
|
+
# Result: "Title\n\nThis is bold and italic text."
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
|
|
210
|
+
|
|
211
|
+
|
|
194
212
|
|
|
195
213
|
## Metadata Extraction
|
|
196
214
|
|
|
@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
|
|
|
65
65
|
match symbol_to_string(value)?.as_str() {
|
|
66
66
|
"markdown" => Ok(OutputFormat::Markdown),
|
|
67
67
|
"djot" => Ok(OutputFormat::Djot),
|
|
68
|
+
"plain" => Ok(OutputFormat::Plain),
|
|
68
69
|
other => Err(arg_error(format!("invalid output_format: {other}"))),
|
|
69
70
|
}
|
|
70
71
|
}
|
|
@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
|
|
|
562
562
|
Cow::Borrowed(trimmed)
|
|
563
563
|
};
|
|
564
564
|
|
|
565
|
-
let escaped =
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
565
|
+
let escaped = if options.output_format == crate::options::OutputFormat::Plain {
|
|
566
|
+
normalized.into_owned()
|
|
567
|
+
} else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
|
|
568
|
+
text::escape(
|
|
569
|
+
normalized.as_ref(),
|
|
570
|
+
options.escape_misc,
|
|
571
|
+
options.escape_asterisks,
|
|
572
|
+
options.escape_underscores,
|
|
573
|
+
options.escape_ascii,
|
|
574
|
+
)
|
|
575
|
+
.into_owned()
|
|
576
|
+
} else {
|
|
577
|
+
normalized.into_owned()
|
|
578
|
+
};
|
|
578
579
|
|
|
579
580
|
let mut output = String::with_capacity(escaped.len() + 1);
|
|
580
581
|
output.push_str(&escaped);
|
|
@@ -35,8 +35,6 @@ pub struct Context {
|
|
|
35
35
|
pub(crate) list_counter: usize,
|
|
36
36
|
/// Are we in an ordered list (vs unordered)?
|
|
37
37
|
pub(crate) in_ordered_list: bool,
|
|
38
|
-
/// Track if previous sibling in dl was a dt
|
|
39
|
-
pub(crate) last_was_dt: bool,
|
|
40
38
|
/// Blockquote nesting depth
|
|
41
39
|
pub(crate) blockquote_depth: usize,
|
|
42
40
|
/// Are we inside a table cell (td/th)?
|
|
@@ -142,7 +140,6 @@ impl Context {
|
|
|
142
140
|
in_code: false,
|
|
143
141
|
list_counter: 0,
|
|
144
142
|
in_ordered_list: false,
|
|
145
|
-
last_was_dt: false,
|
|
146
143
|
blockquote_depth: 0,
|
|
147
144
|
in_table_cell: false,
|
|
148
145
|
convert_as_inline: options.convert_as_inline,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
//! Processes definition lists with:
|
|
4
4
|
//! - Definition terms (dt)
|
|
5
5
|
//! - Definition descriptions (dd)
|
|
6
|
-
//! -
|
|
6
|
+
//! - Plain block formatting (no Pandoc colon syntax)
|
|
7
7
|
|
|
8
8
|
use crate::options::ConversionOptions;
|
|
9
9
|
use tl;
|
|
@@ -24,12 +24,12 @@ pub(crate) fn handle_dl(
|
|
|
24
24
|
depth: usize,
|
|
25
25
|
dom_ctx: &DomContext,
|
|
26
26
|
) {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
};
|
|
27
|
+
let tag = match node_handle.get(parser) {
|
|
28
|
+
Some(tl::Node::Tag(t)) => t,
|
|
29
|
+
_ => return,
|
|
30
|
+
};
|
|
32
31
|
|
|
32
|
+
if ctx.convert_as_inline {
|
|
33
33
|
let children = tag.children();
|
|
34
34
|
{
|
|
35
35
|
for child_handle in children.top().iter() {
|
|
@@ -40,48 +40,11 @@ pub(crate) fn handle_dl(
|
|
|
40
40
|
return;
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
-
let tag = match node_handle.get(parser) {
|
|
44
|
-
Some(tl::Node::Tag(t)) => t,
|
|
45
|
-
_ => return,
|
|
46
|
-
};
|
|
47
|
-
|
|
48
43
|
let mut content = String::new();
|
|
49
|
-
let mut in_dt_group = false;
|
|
50
44
|
let children = tag.children();
|
|
51
45
|
{
|
|
52
46
|
for child_handle in children.top().iter() {
|
|
53
|
-
|
|
54
|
-
if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
|
|
55
|
-
let tag_name = {
|
|
56
|
-
use crate::converter::normalized_tag_name;
|
|
57
|
-
normalized_tag_name(child_tag.name().as_utf8_str())
|
|
58
|
-
};
|
|
59
|
-
(tag_name == "dt", tag_name == "dd")
|
|
60
|
-
} else {
|
|
61
|
-
(false, false)
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
let child_ctx = Context {
|
|
65
|
-
last_was_dt: in_dt_group && is_definition_description,
|
|
66
|
-
..ctx.clone()
|
|
67
|
-
};
|
|
68
|
-
crate::converter::walk_node(child_handle, parser, &mut content, options, &child_ctx, depth, dom_ctx);
|
|
69
|
-
|
|
70
|
-
match child_handle.get(parser) {
|
|
71
|
-
Some(tl::Node::Tag(_)) => {
|
|
72
|
-
if is_definition_term {
|
|
73
|
-
in_dt_group = true;
|
|
74
|
-
} else if !is_definition_description {
|
|
75
|
-
in_dt_group = false;
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
Some(tl::Node::Raw(raw)) => {
|
|
79
|
-
if !raw.as_utf8_str().trim().is_empty() {
|
|
80
|
-
in_dt_group = false;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
Some(tl::Node::Comment(_)) | None => {}
|
|
84
|
-
}
|
|
47
|
+
crate::converter::walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
|
|
85
48
|
}
|
|
86
49
|
}
|
|
87
50
|
|
|
@@ -132,8 +95,7 @@ pub(crate) fn handle_dt(
|
|
|
132
95
|
|
|
133
96
|
/// Handle definition description element (<dd>).
|
|
134
97
|
///
|
|
135
|
-
/// Outputs the description
|
|
136
|
-
/// or on its own with proper spacing.
|
|
98
|
+
/// Outputs the description as a plain block.
|
|
137
99
|
pub(crate) fn handle_dd(
|
|
138
100
|
node_handle: &tl::NodeHandle,
|
|
139
101
|
parser: &tl::Parser,
|
|
@@ -162,27 +124,6 @@ pub(crate) fn handle_dd(
|
|
|
162
124
|
if !trimmed.is_empty() {
|
|
163
125
|
output.push_str(trimmed);
|
|
164
126
|
}
|
|
165
|
-
} else if ctx.last_was_dt {
|
|
166
|
-
if trimmed.is_empty() {
|
|
167
|
-
output.push_str(": \n\n");
|
|
168
|
-
} else {
|
|
169
|
-
let mut lines = trimmed.lines();
|
|
170
|
-
if let Some(first) = lines.next() {
|
|
171
|
-
output.push_str(": ");
|
|
172
|
-
output.push_str(first);
|
|
173
|
-
output.push('\n');
|
|
174
|
-
}
|
|
175
|
-
for line in lines {
|
|
176
|
-
if line.is_empty() {
|
|
177
|
-
output.push('\n');
|
|
178
|
-
} else {
|
|
179
|
-
output.push_str(" ");
|
|
180
|
-
output.push_str(line);
|
|
181
|
-
output.push('\n');
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
output.push('\n');
|
|
185
|
-
}
|
|
186
127
|
} else if !trimmed.is_empty() {
|
|
187
128
|
output.push_str(trimmed);
|
|
188
129
|
output.push_str("\n\n");
|
|
@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
|
|
|
18
18
|
extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
|
|
19
19
|
repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
|
|
20
20
|
};
|
|
21
|
+
use crate::converter::plain_text::extract_plain_text;
|
|
21
22
|
use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
|
|
22
23
|
use crate::converter::utility::caching::build_dom_context;
|
|
23
24
|
use crate::converter::utility::content::normalized_tag_name;
|
|
24
25
|
use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
|
|
25
26
|
use crate::converter::utility::serialization::serialize_tag_to_html;
|
|
27
|
+
use crate::options::OutputFormat;
|
|
26
28
|
|
|
27
29
|
use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
|
|
28
30
|
use crate::error::Result;
|
|
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
|
|
|
134
136
|
}
|
|
135
137
|
}
|
|
136
138
|
|
|
139
|
+
// Fast path for plain text output: skip the full conversion pipeline
|
|
140
|
+
if options.output_format == OutputFormat::Plain {
|
|
141
|
+
let plain = extract_plain_text(&dom, parser, options);
|
|
142
|
+
return Ok(plain);
|
|
143
|
+
}
|
|
144
|
+
|
|
137
145
|
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
|
|
138
146
|
#[cfg(feature = "metadata")]
|
|
139
147
|
let wants_document = metadata_collector
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
//! Plain text extraction from parsed HTML DOM.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides a fast-path text extractor that walks the DOM tree collecting only
|
|
4
|
+
//! visible text content with structural whitespace, bypassing the full
|
|
5
|
+
//! Markdown/Djot conversion pipeline.
|
|
6
|
+
|
|
7
|
+
use crate::options::ConversionOptions;
|
|
8
|
+
use crate::text;
|
|
9
|
+
|
|
10
|
+
/// Tags whose content should be skipped entirely.
|
|
11
|
+
const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
|
|
12
|
+
|
|
13
|
+
/// Block-level tags that should be separated by blank lines.
|
|
14
|
+
const BLOCK_TAGS: &[&str] = &[
|
|
15
|
+
"p",
|
|
16
|
+
"div",
|
|
17
|
+
"h1",
|
|
18
|
+
"h2",
|
|
19
|
+
"h3",
|
|
20
|
+
"h4",
|
|
21
|
+
"h5",
|
|
22
|
+
"h6",
|
|
23
|
+
"blockquote",
|
|
24
|
+
"section",
|
|
25
|
+
"article",
|
|
26
|
+
"aside",
|
|
27
|
+
"main",
|
|
28
|
+
"nav",
|
|
29
|
+
"header",
|
|
30
|
+
"footer",
|
|
31
|
+
"figure",
|
|
32
|
+
"figcaption",
|
|
33
|
+
"details",
|
|
34
|
+
"summary",
|
|
35
|
+
"address",
|
|
36
|
+
"hgroup",
|
|
37
|
+
"search",
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
/// Extract plain text from a parsed DOM tree.
|
|
41
|
+
///
|
|
42
|
+
/// Walks the tree collecting visible text with structural whitespace:
|
|
43
|
+
/// - Block elements get blank-line separation
|
|
44
|
+
/// - `<br>` becomes a newline, `<hr>` a blank line
|
|
45
|
+
/// - `<pre>` preserves internal whitespace
|
|
46
|
+
/// - `<img>` outputs alt text (unless `skip_images` is set)
|
|
47
|
+
/// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
|
|
48
|
+
/// - Tables: cells separated by tab, rows by newline
|
|
49
|
+
/// - Inline elements are recursed without markers
|
|
50
|
+
pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
|
|
51
|
+
let mut buf = String::with_capacity(1024);
|
|
52
|
+
|
|
53
|
+
for child_handle in dom.children() {
|
|
54
|
+
walk_plain(child_handle, parser, &mut buf, options, false);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
post_process(&mut buf);
|
|
58
|
+
buf
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Recursive plain-text walker.
|
|
62
|
+
fn walk_plain(
|
|
63
|
+
node_handle: &tl::NodeHandle,
|
|
64
|
+
parser: &tl::Parser,
|
|
65
|
+
buf: &mut String,
|
|
66
|
+
options: &ConversionOptions,
|
|
67
|
+
in_pre: bool,
|
|
68
|
+
) {
|
|
69
|
+
let Some(node) = node_handle.get(parser) else {
|
|
70
|
+
return;
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
match node {
|
|
74
|
+
tl::Node::Raw(bytes) => {
|
|
75
|
+
let raw = bytes.as_utf8_str();
|
|
76
|
+
let decoded = text::decode_html_entities_cow(raw.as_ref());
|
|
77
|
+
if in_pre {
|
|
78
|
+
buf.push_str(&decoded);
|
|
79
|
+
} else {
|
|
80
|
+
let normalized = text::normalize_whitespace_cow(&decoded);
|
|
81
|
+
if !normalized.is_empty() {
|
|
82
|
+
// Avoid leading space at start of a new line
|
|
83
|
+
if normalized.as_ref() == " " && buf.ends_with('\n') {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
buf.push_str(&normalized);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
tl::Node::Tag(tag) => {
|
|
91
|
+
let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
|
|
92
|
+
let tag_str = tag_name.as_str();
|
|
93
|
+
|
|
94
|
+
// Skip invisible content
|
|
95
|
+
if SKIP_TAGS.contains(&tag_str) {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
match tag_str {
|
|
100
|
+
"br" => {
|
|
101
|
+
buf.push('\n');
|
|
102
|
+
}
|
|
103
|
+
"hr" => {
|
|
104
|
+
ensure_blank_line(buf);
|
|
105
|
+
}
|
|
106
|
+
"pre" => {
|
|
107
|
+
ensure_blank_line(buf);
|
|
108
|
+
walk_children(tag, parser, buf, options, true);
|
|
109
|
+
ensure_blank_line(buf);
|
|
110
|
+
}
|
|
111
|
+
"img" => {
|
|
112
|
+
if !options.skip_images {
|
|
113
|
+
if let Some(Some(alt)) = tag.attributes().get("alt") {
|
|
114
|
+
let alt_text = alt.as_utf8_str();
|
|
115
|
+
if !alt_text.is_empty() {
|
|
116
|
+
buf.push_str(alt_text.as_ref());
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
"table" => {
|
|
122
|
+
ensure_blank_line(buf);
|
|
123
|
+
walk_table(tag, parser, buf, options);
|
|
124
|
+
ensure_blank_line(buf);
|
|
125
|
+
}
|
|
126
|
+
"li" => {
|
|
127
|
+
ensure_newline(buf);
|
|
128
|
+
walk_children(tag, parser, buf, options, false);
|
|
129
|
+
ensure_newline(buf);
|
|
130
|
+
}
|
|
131
|
+
_ if BLOCK_TAGS.contains(&tag_str) => {
|
|
132
|
+
ensure_blank_line(buf);
|
|
133
|
+
walk_children(tag, parser, buf, options, in_pre);
|
|
134
|
+
ensure_blank_line(buf);
|
|
135
|
+
}
|
|
136
|
+
_ => {
|
|
137
|
+
// Inline elements and structural containers (html, body, ul, ol, etc.)
|
|
138
|
+
walk_children(tag, parser, buf, options, in_pre);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
tl::Node::Comment(_) => {}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/// Walk all children of a tag.
|
|
147
|
+
fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
|
|
148
|
+
let children = tag.children();
|
|
149
|
+
let top = children.top();
|
|
150
|
+
for child in top.iter() {
|
|
151
|
+
walk_plain(child, parser, buf, options, in_pre);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
|
|
156
|
+
fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
|
|
157
|
+
// Collect all <tr> node handles by recursing into the table
|
|
158
|
+
let mut row_handles = Vec::new();
|
|
159
|
+
collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
|
|
160
|
+
|
|
161
|
+
for (row_idx, row_handle) in row_handles.iter().enumerate() {
|
|
162
|
+
if row_idx > 0 {
|
|
163
|
+
buf.push('\n');
|
|
164
|
+
}
|
|
165
|
+
let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
|
|
166
|
+
continue;
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
// Collect direct <th>/<td> children
|
|
170
|
+
let mut cell_handles = Vec::new();
|
|
171
|
+
let row_children = row_tag.children();
|
|
172
|
+
let row_top = row_children.top();
|
|
173
|
+
for child in row_top.iter() {
|
|
174
|
+
if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
|
|
175
|
+
let name = child_tag.name().as_utf8_str();
|
|
176
|
+
if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
|
|
177
|
+
cell_handles.push(*child);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
|
|
183
|
+
if cell_idx > 0 {
|
|
184
|
+
buf.push('\t');
|
|
185
|
+
}
|
|
186
|
+
let mut cell_buf = String::new();
|
|
187
|
+
if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
|
|
188
|
+
walk_children(cell_tag, parser, &mut cell_buf, options, false);
|
|
189
|
+
}
|
|
190
|
+
buf.push_str(cell_buf.trim());
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
|
|
196
|
+
fn collect_descendant_handles(
|
|
197
|
+
tag: &tl::HTMLTag,
|
|
198
|
+
parser: &tl::Parser,
|
|
199
|
+
target_tag: &str,
|
|
200
|
+
result: &mut Vec<tl::NodeHandle>,
|
|
201
|
+
) {
|
|
202
|
+
let children = tag.children();
|
|
203
|
+
let top = children.top();
|
|
204
|
+
for child in top.iter() {
|
|
205
|
+
if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
|
|
206
|
+
if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
|
|
207
|
+
result.push(*child);
|
|
208
|
+
} else {
|
|
209
|
+
collect_descendant_handles(child_tag, parser, target_tag, result);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/// Ensure the buffer ends with a blank line (two newlines).
|
|
216
|
+
fn ensure_blank_line(buf: &mut String) {
|
|
217
|
+
if buf.is_empty() {
|
|
218
|
+
return;
|
|
219
|
+
}
|
|
220
|
+
// Strip trailing horizontal whitespace
|
|
221
|
+
while buf.ends_with(' ') || buf.ends_with('\t') {
|
|
222
|
+
buf.pop();
|
|
223
|
+
}
|
|
224
|
+
let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
|
|
225
|
+
for _ in current_newlines..2 {
|
|
226
|
+
buf.push('\n');
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Ensure the buffer ends with at least one newline.
|
|
231
|
+
fn ensure_newline(buf: &mut String) {
|
|
232
|
+
if buf.is_empty() {
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
if !buf.ends_with('\n') {
|
|
236
|
+
buf.push('\n');
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
|
|
241
|
+
fn post_process(buf: &mut String) {
|
|
242
|
+
// Collapse runs of 3+ newlines to exactly 2
|
|
243
|
+
while buf.contains("\n\n\n") {
|
|
244
|
+
*buf = buf.replace("\n\n\n", "\n\n");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
|
|
248
|
+
let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
|
|
249
|
+
buf.clear();
|
|
250
|
+
for (i, line) in lines.iter().enumerate() {
|
|
251
|
+
buf.push_str(line);
|
|
252
|
+
if i < lines.len() - 1 {
|
|
253
|
+
buf.push('\n');
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Trim to single trailing newline
|
|
258
|
+
let keep = buf.trim_end_matches('\n').len();
|
|
259
|
+
if keep == 0 {
|
|
260
|
+
buf.clear();
|
|
261
|
+
} else {
|
|
262
|
+
buf.truncate(keep);
|
|
263
|
+
buf.push('\n');
|
|
264
|
+
}
|
|
265
|
+
}
|