html-to-markdown 2.26.2 → 2.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -10
- data/README.md +17 -1
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/ext/html-to-markdown-rb/native/src/options.rs +1 -0
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +14 -13
- data/rust-vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +4 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +8 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs +265 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +6 -0
- data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs +2 -2
- data/rust-vendor/html-to-markdown-rs/src/options/validation.rs +4 -0
- data/rust-vendor/html-to-markdown-rs/tests/integration_test.rs +60 -0
- data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs +214 -0
- data/spec/visitor_spec.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b359606f2fac17cda3721fd381e8717c5f32ad6b9cbbe7d3f691078521071c5e
|
|
4
|
+
data.tar.gz: 13bb3c8ba29a9270bd91d32bb1fe50c3353895cdd1bfd920ea9c0f79c52fbe9c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f0aea92dccbf209b90476ecabccd195252ae48b2e5aad1bdd54183b1e1686e8142a7f817d54a6513e816cf1038ba20ea8eebaffe45086ad7a05648e9314799a8
|
|
7
|
+
data.tar.gz: ed60ef47e31437ea2f459addd4d871d19f55fc7a121bfcf592535811c85441a5244b52d4d4f87f70caeeb368913c44a3c091aa900c631b074b87bb8c3c81d2f3
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.
|
|
4
|
+
html-to-markdown (2.27.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -20,6 +20,8 @@ GEM
|
|
|
20
20
|
securerandom (>= 0.3)
|
|
21
21
|
tzinfo (~> 2.0, >= 2.0.5)
|
|
22
22
|
uri (>= 0.13.1)
|
|
23
|
+
addressable (2.8.9)
|
|
24
|
+
public_suffix (>= 2.0.2, < 8.0)
|
|
23
25
|
ast (2.4.3)
|
|
24
26
|
base64 (0.3.0)
|
|
25
27
|
bigdecimal (4.0.1)
|
|
@@ -37,6 +39,9 @@ GEM
|
|
|
37
39
|
i18n (1.14.8)
|
|
38
40
|
concurrent-ruby (~> 1.0)
|
|
39
41
|
json (2.18.1)
|
|
42
|
+
json-schema (6.1.0)
|
|
43
|
+
addressable (~> 2.8)
|
|
44
|
+
bigdecimal (>= 3.1, < 5)
|
|
40
45
|
language_server-protocol (3.17.0.5)
|
|
41
46
|
lint_roller (1.1.0)
|
|
42
47
|
listen (3.10.0)
|
|
@@ -44,14 +49,18 @@ GEM
|
|
|
44
49
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
45
50
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
46
51
|
logger (1.7.0)
|
|
47
|
-
|
|
52
|
+
mcp (0.7.1)
|
|
53
|
+
json-schema (>= 4.1)
|
|
54
|
+
minitest (6.0.2)
|
|
55
|
+
drb (~> 2.0)
|
|
48
56
|
prism (~> 1.5)
|
|
49
57
|
mutex_m (0.3.0)
|
|
50
58
|
parallel (1.27.0)
|
|
51
|
-
parser (3.3.10.
|
|
59
|
+
parser (3.3.10.2)
|
|
52
60
|
ast (~> 2.4.1)
|
|
53
61
|
racc
|
|
54
62
|
prism (1.9.0)
|
|
63
|
+
public_suffix (7.0.2)
|
|
55
64
|
racc (1.8.1)
|
|
56
65
|
rainbow (3.1.1)
|
|
57
66
|
rake (13.3.1)
|
|
@@ -76,14 +85,15 @@ GEM
|
|
|
76
85
|
rspec-expectations (3.13.5)
|
|
77
86
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
78
87
|
rspec-support (~> 3.13.0)
|
|
79
|
-
rspec-mocks (3.13.
|
|
88
|
+
rspec-mocks (3.13.8)
|
|
80
89
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
81
90
|
rspec-support (~> 3.13.0)
|
|
82
91
|
rspec-support (3.13.7)
|
|
83
|
-
rubocop (1.
|
|
92
|
+
rubocop (1.85.0)
|
|
84
93
|
json (~> 2.3)
|
|
85
94
|
language_server-protocol (~> 3.17.0.2)
|
|
86
95
|
lint_roller (~> 1.1.0)
|
|
96
|
+
mcp (~> 0.6)
|
|
87
97
|
parallel (~> 1.10)
|
|
88
98
|
parser (>= 3.3.0.2)
|
|
89
99
|
rainbow (>= 2.2.2, < 4.0)
|
|
@@ -147,6 +157,7 @@ DEPENDENCIES
|
|
|
147
157
|
|
|
148
158
|
CHECKSUMS
|
|
149
159
|
activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
|
|
160
|
+
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
150
161
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
151
162
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
152
163
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
@@ -161,18 +172,21 @@ CHECKSUMS
|
|
|
161
172
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
162
173
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
163
174
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
164
|
-
html-to-markdown (2.
|
|
175
|
+
html-to-markdown (2.27.0)
|
|
165
176
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
166
177
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
178
|
+
json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
|
|
167
179
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
168
180
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
169
181
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
170
182
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
171
|
-
|
|
183
|
+
mcp (0.7.1) sha256=fa967895d6952bad0d981ea907731d8528d2c246d2079d56a9c8bae83d14f1c7
|
|
184
|
+
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
172
185
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
173
186
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
174
|
-
parser (3.3.10.
|
|
187
|
+
parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
|
|
175
188
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
189
|
+
public_suffix (7.0.2) sha256=9114090c8e4e7135c1fd0e7acfea33afaab38101884320c65aaa0ffb8e26a857
|
|
176
190
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
177
191
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
178
192
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
@@ -186,9 +200,9 @@ CHECKSUMS
|
|
|
186
200
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
187
201
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
188
202
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
189
|
-
rspec-mocks (3.13.
|
|
203
|
+
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
190
204
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
191
|
-
rubocop (1.
|
|
205
|
+
rubocop (1.85.0) sha256=317407feb681a07d54f64d2f9e1d6b6af1ce7678e51cd658e3ad8bd66da48c01
|
|
192
206
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
193
207
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
194
208
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
data/README.md
CHANGED
|
@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
|
|
|
144
144
|
- `wrap_width`: Wrap at column — default: `80`
|
|
145
145
|
- `code_language`: Default fenced code block language — default: none
|
|
146
146
|
- `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
|
|
147
|
-
- `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
|
|
147
|
+
- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
|
|
148
148
|
|
|
149
149
|
**`MetadataConfig`** – Selective metadata extraction:
|
|
150
150
|
- `extract_headers`: h1-h6 elements — default: `true`
|
|
@@ -191,6 +191,22 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
|
|
|
191
191
|
Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
|
|
192
192
|
|
|
193
193
|
|
|
194
|
+
## Plain Text Output
|
|
195
|
+
|
|
196
|
+
Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
|
|
197
|
+
|
|
198
|
+
```ruby
|
|
199
|
+
require 'html_to_markdown'
|
|
200
|
+
|
|
201
|
+
html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
|
|
202
|
+
|
|
203
|
+
plain = HtmlToMarkdown.convert(html, output_format: 'plain')
|
|
204
|
+
# Result: "Title\n\nThis is bold and italic text."
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
|
|
208
|
+
|
|
209
|
+
|
|
194
210
|
|
|
195
211
|
## Metadata Extraction
|
|
196
212
|
|
|
@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
|
|
|
65
65
|
match symbol_to_string(value)?.as_str() {
|
|
66
66
|
"markdown" => Ok(OutputFormat::Markdown),
|
|
67
67
|
"djot" => Ok(OutputFormat::Djot),
|
|
68
|
+
"plain" => Ok(OutputFormat::Plain),
|
|
68
69
|
other => Err(arg_error(format!("invalid output_format: {other}"))),
|
|
69
70
|
}
|
|
70
71
|
}
|
|
@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
|
|
|
562
562
|
Cow::Borrowed(trimmed)
|
|
563
563
|
};
|
|
564
564
|
|
|
565
|
-
let escaped =
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
565
|
+
let escaped = if options.output_format == crate::options::OutputFormat::Plain {
|
|
566
|
+
normalized.into_owned()
|
|
567
|
+
} else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
|
|
568
|
+
text::escape(
|
|
569
|
+
normalized.as_ref(),
|
|
570
|
+
options.escape_misc,
|
|
571
|
+
options.escape_asterisks,
|
|
572
|
+
options.escape_underscores,
|
|
573
|
+
options.escape_ascii,
|
|
574
|
+
)
|
|
575
|
+
.into_owned()
|
|
576
|
+
} else {
|
|
577
|
+
normalized.into_owned()
|
|
578
|
+
};
|
|
578
579
|
|
|
579
580
|
let mut output = String::with_capacity(escaped.len() + 1);
|
|
580
581
|
output.push_str(&escaped);
|
|
@@ -87,6 +87,8 @@ pub fn handle_subscript(
|
|
|
87
87
|
} else {
|
|
88
88
|
output.push_str(&options.sub_symbol);
|
|
89
89
|
}
|
|
90
|
+
} else {
|
|
91
|
+
output.push_str(trimmed);
|
|
90
92
|
}
|
|
91
93
|
append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
|
|
92
94
|
}
|
|
@@ -139,6 +141,8 @@ pub fn handle_superscript(
|
|
|
139
141
|
} else {
|
|
140
142
|
output.push_str(&options.sup_symbol);
|
|
141
143
|
}
|
|
144
|
+
} else {
|
|
145
|
+
output.push_str(trimmed);
|
|
142
146
|
}
|
|
143
147
|
append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
|
|
144
148
|
}
|
|
@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
|
|
|
18
18
|
extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
|
|
19
19
|
repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
|
|
20
20
|
};
|
|
21
|
+
use crate::converter::plain_text::extract_plain_text;
|
|
21
22
|
use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
|
|
22
23
|
use crate::converter::utility::caching::build_dom_context;
|
|
23
24
|
use crate::converter::utility::content::normalized_tag_name;
|
|
24
25
|
use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
|
|
25
26
|
use crate::converter::utility::serialization::serialize_tag_to_html;
|
|
27
|
+
use crate::options::OutputFormat;
|
|
26
28
|
|
|
27
29
|
use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
|
|
28
30
|
use crate::error::Result;
|
|
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
|
|
|
134
136
|
}
|
|
135
137
|
}
|
|
136
138
|
|
|
139
|
+
// Fast path for plain text output: skip the full conversion pipeline
|
|
140
|
+
if options.output_format == OutputFormat::Plain {
|
|
141
|
+
let plain = extract_plain_text(&dom, parser, options);
|
|
142
|
+
return Ok(plain);
|
|
143
|
+
}
|
|
144
|
+
|
|
137
145
|
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
|
|
138
146
|
#[cfg(feature = "metadata")]
|
|
139
147
|
let wants_document = metadata_collector
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
//! Plain text extraction from parsed HTML DOM.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides a fast-path text extractor that walks the DOM tree collecting only
|
|
4
|
+
//! visible text content with structural whitespace, bypassing the full
|
|
5
|
+
//! Markdown/Djot conversion pipeline.
|
|
6
|
+
|
|
7
|
+
use crate::options::ConversionOptions;
|
|
8
|
+
use crate::text;
|
|
9
|
+
|
|
10
|
+
/// Tags whose content should be skipped entirely.
|
|
11
|
+
const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
|
|
12
|
+
|
|
13
|
+
/// Block-level tags that should be separated by blank lines.
|
|
14
|
+
const BLOCK_TAGS: &[&str] = &[
|
|
15
|
+
"p",
|
|
16
|
+
"div",
|
|
17
|
+
"h1",
|
|
18
|
+
"h2",
|
|
19
|
+
"h3",
|
|
20
|
+
"h4",
|
|
21
|
+
"h5",
|
|
22
|
+
"h6",
|
|
23
|
+
"blockquote",
|
|
24
|
+
"section",
|
|
25
|
+
"article",
|
|
26
|
+
"aside",
|
|
27
|
+
"main",
|
|
28
|
+
"nav",
|
|
29
|
+
"header",
|
|
30
|
+
"footer",
|
|
31
|
+
"figure",
|
|
32
|
+
"figcaption",
|
|
33
|
+
"details",
|
|
34
|
+
"summary",
|
|
35
|
+
"address",
|
|
36
|
+
"hgroup",
|
|
37
|
+
"search",
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
/// Extract plain text from a parsed DOM tree.
|
|
41
|
+
///
|
|
42
|
+
/// Walks the tree collecting visible text with structural whitespace:
|
|
43
|
+
/// - Block elements get blank-line separation
|
|
44
|
+
/// - `<br>` becomes a newline, `<hr>` a blank line
|
|
45
|
+
/// - `<pre>` preserves internal whitespace
|
|
46
|
+
/// - `<img>` outputs alt text (unless `skip_images` is set)
|
|
47
|
+
/// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
|
|
48
|
+
/// - Tables: cells separated by tab, rows by newline
|
|
49
|
+
/// - Inline elements are recursed without markers
|
|
50
|
+
pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
|
|
51
|
+
let mut buf = String::with_capacity(1024);
|
|
52
|
+
|
|
53
|
+
for child_handle in dom.children() {
|
|
54
|
+
walk_plain(child_handle, parser, &mut buf, options, false);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
post_process(&mut buf);
|
|
58
|
+
buf
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Recursive plain-text walker.
|
|
62
|
+
fn walk_plain(
|
|
63
|
+
node_handle: &tl::NodeHandle,
|
|
64
|
+
parser: &tl::Parser,
|
|
65
|
+
buf: &mut String,
|
|
66
|
+
options: &ConversionOptions,
|
|
67
|
+
in_pre: bool,
|
|
68
|
+
) {
|
|
69
|
+
let Some(node) = node_handle.get(parser) else {
|
|
70
|
+
return;
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
match node {
|
|
74
|
+
tl::Node::Raw(bytes) => {
|
|
75
|
+
let raw = bytes.as_utf8_str();
|
|
76
|
+
let decoded = text::decode_html_entities_cow(raw.as_ref());
|
|
77
|
+
if in_pre {
|
|
78
|
+
buf.push_str(&decoded);
|
|
79
|
+
} else {
|
|
80
|
+
let normalized = text::normalize_whitespace_cow(&decoded);
|
|
81
|
+
if !normalized.is_empty() {
|
|
82
|
+
// Avoid leading space at start of a new line
|
|
83
|
+
if normalized.as_ref() == " " && buf.ends_with('\n') {
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
buf.push_str(&normalized);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
tl::Node::Tag(tag) => {
|
|
91
|
+
let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
|
|
92
|
+
let tag_str = tag_name.as_str();
|
|
93
|
+
|
|
94
|
+
// Skip invisible content
|
|
95
|
+
if SKIP_TAGS.contains(&tag_str) {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
match tag_str {
|
|
100
|
+
"br" => {
|
|
101
|
+
buf.push('\n');
|
|
102
|
+
}
|
|
103
|
+
"hr" => {
|
|
104
|
+
ensure_blank_line(buf);
|
|
105
|
+
}
|
|
106
|
+
"pre" => {
|
|
107
|
+
ensure_blank_line(buf);
|
|
108
|
+
walk_children(tag, parser, buf, options, true);
|
|
109
|
+
ensure_blank_line(buf);
|
|
110
|
+
}
|
|
111
|
+
"img" => {
|
|
112
|
+
if !options.skip_images {
|
|
113
|
+
if let Some(Some(alt)) = tag.attributes().get("alt") {
|
|
114
|
+
let alt_text = alt.as_utf8_str();
|
|
115
|
+
if !alt_text.is_empty() {
|
|
116
|
+
buf.push_str(alt_text.as_ref());
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
"table" => {
|
|
122
|
+
ensure_blank_line(buf);
|
|
123
|
+
walk_table(tag, parser, buf, options);
|
|
124
|
+
ensure_blank_line(buf);
|
|
125
|
+
}
|
|
126
|
+
"li" => {
|
|
127
|
+
ensure_newline(buf);
|
|
128
|
+
walk_children(tag, parser, buf, options, false);
|
|
129
|
+
ensure_newline(buf);
|
|
130
|
+
}
|
|
131
|
+
_ if BLOCK_TAGS.contains(&tag_str) => {
|
|
132
|
+
ensure_blank_line(buf);
|
|
133
|
+
walk_children(tag, parser, buf, options, in_pre);
|
|
134
|
+
ensure_blank_line(buf);
|
|
135
|
+
}
|
|
136
|
+
_ => {
|
|
137
|
+
// Inline elements and structural containers (html, body, ul, ol, etc.)
|
|
138
|
+
walk_children(tag, parser, buf, options, in_pre);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
tl::Node::Comment(_) => {}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/// Walk all children of a tag.
|
|
147
|
+
fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
|
|
148
|
+
let children = tag.children();
|
|
149
|
+
let top = children.top();
|
|
150
|
+
for child in top.iter() {
|
|
151
|
+
walk_plain(child, parser, buf, options, in_pre);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
|
|
156
|
+
fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
|
|
157
|
+
// Collect all <tr> node handles by recursing into the table
|
|
158
|
+
let mut row_handles = Vec::new();
|
|
159
|
+
collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
|
|
160
|
+
|
|
161
|
+
for (row_idx, row_handle) in row_handles.iter().enumerate() {
|
|
162
|
+
if row_idx > 0 {
|
|
163
|
+
buf.push('\n');
|
|
164
|
+
}
|
|
165
|
+
let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
|
|
166
|
+
continue;
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
// Collect direct <th>/<td> children
|
|
170
|
+
let mut cell_handles = Vec::new();
|
|
171
|
+
let row_children = row_tag.children();
|
|
172
|
+
let row_top = row_children.top();
|
|
173
|
+
for child in row_top.iter() {
|
|
174
|
+
if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
|
|
175
|
+
let name = child_tag.name().as_utf8_str();
|
|
176
|
+
if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
|
|
177
|
+
cell_handles.push(*child);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
|
|
183
|
+
if cell_idx > 0 {
|
|
184
|
+
buf.push('\t');
|
|
185
|
+
}
|
|
186
|
+
let mut cell_buf = String::new();
|
|
187
|
+
if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
|
|
188
|
+
walk_children(cell_tag, parser, &mut cell_buf, options, false);
|
|
189
|
+
}
|
|
190
|
+
buf.push_str(cell_buf.trim());
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
|
|
196
|
+
fn collect_descendant_handles(
|
|
197
|
+
tag: &tl::HTMLTag,
|
|
198
|
+
parser: &tl::Parser,
|
|
199
|
+
target_tag: &str,
|
|
200
|
+
result: &mut Vec<tl::NodeHandle>,
|
|
201
|
+
) {
|
|
202
|
+
let children = tag.children();
|
|
203
|
+
let top = children.top();
|
|
204
|
+
for child in top.iter() {
|
|
205
|
+
if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
|
|
206
|
+
if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
|
|
207
|
+
result.push(*child);
|
|
208
|
+
} else {
|
|
209
|
+
collect_descendant_handles(child_tag, parser, target_tag, result);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/// Ensure the buffer ends with a blank line (two newlines).
|
|
216
|
+
fn ensure_blank_line(buf: &mut String) {
|
|
217
|
+
if buf.is_empty() {
|
|
218
|
+
return;
|
|
219
|
+
}
|
|
220
|
+
// Strip trailing horizontal whitespace
|
|
221
|
+
while buf.ends_with(' ') || buf.ends_with('\t') {
|
|
222
|
+
buf.pop();
|
|
223
|
+
}
|
|
224
|
+
let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
|
|
225
|
+
for _ in current_newlines..2 {
|
|
226
|
+
buf.push('\n');
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Ensure the buffer ends with at least one newline.
|
|
231
|
+
fn ensure_newline(buf: &mut String) {
|
|
232
|
+
if buf.is_empty() {
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
if !buf.ends_with('\n') {
|
|
236
|
+
buf.push('\n');
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
|
|
241
|
+
fn post_process(buf: &mut String) {
|
|
242
|
+
// Collapse runs of 3+ newlines to exactly 2
|
|
243
|
+
while buf.contains("\n\n\n") {
|
|
244
|
+
*buf = buf.replace("\n\n\n", "\n\n");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
|
|
248
|
+
let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
|
|
249
|
+
buf.clear();
|
|
250
|
+
for (i, line) in lines.iter().enumerate() {
|
|
251
|
+
buf.push_str(line);
|
|
252
|
+
if i < lines.len() - 1 {
|
|
253
|
+
buf.push('\n');
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Trim to single trailing newline
|
|
258
|
+
let keep = buf.trim_end_matches('\n').len();
|
|
259
|
+
if keep == 0 {
|
|
260
|
+
buf.clear();
|
|
261
|
+
} else {
|
|
262
|
+
buf.truncate(keep);
|
|
263
|
+
buf.push('\n');
|
|
264
|
+
}
|
|
265
|
+
}
|
|
@@ -82,6 +82,12 @@ pub fn process_text_node(
|
|
|
82
82
|
if !output.ends_with("\n\n") {
|
|
83
83
|
if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
|
|
84
84
|
if is_inline_element(next_tag) {
|
|
85
|
+
// Newlines between inline elements collapse to a single space
|
|
86
|
+
// in HTML rendering (per CSS white-space: normal). Preserve
|
|
87
|
+
// this word boundary so adjacent inline content doesn't merge.
|
|
88
|
+
if !output.ends_with(' ') && !output.ends_with('\n') {
|
|
89
|
+
output.push(' ');
|
|
90
|
+
}
|
|
85
91
|
return;
|
|
86
92
|
}
|
|
87
93
|
}
|
|
@@ -121,7 +121,7 @@ pub struct ConversionOptions {
|
|
|
121
121
|
/// Useful for text-only extraction or filtering out visual content.
|
|
122
122
|
pub skip_images: bool,
|
|
123
123
|
|
|
124
|
-
/// Output format for conversion (Markdown or
|
|
124
|
+
/// Output format for conversion (Markdown, Djot, or Plain)
|
|
125
125
|
pub output_format: OutputFormat,
|
|
126
126
|
}
|
|
127
127
|
|
|
@@ -233,7 +233,7 @@ pub struct ConversionOptionsUpdate {
|
|
|
233
233
|
/// Optional skip images override
|
|
234
234
|
pub skip_images: Option<bool>,
|
|
235
235
|
|
|
236
|
-
/// Optional output format override (Markdown or
|
|
236
|
+
/// Optional output format override (Markdown, Djot, or Plain)
|
|
237
237
|
pub output_format: Option<OutputFormat>,
|
|
238
238
|
}
|
|
239
239
|
|
|
@@ -182,6 +182,8 @@ pub enum OutputFormat {
|
|
|
182
182
|
Markdown,
|
|
183
183
|
/// Djot lightweight markup language.
|
|
184
184
|
Djot,
|
|
185
|
+
/// Plain text output (no markup, visible text only).
|
|
186
|
+
Plain,
|
|
185
187
|
}
|
|
186
188
|
|
|
187
189
|
impl OutputFormat {
|
|
@@ -193,6 +195,7 @@ impl OutputFormat {
|
|
|
193
195
|
pub fn parse(value: &str) -> Self {
|
|
194
196
|
match normalize_token(value).as_str() {
|
|
195
197
|
"djot" => Self::Djot,
|
|
198
|
+
"plain" | "plaintext" | "text" => Self::Plain,
|
|
196
199
|
_ => Self::Markdown,
|
|
197
200
|
}
|
|
198
201
|
}
|
|
@@ -329,6 +332,7 @@ mod serde_impls {
|
|
|
329
332
|
let s = match self {
|
|
330
333
|
Self::Markdown => "markdown",
|
|
331
334
|
Self::Djot => "djot",
|
|
335
|
+
Self::Plain => "plain",
|
|
332
336
|
};
|
|
333
337
|
serializer.serialize_str(s)
|
|
334
338
|
}
|
|
@@ -373,6 +373,66 @@ fn test_superscript_leading_whitespace() {
|
|
|
373
373
|
assert_eq!(result, "hello ^world^\n");
|
|
374
374
|
}
|
|
375
375
|
|
|
376
|
+
#[test]
|
|
377
|
+
fn test_subscript_default_passthrough() {
|
|
378
|
+
let html = "<p>H<sub>2</sub>O</p>";
|
|
379
|
+
let result = convert(html, None).unwrap();
|
|
380
|
+
assert_eq!(result, "H2O\n");
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
#[test]
|
|
384
|
+
fn test_superscript_default_passthrough() {
|
|
385
|
+
let html = "<p>x<sup>2</sup> + y<sup>3</sup></p>";
|
|
386
|
+
let result = convert(html, None).unwrap();
|
|
387
|
+
assert_eq!(result, "x2 + y3\n");
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
#[test]
|
|
391
|
+
fn test_subscript_superscript_combined_default() {
|
|
392
|
+
let html = "<p>CO<sub>2</sub><sup>*</sup></p>";
|
|
393
|
+
let result = convert(html, None).unwrap();
|
|
394
|
+
assert_eq!(result, "CO2*\n");
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
#[test]
|
|
398
|
+
fn test_subscript_html_tag_symbol() {
|
|
399
|
+
let html = "<p>H<sub>2</sub>O</p>";
|
|
400
|
+
let opts = ConversionOptions {
|
|
401
|
+
sub_symbol: "<sub>".to_string(),
|
|
402
|
+
..Default::default()
|
|
403
|
+
};
|
|
404
|
+
let result = convert(html, Some(opts)).unwrap();
|
|
405
|
+
assert_eq!(result, "H<sub>2</sub>O\n");
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
#[test]
|
|
409
|
+
fn test_adjacent_links_with_newline_separator() {
|
|
410
|
+
let html = "<p>\n<a href=\"/page1\">Link 1</a>\n<a href=\"/page2\">Link 2</a>\n</p>";
|
|
411
|
+
let result = convert(html, None).unwrap();
|
|
412
|
+
assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
#[test]
|
|
416
|
+
fn test_adjacent_links_no_whitespace() {
|
|
417
|
+
let html = "<p><a href=\"/page1\">Link 1</a><a href=\"/page2\">Link 2</a></p>";
|
|
418
|
+
let result = convert(html, None).unwrap();
|
|
419
|
+
assert_eq!(result, "[Link 1](/page1)[Link 2](/page2)\n");
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[test]
|
|
423
|
+
fn test_adjacent_links_with_space() {
|
|
424
|
+
let html = "<p><a href=\"/page1\">Link 1</a> <a href=\"/page2\">Link 2</a></p>";
|
|
425
|
+
let result = convert(html, None).unwrap();
|
|
426
|
+
assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
#[test]
|
|
430
|
+
fn test_adjacent_inline_elements_with_newline() {
|
|
431
|
+
let html = "<p><strong>bold</strong>\n<em>italic</em></p>";
|
|
432
|
+
let result = convert(html, None).unwrap();
|
|
433
|
+
assert_eq!(result, "**bold** *italic*\n");
|
|
434
|
+
}
|
|
435
|
+
|
|
376
436
|
#[test]
|
|
377
437
|
fn test_autolink() {
|
|
378
438
|
let html = "<p><a href=\"https://example.com\">https://example.com</a></p>";
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
//! Tests for plain text output format support.
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
|
|
4
|
+
|
|
5
|
+
fn plain_options() -> ConversionOptions {
|
|
6
|
+
ConversionOptions {
|
|
7
|
+
output_format: OutputFormat::Plain,
|
|
8
|
+
..Default::default()
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
#[test]
|
|
13
|
+
fn test_plain_basic_paragraph() {
|
|
14
|
+
let html = "<p>Hello world</p>";
|
|
15
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
16
|
+
assert_eq!(result, "Hello world\n");
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#[test]
|
|
20
|
+
fn test_plain_no_strong_markers() {
|
|
21
|
+
let html = "<p>This is <strong>bold</strong> text</p>";
|
|
22
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
23
|
+
assert_eq!(result, "This is bold text\n");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
#[test]
|
|
27
|
+
fn test_plain_no_emphasis_markers() {
|
|
28
|
+
let html = "<p>This is <em>italic</em> text</p>";
|
|
29
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
30
|
+
assert_eq!(result, "This is italic text\n");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[test]
|
|
34
|
+
fn test_plain_link_text_only() {
|
|
35
|
+
let html = r#"<p>Visit <a href="https://example.com">our site</a> today</p>"#;
|
|
36
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
37
|
+
assert_eq!(result, "Visit our site today\n");
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
#[test]
|
|
41
|
+
fn test_plain_image_alt_text() {
|
|
42
|
+
let html = r#"<img alt="A cute cat">"#;
|
|
43
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
44
|
+
assert_eq!(result, "A cute cat\n");
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn test_plain_image_skipped_when_option_set() {
|
|
49
|
+
let html = r#"<img alt="A cute cat">"#;
|
|
50
|
+
let mut opts = plain_options();
|
|
51
|
+
opts.skip_images = true;
|
|
52
|
+
let result = convert(html, Some(opts)).unwrap();
|
|
53
|
+
assert_eq!(result, "");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#[test]
|
|
57
|
+
fn test_plain_code_block() {
|
|
58
|
+
let html = "<pre><code>fn main() {}</code></pre>";
|
|
59
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
60
|
+
assert_eq!(result, "fn main() {}\n");
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#[test]
|
|
64
|
+
fn test_plain_blockquote_no_prefix() {
|
|
65
|
+
let html = "<blockquote><p>Quoted text</p></blockquote>";
|
|
66
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
67
|
+
assert!(
|
|
68
|
+
!result.contains('>'),
|
|
69
|
+
"Plain text should not contain blockquote prefix, got: {result}"
|
|
70
|
+
);
|
|
71
|
+
assert!(result.contains("Quoted text"));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
#[test]
|
|
75
|
+
fn test_plain_list_items_on_separate_lines() {
|
|
76
|
+
let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
|
|
77
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
78
|
+
assert!(result.contains("First"));
|
|
79
|
+
assert!(result.contains("Second"));
|
|
80
|
+
assert!(result.contains("Third"));
|
|
81
|
+
// Items should be on separate lines
|
|
82
|
+
let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
|
|
83
|
+
assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[test]
|
|
87
|
+
fn test_plain_table_cells_extracted() {
|
|
88
|
+
let html = "<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>";
|
|
89
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
90
|
+
assert!(result.contains('A'));
|
|
91
|
+
assert!(result.contains('B'));
|
|
92
|
+
assert!(result.contains('C'));
|
|
93
|
+
assert!(result.contains('D'));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
#[test]
|
|
97
|
+
fn test_plain_no_escaping() {
|
|
98
|
+
let html = "<p>* not a list</p>";
|
|
99
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
100
|
+
assert!(
|
|
101
|
+
result.contains("* not a list"),
|
|
102
|
+
"Plain text should not escape asterisks, got: {result}"
|
|
103
|
+
);
|
|
104
|
+
assert!(
|
|
105
|
+
!result.contains("\\*"),
|
|
106
|
+
"Plain text should not backslash-escape, got: {result}"
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[test]
|
|
111
|
+
fn test_plain_script_excluded() {
|
|
112
|
+
let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
|
|
113
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
114
|
+
assert!(
|
|
115
|
+
!result.contains("alert"),
|
|
116
|
+
"Script content should be excluded, got: {result}"
|
|
117
|
+
);
|
|
118
|
+
assert!(result.contains("Before"));
|
|
119
|
+
assert!(result.contains("After"));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[test]
|
|
123
|
+
fn test_plain_style_excluded() {
|
|
124
|
+
let html = "<p>Hello</p><style>.foo { color: red; }</style>";
|
|
125
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
126
|
+
assert!(
|
|
127
|
+
!result.contains("color"),
|
|
128
|
+
"Style content should be excluded, got: {result}"
|
|
129
|
+
);
|
|
130
|
+
assert!(result.contains("Hello"));
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
#[test]
|
|
134
|
+
fn test_plain_br_becomes_newline() {
|
|
135
|
+
let html = "<p>Line one<br>Line two</p>";
|
|
136
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
137
|
+
assert!(
|
|
138
|
+
result.contains("Line one\nLine two"),
|
|
139
|
+
"Expected newline from <br>, got: {result}"
|
|
140
|
+
);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
#[test]
|
|
144
|
+
fn test_plain_hr_becomes_blank_line() {
|
|
145
|
+
let html = "<p>Above</p><hr><p>Below</p>";
|
|
146
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
147
|
+
assert!(result.contains("Above"));
|
|
148
|
+
assert!(result.contains("Below"));
|
|
149
|
+
// Should have blank line between
|
|
150
|
+
assert!(result.contains("\n\n"), "Expected blank line from <hr>, got: {result}");
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_plain_nested_inline_formatting_stripped() {
|
|
155
|
+
let html = "<p>Start <strong>bold <em>and italic</em></strong> end</p>";
|
|
156
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
157
|
+
assert_eq!(result, "Start bold and italic end\n");
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#[test]
|
|
161
|
+
fn test_plain_heading_no_markers() {
|
|
162
|
+
let html = "<h1>Title</h1><p>Content</p>";
|
|
163
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
164
|
+
assert!(
|
|
165
|
+
!result.contains('#'),
|
|
166
|
+
"Plain text should not contain heading markers, got: {result}"
|
|
167
|
+
);
|
|
168
|
+
assert!(result.contains("Title"));
|
|
169
|
+
assert!(result.contains("Content"));
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
#[test]
|
|
173
|
+
fn test_plain_parse_variants() {
|
|
174
|
+
assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
|
|
175
|
+
assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
|
|
176
|
+
assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
|
|
177
|
+
assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
|
|
178
|
+
assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn test_plain_empty_input() {
|
|
183
|
+
let html = "";
|
|
184
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
185
|
+
assert_eq!(result, "");
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_plain_whitespace_only_html() {
|
|
190
|
+
let html = "<p> </p>";
|
|
191
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
192
|
+
assert_eq!(result, "");
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
#[test]
|
|
196
|
+
fn test_plain_inline_code_no_backticks() {
|
|
197
|
+
let html = "<p>Use <code>fmt.Println</code> to print</p>";
|
|
198
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
199
|
+
assert!(
|
|
200
|
+
!result.contains('`'),
|
|
201
|
+
"Plain text should not contain backticks, got: {result}"
|
|
202
|
+
);
|
|
203
|
+
assert!(result.contains("fmt.Println"));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[test]
|
|
207
|
+
fn test_plain_pre_preserves_whitespace() {
|
|
208
|
+
let html = "<pre> indented\n more</pre>";
|
|
209
|
+
let result = convert(html, Some(plain_options())).unwrap();
|
|
210
|
+
assert!(
|
|
211
|
+
result.contains(" indented\n more"),
|
|
212
|
+
"Pre blocks should preserve whitespace, got: {result}"
|
|
213
|
+
);
|
|
214
|
+
}
|
data/spec/visitor_spec.rb
CHANGED
|
@@ -35,7 +35,7 @@ RSpec.describe HtmlToMarkdown do
|
|
|
35
35
|
visit_definition_list_end visit_form visit_input visit_button visit_audio visit_video
|
|
36
36
|
visit_iframe visit_details visit_summary visit_figure_start visit_figcaption
|
|
37
37
|
visit_figure_end
|
|
38
|
-
].
|
|
38
|
+
].to_h { |name| [name.to_sym, { type: :continue }] }
|
|
39
39
|
end
|
|
40
40
|
|
|
41
41
|
def create_visitor(**overrides)
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.27.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-03-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -1852,6 +1852,7 @@ files:
|
|
|
1852
1852
|
- rust-vendor/html-to-markdown-rs/src/converter/media/svg.rs
|
|
1853
1853
|
- rust-vendor/html-to-markdown-rs/src/converter/metadata.rs
|
|
1854
1854
|
- rust-vendor/html-to-markdown-rs/src/converter/mod.rs
|
|
1855
|
+
- rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs
|
|
1855
1856
|
- rust-vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
|
|
1856
1857
|
- rust-vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
|
|
1857
1858
|
- rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
|
|
@@ -1949,6 +1950,7 @@ files:
|
|
|
1949
1950
|
- rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
|
|
1950
1951
|
- rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
|
|
1951
1952
|
- rust-vendor/html-to-markdown-rs/tests/lists_test.rs
|
|
1953
|
+
- rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs
|
|
1952
1954
|
- rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
|
|
1953
1955
|
- rust-vendor/html-to-markdown-rs/tests/skip_images_test.rs
|
|
1954
1956
|
- rust-vendor/html-to-markdown-rs/tests/tables_test.rs
|