html-to-markdown 2.26.2 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 12d5559bda903dfbeb563dba48c733845621b2c67cbca597fb4111c094f33fe0
4
- data.tar.gz: 3ef2fdc3b30051c1eec6a956c42465f3d31670f81f345b67d98a84935f77e02c
3
+ metadata.gz: b359606f2fac17cda3721fd381e8717c5f32ad6b9cbbe7d3f691078521071c5e
4
+ data.tar.gz: 13bb3c8ba29a9270bd91d32bb1fe50c3353895cdd1bfd920ea9c0f79c52fbe9c
5
5
  SHA512:
6
- metadata.gz: dee90b55391d5f84466c2a2d3591a7d3565ebc88357118b0725d57ad2c06cc5e9f965a93f4ad6111bab89c589552cc0f3ecfb21701ae4f0e18b9b9d55e0aa3ef
7
- data.tar.gz: 7b0927d2fa482712bdfac03152a19375e1744e3e22b0121d7567967bc2fa215396d8e292f4d2974962477b839a644e96b01d828687a8fd520cc22171b3a83908
6
+ metadata.gz: f0aea92dccbf209b90476ecabccd195252ae48b2e5aad1bdd54183b1e1686e8142a7f817d54a6513e816cf1038ba20ea8eebaffe45086ad7a05648e9314799a8
7
+ data.tar.gz: ed60ef47e31437ea2f459addd4d871d19f55fc7a121bfcf592535811c85441a5244b52d4d4f87f70caeeb368913c44a3c091aa900c631b074b87bb8c3c81d2f3
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.26.2)
4
+ html-to-markdown (2.27.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -20,6 +20,8 @@ GEM
20
20
  securerandom (>= 0.3)
21
21
  tzinfo (~> 2.0, >= 2.0.5)
22
22
  uri (>= 0.13.1)
23
+ addressable (2.8.9)
24
+ public_suffix (>= 2.0.2, < 8.0)
23
25
  ast (2.4.3)
24
26
  base64 (0.3.0)
25
27
  bigdecimal (4.0.1)
@@ -37,6 +39,9 @@ GEM
37
39
  i18n (1.14.8)
38
40
  concurrent-ruby (~> 1.0)
39
41
  json (2.18.1)
42
+ json-schema (6.1.0)
43
+ addressable (~> 2.8)
44
+ bigdecimal (>= 3.1, < 5)
40
45
  language_server-protocol (3.17.0.5)
41
46
  lint_roller (1.1.0)
42
47
  listen (3.10.0)
@@ -44,14 +49,18 @@ GEM
44
49
  rb-fsevent (~> 0.10, >= 0.10.3)
45
50
  rb-inotify (~> 0.9, >= 0.9.10)
46
51
  logger (1.7.0)
47
- minitest (6.0.1)
52
+ mcp (0.7.1)
53
+ json-schema (>= 4.1)
54
+ minitest (6.0.2)
55
+ drb (~> 2.0)
48
56
  prism (~> 1.5)
49
57
  mutex_m (0.3.0)
50
58
  parallel (1.27.0)
51
- parser (3.3.10.1)
59
+ parser (3.3.10.2)
52
60
  ast (~> 2.4.1)
53
61
  racc
54
62
  prism (1.9.0)
63
+ public_suffix (7.0.2)
55
64
  racc (1.8.1)
56
65
  rainbow (3.1.1)
57
66
  rake (13.3.1)
@@ -76,14 +85,15 @@ GEM
76
85
  rspec-expectations (3.13.5)
77
86
  diff-lcs (>= 1.2.0, < 2.0)
78
87
  rspec-support (~> 3.13.0)
79
- rspec-mocks (3.13.7)
88
+ rspec-mocks (3.13.8)
80
89
  diff-lcs (>= 1.2.0, < 2.0)
81
90
  rspec-support (~> 3.13.0)
82
91
  rspec-support (3.13.7)
83
- rubocop (1.84.2)
92
+ rubocop (1.85.0)
84
93
  json (~> 2.3)
85
94
  language_server-protocol (~> 3.17.0.2)
86
95
  lint_roller (~> 1.1.0)
96
+ mcp (~> 0.6)
87
97
  parallel (~> 1.10)
88
98
  parser (>= 3.3.0.2)
89
99
  rainbow (>= 2.2.2, < 4.0)
@@ -147,6 +157,7 @@ DEPENDENCIES
147
157
 
148
158
  CHECKSUMS
149
159
  activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
160
+ addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
150
161
  ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
151
162
  base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
152
163
  bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
@@ -161,18 +172,21 @@ CHECKSUMS
161
172
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
162
173
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
163
174
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
164
- html-to-markdown (2.26.2)
175
+ html-to-markdown (2.27.0)
165
176
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
166
177
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
178
+ json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
167
179
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
168
180
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
169
181
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
170
182
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
171
- minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
183
+ mcp (0.7.1) sha256=fa967895d6952bad0d981ea907731d8528d2c246d2079d56a9c8bae83d14f1c7
184
+ minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
172
185
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
173
186
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
174
- parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
187
+ parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
175
188
  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
189
+ public_suffix (7.0.2) sha256=9114090c8e4e7135c1fd0e7acfea33afaab38101884320c65aaa0ffb8e26a857
176
190
  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
177
191
  rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
178
192
  rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
@@ -186,9 +200,9 @@ CHECKSUMS
186
200
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
187
201
  rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
188
202
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
189
- rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
203
+ rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
190
204
  rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
191
- rubocop (1.84.2) sha256=5692cea54168f3dc8cb79a6fe95c5424b7ea893c707ad7a4307b0585e88dbf5f
205
+ rubocop (1.85.0) sha256=317407feb681a07d54f64d2f9e1d6b6af1ce7678e51cd658e3ad8bd66da48c01
192
206
  rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
193
207
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
194
208
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
data/README.md CHANGED
@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
144
144
  - `wrap_width`: Wrap at column — default: `80`
145
145
  - `code_language`: Default fenced code block language — default: none
146
146
  - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
147
- - `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
147
+ - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
148
148
 
149
149
  **`MetadataConfig`** – Selective metadata extraction:
150
150
  - `extract_headers`: h1-h6 elements — default: `true`
@@ -191,6 +191,22 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
191
191
  Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
192
192
 
193
193
 
194
+ ## Plain Text Output
195
+
196
+ Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
197
+
198
+ ```ruby
199
+ require 'html_to_markdown'
200
+
201
+ html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
202
+
203
+ plain = HtmlToMarkdown.convert(html, output_format: 'plain')
204
+ # Result: "Title\n\nThis is bold and italic text."
205
+ ```
206
+
207
+ Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
208
+
209
+
194
210
 
195
211
  ## Metadata Extraction
196
212
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.26.2"
3
+ version ="2.27.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
65
65
  match symbol_to_string(value)?.as_str() {
66
66
  "markdown" => Ok(OutputFormat::Markdown),
67
67
  "djot" => Ok(OutputFormat::Djot),
68
+ "plain" => Ok(OutputFormat::Plain),
68
69
  other => Err(arg_error(format!("invalid output_format: {other}"))),
69
70
  }
70
71
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.26.2'
4
+ VERSION = '2.27.0'
5
5
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.26.2"
3
+ version = "2.27.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
562
562
  Cow::Borrowed(trimmed)
563
563
  };
564
564
 
565
- let escaped =
566
- if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
567
- text::escape(
568
- normalized.as_ref(),
569
- options.escape_misc,
570
- options.escape_asterisks,
571
- options.escape_underscores,
572
- options.escape_ascii,
573
- )
574
- .into_owned()
575
- } else {
576
- normalized.into_owned()
577
- };
565
+ let escaped = if options.output_format == crate::options::OutputFormat::Plain {
566
+ normalized.into_owned()
567
+ } else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
568
+ text::escape(
569
+ normalized.as_ref(),
570
+ options.escape_misc,
571
+ options.escape_asterisks,
572
+ options.escape_underscores,
573
+ options.escape_ascii,
574
+ )
575
+ .into_owned()
576
+ } else {
577
+ normalized.into_owned()
578
+ };
578
579
 
579
580
  let mut output = String::with_capacity(escaped.len() + 1);
580
581
  output.push_str(&escaped);
@@ -87,6 +87,8 @@ pub fn handle_subscript(
87
87
  } else {
88
88
  output.push_str(&options.sub_symbol);
89
89
  }
90
+ } else {
91
+ output.push_str(trimmed);
90
92
  }
91
93
  append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
92
94
  }
@@ -139,6 +141,8 @@ pub fn handle_superscript(
139
141
  } else {
140
142
  output.push_str(&options.sup_symbol);
141
143
  }
144
+ } else {
145
+ output.push_str(trimmed);
142
146
  }
143
147
  append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
144
148
  }
@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
18
18
  extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
19
19
  repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
20
20
  };
21
+ use crate::converter::plain_text::extract_plain_text;
21
22
  use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
22
23
  use crate::converter::utility::caching::build_dom_context;
23
24
  use crate::converter::utility::content::normalized_tag_name;
24
25
  use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
25
26
  use crate::converter::utility::serialization::serialize_tag_to_html;
27
+ use crate::options::OutputFormat;
26
28
 
27
29
  use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
28
30
  use crate::error::Result;
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
134
136
  }
135
137
  }
136
138
 
139
+ // Fast path for plain text output: skip the full conversion pipeline
140
+ if options.output_format == OutputFormat::Plain {
141
+ let plain = extract_plain_text(&dom, parser, options);
142
+ return Ok(plain);
143
+ }
144
+
137
145
  let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
138
146
  #[cfg(feature = "metadata")]
139
147
  let wants_document = metadata_collector
@@ -102,6 +102,7 @@ pub mod main;
102
102
  mod main_helpers;
103
103
  pub mod media;
104
104
  mod metadata;
105
+ pub mod plain_text;
105
106
  pub mod preprocessing_helpers;
106
107
  pub mod semantic;
107
108
  pub mod text;
@@ -0,0 +1,265 @@
1
+ //! Plain text extraction from parsed HTML DOM.
2
+ //!
3
+ //! Provides a fast-path text extractor that walks the DOM tree collecting only
4
+ //! visible text content with structural whitespace, bypassing the full
5
+ //! Markdown/Djot conversion pipeline.
6
+
7
+ use crate::options::ConversionOptions;
8
+ use crate::text;
9
+
10
+ /// Tags whose content should be skipped entirely.
11
+ const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
12
+
13
+ /// Block-level tags that should be separated by blank lines.
14
+ const BLOCK_TAGS: &[&str] = &[
15
+ "p",
16
+ "div",
17
+ "h1",
18
+ "h2",
19
+ "h3",
20
+ "h4",
21
+ "h5",
22
+ "h6",
23
+ "blockquote",
24
+ "section",
25
+ "article",
26
+ "aside",
27
+ "main",
28
+ "nav",
29
+ "header",
30
+ "footer",
31
+ "figure",
32
+ "figcaption",
33
+ "details",
34
+ "summary",
35
+ "address",
36
+ "hgroup",
37
+ "search",
38
+ ];
39
+
40
+ /// Extract plain text from a parsed DOM tree.
41
+ ///
42
+ /// Walks the tree collecting visible text with structural whitespace:
43
+ /// - Block elements get blank-line separation
44
+ /// - `<br>` becomes a newline, `<hr>` a blank line
45
+ /// - `<pre>` preserves internal whitespace
46
+ /// - `<img>` outputs alt text (unless `skip_images` is set)
47
+ /// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
48
+ /// - Tables: cells separated by tab, rows by newline
49
+ /// - Inline elements are recursed without markers
50
+ pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
51
+ let mut buf = String::with_capacity(1024);
52
+
53
+ for child_handle in dom.children() {
54
+ walk_plain(child_handle, parser, &mut buf, options, false);
55
+ }
56
+
57
+ post_process(&mut buf);
58
+ buf
59
+ }
60
+
61
+ /// Recursive plain-text walker.
62
+ fn walk_plain(
63
+ node_handle: &tl::NodeHandle,
64
+ parser: &tl::Parser,
65
+ buf: &mut String,
66
+ options: &ConversionOptions,
67
+ in_pre: bool,
68
+ ) {
69
+ let Some(node) = node_handle.get(parser) else {
70
+ return;
71
+ };
72
+
73
+ match node {
74
+ tl::Node::Raw(bytes) => {
75
+ let raw = bytes.as_utf8_str();
76
+ let decoded = text::decode_html_entities_cow(raw.as_ref());
77
+ if in_pre {
78
+ buf.push_str(&decoded);
79
+ } else {
80
+ let normalized = text::normalize_whitespace_cow(&decoded);
81
+ if !normalized.is_empty() {
82
+ // Avoid leading space at start of a new line
83
+ if normalized.as_ref() == " " && buf.ends_with('\n') {
84
+ return;
85
+ }
86
+ buf.push_str(&normalized);
87
+ }
88
+ }
89
+ }
90
+ tl::Node::Tag(tag) => {
91
+ let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
92
+ let tag_str = tag_name.as_str();
93
+
94
+ // Skip invisible content
95
+ if SKIP_TAGS.contains(&tag_str) {
96
+ return;
97
+ }
98
+
99
+ match tag_str {
100
+ "br" => {
101
+ buf.push('\n');
102
+ }
103
+ "hr" => {
104
+ ensure_blank_line(buf);
105
+ }
106
+ "pre" => {
107
+ ensure_blank_line(buf);
108
+ walk_children(tag, parser, buf, options, true);
109
+ ensure_blank_line(buf);
110
+ }
111
+ "img" => {
112
+ if !options.skip_images {
113
+ if let Some(Some(alt)) = tag.attributes().get("alt") {
114
+ let alt_text = alt.as_utf8_str();
115
+ if !alt_text.is_empty() {
116
+ buf.push_str(alt_text.as_ref());
117
+ }
118
+ }
119
+ }
120
+ }
121
+ "table" => {
122
+ ensure_blank_line(buf);
123
+ walk_table(tag, parser, buf, options);
124
+ ensure_blank_line(buf);
125
+ }
126
+ "li" => {
127
+ ensure_newline(buf);
128
+ walk_children(tag, parser, buf, options, false);
129
+ ensure_newline(buf);
130
+ }
131
+ _ if BLOCK_TAGS.contains(&tag_str) => {
132
+ ensure_blank_line(buf);
133
+ walk_children(tag, parser, buf, options, in_pre);
134
+ ensure_blank_line(buf);
135
+ }
136
+ _ => {
137
+ // Inline elements and structural containers (html, body, ul, ol, etc.)
138
+ walk_children(tag, parser, buf, options, in_pre);
139
+ }
140
+ }
141
+ }
142
+ tl::Node::Comment(_) => {}
143
+ }
144
+ }
145
+
146
+ /// Walk all children of a tag.
147
+ fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
148
+ let children = tag.children();
149
+ let top = children.top();
150
+ for child in top.iter() {
151
+ walk_plain(child, parser, buf, options, in_pre);
152
+ }
153
+ }
154
+
155
+ /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
156
+ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
157
+ // Collect all <tr> node handles by recursing into the table
158
+ let mut row_handles = Vec::new();
159
+ collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
160
+
161
+ for (row_idx, row_handle) in row_handles.iter().enumerate() {
162
+ if row_idx > 0 {
163
+ buf.push('\n');
164
+ }
165
+ let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
166
+ continue;
167
+ };
168
+
169
+ // Collect direct <th>/<td> children
170
+ let mut cell_handles = Vec::new();
171
+ let row_children = row_tag.children();
172
+ let row_top = row_children.top();
173
+ for child in row_top.iter() {
174
+ if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
175
+ let name = child_tag.name().as_utf8_str();
176
+ if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
177
+ cell_handles.push(*child);
178
+ }
179
+ }
180
+ }
181
+
182
+ for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
183
+ if cell_idx > 0 {
184
+ buf.push('\t');
185
+ }
186
+ let mut cell_buf = String::new();
187
+ if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
188
+ walk_children(cell_tag, parser, &mut cell_buf, options, false);
189
+ }
190
+ buf.push_str(cell_buf.trim());
191
+ }
192
+ }
193
+ }
194
+
195
+ /// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
196
+ fn collect_descendant_handles(
197
+ tag: &tl::HTMLTag,
198
+ parser: &tl::Parser,
199
+ target_tag: &str,
200
+ result: &mut Vec<tl::NodeHandle>,
201
+ ) {
202
+ let children = tag.children();
203
+ let top = children.top();
204
+ for child in top.iter() {
205
+ if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
206
+ if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
207
+ result.push(*child);
208
+ } else {
209
+ collect_descendant_handles(child_tag, parser, target_tag, result);
210
+ }
211
+ }
212
+ }
213
+ }
214
+
215
+ /// Ensure the buffer ends with a blank line (two newlines).
216
+ fn ensure_blank_line(buf: &mut String) {
217
+ if buf.is_empty() {
218
+ return;
219
+ }
220
+ // Strip trailing horizontal whitespace
221
+ while buf.ends_with(' ') || buf.ends_with('\t') {
222
+ buf.pop();
223
+ }
224
+ let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
225
+ for _ in current_newlines..2 {
226
+ buf.push('\n');
227
+ }
228
+ }
229
+
230
+ /// Ensure the buffer ends with at least one newline.
231
+ fn ensure_newline(buf: &mut String) {
232
+ if buf.is_empty() {
233
+ return;
234
+ }
235
+ if !buf.ends_with('\n') {
236
+ buf.push('\n');
237
+ }
238
+ }
239
+
240
+ /// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
241
+ fn post_process(buf: &mut String) {
242
+ // Collapse runs of 3+ newlines to exactly 2
243
+ while buf.contains("\n\n\n") {
244
+ *buf = buf.replace("\n\n\n", "\n\n");
245
+ }
246
+
247
+ // Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
248
+ let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
249
+ buf.clear();
250
+ for (i, line) in lines.iter().enumerate() {
251
+ buf.push_str(line);
252
+ if i < lines.len() - 1 {
253
+ buf.push('\n');
254
+ }
255
+ }
256
+
257
+ // Trim to single trailing newline
258
+ let keep = buf.trim_end_matches('\n').len();
259
+ if keep == 0 {
260
+ buf.clear();
261
+ } else {
262
+ buf.truncate(keep);
263
+ buf.push('\n');
264
+ }
265
+ }
@@ -82,6 +82,12 @@ pub fn process_text_node(
82
82
  if !output.ends_with("\n\n") {
83
83
  if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
84
84
  if is_inline_element(next_tag) {
85
+ // Newlines between inline elements collapse to a single space
86
+ // in HTML rendering (per CSS white-space: normal). Preserve
87
+ // this word boundary so adjacent inline content doesn't merge.
88
+ if !output.ends_with(' ') && !output.ends_with('\n') {
89
+ output.push(' ');
90
+ }
85
91
  return;
86
92
  }
87
93
  }
@@ -121,7 +121,7 @@ pub struct ConversionOptions {
121
121
  /// Useful for text-only extraction or filtering out visual content.
122
122
  pub skip_images: bool,
123
123
 
124
- /// Output format for conversion (Markdown or Djot)
124
+ /// Output format for conversion (Markdown, Djot, or Plain)
125
125
  pub output_format: OutputFormat,
126
126
  }
127
127
 
@@ -233,7 +233,7 @@ pub struct ConversionOptionsUpdate {
233
233
  /// Optional skip images override
234
234
  pub skip_images: Option<bool>,
235
235
 
236
- /// Optional output format override (Markdown or Djot)
236
+ /// Optional output format override (Markdown, Djot, or Plain)
237
237
  pub output_format: Option<OutputFormat>,
238
238
  }
239
239
 
@@ -182,6 +182,8 @@ pub enum OutputFormat {
182
182
  Markdown,
183
183
  /// Djot lightweight markup language.
184
184
  Djot,
185
+ /// Plain text output (no markup, visible text only).
186
+ Plain,
185
187
  }
186
188
 
187
189
  impl OutputFormat {
@@ -193,6 +195,7 @@ impl OutputFormat {
193
195
  pub fn parse(value: &str) -> Self {
194
196
  match normalize_token(value).as_str() {
195
197
  "djot" => Self::Djot,
198
+ "plain" | "plaintext" | "text" => Self::Plain,
196
199
  _ => Self::Markdown,
197
200
  }
198
201
  }
@@ -329,6 +332,7 @@ mod serde_impls {
329
332
  let s = match self {
330
333
  Self::Markdown => "markdown",
331
334
  Self::Djot => "djot",
335
+ Self::Plain => "plain",
332
336
  };
333
337
  serializer.serialize_str(s)
334
338
  }
@@ -373,6 +373,66 @@ fn test_superscript_leading_whitespace() {
373
373
  assert_eq!(result, "hello ^world^\n");
374
374
  }
375
375
 
376
+ #[test]
377
+ fn test_subscript_default_passthrough() {
378
+ let html = "<p>H<sub>2</sub>O</p>";
379
+ let result = convert(html, None).unwrap();
380
+ assert_eq!(result, "H2O\n");
381
+ }
382
+
383
+ #[test]
384
+ fn test_superscript_default_passthrough() {
385
+ let html = "<p>x<sup>2</sup> + y<sup>3</sup></p>";
386
+ let result = convert(html, None).unwrap();
387
+ assert_eq!(result, "x2 + y3\n");
388
+ }
389
+
390
+ #[test]
391
+ fn test_subscript_superscript_combined_default() {
392
+ let html = "<p>CO<sub>2</sub><sup>*</sup></p>";
393
+ let result = convert(html, None).unwrap();
394
+ assert_eq!(result, "CO2*\n");
395
+ }
396
+
397
+ #[test]
398
+ fn test_subscript_html_tag_symbol() {
399
+ let html = "<p>H<sub>2</sub>O</p>";
400
+ let opts = ConversionOptions {
401
+ sub_symbol: "<sub>".to_string(),
402
+ ..Default::default()
403
+ };
404
+ let result = convert(html, Some(opts)).unwrap();
405
+ assert_eq!(result, "H<sub>2</sub>O\n");
406
+ }
407
+
408
+ #[test]
409
+ fn test_adjacent_links_with_newline_separator() {
410
+ let html = "<p>\n<a href=\"/page1\">Link 1</a>\n<a href=\"/page2\">Link 2</a>\n</p>";
411
+ let result = convert(html, None).unwrap();
412
+ assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
413
+ }
414
+
415
+ #[test]
416
+ fn test_adjacent_links_no_whitespace() {
417
+ let html = "<p><a href=\"/page1\">Link 1</a><a href=\"/page2\">Link 2</a></p>";
418
+ let result = convert(html, None).unwrap();
419
+ assert_eq!(result, "[Link 1](/page1)[Link 2](/page2)\n");
420
+ }
421
+
422
+ #[test]
423
+ fn test_adjacent_links_with_space() {
424
+ let html = "<p><a href=\"/page1\">Link 1</a> <a href=\"/page2\">Link 2</a></p>";
425
+ let result = convert(html, None).unwrap();
426
+ assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
427
+ }
428
+
429
+ #[test]
430
+ fn test_adjacent_inline_elements_with_newline() {
431
+ let html = "<p><strong>bold</strong>\n<em>italic</em></p>";
432
+ let result = convert(html, None).unwrap();
433
+ assert_eq!(result, "**bold** *italic*\n");
434
+ }
435
+
376
436
  #[test]
377
437
  fn test_autolink() {
378
438
  let html = "<p><a href=\"https://example.com\">https://example.com</a></p>";
@@ -0,0 +1,214 @@
1
+ //! Tests for plain text output format support.
2
+
3
+ use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
4
+
5
+ fn plain_options() -> ConversionOptions {
6
+ ConversionOptions {
7
+ output_format: OutputFormat::Plain,
8
+ ..Default::default()
9
+ }
10
+ }
11
+
12
+ #[test]
13
+ fn test_plain_basic_paragraph() {
14
+ let html = "<p>Hello world</p>";
15
+ let result = convert(html, Some(plain_options())).unwrap();
16
+ assert_eq!(result, "Hello world\n");
17
+ }
18
+
19
+ #[test]
20
+ fn test_plain_no_strong_markers() {
21
+ let html = "<p>This is <strong>bold</strong> text</p>";
22
+ let result = convert(html, Some(plain_options())).unwrap();
23
+ assert_eq!(result, "This is bold text\n");
24
+ }
25
+
26
+ #[test]
27
+ fn test_plain_no_emphasis_markers() {
28
+ let html = "<p>This is <em>italic</em> text</p>";
29
+ let result = convert(html, Some(plain_options())).unwrap();
30
+ assert_eq!(result, "This is italic text\n");
31
+ }
32
+
33
+ #[test]
34
+ fn test_plain_link_text_only() {
35
+ let html = r#"<p>Visit <a href="https://example.com">our site</a> today</p>"#;
36
+ let result = convert(html, Some(plain_options())).unwrap();
37
+ assert_eq!(result, "Visit our site today\n");
38
+ }
39
+
40
+ #[test]
41
+ fn test_plain_image_alt_text() {
42
+ let html = r#"<img alt="A cute cat">"#;
43
+ let result = convert(html, Some(plain_options())).unwrap();
44
+ assert_eq!(result, "A cute cat\n");
45
+ }
46
+
47
+ #[test]
48
+ fn test_plain_image_skipped_when_option_set() {
49
+ let html = r#"<img alt="A cute cat">"#;
50
+ let mut opts = plain_options();
51
+ opts.skip_images = true;
52
+ let result = convert(html, Some(opts)).unwrap();
53
+ assert_eq!(result, "");
54
+ }
55
+
56
+ #[test]
57
+ fn test_plain_code_block() {
58
+ let html = "<pre><code>fn main() {}</code></pre>";
59
+ let result = convert(html, Some(plain_options())).unwrap();
60
+ assert_eq!(result, "fn main() {}\n");
61
+ }
62
+
63
+ #[test]
64
+ fn test_plain_blockquote_no_prefix() {
65
+ let html = "<blockquote><p>Quoted text</p></blockquote>";
66
+ let result = convert(html, Some(plain_options())).unwrap();
67
+ assert!(
68
+ !result.contains('>'),
69
+ "Plain text should not contain blockquote prefix, got: {result}"
70
+ );
71
+ assert!(result.contains("Quoted text"));
72
+ }
73
+
74
+ #[test]
75
+ fn test_plain_list_items_on_separate_lines() {
76
+ let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
77
+ let result = convert(html, Some(plain_options())).unwrap();
78
+ assert!(result.contains("First"));
79
+ assert!(result.contains("Second"));
80
+ assert!(result.contains("Third"));
81
+ // Items should be on separate lines
82
+ let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
83
+ assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
84
+ }
85
+
86
+ #[test]
87
+ fn test_plain_table_cells_extracted() {
88
+ let html = "<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>";
89
+ let result = convert(html, Some(plain_options())).unwrap();
90
+ assert!(result.contains('A'));
91
+ assert!(result.contains('B'));
92
+ assert!(result.contains('C'));
93
+ assert!(result.contains('D'));
94
+ }
95
+
96
+ #[test]
97
+ fn test_plain_no_escaping() {
98
+ let html = "<p>* not a list</p>";
99
+ let result = convert(html, Some(plain_options())).unwrap();
100
+ assert!(
101
+ result.contains("* not a list"),
102
+ "Plain text should not escape asterisks, got: {result}"
103
+ );
104
+ assert!(
105
+ !result.contains("\\*"),
106
+ "Plain text should not backslash-escape, got: {result}"
107
+ );
108
+ }
109
+
110
+ #[test]
111
+ fn test_plain_script_excluded() {
112
+ let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
113
+ let result = convert(html, Some(plain_options())).unwrap();
114
+ assert!(
115
+ !result.contains("alert"),
116
+ "Script content should be excluded, got: {result}"
117
+ );
118
+ assert!(result.contains("Before"));
119
+ assert!(result.contains("After"));
120
+ }
121
+
122
+ #[test]
123
+ fn test_plain_style_excluded() {
124
+ let html = "<p>Hello</p><style>.foo { color: red; }</style>";
125
+ let result = convert(html, Some(plain_options())).unwrap();
126
+ assert!(
127
+ !result.contains("color"),
128
+ "Style content should be excluded, got: {result}"
129
+ );
130
+ assert!(result.contains("Hello"));
131
+ }
132
+
133
+ #[test]
134
+ fn test_plain_br_becomes_newline() {
135
+ let html = "<p>Line one<br>Line two</p>";
136
+ let result = convert(html, Some(plain_options())).unwrap();
137
+ assert!(
138
+ result.contains("Line one\nLine two"),
139
+ "Expected newline from <br>, got: {result}"
140
+ );
141
+ }
142
+
143
+ #[test]
144
+ fn test_plain_hr_becomes_blank_line() {
145
+ let html = "<p>Above</p><hr><p>Below</p>";
146
+ let result = convert(html, Some(plain_options())).unwrap();
147
+ assert!(result.contains("Above"));
148
+ assert!(result.contains("Below"));
149
+ // Should have blank line between
150
+ assert!(result.contains("\n\n"), "Expected blank line from <hr>, got: {result}");
151
+ }
152
+
153
+ #[test]
154
+ fn test_plain_nested_inline_formatting_stripped() {
155
+ let html = "<p>Start <strong>bold <em>and italic</em></strong> end</p>";
156
+ let result = convert(html, Some(plain_options())).unwrap();
157
+ assert_eq!(result, "Start bold and italic end\n");
158
+ }
159
+
160
+ #[test]
161
+ fn test_plain_heading_no_markers() {
162
+ let html = "<h1>Title</h1><p>Content</p>";
163
+ let result = convert(html, Some(plain_options())).unwrap();
164
+ assert!(
165
+ !result.contains('#'),
166
+ "Plain text should not contain heading markers, got: {result}"
167
+ );
168
+ assert!(result.contains("Title"));
169
+ assert!(result.contains("Content"));
170
+ }
171
+
172
+ #[test]
173
+ fn test_plain_parse_variants() {
174
+ assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
175
+ assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
176
+ assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
177
+ assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
178
+ assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
179
+ }
180
+
181
+ #[test]
182
+ fn test_plain_empty_input() {
183
+ let html = "";
184
+ let result = convert(html, Some(plain_options())).unwrap();
185
+ assert_eq!(result, "");
186
+ }
187
+
188
+ #[test]
189
+ fn test_plain_whitespace_only_html() {
190
+ let html = "<p> </p>";
191
+ let result = convert(html, Some(plain_options())).unwrap();
192
+ assert_eq!(result, "");
193
+ }
194
+
195
+ #[test]
196
+ fn test_plain_inline_code_no_backticks() {
197
+ let html = "<p>Use <code>fmt.Println</code> to print</p>";
198
+ let result = convert(html, Some(plain_options())).unwrap();
199
+ assert!(
200
+ !result.contains('`'),
201
+ "Plain text should not contain backticks, got: {result}"
202
+ );
203
+ assert!(result.contains("fmt.Println"));
204
+ }
205
+
206
+ #[test]
207
+ fn test_plain_pre_preserves_whitespace() {
208
+ let html = "<pre> indented\n more</pre>";
209
+ let result = convert(html, Some(plain_options())).unwrap();
210
+ assert!(
211
+ result.contains(" indented\n more"),
212
+ "Pre blocks should preserve whitespace, got: {result}"
213
+ );
214
+ }
data/spec/visitor_spec.rb CHANGED
@@ -35,7 +35,7 @@ RSpec.describe HtmlToMarkdown do
35
35
  visit_definition_list_end visit_form visit_input visit_button visit_audio visit_video
36
36
  visit_iframe visit_details visit_summary visit_figure_start visit_figcaption
37
37
  visit_figure_end
38
- ].each_with_object({}) { |name, hash| hash[name.to_sym] = { type: :continue } }
38
+ ].to_h { |name| [name.to_sym, { type: :continue }] }
39
39
  end
40
40
 
41
41
  def create_visitor(**overrides)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.26.2
4
+ version: 2.27.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-28 00:00:00.000000000 Z
11
+ date: 2026-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -1852,6 +1852,7 @@ files:
1852
1852
  - rust-vendor/html-to-markdown-rs/src/converter/media/svg.rs
1853
1853
  - rust-vendor/html-to-markdown-rs/src/converter/metadata.rs
1854
1854
  - rust-vendor/html-to-markdown-rs/src/converter/mod.rs
1855
+ - rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs
1855
1856
  - rust-vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
1856
1857
  - rust-vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
1857
1858
  - rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
@@ -1949,6 +1950,7 @@ files:
1949
1950
  - rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
1950
1951
  - rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
1951
1952
  - rust-vendor/html-to-markdown-rs/tests/lists_test.rs
1953
+ - rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs
1952
1954
  - rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
1953
1955
  - rust-vendor/html-to-markdown-rs/tests/skip_images_test.rs
1954
1956
  - rust-vendor/html-to-markdown-rs/tests/tables_test.rs