html-to-markdown 2.26.3 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 23d0242cd4fc575d8081e675fb8d16f09faa7fb1c6c0df9b18d21338c0391880
4
- data.tar.gz: cf86724440a34a26e1f17c134a232b1b321edaacb95758e42f9eab59dc710f8b
3
+ metadata.gz: b359606f2fac17cda3721fd381e8717c5f32ad6b9cbbe7d3f691078521071c5e
4
+ data.tar.gz: 13bb3c8ba29a9270bd91d32bb1fe50c3353895cdd1bfd920ea9c0f79c52fbe9c
5
5
  SHA512:
6
- metadata.gz: 63afe8bdf9d36f4cc225859e3a7ebb62452e97feafccc5ea1e20564a47b7e037900b6af7300508eec4313d4306608e3b00bc5f6a3115001ee250d5c560880bb6
7
- data.tar.gz: c8fcaa6e61fea4325b08ce39ebf0a2bd92ff4e6d58702497e38cc6a081c1eec3de095d72986ccdc828ee6e219697d53c12483087ee563d39f767fe989d72ffdb
6
+ metadata.gz: f0aea92dccbf209b90476ecabccd195252ae48b2e5aad1bdd54183b1e1686e8142a7f817d54a6513e816cf1038ba20ea8eebaffe45086ad7a05648e9314799a8
7
+ data.tar.gz: ed60ef47e31437ea2f459addd4d871d19f55fc7a121bfcf592535811c85441a5244b52d4d4f87f70caeeb368913c44a3c091aa900c631b074b87bb8c3c81d2f3
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.26.3)
4
+ html-to-markdown (2.27.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -172,7 +172,7 @@ CHECKSUMS
172
172
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
173
173
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
174
174
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
175
- html-to-markdown (2.26.3)
175
+ html-to-markdown (2.27.0)
176
176
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
177
177
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
178
178
  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
data/README.md CHANGED
@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
144
144
  - `wrap_width`: Wrap at column — default: `80`
145
145
  - `code_language`: Default fenced code block language — default: none
146
146
  - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
147
- - `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
147
+ - `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
148
148
 
149
149
  **`MetadataConfig`** – Selective metadata extraction:
150
150
  - `extract_headers`: h1-h6 elements — default: `true`
@@ -191,6 +191,22 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
191
191
  Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
192
192
 
193
193
 
194
+ ## Plain Text Output
195
+
196
+ Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
197
+
198
+ ```ruby
199
+ require 'html_to_markdown'
200
+
201
+ html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
202
+
203
+ plain = HtmlToMarkdown.convert(html, output_format: 'plain')
204
+ # Result: "Title\n\nThis is bold and italic text."
205
+ ```
206
+
207
+ Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
208
+
209
+
194
210
 
195
211
  ## Metadata Extraction
196
212
 
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.26.3"
3
+ version ="2.27.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
65
65
  match symbol_to_string(value)?.as_str() {
66
66
  "markdown" => Ok(OutputFormat::Markdown),
67
67
  "djot" => Ok(OutputFormat::Djot),
68
+ "plain" => Ok(OutputFormat::Plain),
68
69
  other => Err(arg_error(format!("invalid output_format: {other}"))),
69
70
  }
70
71
  }
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.26.3'
4
+ VERSION = '2.27.0'
5
5
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.26.3"
3
+ version = "2.27.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
562
562
  Cow::Borrowed(trimmed)
563
563
  };
564
564
 
565
- let escaped =
566
- if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
567
- text::escape(
568
- normalized.as_ref(),
569
- options.escape_misc,
570
- options.escape_asterisks,
571
- options.escape_underscores,
572
- options.escape_ascii,
573
- )
574
- .into_owned()
575
- } else {
576
- normalized.into_owned()
577
- };
565
+ let escaped = if options.output_format == crate::options::OutputFormat::Plain {
566
+ normalized.into_owned()
567
+ } else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
568
+ text::escape(
569
+ normalized.as_ref(),
570
+ options.escape_misc,
571
+ options.escape_asterisks,
572
+ options.escape_underscores,
573
+ options.escape_ascii,
574
+ )
575
+ .into_owned()
576
+ } else {
577
+ normalized.into_owned()
578
+ };
578
579
 
579
580
  let mut output = String::with_capacity(escaped.len() + 1);
580
581
  output.push_str(&escaped);
@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
18
18
  extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
19
19
  repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
20
20
  };
21
+ use crate::converter::plain_text::extract_plain_text;
21
22
  use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
22
23
  use crate::converter::utility::caching::build_dom_context;
23
24
  use crate::converter::utility::content::normalized_tag_name;
24
25
  use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
25
26
  use crate::converter::utility::serialization::serialize_tag_to_html;
27
+ use crate::options::OutputFormat;
26
28
 
27
29
  use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
28
30
  use crate::error::Result;
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
134
136
  }
135
137
  }
136
138
 
139
+ // Fast path for plain text output: skip the full conversion pipeline
140
+ if options.output_format == OutputFormat::Plain {
141
+ let plain = extract_plain_text(&dom, parser, options);
142
+ return Ok(plain);
143
+ }
144
+
137
145
  let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
138
146
  #[cfg(feature = "metadata")]
139
147
  let wants_document = metadata_collector
@@ -102,6 +102,7 @@ pub mod main;
102
102
  mod main_helpers;
103
103
  pub mod media;
104
104
  mod metadata;
105
+ pub mod plain_text;
105
106
  pub mod preprocessing_helpers;
106
107
  pub mod semantic;
107
108
  pub mod text;
@@ -0,0 +1,265 @@
1
+ //! Plain text extraction from parsed HTML DOM.
2
+ //!
3
+ //! Provides a fast-path text extractor that walks the DOM tree collecting only
4
+ //! visible text content with structural whitespace, bypassing the full
5
+ //! Markdown/Djot conversion pipeline.
6
+
7
+ use crate::options::ConversionOptions;
8
+ use crate::text;
9
+
10
+ /// Tags whose content should be skipped entirely.
11
+ const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
12
+
13
+ /// Block-level tags that should be separated by blank lines.
14
+ const BLOCK_TAGS: &[&str] = &[
15
+ "p",
16
+ "div",
17
+ "h1",
18
+ "h2",
19
+ "h3",
20
+ "h4",
21
+ "h5",
22
+ "h6",
23
+ "blockquote",
24
+ "section",
25
+ "article",
26
+ "aside",
27
+ "main",
28
+ "nav",
29
+ "header",
30
+ "footer",
31
+ "figure",
32
+ "figcaption",
33
+ "details",
34
+ "summary",
35
+ "address",
36
+ "hgroup",
37
+ "search",
38
+ ];
39
+
40
+ /// Extract plain text from a parsed DOM tree.
41
+ ///
42
+ /// Walks the tree collecting visible text with structural whitespace:
43
+ /// - Block elements get blank-line separation
44
+ /// - `<br>` becomes a newline, `<hr>` a blank line
45
+ /// - `<pre>` preserves internal whitespace
46
+ /// - `<img>` outputs alt text (unless `skip_images` is set)
47
+ /// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
48
+ /// - Tables: cells separated by tab, rows by newline
49
+ /// - Inline elements are recursed without markers
50
+ pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
51
+ let mut buf = String::with_capacity(1024);
52
+
53
+ for child_handle in dom.children() {
54
+ walk_plain(child_handle, parser, &mut buf, options, false);
55
+ }
56
+
57
+ post_process(&mut buf);
58
+ buf
59
+ }
60
+
61
+ /// Recursive plain-text walker.
62
+ fn walk_plain(
63
+ node_handle: &tl::NodeHandle,
64
+ parser: &tl::Parser,
65
+ buf: &mut String,
66
+ options: &ConversionOptions,
67
+ in_pre: bool,
68
+ ) {
69
+ let Some(node) = node_handle.get(parser) else {
70
+ return;
71
+ };
72
+
73
+ match node {
74
+ tl::Node::Raw(bytes) => {
75
+ let raw = bytes.as_utf8_str();
76
+ let decoded = text::decode_html_entities_cow(raw.as_ref());
77
+ if in_pre {
78
+ buf.push_str(&decoded);
79
+ } else {
80
+ let normalized = text::normalize_whitespace_cow(&decoded);
81
+ if !normalized.is_empty() {
82
+ // Avoid leading space at start of a new line
83
+ if normalized.as_ref() == " " && buf.ends_with('\n') {
84
+ return;
85
+ }
86
+ buf.push_str(&normalized);
87
+ }
88
+ }
89
+ }
90
+ tl::Node::Tag(tag) => {
91
+ let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
92
+ let tag_str = tag_name.as_str();
93
+
94
+ // Skip invisible content
95
+ if SKIP_TAGS.contains(&tag_str) {
96
+ return;
97
+ }
98
+
99
+ match tag_str {
100
+ "br" => {
101
+ buf.push('\n');
102
+ }
103
+ "hr" => {
104
+ ensure_blank_line(buf);
105
+ }
106
+ "pre" => {
107
+ ensure_blank_line(buf);
108
+ walk_children(tag, parser, buf, options, true);
109
+ ensure_blank_line(buf);
110
+ }
111
+ "img" => {
112
+ if !options.skip_images {
113
+ if let Some(Some(alt)) = tag.attributes().get("alt") {
114
+ let alt_text = alt.as_utf8_str();
115
+ if !alt_text.is_empty() {
116
+ buf.push_str(alt_text.as_ref());
117
+ }
118
+ }
119
+ }
120
+ }
121
+ "table" => {
122
+ ensure_blank_line(buf);
123
+ walk_table(tag, parser, buf, options);
124
+ ensure_blank_line(buf);
125
+ }
126
+ "li" => {
127
+ ensure_newline(buf);
128
+ walk_children(tag, parser, buf, options, false);
129
+ ensure_newline(buf);
130
+ }
131
+ _ if BLOCK_TAGS.contains(&tag_str) => {
132
+ ensure_blank_line(buf);
133
+ walk_children(tag, parser, buf, options, in_pre);
134
+ ensure_blank_line(buf);
135
+ }
136
+ _ => {
137
+ // Inline elements and structural containers (html, body, ul, ol, etc.)
138
+ walk_children(tag, parser, buf, options, in_pre);
139
+ }
140
+ }
141
+ }
142
+ tl::Node::Comment(_) => {}
143
+ }
144
+ }
145
+
146
+ /// Walk all children of a tag.
147
+ fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
148
+ let children = tag.children();
149
+ let top = children.top();
150
+ for child in top.iter() {
151
+ walk_plain(child, parser, buf, options, in_pre);
152
+ }
153
+ }
154
+
155
+ /// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
156
+ fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
157
+ // Collect all <tr> node handles by recursing into the table
158
+ let mut row_handles = Vec::new();
159
+ collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
160
+
161
+ for (row_idx, row_handle) in row_handles.iter().enumerate() {
162
+ if row_idx > 0 {
163
+ buf.push('\n');
164
+ }
165
+ let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
166
+ continue;
167
+ };
168
+
169
+ // Collect direct <th>/<td> children
170
+ let mut cell_handles = Vec::new();
171
+ let row_children = row_tag.children();
172
+ let row_top = row_children.top();
173
+ for child in row_top.iter() {
174
+ if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
175
+ let name = child_tag.name().as_utf8_str();
176
+ if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
177
+ cell_handles.push(*child);
178
+ }
179
+ }
180
+ }
181
+
182
+ for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
183
+ if cell_idx > 0 {
184
+ buf.push('\t');
185
+ }
186
+ let mut cell_buf = String::new();
187
+ if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
188
+ walk_children(cell_tag, parser, &mut cell_buf, options, false);
189
+ }
190
+ buf.push_str(cell_buf.trim());
191
+ }
192
+ }
193
+ }
194
+
195
+ /// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
196
+ fn collect_descendant_handles(
197
+ tag: &tl::HTMLTag,
198
+ parser: &tl::Parser,
199
+ target_tag: &str,
200
+ result: &mut Vec<tl::NodeHandle>,
201
+ ) {
202
+ let children = tag.children();
203
+ let top = children.top();
204
+ for child in top.iter() {
205
+ if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
206
+ if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
207
+ result.push(*child);
208
+ } else {
209
+ collect_descendant_handles(child_tag, parser, target_tag, result);
210
+ }
211
+ }
212
+ }
213
+ }
214
+
215
+ /// Ensure the buffer ends with a blank line (two newlines).
216
+ fn ensure_blank_line(buf: &mut String) {
217
+ if buf.is_empty() {
218
+ return;
219
+ }
220
+ // Strip trailing horizontal whitespace
221
+ while buf.ends_with(' ') || buf.ends_with('\t') {
222
+ buf.pop();
223
+ }
224
+ let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
225
+ for _ in current_newlines..2 {
226
+ buf.push('\n');
227
+ }
228
+ }
229
+
230
+ /// Ensure the buffer ends with at least one newline.
231
+ fn ensure_newline(buf: &mut String) {
232
+ if buf.is_empty() {
233
+ return;
234
+ }
235
+ if !buf.ends_with('\n') {
236
+ buf.push('\n');
237
+ }
238
+ }
239
+
240
+ /// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
241
+ fn post_process(buf: &mut String) {
242
+ // Collapse runs of 3+ newlines to exactly 2
243
+ while buf.contains("\n\n\n") {
244
+ *buf = buf.replace("\n\n\n", "\n\n");
245
+ }
246
+
247
+ // Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
248
+ let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
249
+ buf.clear();
250
+ for (i, line) in lines.iter().enumerate() {
251
+ buf.push_str(line);
252
+ if i < lines.len() - 1 {
253
+ buf.push('\n');
254
+ }
255
+ }
256
+
257
+ // Trim to single trailing newline
258
+ let keep = buf.trim_end_matches('\n').len();
259
+ if keep == 0 {
260
+ buf.clear();
261
+ } else {
262
+ buf.truncate(keep);
263
+ buf.push('\n');
264
+ }
265
+ }
@@ -121,7 +121,7 @@ pub struct ConversionOptions {
121
121
  /// Useful for text-only extraction or filtering out visual content.
122
122
  pub skip_images: bool,
123
123
 
124
- /// Output format for conversion (Markdown or Djot)
124
+ /// Output format for conversion (Markdown, Djot, or Plain)
125
125
  pub output_format: OutputFormat,
126
126
  }
127
127
 
@@ -233,7 +233,7 @@ pub struct ConversionOptionsUpdate {
233
233
  /// Optional skip images override
234
234
  pub skip_images: Option<bool>,
235
235
 
236
- /// Optional output format override (Markdown or Djot)
236
+ /// Optional output format override (Markdown, Djot, or Plain)
237
237
  pub output_format: Option<OutputFormat>,
238
238
  }
239
239
 
@@ -182,6 +182,8 @@ pub enum OutputFormat {
182
182
  Markdown,
183
183
  /// Djot lightweight markup language.
184
184
  Djot,
185
+ /// Plain text output (no markup, visible text only).
186
+ Plain,
185
187
  }
186
188
 
187
189
  impl OutputFormat {
@@ -193,6 +195,7 @@ impl OutputFormat {
193
195
  pub fn parse(value: &str) -> Self {
194
196
  match normalize_token(value).as_str() {
195
197
  "djot" => Self::Djot,
198
+ "plain" | "plaintext" | "text" => Self::Plain,
196
199
  _ => Self::Markdown,
197
200
  }
198
201
  }
@@ -329,6 +332,7 @@ mod serde_impls {
329
332
  let s = match self {
330
333
  Self::Markdown => "markdown",
331
334
  Self::Djot => "djot",
335
+ Self::Plain => "plain",
332
336
  };
333
337
  serializer.serialize_str(s)
334
338
  }
@@ -0,0 +1,214 @@
1
+ //! Tests for plain text output format support.
2
+
3
+ use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
4
+
5
+ fn plain_options() -> ConversionOptions {
6
+ ConversionOptions {
7
+ output_format: OutputFormat::Plain,
8
+ ..Default::default()
9
+ }
10
+ }
11
+
12
+ #[test]
13
+ fn test_plain_basic_paragraph() {
14
+ let html = "<p>Hello world</p>";
15
+ let result = convert(html, Some(plain_options())).unwrap();
16
+ assert_eq!(result, "Hello world\n");
17
+ }
18
+
19
+ #[test]
20
+ fn test_plain_no_strong_markers() {
21
+ let html = "<p>This is <strong>bold</strong> text</p>";
22
+ let result = convert(html, Some(plain_options())).unwrap();
23
+ assert_eq!(result, "This is bold text\n");
24
+ }
25
+
26
+ #[test]
27
+ fn test_plain_no_emphasis_markers() {
28
+ let html = "<p>This is <em>italic</em> text</p>";
29
+ let result = convert(html, Some(plain_options())).unwrap();
30
+ assert_eq!(result, "This is italic text\n");
31
+ }
32
+
33
+ #[test]
34
+ fn test_plain_link_text_only() {
35
+ let html = r#"<p>Visit <a href="https://example.com">our site</a> today</p>"#;
36
+ let result = convert(html, Some(plain_options())).unwrap();
37
+ assert_eq!(result, "Visit our site today\n");
38
+ }
39
+
40
+ #[test]
41
+ fn test_plain_image_alt_text() {
42
+ let html = r#"<img alt="A cute cat">"#;
43
+ let result = convert(html, Some(plain_options())).unwrap();
44
+ assert_eq!(result, "A cute cat\n");
45
+ }
46
+
47
+ #[test]
48
+ fn test_plain_image_skipped_when_option_set() {
49
+ let html = r#"<img alt="A cute cat">"#;
50
+ let mut opts = plain_options();
51
+ opts.skip_images = true;
52
+ let result = convert(html, Some(opts)).unwrap();
53
+ assert_eq!(result, "");
54
+ }
55
+
56
+ #[test]
57
+ fn test_plain_code_block() {
58
+ let html = "<pre><code>fn main() {}</code></pre>";
59
+ let result = convert(html, Some(plain_options())).unwrap();
60
+ assert_eq!(result, "fn main() {}\n");
61
+ }
62
+
63
+ #[test]
64
+ fn test_plain_blockquote_no_prefix() {
65
+ let html = "<blockquote><p>Quoted text</p></blockquote>";
66
+ let result = convert(html, Some(plain_options())).unwrap();
67
+ assert!(
68
+ !result.contains('>'),
69
+ "Plain text should not contain blockquote prefix, got: {result}"
70
+ );
71
+ assert!(result.contains("Quoted text"));
72
+ }
73
+
74
+ #[test]
75
+ fn test_plain_list_items_on_separate_lines() {
76
+ let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
77
+ let result = convert(html, Some(plain_options())).unwrap();
78
+ assert!(result.contains("First"));
79
+ assert!(result.contains("Second"));
80
+ assert!(result.contains("Third"));
81
+ // Items should be on separate lines
82
+ let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
83
+ assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
84
+ }
85
+
86
+ #[test]
87
+ fn test_plain_table_cells_extracted() {
88
+ let html = "<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>";
89
+ let result = convert(html, Some(plain_options())).unwrap();
90
+ assert!(result.contains('A'));
91
+ assert!(result.contains('B'));
92
+ assert!(result.contains('C'));
93
+ assert!(result.contains('D'));
94
+ }
95
+
96
+ #[test]
97
+ fn test_plain_no_escaping() {
98
+ let html = "<p>* not a list</p>";
99
+ let result = convert(html, Some(plain_options())).unwrap();
100
+ assert!(
101
+ result.contains("* not a list"),
102
+ "Plain text should not escape asterisks, got: {result}"
103
+ );
104
+ assert!(
105
+ !result.contains("\\*"),
106
+ "Plain text should not backslash-escape, got: {result}"
107
+ );
108
+ }
109
+
110
+ #[test]
111
+ fn test_plain_script_excluded() {
112
+ let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
113
+ let result = convert(html, Some(plain_options())).unwrap();
114
+ assert!(
115
+ !result.contains("alert"),
116
+ "Script content should be excluded, got: {result}"
117
+ );
118
+ assert!(result.contains("Before"));
119
+ assert!(result.contains("After"));
120
+ }
121
+
122
+ #[test]
123
+ fn test_plain_style_excluded() {
124
+ let html = "<p>Hello</p><style>.foo { color: red; }</style>";
125
+ let result = convert(html, Some(plain_options())).unwrap();
126
+ assert!(
127
+ !result.contains("color"),
128
+ "Style content should be excluded, got: {result}"
129
+ );
130
+ assert!(result.contains("Hello"));
131
+ }
132
+
133
+ #[test]
134
+ fn test_plain_br_becomes_newline() {
135
+ let html = "<p>Line one<br>Line two</p>";
136
+ let result = convert(html, Some(plain_options())).unwrap();
137
+ assert!(
138
+ result.contains("Line one\nLine two"),
139
+ "Expected newline from <br>, got: {result}"
140
+ );
141
+ }
142
+
143
+ #[test]
144
+ fn test_plain_hr_becomes_blank_line() {
145
+ let html = "<p>Above</p><hr><p>Below</p>";
146
+ let result = convert(html, Some(plain_options())).unwrap();
147
+ assert!(result.contains("Above"));
148
+ assert!(result.contains("Below"));
149
+ // Should have blank line between
150
+ assert!(result.contains("\n\n"), "Expected blank line from <hr>, got: {result}");
151
+ }
152
+
153
+ #[test]
154
+ fn test_plain_nested_inline_formatting_stripped() {
155
+ let html = "<p>Start <strong>bold <em>and italic</em></strong> end</p>";
156
+ let result = convert(html, Some(plain_options())).unwrap();
157
+ assert_eq!(result, "Start bold and italic end\n");
158
+ }
159
+
160
+ #[test]
161
+ fn test_plain_heading_no_markers() {
162
+ let html = "<h1>Title</h1><p>Content</p>";
163
+ let result = convert(html, Some(plain_options())).unwrap();
164
+ assert!(
165
+ !result.contains('#'),
166
+ "Plain text should not contain heading markers, got: {result}"
167
+ );
168
+ assert!(result.contains("Title"));
169
+ assert!(result.contains("Content"));
170
+ }
171
+
172
+ #[test]
173
+ fn test_plain_parse_variants() {
174
+ assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
175
+ assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
176
+ assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
177
+ assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
178
+ assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
179
+ }
180
+
181
+ #[test]
182
+ fn test_plain_empty_input() {
183
+ let html = "";
184
+ let result = convert(html, Some(plain_options())).unwrap();
185
+ assert_eq!(result, "");
186
+ }
187
+
188
+ #[test]
189
+ fn test_plain_whitespace_only_html() {
190
+ let html = "<p> </p>";
191
+ let result = convert(html, Some(plain_options())).unwrap();
192
+ assert_eq!(result, "");
193
+ }
194
+
195
+ #[test]
196
+ fn test_plain_inline_code_no_backticks() {
197
+ let html = "<p>Use <code>fmt.Println</code> to print</p>";
198
+ let result = convert(html, Some(plain_options())).unwrap();
199
+ assert!(
200
+ !result.contains('`'),
201
+ "Plain text should not contain backticks, got: {result}"
202
+ );
203
+ assert!(result.contains("fmt.Println"));
204
+ }
205
+
206
+ #[test]
207
+ fn test_plain_pre_preserves_whitespace() {
208
+ let html = "<pre> indented\n more</pre>";
209
+ let result = convert(html, Some(plain_options())).unwrap();
210
+ assert!(
211
+ result.contains(" indented\n more"),
212
+ "Pre blocks should preserve whitespace, got: {result}"
213
+ );
214
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.26.3
4
+ version: 2.27.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-28 00:00:00.000000000 Z
11
+ date: 2026-03-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -1852,6 +1852,7 @@ files:
1852
1852
  - rust-vendor/html-to-markdown-rs/src/converter/media/svg.rs
1853
1853
  - rust-vendor/html-to-markdown-rs/src/converter/metadata.rs
1854
1854
  - rust-vendor/html-to-markdown-rs/src/converter/mod.rs
1855
+ - rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs
1855
1856
  - rust-vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
1856
1857
  - rust-vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
1857
1858
  - rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
@@ -1949,6 +1950,7 @@ files:
1949
1950
  - rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
1950
1951
  - rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
1951
1952
  - rust-vendor/html-to-markdown-rs/tests/lists_test.rs
1953
+ - rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs
1952
1954
  - rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
1953
1955
  - rust-vendor/html-to-markdown-rs/tests/skip_images_test.rs
1954
1956
  - rust-vendor/html-to-markdown-rs/tests/tables_test.rs