RubyGems - html-to-markdown - Versions diffs - 2.26.3 → 2.27.0 - Mend

html-to-markdown 2.26.3 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +17 -1
data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
data/ext/html-to-markdown-rb/native/src/options.rs +1 -0
data/lib/html_to_markdown/version.rb +1 -1
data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +14 -13
data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +8 -0
data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs +265 -0
data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs +2 -2
data/rust-vendor/html-to-markdown-rs/src/options/validation.rs +4 -0
data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs +214 -0
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 23d0242cd4fc575d8081e675fb8d16f09faa7fb1c6c0df9b18d21338c0391880
-  data.tar.gz: cf86724440a34a26e1f17c134a232b1b321edaacb95758e42f9eab59dc710f8b
+  metadata.gz: b359606f2fac17cda3721fd381e8717c5f32ad6b9cbbe7d3f691078521071c5e
+  data.tar.gz: 13bb3c8ba29a9270bd91d32bb1fe50c3353895cdd1bfd920ea9c0f79c52fbe9c
 SHA512:
-  metadata.gz: 63afe8bdf9d36f4cc225859e3a7ebb62452e97feafccc5ea1e20564a47b7e037900b6af7300508eec4313d4306608e3b00bc5f6a3115001ee250d5c560880bb6
-  data.tar.gz: c8fcaa6e61fea4325b08ce39ebf0a2bd92ff4e6d58702497e38cc6a081c1eec3de095d72986ccdc828ee6e219697d53c12483087ee563d39f767fe989d72ffdb
+  metadata.gz: f0aea92dccbf209b90476ecabccd195252ae48b2e5aad1bdd54183b1e1686e8142a7f817d54a6513e816cf1038ba20ea8eebaffe45086ad7a05648e9314799a8
+  data.tar.gz: ed60ef47e31437ea2f459addd4d871d19f55fc7a121bfcf592535811c85441a5244b52d4d4f87f70caeeb368913c44a3c091aa900c631b074b87bb8c3c81d2f3

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    html-to-markdown (2.26.3)
+    html-to-markdown (2.27.0)
       rb_sys (>= 0.9, < 1.0)
 GEM
@@ -172,7 +172,7 @@ CHECKSUMS
   ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
   ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
   fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
-  html-to-markdown (2.26.3)
+  html-to-markdown (2.27.0)
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
   json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702

data/README.md CHANGED Viewed

@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
 - `wrap_width`: Wrap at column — default: `80`
 - `code_language`: Default fenced code block language — default: none
 - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
-- `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
+- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
 **`MetadataConfig`** – Selective metadata extraction:
 - `extract_headers`: h1-h6 elements — default: `true`
@@ -191,6 +191,22 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
 Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
+## Plain Text Output
+Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
+```ruby
+require 'html_to_markdown'
+html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
+plain = HtmlToMarkdown.convert(html, output_format: 'plain')
+# Result: "Title\n\nThis is bold and italic text."
+```
+Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
 ## Metadata Extraction

data/ext/html-to-markdown-rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "html-to-markdown-rb"
-version ="2.26.3"
+version ="2.27.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/ext/html-to-markdown-rb/native/src/options.rs CHANGED Viewed

@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
     match symbol_to_string(value)?.as_str() {
         "markdown" => Ok(OutputFormat::Markdown),
         "djot" => Ok(OutputFormat::Djot),
+        "plain" => Ok(OutputFormat::Plain),
         other => Err(arg_error(format!("invalid output_format: {other}"))),
     }
 }

data/lib/html_to_markdown/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HtmlToMarkdown
-  VERSION = '2.26.3'
+  VERSION = '2.27.0'
 end

data/rust-vendor/html-to-markdown-rs/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "html-to-markdown-rs"
-version = "2.26.3"
+version = "2.27.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/rust-vendor/html-to-markdown-rs/src/convert_api.rs CHANGED Viewed

@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
         Cow::Borrowed(trimmed)
     };
-    let escaped =
-        if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
-            text::escape(
-                normalized.as_ref(),
-                options.escape_misc,
-                options.escape_asterisks,
-                options.escape_underscores,
-                options.escape_ascii,
-            )
-            .into_owned()
-        } else {
-            normalized.into_owned()
-        };
+    let escaped = if options.output_format == crate::options::OutputFormat::Plain {
+        normalized.into_owned()
+    } else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
+        text::escape(
+            normalized.as_ref(),
+            options.escape_misc,
+            options.escape_asterisks,
+            options.escape_underscores,
+            options.escape_ascii,
+        )
+        .into_owned()
+    } else {
+        normalized.into_owned()
+    };
     let mut output = String::with_capacity(escaped.len() + 1);
     output.push_str(&escaped);

data/rust-vendor/html-to-markdown-rs/src/converter/main.rs CHANGED Viewed

@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
     extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
     repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
 };
+use crate::converter::plain_text::extract_plain_text;
 use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
 use crate::converter::utility::caching::build_dom_context;
 use crate::converter::utility::content::normalized_tag_name;
 use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
 use crate::converter::utility::serialization::serialize_tag_to_html;
+use crate::options::OutputFormat;
 use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
 use crate::error::Result;
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
         }
     }
+    // Fast path for plain text output: skip the full conversion pipeline
+    if options.output_format == OutputFormat::Plain {
+        let plain = extract_plain_text(&dom, parser, options);
+        return Ok(plain);
+    }
     let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
     #[cfg(feature = "metadata")]
     let wants_document = metadata_collector

data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs CHANGED Viewed

@@ -102,6 +102,7 @@ pub mod main;
 mod main_helpers;
 pub mod media;
 mod metadata;
+pub mod plain_text;
 pub mod preprocessing_helpers;
 pub mod semantic;
 pub mod text;

data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs ADDED Viewed

@@ -0,0 +1,265 @@
+//! Plain text extraction from parsed HTML DOM.
+//!
+//! Provides a fast-path text extractor that walks the DOM tree collecting only
+//! visible text content with structural whitespace, bypassing the full
+//! Markdown/Djot conversion pipeline.
+use crate::options::ConversionOptions;
+use crate::text;
+/// Tags whose content should be skipped entirely.
+const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
+/// Block-level tags that should be separated by blank lines.
+const BLOCK_TAGS: &[&str] = &[
+    "p",
+    "div",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "blockquote",
+    "section",
+    "article",
+    "aside",
+    "main",
+    "nav",
+    "header",
+    "footer",
+    "figure",
+    "figcaption",
+    "details",
+    "summary",
+    "address",
+    "hgroup",
+    "search",
+];
+/// Extract plain text from a parsed DOM tree.
+///
+/// Walks the tree collecting visible text with structural whitespace:
+/// - Block elements get blank-line separation
+/// - `<br>` becomes a newline, `<hr>` a blank line
+/// - `<pre>` preserves internal whitespace
+/// - `<img>` outputs alt text (unless `skip_images` is set)
+/// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
+/// - Tables: cells separated by tab, rows by newline
+/// - Inline elements are recursed without markers
+pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
+    let mut buf = String::with_capacity(1024);
+    for child_handle in dom.children() {
+        walk_plain(child_handle, parser, &mut buf, options, false);
+    }
+    post_process(&mut buf);
+    buf
+}
+/// Recursive plain-text walker.
+fn walk_plain(
+    node_handle: &tl::NodeHandle,
+    parser: &tl::Parser,
+    buf: &mut String,
+    options: &ConversionOptions,
+    in_pre: bool,
+) {
+    let Some(node) = node_handle.get(parser) else {
+        return;
+    };
+    match node {
+        tl::Node::Raw(bytes) => {
+            let raw = bytes.as_utf8_str();
+            let decoded = text::decode_html_entities_cow(raw.as_ref());
+            if in_pre {
+                buf.push_str(&decoded);
+            } else {
+                let normalized = text::normalize_whitespace_cow(&decoded);
+                if !normalized.is_empty() {
+                    // Avoid leading space at start of a new line
+                    if normalized.as_ref() == " " && buf.ends_with('\n') {
+                        return;
+                    }
+                    buf.push_str(&normalized);
+                }
+            }
+        }
+        tl::Node::Tag(tag) => {
+            let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
+            let tag_str = tag_name.as_str();
+            // Skip invisible content
+            if SKIP_TAGS.contains(&tag_str) {
+                return;
+            }
+            match tag_str {
+                "br" => {
+                    buf.push('\n');
+                }
+                "hr" => {
+                    ensure_blank_line(buf);
+                }
+                "pre" => {
+                    ensure_blank_line(buf);
+                    walk_children(tag, parser, buf, options, true);
+                    ensure_blank_line(buf);
+                }
+                "img" => {
+                    if !options.skip_images {
+                        if let Some(Some(alt)) = tag.attributes().get("alt") {
+                            let alt_text = alt.as_utf8_str();
+                            if !alt_text.is_empty() {
+                                buf.push_str(alt_text.as_ref());
+                            }
+                        }
+                    }
+                }
+                "table" => {
+                    ensure_blank_line(buf);
+                    walk_table(tag, parser, buf, options);
+                    ensure_blank_line(buf);
+                }
+                "li" => {
+                    ensure_newline(buf);
+                    walk_children(tag, parser, buf, options, false);
+                    ensure_newline(buf);
+                }
+                _ if BLOCK_TAGS.contains(&tag_str) => {
+                    ensure_blank_line(buf);
+                    walk_children(tag, parser, buf, options, in_pre);
+                    ensure_blank_line(buf);
+                }
+                _ => {
+                    // Inline elements and structural containers (html, body, ul, ol, etc.)
+                    walk_children(tag, parser, buf, options, in_pre);
+                }
+            }
+        }
+        tl::Node::Comment(_) => {}
+    }
+}
+/// Walk all children of a tag.
+fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
+    let children = tag.children();
+    let top = children.top();
+    for child in top.iter() {
+        walk_plain(child, parser, buf, options, in_pre);
+    }
+}
+/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
+fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
+    // Collect all <tr> node handles by recursing into the table
+    let mut row_handles = Vec::new();
+    collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
+    for (row_idx, row_handle) in row_handles.iter().enumerate() {
+        if row_idx > 0 {
+            buf.push('\n');
+        }
+        let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
+            continue;
+        };
+        // Collect direct <th>/<td> children
+        let mut cell_handles = Vec::new();
+        let row_children = row_tag.children();
+        let row_top = row_children.top();
+        for child in row_top.iter() {
+            if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
+                let name = child_tag.name().as_utf8_str();
+                if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
+                    cell_handles.push(*child);
+                }
+            }
+        }
+        for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
+            if cell_idx > 0 {
+                buf.push('\t');
+            }
+            let mut cell_buf = String::new();
+            if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
+                walk_children(cell_tag, parser, &mut cell_buf, options, false);
+            }
+            buf.push_str(cell_buf.trim());
+        }
+    }
+}
+/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
+fn collect_descendant_handles(
+    tag: &tl::HTMLTag,
+    parser: &tl::Parser,
+    target_tag: &str,
+    result: &mut Vec<tl::NodeHandle>,
+) {
+    let children = tag.children();
+    let top = children.top();
+    for child in top.iter() {
+        if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
+            if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
+                result.push(*child);
+            } else {
+                collect_descendant_handles(child_tag, parser, target_tag, result);
+            }
+        }
+    }
+}
+/// Ensure the buffer ends with a blank line (two newlines).
+fn ensure_blank_line(buf: &mut String) {
+    if buf.is_empty() {
+        return;
+    }
+    // Strip trailing horizontal whitespace
+    while buf.ends_with(' ') || buf.ends_with('\t') {
+        buf.pop();
+    }
+    let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
+    for _ in current_newlines..2 {
+        buf.push('\n');
+    }
+}
+/// Ensure the buffer ends with at least one newline.
+fn ensure_newline(buf: &mut String) {
+    if buf.is_empty() {
+        return;
+    }
+    if !buf.ends_with('\n') {
+        buf.push('\n');
+    }
+}
+/// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
+fn post_process(buf: &mut String) {
+    // Collapse runs of 3+ newlines to exactly 2
+    while buf.contains("\n\n\n") {
+        *buf = buf.replace("\n\n\n", "\n\n");
+    }
+    // Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
+    let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
+    buf.clear();
+    for (i, line) in lines.iter().enumerate() {
+        buf.push_str(line);
+        if i < lines.len() - 1 {
+            buf.push('\n');
+        }
+    }
+    // Trim to single trailing newline
+    let keep = buf.trim_end_matches('\n').len();
+    if keep == 0 {
+        buf.clear();
+    } else {
+        buf.truncate(keep);
+        buf.push('\n');
+    }
+}

data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs CHANGED Viewed

@@ -121,7 +121,7 @@ pub struct ConversionOptions {
     /// Useful for text-only extraction or filtering out visual content.
     pub skip_images: bool,
-    /// Output format for conversion (Markdown or Djot)
+    /// Output format for conversion (Markdown, Djot, or Plain)
     pub output_format: OutputFormat,
 }
@@ -233,7 +233,7 @@ pub struct ConversionOptionsUpdate {
     /// Optional skip images override
     pub skip_images: Option<bool>,
-    /// Optional output format override (Markdown or Djot)
+    /// Optional output format override (Markdown, Djot, or Plain)
     pub output_format: Option<OutputFormat>,
 }

data/rust-vendor/html-to-markdown-rs/src/options/validation.rs CHANGED Viewed

@@ -182,6 +182,8 @@ pub enum OutputFormat {
     Markdown,
     /// Djot lightweight markup language.
     Djot,
+    /// Plain text output (no markup, visible text only).
+    Plain,
 }
 impl OutputFormat {
@@ -193,6 +195,7 @@ impl OutputFormat {
     pub fn parse(value: &str) -> Self {
         match normalize_token(value).as_str() {
             "djot" => Self::Djot,
+            "plain" | "plaintext" | "text" => Self::Plain,
             _ => Self::Markdown,
         }
     }
@@ -329,6 +332,7 @@ mod serde_impls {
             let s = match self {
                 Self::Markdown => "markdown",
                 Self::Djot => "djot",
+                Self::Plain => "plain",
             };
             serializer.serialize_str(s)
         }

data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs ADDED Viewed

@@ -0,0 +1,214 @@
+//! Tests for plain text output format support.
+use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
+fn plain_options() -> ConversionOptions {
+    ConversionOptions {
+        output_format: OutputFormat::Plain,
+        ..Default::default()
+    }
+}
+#[test]
+fn test_plain_basic_paragraph() {
+    let html = "<p>Hello world</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Hello world\n");
+}
+#[test]
+fn test_plain_no_strong_markers() {
+    let html = "<p>This is <strong>bold</strong> text</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "This is bold text\n");
+}
+#[test]
+fn test_plain_no_emphasis_markers() {
+    let html = "<p>This is <em>italic</em> text</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "This is italic text\n");
+}
+#[test]
+fn test_plain_link_text_only() {
+    let html = r#"<p>Visit <a href="https://example.com">our site</a> today</p>"#;
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Visit our site today\n");
+}
+#[test]
+fn test_plain_image_alt_text() {
+    let html = r#"<img alt="A cute cat">"#;
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "A cute cat\n");
+}
+#[test]
+fn test_plain_image_skipped_when_option_set() {
+    let html = r#"<img alt="A cute cat">"#;
+    let mut opts = plain_options();
+    opts.skip_images = true;
+    let result = convert(html, Some(opts)).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_code_block() {
+    let html = "<pre><code>fn main() {}</code></pre>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "fn main() {}\n");
+}
+#[test]
+fn test_plain_blockquote_no_prefix() {
+    let html = "<blockquote><p>Quoted text</p></blockquote>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('>'),
+        "Plain text should not contain blockquote prefix, got: {result}"
+    );
+    assert!(result.contains("Quoted text"));
+}
+#[test]
+fn test_plain_list_items_on_separate_lines() {
+    let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains("First"));
+    assert!(result.contains("Second"));
+    assert!(result.contains("Third"));
+    // Items should be on separate lines
+    let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
+    assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
+}
+#[test]
+fn test_plain_table_cells_extracted() {
+    let html = "<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains('A'));
+    assert!(result.contains('B'));
+    assert!(result.contains('C'));
+    assert!(result.contains('D'));
+}
+#[test]
+fn test_plain_no_escaping() {
+    let html = "<p>* not a list</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("* not a list"),
+        "Plain text should not escape asterisks, got: {result}"
+    );
+    assert!(
+        !result.contains("\\*"),
+        "Plain text should not backslash-escape, got: {result}"
+    );
+}
+#[test]
+fn test_plain_script_excluded() {
+    let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains("alert"),
+        "Script content should be excluded, got: {result}"
+    );
+    assert!(result.contains("Before"));
+    assert!(result.contains("After"));
+}
+#[test]
+fn test_plain_style_excluded() {
+    let html = "<p>Hello</p><style>.foo { color: red; }</style>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains("color"),
+        "Style content should be excluded, got: {result}"
+    );
+    assert!(result.contains("Hello"));
+}
+#[test]
+fn test_plain_br_becomes_newline() {
+    let html = "<p>Line one<br>Line two</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("Line one\nLine two"),
+        "Expected newline from <br>, got: {result}"
+    );
+}
+#[test]
+fn test_plain_hr_becomes_blank_line() {
+    let html = "<p>Above</p><hr><p>Below</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains("Above"));
+    assert!(result.contains("Below"));
+    // Should have blank line between
+    assert!(result.contains("\n\n"), "Expected blank line from <hr>, got: {result}");
+}
+#[test]
+fn test_plain_nested_inline_formatting_stripped() {
+    let html = "<p>Start <strong>bold <em>and italic</em></strong> end</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Start bold and italic end\n");
+}
+#[test]
+fn test_plain_heading_no_markers() {
+    let html = "<h1>Title</h1><p>Content</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('#'),
+        "Plain text should not contain heading markers, got: {result}"
+    );
+    assert!(result.contains("Title"));
+    assert!(result.contains("Content"));
+}
+#[test]
+fn test_plain_parse_variants() {
+    assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
+}
+#[test]
+fn test_plain_empty_input() {
+    let html = "";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_whitespace_only_html() {
+    let html = "<p>   </p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_inline_code_no_backticks() {
+    let html = "<p>Use <code>fmt.Println</code> to print</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('`'),
+        "Plain text should not contain backticks, got: {result}"
+    );
+    assert!(result.contains("fmt.Println"));
+}
+#[test]
+fn test_plain_pre_preserves_whitespace() {
+    let html = "<pre>  indented\n    more</pre>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("  indented\n    more"),
+        "Pre blocks should preserve whitespace, got: {result}"
+    );
+}

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html-to-markdown
 version: !ruby/object:Gem::Version
-  version: 2.26.3
+  version: 2.27.0
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-28 00:00:00.000000000 Z
+date: 2026-03-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -1852,6 +1852,7 @@ files:
 - rust-vendor/html-to-markdown-rs/src/converter/media/svg.rs
 - rust-vendor/html-to-markdown-rs/src/converter/metadata.rs
 - rust-vendor/html-to-markdown-rs/src/converter/mod.rs
+- rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs
 - rust-vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
 - rust-vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
 - rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
@@ -1949,6 +1950,7 @@ files:
 - rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
 - rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
 - rust-vendor/html-to-markdown-rs/tests/lists_test.rs
+- rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs
 - rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
 - rust-vendor/html-to-markdown-rs/tests/skip_images_test.rs
 - rust-vendor/html-to-markdown-rs/tests/tables_test.rs