RubyGems - html-to-markdown - Versions diffs - 2.26.2 → 2.27.0 - Mend

html-to-markdown 2.26.2 → 2.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/Gemfile.lock +24 -10
data/README.md +17 -1
data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
data/ext/html-to-markdown-rb/native/src/options.rs +1 -0
data/lib/html_to_markdown/version.rb +1 -1
data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +14 -13
data/rust-vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs +4 -0
data/rust-vendor/html-to-markdown-rs/src/converter/main.rs +8 -0
data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs +265 -0
data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +6 -0
data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs +2 -2
data/rust-vendor/html-to-markdown-rs/src/options/validation.rs +4 -0
data/rust-vendor/html-to-markdown-rs/tests/integration_test.rs +60 -0
data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs +214 -0
data/spec/visitor_spec.rb +1 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 12d5559bda903dfbeb563dba48c733845621b2c67cbca597fb4111c094f33fe0
-  data.tar.gz: 3ef2fdc3b30051c1eec6a956c42465f3d31670f81f345b67d98a84935f77e02c
+  metadata.gz: b359606f2fac17cda3721fd381e8717c5f32ad6b9cbbe7d3f691078521071c5e
+  data.tar.gz: 13bb3c8ba29a9270bd91d32bb1fe50c3353895cdd1bfd920ea9c0f79c52fbe9c
 SHA512:
-  metadata.gz: dee90b55391d5f84466c2a2d3591a7d3565ebc88357118b0725d57ad2c06cc5e9f965a93f4ad6111bab89c589552cc0f3ecfb21701ae4f0e18b9b9d55e0aa3ef
-  data.tar.gz: 7b0927d2fa482712bdfac03152a19375e1744e3e22b0121d7567967bc2fa215396d8e292f4d2974962477b839a644e96b01d828687a8fd520cc22171b3a83908
+  metadata.gz: f0aea92dccbf209b90476ecabccd195252ae48b2e5aad1bdd54183b1e1686e8142a7f817d54a6513e816cf1038ba20ea8eebaffe45086ad7a05648e9314799a8
+  data.tar.gz: ed60ef47e31437ea2f459addd4d871d19f55fc7a121bfcf592535811c85441a5244b52d4d4f87f70caeeb368913c44a3c091aa900c631b074b87bb8c3c81d2f3

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    html-to-markdown (2.26.2)
+    html-to-markdown (2.27.0)
       rb_sys (>= 0.9, < 1.0)
 GEM
@@ -20,6 +20,8 @@ GEM
       securerandom (>= 0.3)
       tzinfo (~> 2.0, >= 2.0.5)
       uri (>= 0.13.1)
+    addressable (2.8.9)
+      public_suffix (>= 2.0.2, < 8.0)
     ast (2.4.3)
     base64 (0.3.0)
     bigdecimal (4.0.1)
@@ -37,6 +39,9 @@ GEM
     i18n (1.14.8)
       concurrent-ruby (~> 1.0)
     json (2.18.1)
+    json-schema (6.1.0)
+      addressable (~> 2.8)
+      bigdecimal (>= 3.1, < 5)
     language_server-protocol (3.17.0.5)
     lint_roller (1.1.0)
     listen (3.10.0)
@@ -44,14 +49,18 @@ GEM
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
     logger (1.7.0)
-    minitest (6.0.1)
+    mcp (0.7.1)
+      json-schema (>= 4.1)
+    minitest (6.0.2)
+      drb (~> 2.0)
       prism (~> 1.5)
     mutex_m (0.3.0)
     parallel (1.27.0)
-    parser (3.3.10.1)
+    parser (3.3.10.2)
       ast (~> 2.4.1)
       racc
     prism (1.9.0)
+    public_suffix (7.0.2)
     racc (1.8.1)
     rainbow (3.1.1)
     rake (13.3.1)
@@ -76,14 +85,15 @@ GEM
     rspec-expectations (3.13.5)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
-    rspec-mocks (3.13.7)
+    rspec-mocks (3.13.8)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.7)
-    rubocop (1.84.2)
+    rubocop (1.85.0)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)
       lint_roller (~> 1.1.0)
+      mcp (~> 0.6)
       parallel (~> 1.10)
       parser (>= 3.3.0.2)
       rainbow (>= 2.2.2, < 4.0)
@@ -147,6 +157,7 @@ DEPENDENCIES
 CHECKSUMS
   activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
+  addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
   ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
   base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
   bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
@@ -161,18 +172,21 @@ CHECKSUMS
   ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
   ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
   fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
-  html-to-markdown (2.26.2)
+  html-to-markdown (2.27.0)
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
+  json-schema (6.1.0) sha256=6bf70a2cfb6dfd5a06da28093fa8190f324c88eabd36a7f47097f227321dc702
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
   logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
-  minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
+  mcp (0.7.1) sha256=fa967895d6952bad0d981ea907731d8528d2c246d2079d56a9c8bae83d14f1c7
+  minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
   mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
   parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
-  parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
+  parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
   prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
+  public_suffix (7.0.2) sha256=9114090c8e4e7135c1fd0e7acfea33afaab38101884320c65aaa0ffb8e26a857
   racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
   rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
   rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
@@ -186,9 +200,9 @@ CHECKSUMS
   rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
   rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
   rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
-  rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
+  rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
   rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
-  rubocop (1.84.2) sha256=5692cea54168f3dc8cb79a6fe95c5424b7ea893c707ad7a4307b0585e88dbf5f
+  rubocop (1.85.0) sha256=317407feb681a07d54f64d2f9e1d6b6af1ce7678e51cd658e3ad8bd66da48c01
   rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
   rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33

data/README.md CHANGED Viewed

@@ -144,7 +144,7 @@ Extract base64-encoded inline images with metadata.
 - `wrap_width`: Wrap at column — default: `80`
 - `code_language`: Default fenced code block language — default: none
 - `extract_metadata`: Embed metadata as YAML frontmatter — default: `false`
-- `output_format`: Output markup format (`"markdown"` | `"djot"`) — default: `"markdown"`
+- `output_format`: Output markup format (`"markdown"` | `"djot"` | `"plain"`) — default: `"markdown"`
 **`MetadataConfig`** – Selective metadata extraction:
 - `extract_headers`: h1-h6 elements — default: `true`
@@ -191,6 +191,22 @@ djot = HtmlToMarkdown.convert(html, output_format: 'djot')
 Djot's extended syntax allows you to express more semantic meaning in lightweight text, making it useful for documents that require strikethrough, insertion tracking, or mathematical notation.
+## Plain Text Output
+Set `output_format` to `"plain"` to strip all markup and return only visible text. This bypasses the Markdown conversion pipeline entirely for maximum speed.
+```ruby
+require 'html_to_markdown'
+html = "<h1>Title</h1><p>This is <strong>bold</strong> and <em>italic</em> text.</p>"
+plain = HtmlToMarkdown.convert(html, output_format: 'plain')
+# Result: "Title\n\nThis is bold and italic text."
+```
+Plain text mode is useful for search indexing, text extraction, and feeding content to LLMs.
 ## Metadata Extraction

data/ext/html-to-markdown-rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "html-to-markdown-rb"
-version ="2.26.2"
+version ="2.27.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/ext/html-to-markdown-rb/native/src/options.rs CHANGED Viewed

@@ -65,6 +65,7 @@ pub fn parse_output_format(value: Value) -> Result<OutputFormat, Error> {
     match symbol_to_string(value)?.as_str() {
         "markdown" => Ok(OutputFormat::Markdown),
         "djot" => Ok(OutputFormat::Djot),
+        "plain" => Ok(OutputFormat::Plain),
         other => Err(arg_error(format!("invalid output_format: {other}"))),
     }
 }

data/lib/html_to_markdown/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module HtmlToMarkdown
-  VERSION = '2.26.2'
+  VERSION = '2.27.0'
 end

data/rust-vendor/html-to-markdown-rs/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "html-to-markdown-rs"
-version = "2.26.2"
+version = "2.27.0"
 edition = "2024"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
 license = "MIT"

data/rust-vendor/html-to-markdown-rs/src/convert_api.rs CHANGED Viewed

@@ -562,19 +562,20 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
         Cow::Borrowed(trimmed)
     };
-    let escaped =
-        if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
-            text::escape(
-                normalized.as_ref(),
-                options.escape_misc,
-                options.escape_asterisks,
-                options.escape_underscores,
-                options.escape_ascii,
-            )
-            .into_owned()
-        } else {
-            normalized.into_owned()
-        };
+    let escaped = if options.output_format == crate::options::OutputFormat::Plain {
+        normalized.into_owned()
+    } else if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
+        text::escape(
+            normalized.as_ref(),
+            options.escape_misc,
+            options.escape_asterisks,
+            options.escape_underscores,
+            options.escape_ascii,
+        )
+        .into_owned()
+    } else {
+        normalized.into_owned()
+    };
     let mut output = String::with_capacity(escaped.len() + 1);
     output.push_str(&escaped);

data/rust-vendor/html-to-markdown-rs/src/converter/inline/semantic/typography.rs CHANGED Viewed

@@ -87,6 +87,8 @@ pub fn handle_subscript(
             } else {
                 output.push_str(&options.sub_symbol);
             }
+        } else {
+            output.push_str(trimmed);
         }
         append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
     }
@@ -139,6 +141,8 @@ pub fn handle_superscript(
             } else {
                 output.push_str(&options.sup_symbol);
             }
+        } else {
+            output.push_str(trimmed);
         }
         append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
     }

data/rust-vendor/html-to-markdown-rs/src/converter/main.rs CHANGED Viewed

@@ -18,11 +18,13 @@ use crate::converter::main_helpers::{
     extract_head_metadata, format_metadata_frontmatter, handle_hocr_document, has_custom_element_tags,
     repair_with_html5ever, trim_line_end_whitespace, trim_trailing_whitespace,
 };
+use crate::converter::plain_text::extract_plain_text;
 use crate::converter::preprocessing_helpers::{has_inline_block_misnest, should_drop_for_preprocessing};
 use crate::converter::utility::caching::build_dom_context;
 use crate::converter::utility::content::normalized_tag_name;
 use crate::converter::utility::preprocessing::{preprocess_html, strip_script_and_style_tags};
 use crate::converter::utility::serialization::serialize_tag_to_html;
+use crate::options::OutputFormat;
 use crate::converter::handlers::{handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre};
 use crate::error::Result;
@@ -134,6 +136,12 @@ pub(crate) fn convert_html_impl(
         }
     }
+    // Fast path for plain text output: skip the full conversion pipeline
+    if options.output_format == OutputFormat::Plain {
+        let plain = extract_plain_text(&dom, parser, options);
+        return Ok(plain);
+    }
     let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
     #[cfg(feature = "metadata")]
     let wants_document = metadata_collector

data/rust-vendor/html-to-markdown-rs/src/converter/mod.rs CHANGED Viewed

@@ -102,6 +102,7 @@ pub mod main;
 mod main_helpers;
 pub mod media;
 mod metadata;
+pub mod plain_text;
 pub mod preprocessing_helpers;
 pub mod semantic;
 pub mod text;

data/rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs ADDED Viewed

@@ -0,0 +1,265 @@
+//! Plain text extraction from parsed HTML DOM.
+//!
+//! Provides a fast-path text extractor that walks the DOM tree collecting only
+//! visible text content with structural whitespace, bypassing the full
+//! Markdown/Djot conversion pipeline.
+use crate::options::ConversionOptions;
+use crate::text;
+/// Tags whose content should be skipped entirely.
+const SKIP_TAGS: &[&str] = &["script", "style", "head", "template", "noscript", "svg", "math"];
+/// Block-level tags that should be separated by blank lines.
+const BLOCK_TAGS: &[&str] = &[
+    "p",
+    "div",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "blockquote",
+    "section",
+    "article",
+    "aside",
+    "main",
+    "nav",
+    "header",
+    "footer",
+    "figure",
+    "figcaption",
+    "details",
+    "summary",
+    "address",
+    "hgroup",
+    "search",
+];
+/// Extract plain text from a parsed DOM tree.
+///
+/// Walks the tree collecting visible text with structural whitespace:
+/// - Block elements get blank-line separation
+/// - `<br>` becomes a newline, `<hr>` a blank line
+/// - `<pre>` preserves internal whitespace
+/// - `<img>` outputs alt text (unless `skip_images` is set)
+/// - `<script>`, `<style>`, `<head>`, `<template>`, `<noscript>` are skipped
+/// - Tables: cells separated by tab, rows by newline
+/// - Inline elements are recursed without markers
+pub fn extract_plain_text(dom: &tl::VDom, parser: &tl::Parser, options: &ConversionOptions) -> String {
+    let mut buf = String::with_capacity(1024);
+    for child_handle in dom.children() {
+        walk_plain(child_handle, parser, &mut buf, options, false);
+    }
+    post_process(&mut buf);
+    buf
+}
+/// Recursive plain-text walker.
+fn walk_plain(
+    node_handle: &tl::NodeHandle,
+    parser: &tl::Parser,
+    buf: &mut String,
+    options: &ConversionOptions,
+    in_pre: bool,
+) {
+    let Some(node) = node_handle.get(parser) else {
+        return;
+    };
+    match node {
+        tl::Node::Raw(bytes) => {
+            let raw = bytes.as_utf8_str();
+            let decoded = text::decode_html_entities_cow(raw.as_ref());
+            if in_pre {
+                buf.push_str(&decoded);
+            } else {
+                let normalized = text::normalize_whitespace_cow(&decoded);
+                if !normalized.is_empty() {
+                    // Avoid leading space at start of a new line
+                    if normalized.as_ref() == " " && buf.ends_with('\n') {
+                        return;
+                    }
+                    buf.push_str(&normalized);
+                }
+            }
+        }
+        tl::Node::Tag(tag) => {
+            let tag_name = tag.name().as_utf8_str().to_ascii_lowercase();
+            let tag_str = tag_name.as_str();
+            // Skip invisible content
+            if SKIP_TAGS.contains(&tag_str) {
+                return;
+            }
+            match tag_str {
+                "br" => {
+                    buf.push('\n');
+                }
+                "hr" => {
+                    ensure_blank_line(buf);
+                }
+                "pre" => {
+                    ensure_blank_line(buf);
+                    walk_children(tag, parser, buf, options, true);
+                    ensure_blank_line(buf);
+                }
+                "img" => {
+                    if !options.skip_images {
+                        if let Some(Some(alt)) = tag.attributes().get("alt") {
+                            let alt_text = alt.as_utf8_str();
+                            if !alt_text.is_empty() {
+                                buf.push_str(alt_text.as_ref());
+                            }
+                        }
+                    }
+                }
+                "table" => {
+                    ensure_blank_line(buf);
+                    walk_table(tag, parser, buf, options);
+                    ensure_blank_line(buf);
+                }
+                "li" => {
+                    ensure_newline(buf);
+                    walk_children(tag, parser, buf, options, false);
+                    ensure_newline(buf);
+                }
+                _ if BLOCK_TAGS.contains(&tag_str) => {
+                    ensure_blank_line(buf);
+                    walk_children(tag, parser, buf, options, in_pre);
+                    ensure_blank_line(buf);
+                }
+                _ => {
+                    // Inline elements and structural containers (html, body, ul, ol, etc.)
+                    walk_children(tag, parser, buf, options, in_pre);
+                }
+            }
+        }
+        tl::Node::Comment(_) => {}
+    }
+}
+/// Walk all children of a tag.
+fn walk_children(tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions, in_pre: bool) {
+    let children = tag.children();
+    let top = children.top();
+    for child in top.iter() {
+        walk_plain(child, parser, buf, options, in_pre);
+    }
+}
+/// Walk a `<table>` element, extracting cells as tab-separated, rows as newline-separated.
+fn walk_table(table_tag: &tl::HTMLTag, parser: &tl::Parser, buf: &mut String, options: &ConversionOptions) {
+    // Collect all <tr> node handles by recursing into the table
+    let mut row_handles = Vec::new();
+    collect_descendant_handles(table_tag, parser, "tr", &mut row_handles);
+    for (row_idx, row_handle) in row_handles.iter().enumerate() {
+        if row_idx > 0 {
+            buf.push('\n');
+        }
+        let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) else {
+            continue;
+        };
+        // Collect direct <th>/<td> children
+        let mut cell_handles = Vec::new();
+        let row_children = row_tag.children();
+        let row_top = row_children.top();
+        for child in row_top.iter() {
+            if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
+                let name = child_tag.name().as_utf8_str();
+                if name.eq_ignore_ascii_case("th") || name.eq_ignore_ascii_case("td") {
+                    cell_handles.push(*child);
+                }
+            }
+        }
+        for (cell_idx, cell_handle) in cell_handles.iter().enumerate() {
+            if cell_idx > 0 {
+                buf.push('\t');
+            }
+            let mut cell_buf = String::new();
+            if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
+                walk_children(cell_tag, parser, &mut cell_buf, options, false);
+            }
+            buf.push_str(cell_buf.trim());
+        }
+    }
+}
+/// Recursively collect all descendant `NodeHandle`s matching `target_tag` (by cloning handles).
+fn collect_descendant_handles(
+    tag: &tl::HTMLTag,
+    parser: &tl::Parser,
+    target_tag: &str,
+    result: &mut Vec<tl::NodeHandle>,
+) {
+    let children = tag.children();
+    let top = children.top();
+    for child in top.iter() {
+        if let Some(tl::Node::Tag(child_tag)) = child.get(parser) {
+            if child_tag.name().as_utf8_str().eq_ignore_ascii_case(target_tag) {
+                result.push(*child);
+            } else {
+                collect_descendant_handles(child_tag, parser, target_tag, result);
+            }
+        }
+    }
+}
+/// Ensure the buffer ends with a blank line (two newlines).
+fn ensure_blank_line(buf: &mut String) {
+    if buf.is_empty() {
+        return;
+    }
+    // Strip trailing horizontal whitespace
+    while buf.ends_with(' ') || buf.ends_with('\t') {
+        buf.pop();
+    }
+    let current_newlines = buf.chars().rev().take_while(|&c| c == '\n').count();
+    for _ in current_newlines..2 {
+        buf.push('\n');
+    }
+}
+/// Ensure the buffer ends with at least one newline.
+fn ensure_newline(buf: &mut String) {
+    if buf.is_empty() {
+        return;
+    }
+    if !buf.ends_with('\n') {
+        buf.push('\n');
+    }
+}
+/// Post-process: collapse 3+ newlines to 2, trim line-end whitespace, ensure single trailing newline.
+fn post_process(buf: &mut String) {
+    // Collapse runs of 3+ newlines to exactly 2
+    while buf.contains("\n\n\n") {
+        *buf = buf.replace("\n\n\n", "\n\n");
+    }
+    // Trim trailing whitespace from each line — collect owned strings to avoid borrow conflict
+    let lines: Vec<String> = buf.lines().map(|line| line.trim_end().to_string()).collect();
+    buf.clear();
+    for (i, line) in lines.iter().enumerate() {
+        buf.push_str(line);
+        if i < lines.len() - 1 {
+            buf.push('\n');
+        }
+    }
+    // Trim to single trailing newline
+    let keep = buf.trim_end_matches('\n').len();
+    if keep == 0 {
+        buf.clear();
+    } else {
+        buf.truncate(keep);
+        buf.push('\n');
+    }
+}

data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs CHANGED Viewed

@@ -82,6 +82,12 @@ pub fn process_text_node(
             if !output.ends_with("\n\n") {
                 if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
                     if is_inline_element(next_tag) {
+                        // Newlines between inline elements collapse to a single space
+                        // in HTML rendering (per CSS white-space: normal). Preserve
+                        // this word boundary so adjacent inline content doesn't merge.
+                        if !output.ends_with(' ') && !output.ends_with('\n') {
+                            output.push(' ');
+                        }
                         return;
                     }
                 }

data/rust-vendor/html-to-markdown-rs/src/options/conversion.rs CHANGED Viewed

@@ -121,7 +121,7 @@ pub struct ConversionOptions {
     /// Useful for text-only extraction or filtering out visual content.
     pub skip_images: bool,
-    /// Output format for conversion (Markdown or Djot)
+    /// Output format for conversion (Markdown, Djot, or Plain)
     pub output_format: OutputFormat,
 }
@@ -233,7 +233,7 @@ pub struct ConversionOptionsUpdate {
     /// Optional skip images override
     pub skip_images: Option<bool>,
-    /// Optional output format override (Markdown or Djot)
+    /// Optional output format override (Markdown, Djot, or Plain)
     pub output_format: Option<OutputFormat>,
 }

data/rust-vendor/html-to-markdown-rs/src/options/validation.rs CHANGED Viewed

@@ -182,6 +182,8 @@ pub enum OutputFormat {
     Markdown,
     /// Djot lightweight markup language.
     Djot,
+    /// Plain text output (no markup, visible text only).
+    Plain,
 }
 impl OutputFormat {
@@ -193,6 +195,7 @@ impl OutputFormat {
     pub fn parse(value: &str) -> Self {
         match normalize_token(value).as_str() {
             "djot" => Self::Djot,
+            "plain" | "plaintext" | "text" => Self::Plain,
             _ => Self::Markdown,
         }
     }
@@ -329,6 +332,7 @@ mod serde_impls {
             let s = match self {
                 Self::Markdown => "markdown",
                 Self::Djot => "djot",
+                Self::Plain => "plain",
             };
             serializer.serialize_str(s)
         }

data/rust-vendor/html-to-markdown-rs/tests/integration_test.rs CHANGED Viewed

@@ -373,6 +373,66 @@ fn test_superscript_leading_whitespace() {
     assert_eq!(result, "hello ^world^\n");
 }
+#[test]
+fn test_subscript_default_passthrough() {
+    let html = "<p>H<sub>2</sub>O</p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "H2O\n");
+}
+#[test]
+fn test_superscript_default_passthrough() {
+    let html = "<p>x<sup>2</sup> + y<sup>3</sup></p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "x2 + y3\n");
+}
+#[test]
+fn test_subscript_superscript_combined_default() {
+    let html = "<p>CO<sub>2</sub><sup>*</sup></p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "CO2*\n");
+}
+#[test]
+fn test_subscript_html_tag_symbol() {
+    let html = "<p>H<sub>2</sub>O</p>";
+    let opts = ConversionOptions {
+        sub_symbol: "<sub>".to_string(),
+        ..Default::default()
+    };
+    let result = convert(html, Some(opts)).unwrap();
+    assert_eq!(result, "H<sub>2</sub>O\n");
+}
+#[test]
+fn test_adjacent_links_with_newline_separator() {
+    let html = "<p>\n<a href=\"/page1\">Link 1</a>\n<a href=\"/page2\">Link 2</a>\n</p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
+}
+#[test]
+fn test_adjacent_links_no_whitespace() {
+    let html = "<p><a href=\"/page1\">Link 1</a><a href=\"/page2\">Link 2</a></p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "[Link 1](/page1)[Link 2](/page2)\n");
+}
+#[test]
+fn test_adjacent_links_with_space() {
+    let html = "<p><a href=\"/page1\">Link 1</a> <a href=\"/page2\">Link 2</a></p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "[Link 1](/page1) [Link 2](/page2)\n");
+}
+#[test]
+fn test_adjacent_inline_elements_with_newline() {
+    let html = "<p><strong>bold</strong>\n<em>italic</em></p>";
+    let result = convert(html, None).unwrap();
+    assert_eq!(result, "**bold** *italic*\n");
+}
 #[test]
 fn test_autolink() {
     let html = "<p><a href=\"https://example.com\">https://example.com</a></p>";

data/rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs ADDED Viewed

@@ -0,0 +1,214 @@
+//! Tests for plain text output format support.
+use html_to_markdown_rs::{ConversionOptions, OutputFormat, convert};
+fn plain_options() -> ConversionOptions {
+    ConversionOptions {
+        output_format: OutputFormat::Plain,
+        ..Default::default()
+    }
+}
+#[test]
+fn test_plain_basic_paragraph() {
+    let html = "<p>Hello world</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Hello world\n");
+}
+#[test]
+fn test_plain_no_strong_markers() {
+    let html = "<p>This is <strong>bold</strong> text</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "This is bold text\n");
+}
+#[test]
+fn test_plain_no_emphasis_markers() {
+    let html = "<p>This is <em>italic</em> text</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "This is italic text\n");
+}
+#[test]
+fn test_plain_link_text_only() {
+    let html = r#"<p>Visit <a href="https://example.com">our site</a> today</p>"#;
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Visit our site today\n");
+}
+#[test]
+fn test_plain_image_alt_text() {
+    let html = r#"<img alt="A cute cat">"#;
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "A cute cat\n");
+}
+#[test]
+fn test_plain_image_skipped_when_option_set() {
+    let html = r#"<img alt="A cute cat">"#;
+    let mut opts = plain_options();
+    opts.skip_images = true;
+    let result = convert(html, Some(opts)).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_code_block() {
+    let html = "<pre><code>fn main() {}</code></pre>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "fn main() {}\n");
+}
+#[test]
+fn test_plain_blockquote_no_prefix() {
+    let html = "<blockquote><p>Quoted text</p></blockquote>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('>'),
+        "Plain text should not contain blockquote prefix, got: {result}"
+    );
+    assert!(result.contains("Quoted text"));
+}
+#[test]
+fn test_plain_list_items_on_separate_lines() {
+    let html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains("First"));
+    assert!(result.contains("Second"));
+    assert!(result.contains("Third"));
+    // Items should be on separate lines
+    let lines: Vec<&str> = result.lines().filter(|l| !l.is_empty()).collect();
+    assert!(lines.len() >= 3, "Expected at least 3 lines, got: {result}");
+}
+#[test]
+fn test_plain_table_cells_extracted() {
+    let html = "<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains('A'));
+    assert!(result.contains('B'));
+    assert!(result.contains('C'));
+    assert!(result.contains('D'));
+}
+#[test]
+fn test_plain_no_escaping() {
+    let html = "<p>* not a list</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("* not a list"),
+        "Plain text should not escape asterisks, got: {result}"
+    );
+    assert!(
+        !result.contains("\\*"),
+        "Plain text should not backslash-escape, got: {result}"
+    );
+}
+#[test]
+fn test_plain_script_excluded() {
+    let html = "<p>Before</p><script>alert('xss')</script><p>After</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains("alert"),
+        "Script content should be excluded, got: {result}"
+    );
+    assert!(result.contains("Before"));
+    assert!(result.contains("After"));
+}
+#[test]
+fn test_plain_style_excluded() {
+    let html = "<p>Hello</p><style>.foo { color: red; }</style>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains("color"),
+        "Style content should be excluded, got: {result}"
+    );
+    assert!(result.contains("Hello"));
+}
+#[test]
+fn test_plain_br_becomes_newline() {
+    let html = "<p>Line one<br>Line two</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("Line one\nLine two"),
+        "Expected newline from <br>, got: {result}"
+    );
+}
+#[test]
+fn test_plain_hr_becomes_blank_line() {
+    let html = "<p>Above</p><hr><p>Below</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(result.contains("Above"));
+    assert!(result.contains("Below"));
+    // Should have blank line between
+    assert!(result.contains("\n\n"), "Expected blank line from <hr>, got: {result}");
+}
+#[test]
+fn test_plain_nested_inline_formatting_stripped() {
+    let html = "<p>Start <strong>bold <em>and italic</em></strong> end</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "Start bold and italic end\n");
+}
+#[test]
+fn test_plain_heading_no_markers() {
+    let html = "<h1>Title</h1><p>Content</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('#'),
+        "Plain text should not contain heading markers, got: {result}"
+    );
+    assert!(result.contains("Title"));
+    assert!(result.contains("Content"));
+}
+#[test]
+fn test_plain_parse_variants() {
+    assert_eq!(OutputFormat::parse("plain"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("plaintext"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("text"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("Plain"), OutputFormat::Plain);
+    assert_eq!(OutputFormat::parse("PLAINTEXT"), OutputFormat::Plain);
+}
+#[test]
+fn test_plain_empty_input() {
+    let html = "";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_whitespace_only_html() {
+    let html = "<p>   </p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert_eq!(result, "");
+}
+#[test]
+fn test_plain_inline_code_no_backticks() {
+    let html = "<p>Use <code>fmt.Println</code> to print</p>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        !result.contains('`'),
+        "Plain text should not contain backticks, got: {result}"
+    );
+    assert!(result.contains("fmt.Println"));
+}
+#[test]
+fn test_plain_pre_preserves_whitespace() {
+    let html = "<pre>  indented\n    more</pre>";
+    let result = convert(html, Some(plain_options())).unwrap();
+    assert!(
+        result.contains("  indented\n    more"),
+        "Pre blocks should preserve whitespace, got: {result}"
+    );
+}

data/spec/visitor_spec.rb CHANGED Viewed

@@ -35,7 +35,7 @@ RSpec.describe HtmlToMarkdown do
         visit_definition_list_end visit_form visit_input visit_button visit_audio visit_video
         visit_iframe visit_details visit_summary visit_figure_start visit_figcaption
         visit_figure_end
-      ].each_with_object({}) { |name, hash| hash[name.to_sym] = { type: :continue } }
+      ].to_h { |name| [name.to_sym, { type: :continue }] }
     end
     def create_visitor(**overrides)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html-to-markdown
 version: !ruby/object:Gem::Version
-  version: 2.26.2
+  version: 2.27.0
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-28 00:00:00.000000000 Z
+date: 2026-03-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -1852,6 +1852,7 @@ files:
 - rust-vendor/html-to-markdown-rs/src/converter/media/svg.rs
 - rust-vendor/html-to-markdown-rs/src/converter/metadata.rs
 - rust-vendor/html-to-markdown-rs/src/converter/mod.rs
+- rust-vendor/html-to-markdown-rs/src/converter/plain_text.rs
 - rust-vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
 - rust-vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
 - rust-vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
@@ -1949,6 +1950,7 @@ files:
 - rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
 - rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
 - rust-vendor/html-to-markdown-rs/tests/lists_test.rs
+- rust-vendor/html-to-markdown-rs/tests/plain_output_test.rs
 - rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
 - rust-vendor/html-to-markdown-rs/tests/skip_images_test.rs
 - rust-vendor/html-to-markdown-rs/tests/tables_test.rs