html-to-markdown 2.26.1 → 2.26.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 48fbfb1438c5ad97a8984bbd111b90332abc08e7c2540709dfc6c3f02daea6c5
4
- data.tar.gz: 7d579f3f883a4fb9d99ce6b8838e0b51c22ecb0c0a8759ca7bb3d01b723aa7d9
3
+ metadata.gz: 12d5559bda903dfbeb563dba48c733845621b2c67cbca597fb4111c094f33fe0
4
+ data.tar.gz: 3ef2fdc3b30051c1eec6a956c42465f3d31670f81f345b67d98a84935f77e02c
5
5
  SHA512:
6
- metadata.gz: 37ada8ad47408d91d98f065cf598ddfe04baa46900a92939f19f1290f54991cf59e4471b855d12889171d4e478ce2f9fc49bff938d814f8c3a3f4cd8ba009ebc
7
- data.tar.gz: 3b3653ffc39d693111aa35fb8d7a255338801256856b55fe4e7933373b4cf06fd69b838d4177c046ee41ffc7654e36075dc6d18b1a58fa1e44b83ce6d225abb6
6
+ metadata.gz: dee90b55391d5f84466c2a2d3591a7d3565ebc88357118b0725d57ad2c06cc5e9f965a93f4ad6111bab89c589552cc0f3ecfb21701ae4f0e18b9b9d55e0aa3ef
7
+ data.tar.gz: 7b0927d2fa482712bdfac03152a19375e1744e3e22b0121d7567967bc2fa215396d8e292f4d2974962477b839a644e96b01d828687a8fd520cc22171b3a83908
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.26.1)
4
+ html-to-markdown (2.26.2)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -161,7 +161,7 @@ CHECKSUMS
161
161
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
162
162
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
163
163
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
164
- html-to-markdown (2.26.1)
164
+ html-to-markdown (2.26.2)
165
165
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
166
166
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
167
167
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.26.1"
3
+ version ="2.26.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.26.1'
4
+ VERSION = '2.26.2'
5
5
  end
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "2.26.1"
3
+ version = "2.26.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -61,6 +61,7 @@ pub(crate) fn handle(
61
61
 
62
62
  let p_ctx = Context {
63
63
  in_paragraph: true,
64
+ block_content_start: output.len(),
64
65
  ..ctx.clone()
65
66
  };
66
67
 
@@ -63,6 +63,10 @@ pub struct Context {
63
63
  pub(crate) heading_allow_inline_images: bool,
64
64
  /// Are we inside a paragraph element?
65
65
  pub(crate) in_paragraph: bool,
66
+ /// Output buffer position where the current block's content starts.
67
+ /// Used to distinguish paragraph-break newlines from a previous block
68
+ /// vs. newlines generated within the current block.
69
+ pub(crate) block_content_start: usize,
66
70
  /// Are we inside a ruby element?
67
71
  pub(crate) in_ruby: bool,
68
72
  /// Are we inside a `<strong>` / `<b>` element?
@@ -152,6 +156,7 @@ impl Context {
152
156
  in_heading: false,
153
157
  heading_allow_inline_images: false,
154
158
  in_paragraph: false,
159
+ block_content_start: 0,
155
160
  in_ruby: false,
156
161
  in_strong: false,
157
162
  in_link: false,
@@ -178,7 +178,12 @@ pub fn process_text_node(
178
178
  if !suffix.is_empty() {
179
179
  final_text.push_str(suffix);
180
180
  } else if has_trailing_single_newline {
181
- let at_paragraph_break = output.ends_with("\n\n");
181
+ // Check if the "\n\n" at the end of the output buffer came from within
182
+ // the current block's content, not from a previous block's closing.
183
+ // Without this distinction, the second paragraph after a "\n\n" boundary
184
+ // would incorrectly suppress the trailing space before inline elements.
185
+ let current_block_output = &output[ctx.block_content_start..];
186
+ let at_paragraph_break = current_block_output.ends_with("\n\n");
182
187
  if !at_paragraph_break {
183
188
  if has_double_newline {
184
189
  final_text.push('\n');
@@ -0,0 +1,63 @@
1
+ #![allow(missing_docs)]
2
+
3
+ use html_to_markdown_rs::convert;
4
+
5
+ /// Regression test for https://github.com/kreuzberg-dev/html-to-markdown/issues/212
6
+ ///
7
+ /// When `\n` precedes an `<a>` tag inside a `<p>`, the whitespace was
8
+ /// inconsistently handled between the first and subsequent paragraphs.
9
+ /// The bug was stateful: identical HTML structures produced different
10
+ /// results depending on their position in the document.
11
+ #[test]
12
+ fn consistent_whitespace_before_link_across_paragraphs_issue_212() {
13
+ let html = r#"<p>text before
14
+ <a href="https://example.com">the link</a>
15
+ after</p>
16
+ <p>text before
17
+ <a href="https://example.com">the link</a>
18
+ after</p>"#;
19
+
20
+ let result = convert(html, None).unwrap();
21
+ assert_eq!(
22
+ result,
23
+ "text before [the link](https://example.com) after\n\ntext before [the link](https://example.com) after\n"
24
+ );
25
+ }
26
+
27
+ /// Same bug but with three paragraphs to ensure no accumulation effects.
28
+ #[test]
29
+ fn consistent_whitespace_three_paragraphs_issue_212() {
30
+ let html = r#"<p>click
31
+ <a href="/a">here</a></p>
32
+ <p>click
33
+ <a href="/b">here</a></p>
34
+ <p>click
35
+ <a href="/c">here</a></p>"#;
36
+
37
+ let result = convert(html, None).unwrap();
38
+ assert_eq!(result, "click [here](/a)\n\nclick [here](/b)\n\nclick [here](/c)\n");
39
+ }
40
+
41
+ /// Verify the fix doesn't break whitespace between text nodes without links.
42
+ #[test]
43
+ fn newline_before_inline_elements_consistent_issue_212() {
44
+ let html = r#"<p>before
45
+ <strong>bold</strong> after</p>
46
+ <p>before
47
+ <strong>bold</strong> after</p>"#;
48
+
49
+ let result = convert(html, None).unwrap();
50
+ assert_eq!(result, "before **bold** after\n\nbefore **bold** after\n");
51
+ }
52
+
53
+ /// Verify with `<em>` tags across multiple paragraphs.
54
+ #[test]
55
+ fn newline_before_em_across_paragraphs_issue_212() {
56
+ let html = r#"<p>some text
57
+ <em>emphasized</em> end</p>
58
+ <p>some text
59
+ <em>emphasized</em> end</p>"#;
60
+
61
+ let result = convert(html, None).unwrap();
62
+ assert_eq!(result, "some text *emphasized* end\n\nsome text *emphasized* end\n");
63
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.26.1
4
+ version: 2.26.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-02-27 00:00:00.000000000 Z
11
+ date: 2026-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -1946,6 +1946,7 @@ files:
1946
1946
  - rust-vendor/html-to-markdown-rs/tests/issue_190_regressions.rs
1947
1947
  - rust-vendor/html-to-markdown-rs/tests/issue_199_regressions.rs
1948
1948
  - rust-vendor/html-to-markdown-rs/tests/issue_200_regressions.rs
1949
+ - rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
1949
1950
  - rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
1950
1951
  - rust-vendor/html-to-markdown-rs/tests/lists_test.rs
1951
1952
  - rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs