html-to-markdown 2.26.1 → 2.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +6 -1
- data/rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs +63 -0
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 12d5559bda903dfbeb563dba48c733845621b2c67cbca597fb4111c094f33fe0
|
|
4
|
+
data.tar.gz: 3ef2fdc3b30051c1eec6a956c42465f3d31670f81f345b67d98a84935f77e02c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: dee90b55391d5f84466c2a2d3591a7d3565ebc88357118b0725d57ad2c06cc5e9f965a93f4ad6111bab89c589552cc0f3ecfb21701ae4f0e18b9b9d55e0aa3ef
|
|
7
|
+
data.tar.gz: 7b0927d2fa482712bdfac03152a19375e1744e3e22b0121d7567967bc2fa215396d8e292f4d2974962477b839a644e96b01d828687a8fd520cc22171b3a83908
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.26.
|
|
4
|
+
html-to-markdown (2.26.2)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -161,7 +161,7 @@ CHECKSUMS
|
|
|
161
161
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
162
162
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
163
163
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
164
|
-
html-to-markdown (2.26.
|
|
164
|
+
html-to-markdown (2.26.2)
|
|
165
165
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
166
166
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
167
167
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
@@ -63,6 +63,10 @@ pub struct Context {
|
|
|
63
63
|
pub(crate) heading_allow_inline_images: bool,
|
|
64
64
|
/// Are we inside a paragraph element?
|
|
65
65
|
pub(crate) in_paragraph: bool,
|
|
66
|
+
/// Output buffer position where the current block's content starts.
|
|
67
|
+
/// Used to distinguish paragraph-break newlines from a previous block
|
|
68
|
+
/// vs. newlines generated within the current block.
|
|
69
|
+
pub(crate) block_content_start: usize,
|
|
66
70
|
/// Are we inside a ruby element?
|
|
67
71
|
pub(crate) in_ruby: bool,
|
|
68
72
|
/// Are we inside a `<strong>` / `<b>` element?
|
|
@@ -152,6 +156,7 @@ impl Context {
|
|
|
152
156
|
in_heading: false,
|
|
153
157
|
heading_allow_inline_images: false,
|
|
154
158
|
in_paragraph: false,
|
|
159
|
+
block_content_start: 0,
|
|
155
160
|
in_ruby: false,
|
|
156
161
|
in_strong: false,
|
|
157
162
|
in_link: false,
|
|
@@ -178,7 +178,12 @@ pub fn process_text_node(
|
|
|
178
178
|
if !suffix.is_empty() {
|
|
179
179
|
final_text.push_str(suffix);
|
|
180
180
|
} else if has_trailing_single_newline {
|
|
181
|
-
|
|
181
|
+
// Check if the "\n\n" at the end of the output buffer came from within
|
|
182
|
+
// the current block's content, not from a previous block's closing.
|
|
183
|
+
// Without this distinction, the second paragraph after a "\n\n" boundary
|
|
184
|
+
// would incorrectly suppress the trailing space before inline elements.
|
|
185
|
+
let current_block_output = &output[ctx.block_content_start..];
|
|
186
|
+
let at_paragraph_break = current_block_output.ends_with("\n\n");
|
|
182
187
|
if !at_paragraph_break {
|
|
183
188
|
if has_double_newline {
|
|
184
189
|
final_text.push('\n');
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::convert;
|
|
4
|
+
|
|
5
|
+
/// Regression test for https://github.com/kreuzberg-dev/html-to-markdown/issues/212
|
|
6
|
+
///
|
|
7
|
+
/// When `\n` precedes an `<a>` tag inside a `<p>`, the whitespace was
|
|
8
|
+
/// inconsistently handled between the first and subsequent paragraphs.
|
|
9
|
+
/// The bug was stateful: identical HTML structures produced different
|
|
10
|
+
/// results depending on their position in the document.
|
|
11
|
+
#[test]
|
|
12
|
+
fn consistent_whitespace_before_link_across_paragraphs_issue_212() {
|
|
13
|
+
let html = r#"<p>text before
|
|
14
|
+
<a href="https://example.com">the link</a>
|
|
15
|
+
after</p>
|
|
16
|
+
<p>text before
|
|
17
|
+
<a href="https://example.com">the link</a>
|
|
18
|
+
after</p>"#;
|
|
19
|
+
|
|
20
|
+
let result = convert(html, None).unwrap();
|
|
21
|
+
assert_eq!(
|
|
22
|
+
result,
|
|
23
|
+
"text before [the link](https://example.com) after\n\ntext before [the link](https://example.com) after\n"
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/// Same bug but with three paragraphs to ensure no accumulation effects.
|
|
28
|
+
#[test]
|
|
29
|
+
fn consistent_whitespace_three_paragraphs_issue_212() {
|
|
30
|
+
let html = r#"<p>click
|
|
31
|
+
<a href="/a">here</a></p>
|
|
32
|
+
<p>click
|
|
33
|
+
<a href="/b">here</a></p>
|
|
34
|
+
<p>click
|
|
35
|
+
<a href="/c">here</a></p>"#;
|
|
36
|
+
|
|
37
|
+
let result = convert(html, None).unwrap();
|
|
38
|
+
assert_eq!(result, "click [here](/a)\n\nclick [here](/b)\n\nclick [here](/c)\n");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Verify the fix doesn't break whitespace between text nodes without links.
|
|
42
|
+
#[test]
|
|
43
|
+
fn newline_before_inline_elements_consistent_issue_212() {
|
|
44
|
+
let html = r#"<p>before
|
|
45
|
+
<strong>bold</strong> after</p>
|
|
46
|
+
<p>before
|
|
47
|
+
<strong>bold</strong> after</p>"#;
|
|
48
|
+
|
|
49
|
+
let result = convert(html, None).unwrap();
|
|
50
|
+
assert_eq!(result, "before **bold** after\n\nbefore **bold** after\n");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Verify with `<em>` tags across multiple paragraphs.
|
|
54
|
+
#[test]
|
|
55
|
+
fn newline_before_em_across_paragraphs_issue_212() {
|
|
56
|
+
let html = r#"<p>some text
|
|
57
|
+
<em>emphasized</em> end</p>
|
|
58
|
+
<p>some text
|
|
59
|
+
<em>emphasized</em> end</p>"#;
|
|
60
|
+
|
|
61
|
+
let result = convert(html, None).unwrap();
|
|
62
|
+
assert_eq!(result, "some text *emphasized* end\n\nsome text *emphasized* end\n");
|
|
63
|
+
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.26.
|
|
4
|
+
version: 2.26.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-02-
|
|
11
|
+
date: 2026-02-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -1946,6 +1946,7 @@ files:
|
|
|
1946
1946
|
- rust-vendor/html-to-markdown-rs/tests/issue_190_regressions.rs
|
|
1947
1947
|
- rust-vendor/html-to-markdown-rs/tests/issue_199_regressions.rs
|
|
1948
1948
|
- rust-vendor/html-to-markdown-rs/tests/issue_200_regressions.rs
|
|
1949
|
+
- rust-vendor/html-to-markdown-rs/tests/issue_212_regressions.rs
|
|
1949
1950
|
- rust-vendor/html-to-markdown-rs/tests/json_ld_script_extraction.rs
|
|
1950
1951
|
- rust-vendor/html-to-markdown-rs/tests/lists_test.rs
|
|
1951
1952
|
- rust-vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
|