html-to-markdown 2.24.3 → 2.24.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.lock +2 -1
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/rust-vendor/html-to-markdown-rs/Cargo.toml +1 -0
- data/rust-vendor/html-to-markdown-rs/src/convert_api.rs +1 -0
- data/rust-vendor/html-to-markdown-rs/src/converter/block/div.rs +3 -2
- data/rust-vendor/html-to-markdown-rs/src/converter/block/heading.rs +23 -7
- data/rust-vendor/html-to-markdown-rs/src/converter/block/paragraph.rs +3 -2
- data/rust-vendor/html-to-markdown-rs/src/converter/block/table/cell.rs +4 -2
- data/rust-vendor/html-to-markdown-rs/src/converter/block/table/layout.rs +3 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/form/elements.rs +6 -2
- data/rust-vendor/html-to-markdown-rs/src/converter/inline/semantic/marks.rs +3 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/list/utils.rs +22 -6
- data/rust-vendor/html-to-markdown-rs/src/converter/semantic/summary.rs +3 -1
- data/rust-vendor/html-to-markdown-rs/src/converter/text/processing.rs +23 -22
- data/rust-vendor/html-to-markdown-rs/src/converter/text_node.rs +5 -2
- data/rust-vendor/html-to-markdown-rs/src/hocr/types.rs +1 -1
- data/rust-vendor/html-to-markdown-rs/src/text.rs +7 -7
- data/rust-vendor/html-to-markdown-rs/tests/issue_139_regressions.rs +2 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7aba0e152474e36acd5ca4a9abb8cd891a6edd671086048415eec13bb25a69af
|
|
4
|
+
data.tar.gz: 55c7d8340aa73d115005745d8e93894c7494588a1458f55427b765ff392f1cd6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8a591d0bbbba4cc5682cc8d51e6f851199a70065294352251b4d6b7794ce53d257ab66c873b79da9c10f15b74da7670cf77caa3ddd1eec5d93593435aa6979b7
|
|
7
|
+
data.tar.gz: b979428207c4ebc1e7a7ea0f9d4e4859f11c50558bd11f9fae2e7e61d6901538c09594e7307bfce3910f4abfa3d9a4fe230f1a61312c592c86c5c30693ed3b5b
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (2.24.
|
|
4
|
+
html-to-markdown (2.24.4)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -161,7 +161,7 @@ CHECKSUMS
|
|
|
161
161
|
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
|
|
162
162
|
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
|
|
163
163
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
164
|
-
html-to-markdown (2.24.
|
|
164
|
+
html-to-markdown (2.24.4)
|
|
165
165
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
166
166
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
167
167
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/badge/Go-v2.24.
|
|
21
|
+
<img src="https://img.shields.io/badge/Go-v2.24.4-007ec6" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -384,7 +384,7 @@ dependencies = [
|
|
|
384
384
|
|
|
385
385
|
[[package]]
|
|
386
386
|
name = "html-to-markdown-rb"
|
|
387
|
-
version = "2.24.
|
|
387
|
+
version = "2.24.4"
|
|
388
388
|
dependencies = [
|
|
389
389
|
"html-to-markdown-rs",
|
|
390
390
|
"magnus",
|
|
@@ -396,6 +396,7 @@ dependencies = [
|
|
|
396
396
|
name = "html-to-markdown-rs"
|
|
397
397
|
version = "2.23.0"
|
|
398
398
|
dependencies = [
|
|
399
|
+
"ahash",
|
|
399
400
|
"astral-tl",
|
|
400
401
|
"base64",
|
|
401
402
|
"html-escape",
|
|
@@ -31,6 +31,7 @@ regex = "1.12"
|
|
|
31
31
|
once_cell = "1.21"
|
|
32
32
|
thiserror = "2.0"
|
|
33
33
|
base64 = "0.22"
|
|
34
|
+
ahash = "0.8"
|
|
34
35
|
html-escape = "0.2.13"
|
|
35
36
|
image = { version = "0.25", default-features = false, features = ["gif", "jpeg", "png", "bmp", "webp"], optional = true }
|
|
36
37
|
html5ever = "0.36"
|
|
@@ -216,28 +216,40 @@ pub(crate) fn push_heading(output: &mut String, ctx: &Context, options: &Convers
|
|
|
216
216
|
if level == 1 {
|
|
217
217
|
output.push_str(text);
|
|
218
218
|
output.push('\n');
|
|
219
|
-
|
|
219
|
+
for _ in 0..text.len() {
|
|
220
|
+
output.push('=');
|
|
221
|
+
}
|
|
220
222
|
} else if level == 2 {
|
|
221
223
|
output.push_str(text);
|
|
222
224
|
output.push('\n');
|
|
223
|
-
|
|
225
|
+
for _ in 0..text.len() {
|
|
226
|
+
output.push('-');
|
|
227
|
+
}
|
|
224
228
|
} else {
|
|
225
|
-
|
|
229
|
+
for _ in 0..level {
|
|
230
|
+
output.push('#');
|
|
231
|
+
}
|
|
226
232
|
output.push(' ');
|
|
227
233
|
output.push_str(text);
|
|
228
234
|
}
|
|
229
235
|
}
|
|
230
236
|
HeadingStyle::Atx => {
|
|
231
|
-
|
|
237
|
+
for _ in 0..level {
|
|
238
|
+
output.push('#');
|
|
239
|
+
}
|
|
232
240
|
output.push(' ');
|
|
233
241
|
output.push_str(text);
|
|
234
242
|
}
|
|
235
243
|
HeadingStyle::AtxClosed => {
|
|
236
|
-
|
|
244
|
+
for _ in 0..level {
|
|
245
|
+
output.push('#');
|
|
246
|
+
}
|
|
237
247
|
output.push(' ');
|
|
238
248
|
output.push_str(text);
|
|
239
249
|
output.push(' ');
|
|
240
|
-
|
|
250
|
+
for _ in 0..level {
|
|
251
|
+
output.push('#');
|
|
252
|
+
}
|
|
241
253
|
}
|
|
242
254
|
}
|
|
243
255
|
output.push_str(heading_suffix);
|
|
@@ -248,7 +260,11 @@ fn continuation_indent_string(list_depth: usize, _options: &ConversionOptions) -
|
|
|
248
260
|
if list_depth == 0 {
|
|
249
261
|
return None;
|
|
250
262
|
}
|
|
251
|
-
|
|
263
|
+
let mut indent = String::new();
|
|
264
|
+
for _ in 0..(4 * list_depth) {
|
|
265
|
+
indent.push(' ');
|
|
266
|
+
}
|
|
267
|
+
Some(indent)
|
|
252
268
|
}
|
|
253
269
|
|
|
254
270
|
/// Process heading with visitor callback if available.
|
|
@@ -107,8 +107,9 @@ fn add_list_continuation_indent(
|
|
|
107
107
|
if needs_space && !output.ends_with(' ') && !output.ends_with('\n') {
|
|
108
108
|
output.push(' ');
|
|
109
109
|
}
|
|
110
|
-
|
|
111
|
-
|
|
110
|
+
for _ in 0..(4 * list_depth) {
|
|
111
|
+
output.push(' ');
|
|
112
|
+
}
|
|
112
113
|
}
|
|
113
114
|
|
|
114
115
|
/// Check if an element is empty (has no text content).
|
|
@@ -153,7 +153,7 @@ pub fn convert_table_cell(
|
|
|
153
153
|
options.escape_ascii,
|
|
154
154
|
);
|
|
155
155
|
if options.escape_misc {
|
|
156
|
-
text = escaped;
|
|
156
|
+
text = escaped.into_owned();
|
|
157
157
|
} else {
|
|
158
158
|
text = escaped.replace('|', r"\|");
|
|
159
159
|
}
|
|
@@ -175,5 +175,7 @@ pub fn convert_table_cell(
|
|
|
175
175
|
|
|
176
176
|
output.push(' ');
|
|
177
177
|
output.push_str(&text);
|
|
178
|
-
|
|
178
|
+
for _ in 0..colspan {
|
|
179
|
+
output.push_str(" |");
|
|
180
|
+
}
|
|
179
181
|
}
|
|
@@ -33,7 +33,9 @@ pub(crate) fn indent_table_for_list(
|
|
|
33
33
|
if matches!(options.list_indent_type, ListIndentType::Spaces) {
|
|
34
34
|
let space_count = indent.chars().filter(|c| *c == ' ').count();
|
|
35
35
|
if space_count < 4 {
|
|
36
|
-
|
|
36
|
+
for _ in 0..(4 - space_count) {
|
|
37
|
+
indent.push(' ');
|
|
38
|
+
}
|
|
37
39
|
}
|
|
38
40
|
}
|
|
39
41
|
|
|
@@ -176,7 +176,9 @@ pub fn handle_legend(
|
|
|
176
176
|
if ctx.convert_as_inline {
|
|
177
177
|
output.push_str(trimmed);
|
|
178
178
|
} else {
|
|
179
|
-
let symbol =
|
|
179
|
+
let mut symbol = String::with_capacity(2);
|
|
180
|
+
symbol.push(options.strong_em_symbol);
|
|
181
|
+
symbol.push(options.strong_em_symbol);
|
|
180
182
|
output.push_str(&symbol);
|
|
181
183
|
output.push_str(trimmed);
|
|
182
184
|
output.push_str(&symbol);
|
|
@@ -381,7 +383,9 @@ pub fn handle_optgroup(
|
|
|
381
383
|
.map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
|
|
382
384
|
|
|
383
385
|
if !label.is_empty() {
|
|
384
|
-
let symbol =
|
|
386
|
+
let mut symbol = String::with_capacity(2);
|
|
387
|
+
symbol.push(options.strong_em_symbol);
|
|
388
|
+
symbol.push(options.strong_em_symbol);
|
|
385
389
|
output.push_str(&symbol);
|
|
386
390
|
output.push_str(&label);
|
|
387
391
|
output.push_str(&symbol);
|
|
@@ -70,7 +70,9 @@ pub fn handle_mark(
|
|
|
70
70
|
output.push_str("</mark>");
|
|
71
71
|
}
|
|
72
72
|
HighlightStyle::Bold => {
|
|
73
|
-
let symbol =
|
|
73
|
+
let mut symbol = String::with_capacity(2);
|
|
74
|
+
symbol.push(options.strong_em_symbol);
|
|
75
|
+
symbol.push(options.strong_em_symbol);
|
|
74
76
|
output.push_str(&symbol);
|
|
75
77
|
let bold_ctx = Context {
|
|
76
78
|
in_strong: true,
|
|
@@ -157,9 +157,16 @@ pub fn add_list_continuation_indent(
|
|
|
157
157
|
let indent_level = calculate_list_continuation_indent(list_depth);
|
|
158
158
|
let indent_char = match options.list_indent_type {
|
|
159
159
|
ListIndentType::Tabs => "\t",
|
|
160
|
-
ListIndentType::Spaces =>
|
|
160
|
+
ListIndentType::Spaces => {
|
|
161
|
+
for _ in 0..options.list_indent_width {
|
|
162
|
+
output.push(' ');
|
|
163
|
+
}
|
|
164
|
+
return;
|
|
165
|
+
}
|
|
161
166
|
};
|
|
162
|
-
|
|
167
|
+
for _ in 0..indent_level {
|
|
168
|
+
output.push_str(indent_char);
|
|
169
|
+
}
|
|
163
170
|
}
|
|
164
171
|
|
|
165
172
|
/// Calculate the indentation string for list continuations based on depth and options.
|
|
@@ -169,10 +176,19 @@ pub fn continuation_indent_string(list_depth: usize, options: &ConversionOptions
|
|
|
169
176
|
return None;
|
|
170
177
|
}
|
|
171
178
|
|
|
172
|
-
let indent =
|
|
173
|
-
|
|
174
|
-
ListIndentType::
|
|
175
|
-
|
|
179
|
+
let mut indent = String::new();
|
|
180
|
+
match options.list_indent_type {
|
|
181
|
+
ListIndentType::Tabs => {
|
|
182
|
+
for _ in 0..indent_level {
|
|
183
|
+
indent.push('\t');
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
ListIndentType::Spaces => {
|
|
187
|
+
for _ in 0..(options.list_indent_width * indent_level) {
|
|
188
|
+
indent.push(' ');
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
176
192
|
Some(indent)
|
|
177
193
|
}
|
|
178
194
|
|
|
@@ -126,7 +126,9 @@ pub fn handle_summary(
|
|
|
126
126
|
output.push_str(trimmed);
|
|
127
127
|
} else {
|
|
128
128
|
// Block mode: output with strong markers
|
|
129
|
-
let symbol =
|
|
129
|
+
let mut symbol = String::with_capacity(2);
|
|
130
|
+
symbol.push(options.strong_em_symbol);
|
|
131
|
+
symbol.push(options.strong_em_symbol);
|
|
130
132
|
output.push_str(&symbol);
|
|
131
133
|
output.push_str(trimmed);
|
|
132
134
|
output.push_str(&symbol);
|
|
@@ -28,28 +28,29 @@ pub fn dedent_code_block(content: &str) -> String {
|
|
|
28
28
|
.min()
|
|
29
29
|
.unwrap_or(0);
|
|
30
30
|
|
|
31
|
-
lines
|
|
32
|
-
.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
31
|
+
lines.iter().fold(String::new(), |mut acc, line| {
|
|
32
|
+
if !acc.is_empty() {
|
|
33
|
+
acc.push('\n');
|
|
34
|
+
}
|
|
35
|
+
let processed = if line.trim().is_empty() {
|
|
36
|
+
*line
|
|
37
|
+
} else {
|
|
38
|
+
let mut remaining = min_indent;
|
|
39
|
+
let mut cut = 0;
|
|
40
|
+
for (idx, ch) in line.char_indices() {
|
|
41
|
+
if remaining == 0 {
|
|
42
|
+
break;
|
|
43
|
+
}
|
|
44
|
+
if ch.is_whitespace() {
|
|
45
|
+
remaining -= 1;
|
|
46
|
+
cut = idx + ch.len_utf8();
|
|
47
|
+
} else {
|
|
48
|
+
break;
|
|
49
49
|
}
|
|
50
|
-
&line[cut..]
|
|
51
50
|
}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
.
|
|
51
|
+
&line[cut..]
|
|
52
|
+
};
|
|
53
|
+
acc.push_str(processed);
|
|
54
|
+
acc
|
|
55
|
+
})
|
|
55
56
|
}
|
|
@@ -110,13 +110,14 @@ pub fn process_text_node(
|
|
|
110
110
|
} else if ctx.in_table_cell {
|
|
111
111
|
let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
|
|
112
112
|
let normalized_text = text::normalize_whitespace_cow(text.as_ref());
|
|
113
|
-
text::escape(
|
|
113
|
+
let escaped_result = text::escape(
|
|
114
114
|
normalized_text.as_ref(),
|
|
115
115
|
options.escape_misc,
|
|
116
116
|
options.escape_asterisks,
|
|
117
117
|
options.escape_underscores,
|
|
118
118
|
options.escape_ascii,
|
|
119
|
-
)
|
|
119
|
+
);
|
|
120
|
+
escaped_result.into_owned()
|
|
120
121
|
} else {
|
|
121
122
|
text::escape(
|
|
122
123
|
text.as_ref(),
|
|
@@ -125,6 +126,7 @@ pub fn process_text_node(
|
|
|
125
126
|
options.escape_underscores,
|
|
126
127
|
options.escape_ascii,
|
|
127
128
|
)
|
|
129
|
+
.into_owned()
|
|
128
130
|
};
|
|
129
131
|
if options.escape_misc {
|
|
130
132
|
escaped
|
|
@@ -139,6 +141,7 @@ pub fn process_text_node(
|
|
|
139
141
|
options.escape_underscores,
|
|
140
142
|
options.escape_ascii,
|
|
141
143
|
)
|
|
144
|
+
.into_owned()
|
|
142
145
|
} else {
|
|
143
146
|
let has_double_newline = text.contains("\n\n") || text.contains("\r\n\r\n");
|
|
144
147
|
let has_trailing_single_newline =
|
|
@@ -36,13 +36,13 @@ pub fn escape(
|
|
|
36
36
|
escape_asterisks: bool,
|
|
37
37
|
escape_underscores: bool,
|
|
38
38
|
escape_ascii: bool,
|
|
39
|
-
) ->
|
|
39
|
+
) -> Cow<'_, str> {
|
|
40
40
|
if text.is_empty() {
|
|
41
|
-
return
|
|
41
|
+
return Cow::Borrowed("");
|
|
42
42
|
}
|
|
43
43
|
|
|
44
44
|
if !escape_misc && !escape_asterisks && !escape_underscores && !escape_ascii {
|
|
45
|
-
return text
|
|
45
|
+
return Cow::Borrowed(text);
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
if escape_ascii
|
|
@@ -83,7 +83,7 @@ pub fn escape(
|
|
|
83
83
|
)
|
|
84
84
|
})
|
|
85
85
|
{
|
|
86
|
-
return text
|
|
86
|
+
return Cow::Borrowed(text);
|
|
87
87
|
}
|
|
88
88
|
|
|
89
89
|
if !escape_ascii && escape_misc && !escape_asterisks && !escape_underscores {
|
|
@@ -95,7 +95,7 @@ pub fn escape(
|
|
|
95
95
|
});
|
|
96
96
|
let needs_numbered = text.as_bytes().iter().any(|b| matches!(b, b'.' | b')'));
|
|
97
97
|
if !needs_misc && !needs_numbered {
|
|
98
|
-
return text
|
|
98
|
+
return Cow::Borrowed(text);
|
|
99
99
|
}
|
|
100
100
|
}
|
|
101
101
|
|
|
@@ -103,7 +103,7 @@ pub fn escape(
|
|
|
103
103
|
|
|
104
104
|
if escape_ascii {
|
|
105
105
|
result = ESCAPE_ASCII_RE.replace_all(&result, r"\$1").to_string();
|
|
106
|
-
return result;
|
|
106
|
+
return Cow::Owned(result);
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
if escape_misc {
|
|
@@ -120,7 +120,7 @@ pub fn escape(
|
|
|
120
120
|
result = result.replace('_', r"\_");
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
result
|
|
123
|
+
Cow::Owned(result)
|
|
124
124
|
}
|
|
125
125
|
|
|
126
126
|
/// Extract boundary whitespace from text (chomp).
|
|
@@ -10,10 +10,10 @@ fn long_multibyte_link_label_does_not_panic() {
|
|
|
10
10
|
html.push_str("</a>");
|
|
11
11
|
|
|
12
12
|
let markdown = convert(&html, Some(ConversionOptions::default())).unwrap();
|
|
13
|
-
let expected_label = format!("{}
|
|
13
|
+
let expected_label = format!("{}👍", "a".repeat(511));
|
|
14
14
|
|
|
15
15
|
assert!(
|
|
16
16
|
markdown.contains(&format!("[{}]", expected_label)),
|
|
17
|
-
"expected
|
|
17
|
+
"expected full label to appear in markdown output; got: {markdown}"
|
|
18
18
|
);
|
|
19
19
|
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.24.
|
|
4
|
+
version: 2.24.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01
|
|
11
|
+
date: 2026-02-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|