html-to-markdown 2.24.3 → 2.24.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 803d11193f6053eee9f8a73696e99148b74ecc73b237b7f587dcb56f206f7805
4
- data.tar.gz: f7950a3e47d0c50bf5777ef4359ee45dbc794fe7dcff3306d9b97c8fc175e05b
3
+ metadata.gz: 7aba0e152474e36acd5ca4a9abb8cd891a6edd671086048415eec13bb25a69af
4
+ data.tar.gz: 55c7d8340aa73d115005745d8e93894c7494588a1458f55427b765ff392f1cd6
5
5
  SHA512:
6
- metadata.gz: '049e92eb25488dc40d9ee107cc13de347123472ed9632667d8f3958399735589688d15c7001c636153af363c80fcac86b4b8ab953a75ab622cf346bc03e237e5'
7
- data.tar.gz: c5fd6a3378383efe8a9581129e238ce292b6755607aebc8228d2183fa1adee1a84247d3559e00f6143e48503752e0b9517693110063ca32e4ff95bdc9c412591
6
+ metadata.gz: 8a591d0bbbba4cc5682cc8d51e6f851199a70065294352251b4d6b7794ce53d257ab66c873b79da9c10f15b74da7670cf77caa3ddd1eec5d93593435aa6979b7
7
+ data.tar.gz: b979428207c4ebc1e7a7ea0f9d4e4859f11c50558bd11f9fae2e7e61d6901538c09594e7307bfce3910f4abfa3d9a4fe230f1a61312c592c86c5c30693ed3b5b
data/.gitignore CHANGED
@@ -1 +1,3 @@
1
1
  vendor/
2
+ .cargo/
3
+ rust-vendor/
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (2.24.3)
4
+ html-to-markdown (2.24.4)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -161,7 +161,7 @@ CHECKSUMS
161
161
  ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
162
162
  ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
163
163
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
164
- html-to-markdown (2.24.3)
164
+ html-to-markdown (2.24.4)
165
165
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
166
166
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
167
167
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
data/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
20
  <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v2/htmltomarkdown">
21
- <img src="https://img.shields.io/badge/Go-v2.24.1-007ec6" alt="Go">
21
+ <img src="https://img.shields.io/badge/Go-v2.24.4-007ec6" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -384,7 +384,7 @@ dependencies = [
384
384
 
385
385
  [[package]]
386
386
  name = "html-to-markdown-rb"
387
- version = "2.24.3"
387
+ version = "2.24.4"
388
388
  dependencies = [
389
389
  "html-to-markdown-rs",
390
390
  "magnus",
@@ -396,6 +396,7 @@ dependencies = [
396
396
  name = "html-to-markdown-rs"
397
397
  version = "2.23.0"
398
398
  dependencies = [
399
+ "ahash",
399
400
  "astral-tl",
400
401
  "base64",
401
402
  "html-escape",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version ="2.24.3"
3
+ version ="2.24.4"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '2.24.3'
4
+ VERSION = '2.24.4'
5
5
  end
@@ -31,6 +31,7 @@ regex = "1.12"
31
31
  once_cell = "1.21"
32
32
  thiserror = "2.0"
33
33
  base64 = "0.22"
34
+ ahash = "0.8"
34
35
  html-escape = "0.2.13"
35
36
  image = { version = "0.25", default-features = false, features = ["gif", "jpeg", "png", "bmp", "webp"], optional = true }
36
37
  html5ever = "0.36"
@@ -567,6 +567,7 @@ fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
567
567
  options.escape_underscores,
568
568
  options.escape_ascii,
569
569
  )
570
+ .into_owned()
570
571
  } else {
571
572
  normalized.into_owned()
572
573
  };
@@ -149,6 +149,7 @@ fn add_list_continuation_indent(
149
149
  output.push('\n');
150
150
  }
151
151
 
152
- let indent = " ".repeat(list_depth);
153
- output.push_str(&indent);
152
+ for _ in 0..list_depth {
153
+ output.push_str(" ");
154
+ }
154
155
  }
@@ -216,28 +216,40 @@ pub(crate) fn push_heading(output: &mut String, ctx: &Context, options: &Convers
216
216
  if level == 1 {
217
217
  output.push_str(text);
218
218
  output.push('\n');
219
- output.push_str(&"=".repeat(text.len()));
219
+ for _ in 0..text.len() {
220
+ output.push('=');
221
+ }
220
222
  } else if level == 2 {
221
223
  output.push_str(text);
222
224
  output.push('\n');
223
- output.push_str(&"-".repeat(text.len()));
225
+ for _ in 0..text.len() {
226
+ output.push('-');
227
+ }
224
228
  } else {
225
- output.push_str(&"#".repeat(level));
229
+ for _ in 0..level {
230
+ output.push('#');
231
+ }
226
232
  output.push(' ');
227
233
  output.push_str(text);
228
234
  }
229
235
  }
230
236
  HeadingStyle::Atx => {
231
- output.push_str(&"#".repeat(level));
237
+ for _ in 0..level {
238
+ output.push('#');
239
+ }
232
240
  output.push(' ');
233
241
  output.push_str(text);
234
242
  }
235
243
  HeadingStyle::AtxClosed => {
236
- output.push_str(&"#".repeat(level));
244
+ for _ in 0..level {
245
+ output.push('#');
246
+ }
237
247
  output.push(' ');
238
248
  output.push_str(text);
239
249
  output.push(' ');
240
- output.push_str(&"#".repeat(level));
250
+ for _ in 0..level {
251
+ output.push('#');
252
+ }
241
253
  }
242
254
  }
243
255
  output.push_str(heading_suffix);
@@ -248,7 +260,11 @@ fn continuation_indent_string(list_depth: usize, _options: &ConversionOptions) -
248
260
  if list_depth == 0 {
249
261
  return None;
250
262
  }
251
- Some(" ".repeat(4 * list_depth))
263
+ let mut indent = String::new();
264
+ for _ in 0..(4 * list_depth) {
265
+ indent.push(' ');
266
+ }
267
+ Some(indent)
252
268
  }
253
269
 
254
270
  /// Process heading with visitor callback if available.
@@ -107,8 +107,9 @@ fn add_list_continuation_indent(
107
107
  if needs_space && !output.ends_with(' ') && !output.ends_with('\n') {
108
108
  output.push(' ');
109
109
  }
110
- let indent = " ".repeat(4 * list_depth);
111
- output.push_str(&indent);
110
+ for _ in 0..(4 * list_depth) {
111
+ output.push(' ');
112
+ }
112
113
  }
113
114
 
114
115
  /// Check if an element is empty (has no text content).
@@ -153,7 +153,7 @@ pub fn convert_table_cell(
153
153
  options.escape_ascii,
154
154
  );
155
155
  if options.escape_misc {
156
- text = escaped;
156
+ text = escaped.into_owned();
157
157
  } else {
158
158
  text = escaped.replace('|', r"\|");
159
159
  }
@@ -175,5 +175,7 @@ pub fn convert_table_cell(
175
175
 
176
176
  output.push(' ');
177
177
  output.push_str(&text);
178
- output.push_str(&" |".repeat(colspan));
178
+ for _ in 0..colspan {
179
+ output.push_str(" |");
180
+ }
179
181
  }
@@ -33,7 +33,9 @@ pub(crate) fn indent_table_for_list(
33
33
  if matches!(options.list_indent_type, ListIndentType::Spaces) {
34
34
  let space_count = indent.chars().filter(|c| *c == ' ').count();
35
35
  if space_count < 4 {
36
- indent.push_str(&" ".repeat(4 - space_count));
36
+ for _ in 0..(4 - space_count) {
37
+ indent.push(' ');
38
+ }
37
39
  }
38
40
  }
39
41
 
@@ -176,7 +176,9 @@ pub fn handle_legend(
176
176
  if ctx.convert_as_inline {
177
177
  output.push_str(trimmed);
178
178
  } else {
179
- let symbol = options.strong_em_symbol.to_string().repeat(2);
179
+ let mut symbol = String::with_capacity(2);
180
+ symbol.push(options.strong_em_symbol);
181
+ symbol.push(options.strong_em_symbol);
180
182
  output.push_str(&symbol);
181
183
  output.push_str(trimmed);
182
184
  output.push_str(&symbol);
@@ -381,7 +383,9 @@ pub fn handle_optgroup(
381
383
  .map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
382
384
 
383
385
  if !label.is_empty() {
384
- let symbol = options.strong_em_symbol.to_string().repeat(2);
386
+ let mut symbol = String::with_capacity(2);
387
+ symbol.push(options.strong_em_symbol);
388
+ symbol.push(options.strong_em_symbol);
385
389
  output.push_str(&symbol);
386
390
  output.push_str(&label);
387
391
  output.push_str(&symbol);
@@ -70,7 +70,9 @@ pub fn handle_mark(
70
70
  output.push_str("</mark>");
71
71
  }
72
72
  HighlightStyle::Bold => {
73
- let symbol = options.strong_em_symbol.to_string().repeat(2);
73
+ let mut symbol = String::with_capacity(2);
74
+ symbol.push(options.strong_em_symbol);
75
+ symbol.push(options.strong_em_symbol);
74
76
  output.push_str(&symbol);
75
77
  let bold_ctx = Context {
76
78
  in_strong: true,
@@ -157,9 +157,16 @@ pub fn add_list_continuation_indent(
157
157
  let indent_level = calculate_list_continuation_indent(list_depth);
158
158
  let indent_char = match options.list_indent_type {
159
159
  ListIndentType::Tabs => "\t",
160
- ListIndentType::Spaces => &" ".repeat(options.list_indent_width),
160
+ ListIndentType::Spaces => {
161
+ for _ in 0..options.list_indent_width {
162
+ output.push(' ');
163
+ }
164
+ return;
165
+ }
161
166
  };
162
- output.push_str(&indent_char.repeat(indent_level));
167
+ for _ in 0..indent_level {
168
+ output.push_str(indent_char);
169
+ }
163
170
  }
164
171
 
165
172
  /// Calculate the indentation string for list continuations based on depth and options.
@@ -169,10 +176,19 @@ pub fn continuation_indent_string(list_depth: usize, options: &ConversionOptions
169
176
  return None;
170
177
  }
171
178
 
172
- let indent = match options.list_indent_type {
173
- ListIndentType::Tabs => "\t".repeat(indent_level),
174
- ListIndentType::Spaces => " ".repeat(options.list_indent_width * indent_level),
175
- };
179
+ let mut indent = String::new();
180
+ match options.list_indent_type {
181
+ ListIndentType::Tabs => {
182
+ for _ in 0..indent_level {
183
+ indent.push('\t');
184
+ }
185
+ }
186
+ ListIndentType::Spaces => {
187
+ for _ in 0..(options.list_indent_width * indent_level) {
188
+ indent.push(' ');
189
+ }
190
+ }
191
+ }
176
192
  Some(indent)
177
193
  }
178
194
 
@@ -126,7 +126,9 @@ pub fn handle_summary(
126
126
  output.push_str(trimmed);
127
127
  } else {
128
128
  // Block mode: output with strong markers
129
- let symbol = options.strong_em_symbol.to_string().repeat(2);
129
+ let mut symbol = String::with_capacity(2);
130
+ symbol.push(options.strong_em_symbol);
131
+ symbol.push(options.strong_em_symbol);
130
132
  output.push_str(&symbol);
131
133
  output.push_str(trimmed);
132
134
  output.push_str(&symbol);
@@ -28,28 +28,29 @@ pub fn dedent_code_block(content: &str) -> String {
28
28
  .min()
29
29
  .unwrap_or(0);
30
30
 
31
- lines
32
- .iter()
33
- .map(|line| {
34
- if line.trim().is_empty() {
35
- *line
36
- } else {
37
- let mut remaining = min_indent;
38
- let mut cut = 0;
39
- for (idx, ch) in line.char_indices() {
40
- if remaining == 0 {
41
- break;
42
- }
43
- if ch.is_whitespace() {
44
- remaining -= 1;
45
- cut = idx + ch.len_utf8();
46
- } else {
47
- break;
48
- }
31
+ lines.iter().fold(String::new(), |mut acc, line| {
32
+ if !acc.is_empty() {
33
+ acc.push('\n');
34
+ }
35
+ let processed = if line.trim().is_empty() {
36
+ *line
37
+ } else {
38
+ let mut remaining = min_indent;
39
+ let mut cut = 0;
40
+ for (idx, ch) in line.char_indices() {
41
+ if remaining == 0 {
42
+ break;
43
+ }
44
+ if ch.is_whitespace() {
45
+ remaining -= 1;
46
+ cut = idx + ch.len_utf8();
47
+ } else {
48
+ break;
49
49
  }
50
- &line[cut..]
51
50
  }
52
- })
53
- .collect::<Vec<_>>()
54
- .join("\n")
51
+ &line[cut..]
52
+ };
53
+ acc.push_str(processed);
54
+ acc
55
+ })
55
56
  }
@@ -110,13 +110,14 @@ pub fn process_text_node(
110
110
  } else if ctx.in_table_cell {
111
111
  let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
112
112
  let normalized_text = text::normalize_whitespace_cow(text.as_ref());
113
- text::escape(
113
+ let escaped_result = text::escape(
114
114
  normalized_text.as_ref(),
115
115
  options.escape_misc,
116
116
  options.escape_asterisks,
117
117
  options.escape_underscores,
118
118
  options.escape_ascii,
119
- )
119
+ );
120
+ escaped_result.into_owned()
120
121
  } else {
121
122
  text::escape(
122
123
  text.as_ref(),
@@ -125,6 +126,7 @@ pub fn process_text_node(
125
126
  options.escape_underscores,
126
127
  options.escape_ascii,
127
128
  )
129
+ .into_owned()
128
130
  };
129
131
  if options.escape_misc {
130
132
  escaped
@@ -139,6 +141,7 @@ pub fn process_text_node(
139
141
  options.escape_underscores,
140
142
  options.escape_ascii,
141
143
  )
144
+ .into_owned()
142
145
  } else {
143
146
  let has_double_newline = text.contains("\n\n") || text.contains("\r\n\r\n");
144
147
  let has_trailing_single_newline =
@@ -3,7 +3,7 @@
3
3
  //!
4
4
  //! Complete type system for hOCR 1.2 specification elements and properties.
5
5
 
6
- use std::collections::HashMap;
6
+ use ahash::AHashMap as HashMap;
7
7
 
8
8
  /// All hOCR 1.2 element types
9
9
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -36,13 +36,13 @@ pub fn escape(
36
36
  escape_asterisks: bool,
37
37
  escape_underscores: bool,
38
38
  escape_ascii: bool,
39
- ) -> String {
39
+ ) -> Cow<'_, str> {
40
40
  if text.is_empty() {
41
- return String::new();
41
+ return Cow::Borrowed("");
42
42
  }
43
43
 
44
44
  if !escape_misc && !escape_asterisks && !escape_underscores && !escape_ascii {
45
- return text.to_string();
45
+ return Cow::Borrowed(text);
46
46
  }
47
47
 
48
48
  if escape_ascii
@@ -83,7 +83,7 @@ pub fn escape(
83
83
  )
84
84
  })
85
85
  {
86
- return text.to_string();
86
+ return Cow::Borrowed(text);
87
87
  }
88
88
 
89
89
  if !escape_ascii && escape_misc && !escape_asterisks && !escape_underscores {
@@ -95,7 +95,7 @@ pub fn escape(
95
95
  });
96
96
  let needs_numbered = text.as_bytes().iter().any(|b| matches!(b, b'.' | b')'));
97
97
  if !needs_misc && !needs_numbered {
98
- return text.to_string();
98
+ return Cow::Borrowed(text);
99
99
  }
100
100
  }
101
101
 
@@ -103,7 +103,7 @@ pub fn escape(
103
103
 
104
104
  if escape_ascii {
105
105
  result = ESCAPE_ASCII_RE.replace_all(&result, r"\$1").to_string();
106
- return result;
106
+ return Cow::Owned(result);
107
107
  }
108
108
 
109
109
  if escape_misc {
@@ -120,7 +120,7 @@ pub fn escape(
120
120
  result = result.replace('_', r"\_");
121
121
  }
122
122
 
123
- result
123
+ Cow::Owned(result)
124
124
  }
125
125
 
126
126
  /// Extract boundary whitespace from text (chomp).
@@ -10,10 +10,10 @@ fn long_multibyte_link_label_does_not_panic() {
10
10
  html.push_str("</a>");
11
11
 
12
12
  let markdown = convert(&html, Some(ConversionOptions::default())).unwrap();
13
- let expected_label = format!("{}", "a".repeat(511));
13
+ let expected_label = format!("{}👍", "a".repeat(511));
14
14
 
15
15
  assert!(
16
16
  markdown.contains(&format!("[{}]", expected_label)),
17
- "expected truncated label to appear in markdown output; got: {markdown}"
17
+ "expected full label to appear in markdown output; got: {markdown}"
18
18
  );
19
19
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.24.3
4
+ version: 2.24.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-31 00:00:00.000000000 Z
11
+ date: 2026-02-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys