html-to-markdown 3.0.1 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7681da88f184d981337443e5fd5ba46746c3d51690aea94cd4ed33a903081996
4
- data.tar.gz: e0073d5087e894e76588d03853f94d618412e650d8d3c6b62831555ede8f5230
3
+ metadata.gz: e2761dc167e2c7f7e0e27da4367660d7dc4d18f853b5b5add976019434a37de0
4
+ data.tar.gz: 8d6822eb08fc782524c4ec35446c48ff017bd2aa640f2515e79c05e4d89508b7
5
5
  SHA512:
6
- metadata.gz: 94d123fc5f89d7d83ed6e05feed6fe9d6a084822d42c257a41e2af2bfdde92905f852956dea05e924483284b1fde84eaf1b0e0230ab542c5db56fd64d5a553bf
7
- data.tar.gz: e474ddd110717cada39806796d8c7b1a2bd8900193d4413058392f69424ca7a964c3fa9b41feed493526ccbb6d7f3a6c36d3cc0553504c28c6933488abdbe9a9
6
+ metadata.gz: 93b26fafdae4c4beca9fc6134a3ba2948369210900c8145e1ad72b1714db86e453523361e39308b8d78d5b94d8dc83e33259c2ae2eca727c1b20f3c1629b81c7
7
+ data.tar.gz: 69d16a9ff3a3d67a3cd267d9c0ff7ed48b9f3736e01fa87f5ec7ae2dd1b79263dea40d9e54392ab50546d12ccfd0c9486b3a728e45344009769a1f1fd2a65bd2
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (3.0.1)
4
+ html-to-markdown (3.0.2)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -29,6 +29,7 @@ GEM
29
29
  diff-lcs (1.6.2)
30
30
  drb (2.2.3)
31
31
  ffi (1.17.4-arm64-darwin)
32
+ ffi (1.17.4-x86_64-linux-gnu)
32
33
  fileutils (1.8.0)
33
34
  i18n (1.14.8)
34
35
  concurrent-ruby (~> 1.0)
@@ -40,7 +41,7 @@ GEM
40
41
  rb-fsevent (~> 0.10, >= 0.10.3)
41
42
  rb-inotify (~> 0.9, >= 0.9.10)
42
43
  logger (1.7.0)
43
- minitest (6.0.2)
44
+ minitest (6.0.3)
44
45
  drb (~> 2.0)
45
46
  prism (~> 1.5)
46
47
  mutex_m (0.3.0)
@@ -58,7 +59,8 @@ GEM
58
59
  rb-fsevent (0.11.2)
59
60
  rb-inotify (0.11.1)
60
61
  ffi (~> 1.0)
61
- rb_sys (0.9.124)
62
+ rb_sys (0.9.125)
63
+ json (>= 2)
62
64
  rake-compiler-dock (= 1.11.0)
63
65
  rbs (3.10.4)
64
66
  logger
@@ -126,6 +128,7 @@ GEM
126
128
 
127
129
  PLATFORMS
128
130
  arm64-darwin
131
+ x86_64-linux
129
132
 
130
133
  DEPENDENCIES
131
134
  html-to-markdown!
@@ -148,15 +151,16 @@ CHECKSUMS
148
151
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
149
152
  drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
150
153
  ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
154
+ ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
151
155
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
152
- html-to-markdown (3.0.1)
156
+ html-to-markdown (3.0.2)
153
157
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
154
158
  json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
155
159
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
156
160
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
157
161
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
158
162
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
159
- minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
163
+ minitest (6.0.3) sha256=88ac8a1de36c00692420e7cb3cc11a0773bbcb126aee1c249f320160a7d11411
160
164
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
161
165
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
162
166
  parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
@@ -168,7 +172,7 @@ CHECKSUMS
168
172
  rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
169
173
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
170
174
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
171
- rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
175
+ rb_sys (0.9.125) sha256=14efd4e07eaf7c07edb1bab548d2a4767869a47a8821fc5ea52d9bf982ef00a8
172
176
  rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
173
177
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
174
178
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
data/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
20
  <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.1" alt="Go">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.2" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -59,16 +59,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
59
59
 
60
60
  [[package]]
61
61
  name = "bindgen"
62
- version = "0.69.5"
62
+ version = "0.72.1"
63
63
  source = "registry+https://github.com/rust-lang/crates.io-index"
64
- checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
64
+ checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
65
65
  dependencies = [
66
66
  "bitflags",
67
67
  "cexpr",
68
68
  "clang-sys",
69
69
  "itertools",
70
- "lazy_static",
71
- "lazycell",
72
70
  "proc-macro2",
73
71
  "quote",
74
72
  "regex",
@@ -260,7 +258,7 @@ dependencies = [
260
258
 
261
259
  [[package]]
262
260
  name = "html-to-markdown-rb"
263
- version = "3.0.0"
261
+ version = "3.0.1"
264
262
  dependencies = [
265
263
  "html-to-markdown-rs",
266
264
  "magnus",
@@ -269,7 +267,7 @@ dependencies = [
269
267
 
270
268
  [[package]]
271
269
  name = "html-to-markdown-rs"
272
- version = "3.0.0"
270
+ version = "3.0.1"
273
271
  dependencies = [
274
272
  "ahash",
275
273
  "astral-tl",
@@ -326,9 +324,9 @@ dependencies = [
326
324
 
327
325
  [[package]]
328
326
  name = "itertools"
329
- version = "0.12.1"
327
+ version = "0.13.0"
330
328
  source = "registry+https://github.com/rust-lang/crates.io-index"
331
- checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
329
+ checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
332
330
  dependencies = [
333
331
  "either",
334
332
  ]
@@ -345,12 +343,6 @@ version = "1.5.0"
345
343
  source = "registry+https://github.com/rust-lang/crates.io-index"
346
344
  checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
347
345
 
348
- [[package]]
349
- name = "lazycell"
350
- version = "1.3.0"
351
- source = "registry+https://github.com/rust-lang/crates.io-index"
352
- checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
353
-
354
346
  [[package]]
355
347
  name = "libc"
356
348
  version = "0.2.183"
@@ -609,18 +601,18 @@ dependencies = [
609
601
 
610
602
  [[package]]
611
603
  name = "rb-sys"
612
- version = "0.9.124"
604
+ version = "0.9.125"
613
605
  source = "registry+https://github.com/rust-lang/crates.io-index"
614
- checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
606
+ checksum = "85b37650fabd8ba515910a0dc089dcb6348eb3c35fbf91698cb226435be2babc"
615
607
  dependencies = [
616
608
  "rb-sys-build",
617
609
  ]
618
610
 
619
611
  [[package]]
620
612
  name = "rb-sys-build"
621
- version = "0.9.124"
613
+ version = "0.9.125"
622
614
  source = "registry+https://github.com/rust-lang/crates.io-index"
623
- checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
615
+ checksum = "c73b806faa66006e491458b48a78725621c1ac5a2a6efe2614c90711a7780b80"
624
616
  dependencies = [
625
617
  "bindgen",
626
618
  "lazy_static",
@@ -677,9 +669,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
677
669
 
678
670
  [[package]]
679
671
  name = "rustc-hash"
680
- version = "1.1.0"
672
+ version = "2.1.2"
681
673
  source = "registry+https://github.com/rust-lang/crates.io-index"
682
- checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
674
+ checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
683
675
 
684
676
  [[package]]
685
677
  name = "scopeguard"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "3.0.1"
3
+ version = "3.0.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.0.1'
4
+ VERSION = '3.0.2'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.0.1"
6
+ version = "3.0.2"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.0.1"
3
+ version = "3.0.2"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -128,16 +128,20 @@ pub(crate) fn handle(
128
128
  }
129
129
 
130
130
  // Notify the structure collector if present.
131
- if let Some(ref sc) = ctx.structure_collector {
132
- if let Some(node) = node_handle.get(parser) {
133
- if let tl::Node::Tag(tag) = node {
134
- let id = tag
135
- .attributes()
136
- .get("id")
137
- .flatten()
138
- .map(|v| v.as_utf8_str().to_string());
139
- sc.borrow_mut()
140
- .push_heading(level as u8, normalized.as_ref(), id.as_deref());
131
+ // Skip headings inside table cells — they are part of the table content,
132
+ // not standalone structural headings.
133
+ if !ctx.in_table_cell {
134
+ if let Some(ref sc) = ctx.structure_collector {
135
+ if let Some(node) = node_handle.get(parser) {
136
+ if let tl::Node::Tag(tag) = node {
137
+ let id = tag
138
+ .attributes()
139
+ .get("id")
140
+ .flatten()
141
+ .map(|v| v.as_utf8_str().to_string());
142
+ sc.borrow_mut()
143
+ .push_heading(level as u8, normalized.as_ref(), id.as_deref());
144
+ }
141
145
  }
142
146
  }
143
147
  }
@@ -301,6 +301,10 @@ pub fn handle_pre(
301
301
  {
302
302
  format_code_block(&processed_content, language.as_deref(), output, options, ctx);
303
303
  }
304
+
305
+ if let Some(ref sc) = ctx.structure_collector {
306
+ sc.borrow_mut().push_code(&processed_content, language.as_deref());
307
+ }
304
308
  }
305
309
  }
306
310
 
@@ -192,6 +192,12 @@ pub fn handle_img(
192
192
  }
193
193
  }
194
194
  }
195
+
196
+ if let Some(ref sc) = ctx.structure_collector {
197
+ let src_opt = if src.is_empty() { None } else { Some(src.as_ref()) };
198
+ let alt_opt = if alt.is_empty() { None } else { Some(alt.as_ref()) };
199
+ sc.borrow_mut().push_image(src_opt, alt_opt);
200
+ }
195
201
  }
196
202
 
197
203
  /// Format an image as Markdown syntax.
@@ -204,6 +204,8 @@ pub(crate) fn handle_li(
204
204
  }
205
205
  }
206
206
 
207
+ let item_start_pos = output.len();
208
+
207
209
  let children = tag.children();
208
210
  {
209
211
  for child_handle in children.top().iter() {
@@ -213,6 +215,18 @@ pub(crate) fn handle_li(
213
215
 
214
216
  trim_trailing_whitespace(output);
215
217
 
218
+ if !ctx.in_table_cell {
219
+ if let Some(ref sc) = ctx.structure_collector {
220
+ if item_start_pos <= output.len() && output.is_char_boundary(item_start_pos) {
221
+ let rendered = &output[item_start_pos..];
222
+ let content = rendered.trim();
223
+ if !content.is_empty() {
224
+ sc.borrow_mut().push_list_item(content);
225
+ }
226
+ }
227
+ }
228
+ }
229
+
216
230
  #[cfg(feature = "visitor")]
217
231
  if let Some(ref visitor_handle) = ctx.visitor {
218
232
  use crate::visitor::{NodeContext, NodeType, VisitResult};
@@ -107,6 +107,12 @@ pub(crate) fn handle_ol(
107
107
  }
108
108
  }
109
109
 
110
+ if !ctx.in_table_cell {
111
+ if let Some(ref sc) = ctx.structure_collector {
112
+ sc.borrow_mut().push_list_start(true);
113
+ }
114
+ }
115
+
110
116
  process_list_children(
111
117
  *node_handle,
112
118
  parser,
@@ -121,6 +127,12 @@ pub(crate) fn handle_ol(
121
127
  dom_ctx,
122
128
  );
123
129
 
130
+ if !ctx.in_table_cell {
131
+ if let Some(ref sc) = ctx.structure_collector {
132
+ sc.borrow_mut().push_list_end();
133
+ }
134
+ }
135
+
124
136
  add_nested_list_trailing_separator(output, ctx);
125
137
 
126
138
  #[cfg(feature = "visitor")]
@@ -101,6 +101,12 @@ pub(crate) fn handle_ul(
101
101
  }
102
102
  }
103
103
 
104
+ if !ctx.in_table_cell {
105
+ if let Some(ref sc) = ctx.structure_collector {
106
+ sc.borrow_mut().push_list_start(false);
107
+ }
108
+ }
109
+
104
110
  process_list_children(
105
111
  *node_handle,
106
112
  parser,
@@ -115,6 +121,12 @@ pub(crate) fn handle_ul(
115
121
  dom_ctx,
116
122
  );
117
123
 
124
+ if !ctx.in_table_cell {
125
+ if let Some(ref sc) = ctx.structure_collector {
126
+ sc.borrow_mut().push_list_end();
127
+ }
128
+ }
129
+
118
130
  add_nested_list_trailing_separator(output, ctx);
119
131
 
120
132
  #[cfg(feature = "visitor")]
@@ -226,7 +226,8 @@ fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
226
226
  let mut hasher = DefaultHasher::new();
227
227
  node_type.hash(&mut hasher);
228
228
  // Only hash a prefix of the text to keep cost bounded.
229
- text[..text.len().min(64)].hash(&mut hasher);
229
+ let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
230
+ text[..end].hash(&mut hasher);
230
231
  index.hash(&mut hasher);
231
232
  let digest = hasher.finish();
232
233
  format!("{node_type}-{digest:016x}")
@@ -347,7 +347,8 @@ impl StructureCollector {
347
347
 
348
348
  let mut hasher = DefaultHasher::new();
349
349
  node_type.hash(&mut hasher);
350
- text[..text.len().min(64)].hash(&mut hasher);
350
+ let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
351
+ text[..end].hash(&mut hasher);
351
352
  index.hash(&mut hasher);
352
353
  let digest = hasher.finish();
353
354
  format!("{node_type}-{digest:016x}")
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-31 00:00:00.000000000 Z
11
+ date: 2026-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys