html-to-markdown 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +10 -6
  3. data/README.md +1 -1
  4. data/ext/html-to-markdown-rb/native/Cargo.lock +12 -20
  5. data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
  6. data/lib/html_to_markdown/version.rb +1 -1
  7. data/vendor/Cargo.toml +1 -1
  8. data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
  9. data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +14 -10
  10. data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
  11. data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +4 -0
  12. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
  13. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +62 -17
  14. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
  15. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
  16. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +14 -0
  17. data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +12 -0
  18. data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +12 -0
  19. data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
  20. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
  21. data/vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
  22. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
  23. data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
  24. data/vendor/html-to-markdown-rs/src/options/conversion.rs +8 -1
  25. data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
  26. data/vendor/html-to-markdown-rs/src/options/validation.rs +43 -1
  27. data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +2 -1
  28. data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +2 -1
  29. data/vendor/html-to-markdown-rs/tests/integration_test.rs +24 -0
  30. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
  31. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7681da88f184d981337443e5fd5ba46746c3d51690aea94cd4ed33a903081996
4
- data.tar.gz: e0073d5087e894e76588d03853f94d618412e650d8d3c6b62831555ede8f5230
3
+ metadata.gz: c23b51454716c4f5224bc9a0b6cfcfcf3f9935709379395662d9d89cab96f223
4
+ data.tar.gz: '0878f8bad06ca970013d87f6064150bed2db8b5e12d087474acaa4dd17a00559'
5
5
  SHA512:
6
- metadata.gz: 94d123fc5f89d7d83ed6e05feed6fe9d6a084822d42c257a41e2af2bfdde92905f852956dea05e924483284b1fde84eaf1b0e0230ab542c5db56fd64d5a553bf
7
- data.tar.gz: e474ddd110717cada39806796d8c7b1a2bd8900193d4413058392f69424ca7a964c3fa9b41feed493526ccbb6d7f3a6c36d3cc0553504c28c6933488abdbe9a9
6
+ metadata.gz: e21bd6d2ec9cbd40df454f2b441cb2da333b1c73a062686f830c4bd3368dad2dacec3bb0953f5c8902ad2ab411453c690597d64f0c86103d65b71438c647a7f1
7
+ data.tar.gz: 38cf61f5035e6becae227f4117f10208eb9b0ca2d99b805b6e8feefdc8bf2611e44605c967218ea69892df3af8a417026fb70e20cb3ab9a9e28771c3ecc723c9
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (3.0.1)
4
+ html-to-markdown (3.1.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -29,6 +29,7 @@ GEM
29
29
  diff-lcs (1.6.2)
30
30
  drb (2.2.3)
31
31
  ffi (1.17.4-arm64-darwin)
32
+ ffi (1.17.4-x86_64-linux-gnu)
32
33
  fileutils (1.8.0)
33
34
  i18n (1.14.8)
34
35
  concurrent-ruby (~> 1.0)
@@ -40,7 +41,7 @@ GEM
40
41
  rb-fsevent (~> 0.10, >= 0.10.3)
41
42
  rb-inotify (~> 0.9, >= 0.9.10)
42
43
  logger (1.7.0)
43
- minitest (6.0.2)
44
+ minitest (6.0.3)
44
45
  drb (~> 2.0)
45
46
  prism (~> 1.5)
46
47
  mutex_m (0.3.0)
@@ -58,7 +59,8 @@ GEM
58
59
  rb-fsevent (0.11.2)
59
60
  rb-inotify (0.11.1)
60
61
  ffi (~> 1.0)
61
- rb_sys (0.9.124)
62
+ rb_sys (0.9.125)
63
+ json (>= 2)
62
64
  rake-compiler-dock (= 1.11.0)
63
65
  rbs (3.10.4)
64
66
  logger
@@ -126,6 +128,7 @@ GEM
126
128
 
127
129
  PLATFORMS
128
130
  arm64-darwin
131
+ x86_64-linux
129
132
 
130
133
  DEPENDENCIES
131
134
  html-to-markdown!
@@ -148,15 +151,16 @@ CHECKSUMS
148
151
  diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
149
152
  drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
150
153
  ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
154
+ ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
151
155
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
152
- html-to-markdown (3.0.1)
156
+ html-to-markdown (3.1.0)
153
157
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
154
158
  json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
155
159
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
156
160
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
157
161
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
158
162
  logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
159
- minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
163
+ minitest (6.0.3) sha256=88ac8a1de36c00692420e7cb3cc11a0773bbcb126aee1c249f320160a7d11411
160
164
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
161
165
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
162
166
  parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
@@ -168,7 +172,7 @@ CHECKSUMS
168
172
  rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
169
173
  rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
170
174
  rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
171
- rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
175
+ rb_sys (0.9.125) sha256=14efd4e07eaf7c07edb1bab548d2a4767869a47a8821fc5ea52d9bf982ef00a8
172
176
  rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
173
177
  regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
174
178
  rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
data/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
20
  <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.1" alt="Go">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.1.0" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -59,16 +59,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
59
59
 
60
60
  [[package]]
61
61
  name = "bindgen"
62
- version = "0.69.5"
62
+ version = "0.72.1"
63
63
  source = "registry+https://github.com/rust-lang/crates.io-index"
64
- checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
64
+ checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
65
65
  dependencies = [
66
66
  "bitflags",
67
67
  "cexpr",
68
68
  "clang-sys",
69
69
  "itertools",
70
- "lazy_static",
71
- "lazycell",
72
70
  "proc-macro2",
73
71
  "quote",
74
72
  "regex",
@@ -260,7 +258,7 @@ dependencies = [
260
258
 
261
259
  [[package]]
262
260
  name = "html-to-markdown-rb"
263
- version = "3.0.0"
261
+ version = "3.0.1"
264
262
  dependencies = [
265
263
  "html-to-markdown-rs",
266
264
  "magnus",
@@ -269,7 +267,7 @@ dependencies = [
269
267
 
270
268
  [[package]]
271
269
  name = "html-to-markdown-rs"
272
- version = "3.0.0"
270
+ version = "3.0.1"
273
271
  dependencies = [
274
272
  "ahash",
275
273
  "astral-tl",
@@ -326,9 +324,9 @@ dependencies = [
326
324
 
327
325
  [[package]]
328
326
  name = "itertools"
329
- version = "0.12.1"
327
+ version = "0.13.0"
330
328
  source = "registry+https://github.com/rust-lang/crates.io-index"
331
- checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
329
+ checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
332
330
  dependencies = [
333
331
  "either",
334
332
  ]
@@ -345,12 +343,6 @@ version = "1.5.0"
345
343
  source = "registry+https://github.com/rust-lang/crates.io-index"
346
344
  checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
347
345
 
348
- [[package]]
349
- name = "lazycell"
350
- version = "1.3.0"
351
- source = "registry+https://github.com/rust-lang/crates.io-index"
352
- checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
353
-
354
346
  [[package]]
355
347
  name = "libc"
356
348
  version = "0.2.183"
@@ -609,18 +601,18 @@ dependencies = [
609
601
 
610
602
  [[package]]
611
603
  name = "rb-sys"
612
- version = "0.9.124"
604
+ version = "0.9.125"
613
605
  source = "registry+https://github.com/rust-lang/crates.io-index"
614
- checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
606
+ checksum = "85b37650fabd8ba515910a0dc089dcb6348eb3c35fbf91698cb226435be2babc"
615
607
  dependencies = [
616
608
  "rb-sys-build",
617
609
  ]
618
610
 
619
611
  [[package]]
620
612
  name = "rb-sys-build"
621
- version = "0.9.124"
613
+ version = "0.9.125"
622
614
  source = "registry+https://github.com/rust-lang/crates.io-index"
623
- checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
615
+ checksum = "c73b806faa66006e491458b48a78725621c1ac5a2a6efe2614c90711a7780b80"
624
616
  dependencies = [
625
617
  "bindgen",
626
618
  "lazy_static",
@@ -677,9 +669,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
677
669
 
678
670
  [[package]]
679
671
  name = "rustc-hash"
680
- version = "1.1.0"
672
+ version = "2.1.2"
681
673
  source = "registry+https://github.com/rust-lang/crates.io-index"
682
- checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
674
+ checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
683
675
 
684
676
  [[package]]
685
677
  name = "scopeguard"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "3.0.1"
3
+ version = "3.1.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.0.1'
4
+ VERSION = '3.1.0'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.0.1"
6
+ version = "3.1.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.0.1"
3
+ version = "3.1.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -128,16 +128,20 @@ pub(crate) fn handle(
128
128
  }
129
129
 
130
130
  // Notify the structure collector if present.
131
- if let Some(ref sc) = ctx.structure_collector {
132
- if let Some(node) = node_handle.get(parser) {
133
- if let tl::Node::Tag(tag) = node {
134
- let id = tag
135
- .attributes()
136
- .get("id")
137
- .flatten()
138
- .map(|v| v.as_utf8_str().to_string());
139
- sc.borrow_mut()
140
- .push_heading(level as u8, normalized.as_ref(), id.as_deref());
131
+ // Skip headings inside table cells — they are part of the table content,
132
+ // not standalone structural headings.
133
+ if !ctx.in_table_cell {
134
+ if let Some(ref sc) = ctx.structure_collector {
135
+ if let Some(node) = node_handle.get(parser) {
136
+ if let tl::Node::Tag(tag) = node {
137
+ let id = tag
138
+ .attributes()
139
+ .get("id")
140
+ .flatten()
141
+ .map(|v| v.as_utf8_str().to_string());
142
+ sc.borrow_mut()
143
+ .push_heading(level as u8, normalized.as_ref(), id.as_deref());
144
+ }
141
145
  }
142
146
  }
143
147
  }
@@ -12,6 +12,7 @@ use std::rc::Rc;
12
12
  #[cfg(feature = "inline-images")]
13
13
  use crate::inline_images::InlineImageCollector;
14
14
 
15
+ use crate::converter::reference_collector::ReferenceCollectorHandle;
15
16
  use crate::types::structure_collector::StructureCollectorHandle;
16
17
 
17
18
  /// Handle type for inline image collector when feature is enabled.
@@ -105,6 +106,8 @@ pub struct Context {
105
106
  ///
106
107
  /// Populated when `options.include_document_structure == true`.
107
108
  pub(crate) structure_collector: Option<StructureCollectorHandle>,
109
+ /// Optional reference collector for reference-style links.
110
+ pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
108
111
  }
109
112
 
110
113
  impl Context {
@@ -122,6 +125,7 @@ impl Context {
122
125
  #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
123
126
  #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
124
127
  structure_collector: Option<StructureCollectorHandle>,
128
+ reference_collector: Option<ReferenceCollectorHandle>,
125
129
  ) -> Self {
126
130
  #[cfg(feature = "metadata")]
127
131
  let (
@@ -186,6 +190,7 @@ impl Context {
186
190
  #[cfg(feature = "visitor")]
187
191
  visitor_error: Rc::new(RefCell::new(None)),
188
192
  structure_collector,
193
+ reference_collector,
189
194
  }
190
195
  }
191
196
  }
@@ -301,6 +301,10 @@ pub fn handle_pre(
301
301
  {
302
302
  format_code_block(&processed_content, language.as_deref(), output, options, ctx);
303
303
  }
304
+
305
+ if let Some(ref sc) = ctx.structure_collector {
306
+ sc.borrow_mut().push_code(&processed_content, language.as_deref());
307
+ }
304
308
  }
305
309
  }
306
310
 
@@ -128,6 +128,8 @@ pub fn handle_graphic(
128
128
  &alt,
129
129
  title.as_deref(),
130
130
  should_use_alt_text,
131
+ options.link_style,
132
+ ctx.reference_collector.as_ref(),
131
133
  )),
132
134
  VisitResult::Custom(custom) => Some(custom),
133
135
  VisitResult::Skip => None,
@@ -145,6 +147,8 @@ pub fn handle_graphic(
145
147
  &alt,
146
148
  title.as_deref(),
147
149
  should_use_alt_text,
150
+ options.link_style,
151
+ ctx.reference_collector.as_ref(),
148
152
  ))
149
153
  };
150
154
 
@@ -154,6 +158,8 @@ pub fn handle_graphic(
154
158
  &alt,
155
159
  title.as_deref(),
156
160
  should_use_alt_text,
161
+ options.link_style,
162
+ ctx.reference_collector.as_ref(),
157
163
  ));
158
164
 
159
165
  if !options.skip_images {
@@ -189,21 +195,39 @@ pub fn handle_graphic(
189
195
  ///
190
196
  /// If `use_alt_only` is true, returns just the alt text.
191
197
  /// Otherwise returns the full `![alt](src "title")` syntax.
192
- fn format_graphic_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
198
+ fn format_graphic_markdown(
199
+ src: &str,
200
+ alt: &str,
201
+ title: Option<&str>,
202
+ use_alt_only: bool,
203
+ link_style: crate::options::validation::LinkStyle,
204
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
205
+ ) -> String {
193
206
  if use_alt_only {
194
- alt.to_string()
195
- } else {
196
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
197
- buf.push_str("![");
198
- buf.push_str(alt);
199
- buf.push_str("](");
200
- buf.push_str(src);
201
- if let Some(title_text) = title {
202
- buf.push_str(" \"");
203
- buf.push_str(title_text);
204
- buf.push('"');
207
+ return alt.to_string();
208
+ }
209
+ if link_style == crate::options::validation::LinkStyle::Reference {
210
+ if let Some(collector) = reference_collector {
211
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
212
+ let mut buf = String::with_capacity(alt.len() + 10);
213
+ buf.push_str("![");
214
+ buf.push_str(alt);
215
+ buf.push_str("][");
216
+ buf.push_str(&ref_num.to_string());
217
+ buf.push(']');
218
+ return buf;
205
219
  }
206
- buf.push(')');
207
- buf
208
220
  }
221
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
222
+ buf.push_str("![");
223
+ buf.push_str(alt);
224
+ buf.push_str("](");
225
+ buf.push_str(src);
226
+ if let Some(title_text) = title {
227
+ buf.push_str(" \"");
228
+ buf.push_str(title_text);
229
+ buf.push('"');
230
+ }
231
+ buf.push(')');
232
+ buf
209
233
  }
@@ -146,7 +146,14 @@ pub fn handle_img(
146
146
  visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
147
147
  };
148
148
  match visit_result {
149
- VisitResult::Continue => Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text)),
149
+ VisitResult::Continue => Some(format_image_markdown(
150
+ &src,
151
+ &alt,
152
+ title.as_deref(),
153
+ should_use_alt_text,
154
+ options.link_style,
155
+ ctx.reference_collector.as_ref(),
156
+ )),
150
157
  VisitResult::Custom(custom) => Some(custom),
151
158
  VisitResult::Skip => None,
152
159
  VisitResult::Error(err) => {
@@ -158,11 +165,25 @@ pub fn handle_img(
158
165
  VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
159
166
  }
160
167
  } else {
161
- Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text))
168
+ Some(format_image_markdown(
169
+ &src,
170
+ &alt,
171
+ title.as_deref(),
172
+ should_use_alt_text,
173
+ options.link_style,
174
+ ctx.reference_collector.as_ref(),
175
+ ))
162
176
  };
163
177
 
164
178
  #[cfg(not(feature = "visitor"))]
165
- let image_output = Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text));
179
+ let image_output = Some(format_image_markdown(
180
+ &src,
181
+ &alt,
182
+ title.as_deref(),
183
+ should_use_alt_text,
184
+ options.link_style,
185
+ ctx.reference_collector.as_ref(),
186
+ ));
166
187
 
167
188
  // Only output image if skip_images is not enabled
168
189
  if !options.skip_images {
@@ -192,27 +213,51 @@ pub fn handle_img(
192
213
  }
193
214
  }
194
215
  }
216
+
217
+ if let Some(ref sc) = ctx.structure_collector {
218
+ let src_opt = if src.is_empty() { None } else { Some(src.as_ref()) };
219
+ let alt_opt = if alt.is_empty() { None } else { Some(alt.as_ref()) };
220
+ sc.borrow_mut().push_image(src_opt, alt_opt);
221
+ }
195
222
  }
196
223
 
197
224
  /// Format an image as Markdown syntax.
198
225
  ///
199
226
  /// If `use_alt_only` is true, returns just the alt text.
200
227
  /// Otherwise returns the full `![alt](src "title")` syntax.
201
- fn format_image_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
228
+ fn format_image_markdown(
229
+ src: &str,
230
+ alt: &str,
231
+ title: Option<&str>,
232
+ use_alt_only: bool,
233
+ link_style: crate::options::validation::LinkStyle,
234
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
235
+ ) -> String {
202
236
  if use_alt_only {
203
- alt.to_string()
204
- } else {
205
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
206
- buf.push_str("![");
207
- buf.push_str(alt);
208
- buf.push_str("](");
209
- buf.push_str(src);
210
- if let Some(title_text) = title {
211
- buf.push_str(" \"");
212
- buf.push_str(title_text);
213
- buf.push('"');
237
+ return alt.to_string();
238
+ }
239
+ if link_style == crate::options::validation::LinkStyle::Reference {
240
+ if let Some(collector) = reference_collector {
241
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
242
+ let mut buf = String::with_capacity(alt.len() + 10);
243
+ buf.push_str("![");
244
+ buf.push_str(alt);
245
+ buf.push_str("][");
246
+ buf.push_str(&ref_num.to_string());
247
+ buf.push(']');
248
+ return buf;
214
249
  }
215
- buf.push(')');
216
- buf
217
250
  }
251
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
252
+ buf.push_str("![");
253
+ buf.push_str(alt);
254
+ buf.push_str("](");
255
+ buf.push_str(src);
256
+ if let Some(title_text) = title {
257
+ buf.push_str(" \"");
258
+ buf.push_str(title_text);
259
+ buf.push('"');
260
+ }
261
+ buf.push(')');
262
+ buf
218
263
  }
@@ -115,6 +115,7 @@ pub fn handle_link(
115
115
  title.as_deref(),
116
116
  raw_text.as_str(),
117
117
  options,
118
+ ctx.reference_collector.as_ref(),
118
119
  );
119
120
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
120
121
  return;
@@ -190,6 +191,13 @@ pub fn handle_link(
190
191
  label = href.clone();
191
192
  }
192
193
 
194
+ // Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
195
+ // These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
196
+ // Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
197
+ if label == "^" && href.starts_with('#') {
198
+ label = "↑".to_string();
199
+ }
200
+
193
201
  let escaped_label = escape_link_label(&label);
194
202
 
195
203
  #[cfg(feature = "visitor")]
@@ -226,6 +234,7 @@ pub fn handle_link(
226
234
  title.as_deref(),
227
235
  label.as_str(),
228
236
  options,
237
+ ctx.reference_collector.as_ref(),
229
238
  );
230
239
  Some(buf)
231
240
  }
@@ -248,6 +257,7 @@ pub fn handle_link(
248
257
  title.as_deref(),
249
258
  label.as_str(),
250
259
  options,
260
+ ctx.reference_collector.as_ref(),
251
261
  );
252
262
  Some(buf)
253
263
  };
@@ -262,6 +272,7 @@ pub fn handle_link(
262
272
  title.as_deref(),
263
273
  label.as_str(),
264
274
  options,
275
+ ctx.reference_collector.as_ref(),
265
276
  );
266
277
  Some(buf)
267
278
  };
@@ -145,6 +145,7 @@ pub(crate) fn handle(
145
145
  title.as_deref(),
146
146
  raw_text.as_str(),
147
147
  options,
148
+ ctx.reference_collector.as_ref(),
148
149
  );
149
150
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
150
151
  return;
@@ -262,6 +263,7 @@ pub(crate) fn handle(
262
263
  title.as_deref(),
263
264
  label.as_str(),
264
265
  options,
266
+ ctx.reference_collector.as_ref(),
265
267
  );
266
268
  Some(buf)
267
269
  }
@@ -284,6 +286,7 @@ pub(crate) fn handle(
284
286
  title.as_deref(),
285
287
  label.as_str(),
286
288
  options,
289
+ ctx.reference_collector.as_ref(),
287
290
  );
288
291
  Some(buf)
289
292
  };
@@ -298,6 +301,7 @@ pub(crate) fn handle(
298
301
  title.as_deref(),
299
302
  label.as_str(),
300
303
  options,
304
+ ctx.reference_collector.as_ref(),
301
305
  );
302
306
  Some(buf)
303
307
  };
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
363
367
  title: Option<&str>,
364
368
  raw_text: &str,
365
369
  options: &ConversionOptions,
370
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
366
371
  ) {
372
+ if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
373
+ if let Some(collector) = reference_collector {
374
+ let ref_num = collector.borrow_mut().get_or_insert(href, title);
375
+ output.push('[');
376
+ output.push_str(label);
377
+ output.push_str("][");
378
+ output.push_str(&ref_num.to_string());
379
+ output.push(']');
380
+ return;
381
+ }
382
+ }
383
+
367
384
  output.push('[');
368
385
  output.push_str(label);
369
386
  output.push_str("](");
@@ -204,6 +204,8 @@ pub(crate) fn handle_li(
204
204
  }
205
205
  }
206
206
 
207
+ let item_start_pos = output.len();
208
+
207
209
  let children = tag.children();
208
210
  {
209
211
  for child_handle in children.top().iter() {
@@ -213,6 +215,18 @@ pub(crate) fn handle_li(
213
215
 
214
216
  trim_trailing_whitespace(output);
215
217
 
218
+ if !ctx.in_table_cell {
219
+ if let Some(ref sc) = ctx.structure_collector {
220
+ if item_start_pos <= output.len() && output.is_char_boundary(item_start_pos) {
221
+ let rendered = &output[item_start_pos..];
222
+ let content = rendered.trim();
223
+ if !content.is_empty() {
224
+ sc.borrow_mut().push_list_item(content);
225
+ }
226
+ }
227
+ }
228
+ }
229
+
216
230
  #[cfg(feature = "visitor")]
217
231
  if let Some(ref visitor_handle) = ctx.visitor {
218
232
  use crate::visitor::{NodeContext, NodeType, VisitResult};
@@ -107,6 +107,12 @@ pub(crate) fn handle_ol(
107
107
  }
108
108
  }
109
109
 
110
+ if !ctx.in_table_cell {
111
+ if let Some(ref sc) = ctx.structure_collector {
112
+ sc.borrow_mut().push_list_start(true);
113
+ }
114
+ }
115
+
110
116
  process_list_children(
111
117
  *node_handle,
112
118
  parser,
@@ -121,6 +127,12 @@ pub(crate) fn handle_ol(
121
127
  dom_ctx,
122
128
  );
123
129
 
130
+ if !ctx.in_table_cell {
131
+ if let Some(ref sc) = ctx.structure_collector {
132
+ sc.borrow_mut().push_list_end();
133
+ }
134
+ }
135
+
124
136
  add_nested_list_trailing_separator(output, ctx);
125
137
 
126
138
  #[cfg(feature = "visitor")]
@@ -101,6 +101,12 @@ pub(crate) fn handle_ul(
101
101
  }
102
102
  }
103
103
 
104
+ if !ctx.in_table_cell {
105
+ if let Some(ref sc) = ctx.structure_collector {
106
+ sc.borrow_mut().push_list_start(false);
107
+ }
108
+ }
109
+
104
110
  process_list_children(
105
111
  *node_handle,
106
112
  parser,
@@ -115,6 +121,12 @@ pub(crate) fn handle_ul(
115
121
  dom_ctx,
116
122
  );
117
123
 
124
+ if !ctx.in_table_cell {
125
+ if let Some(ref sc) = ctx.structure_collector {
126
+ sc.borrow_mut().push_list_end();
127
+ }
128
+ }
129
+
118
130
  add_nested_list_trailing_separator(output, ctx);
119
131
 
120
132
  #[cfg(feature = "visitor")]
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
196
196
  }
197
197
  }
198
198
 
199
+ let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
200
+ Some(std::rc::Rc::new(std::cell::RefCell::new(
201
+ crate::converter::reference_collector::ReferenceCollector::new(),
202
+ )))
203
+ } else {
204
+ None
205
+ };
206
+
199
207
  #[cfg(all(feature = "metadata", feature = "visitor"))]
200
208
  let ctx = Context::new(
201
209
  options,
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
203
211
  metadata_collector,
204
212
  visitor,
205
213
  structure_collector.as_ref().map(std::rc::Rc::clone),
214
+ reference_collector.as_ref().map(std::rc::Rc::clone),
206
215
  );
207
216
  #[cfg(all(feature = "metadata", not(feature = "visitor")))]
208
217
  let ctx = Context::new(
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
211
220
  metadata_collector,
212
221
  _visitor,
213
222
  structure_collector.as_ref().map(std::rc::Rc::clone),
223
+ reference_collector.as_ref().map(std::rc::Rc::clone),
214
224
  );
215
225
  #[cfg(all(not(feature = "metadata"), feature = "visitor"))]
216
226
  let ctx = Context::new(
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
219
229
  _metadata_collector,
220
230
  visitor,
221
231
  structure_collector.as_ref().map(std::rc::Rc::clone),
232
+ reference_collector.as_ref().map(std::rc::Rc::clone),
222
233
  );
223
234
  #[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
224
235
  let ctx = Context::new(
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
227
238
  _metadata_collector,
228
239
  _visitor,
229
240
  structure_collector.as_ref().map(std::rc::Rc::clone),
241
+ reference_collector.as_ref().map(std::rc::Rc::clone),
230
242
  );
231
243
 
232
244
  for child_handle in dom.children() {
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
242
254
  // reference to the same collector, and Rc::try_unwrap requires exactly one reference.
243
255
  drop(ctx);
244
256
 
257
+ // Append reference-style link definitions if any were collected
258
+ if let Some(rc) = reference_collector {
259
+ if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
260
+ let ref_section = collector.into_inner().finish();
261
+ if !ref_section.is_empty() {
262
+ let trimmed_len = output.trim_end_matches('\n').len();
263
+ output.truncate(trimmed_len);
264
+ output.push_str("\n\n");
265
+ output.push_str(&ref_section);
266
+ }
267
+ }
268
+ }
269
+
245
270
  // If plain text was requested, discard the markdown output and return plain text.
246
271
  // The full pipeline was still run above so that metadata + visitor callbacks fire.
247
272
  if is_plain_text {
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
78
78
  };
79
79
 
80
80
  if should_output_media_link(&src) {
81
- output.push('[');
82
- output.push_str(&src);
83
- output.push_str("](");
84
- output.push_str(&src);
85
- output.push(')');
81
+ if let Some(ref collector) = ctx.reference_collector {
82
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
83
+ output.push('[');
84
+ output.push_str(&src);
85
+ output.push_str("][");
86
+ output.push_str(&ref_num.to_string());
87
+ output.push(']');
88
+ } else {
89
+ output.push('[');
90
+ output.push_str(&src);
91
+ output.push_str("](");
92
+ output.push_str(&src);
93
+ output.push(')');
94
+ }
86
95
  if !ctx.in_paragraph && !ctx.convert_as_inline {
87
96
  output.push_str("\n\n");
88
97
  }
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
132
141
  };
133
142
 
134
143
  if should_output_media_link(&src) {
135
- output.push('[');
136
- output.push_str(&src);
137
- output.push_str("](");
138
- output.push_str(&src);
139
- output.push(')');
144
+ if let Some(ref collector) = ctx.reference_collector {
145
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
146
+ output.push('[');
147
+ output.push_str(&src);
148
+ output.push_str("][");
149
+ output.push_str(&ref_num.to_string());
150
+ output.push(']');
151
+ } else {
152
+ output.push('[');
153
+ output.push_str(&src);
154
+ output.push_str("](");
155
+ output.push_str(&src);
156
+ output.push(')');
157
+ }
140
158
  if !ctx.in_paragraph && !ctx.convert_as_inline {
141
159
  output.push_str("\n\n");
142
160
  }
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
199
217
  .map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
200
218
 
201
219
  if !src.is_empty() {
202
- output.push('[');
203
- output.push_str(&src);
204
- output.push_str("](");
205
- output.push_str(&src);
206
- output.push(')');
220
+ if let Some(ref collector) = ctx.reference_collector {
221
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
222
+ output.push('[');
223
+ output.push_str(&src);
224
+ output.push_str("][");
225
+ output.push_str(&ref_num.to_string());
226
+ output.push(']');
227
+ } else {
228
+ output.push('[');
229
+ output.push_str(&src);
230
+ output.push_str("](");
231
+ output.push_str(&src);
232
+ output.push(')');
233
+ }
207
234
  if !ctx.in_paragraph && !ctx.convert_as_inline {
208
235
  output.push_str("\n\n");
209
236
  }
@@ -103,6 +103,7 @@ pub mod media;
103
103
  mod metadata;
104
104
  pub mod plain_text;
105
105
  pub mod preprocessing_helpers;
106
+ pub mod reference_collector;
106
107
  pub mod semantic;
107
108
  pub mod text;
108
109
  mod text_node;
@@ -0,0 +1,69 @@
1
+ //! Collector for reference-style link definitions.
2
+
3
+ use std::cell::RefCell;
4
+ use std::collections::HashMap;
5
+ use std::rc::Rc;
6
+
7
+ /// Shared handle for passing the collector through the conversion context.
8
+ pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
9
+
10
+ #[derive(Debug, Clone, Hash, Eq, PartialEq)]
11
+ struct ReferenceKey {
12
+ url: String,
13
+ title: Option<String>,
14
+ }
15
+
16
+ /// Collects link/image references during conversion and produces a reference
17
+ /// definitions section at the end of the document.
18
+ #[derive(Debug, Default)]
19
+ pub struct ReferenceCollector {
20
+ map: HashMap<ReferenceKey, usize>,
21
+ entries: Vec<(usize, String, Option<String>)>,
22
+ }
23
+
24
+ impl ReferenceCollector {
25
+ /// Create a new, empty reference collector.
26
+ pub fn new() -> Self {
27
+ Self::default()
28
+ }
29
+
30
+ /// Register a URL (and optional title) and return its 1-based reference number.
31
+ ///
32
+ /// If the same URL+title pair was already registered, the existing number is returned.
33
+ pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
34
+ let key = ReferenceKey {
35
+ url: url.to_string(),
36
+ title: title.map(String::from),
37
+ };
38
+ if let Some(&num) = self.map.get(&key) {
39
+ return num;
40
+ }
41
+ let num = self.entries.len() + 1;
42
+ self.map.insert(key, num);
43
+ self.entries.push((num, url.to_string(), title.map(String::from)));
44
+ num
45
+ }
46
+
47
+ /// Produce the reference definitions section.
48
+ ///
49
+ /// Returns an empty string when no references were collected.
50
+ pub fn finish(&self) -> String {
51
+ if self.entries.is_empty() {
52
+ return String::new();
53
+ }
54
+ let mut out = String::new();
55
+ for (num, url, title) in &self.entries {
56
+ out.push('[');
57
+ out.push_str(&num.to_string());
58
+ out.push_str("]: ");
59
+ out.push_str(url);
60
+ if let Some(t) = title {
61
+ out.push_str(" \"");
62
+ out.push_str(&t.replace('"', "\\\""));
63
+ out.push('"');
64
+ }
65
+ out.push('\n');
66
+ }
67
+ out
68
+ }
69
+ }
@@ -18,6 +18,7 @@ pub use crate::metadata::{
18
18
  };
19
19
 
20
20
  pub use crate::options::{
21
- CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
22
- NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
21
+ CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, LinkStyle,
22
+ ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
23
+ WhitespaceMode,
23
24
  };
@@ -4,7 +4,7 @@
4
4
 
5
5
  use crate::options::preprocessing::PreprocessingOptions;
6
6
  use crate::options::validation::{
7
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
7
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
8
8
  };
9
9
 
10
10
  /// Main conversion options for HTML to Markdown conversion.
@@ -94,6 +94,8 @@ pub struct ConversionOptions {
94
94
  pub preserve_tags: Vec<String>,
95
95
  /// Skip conversion of `<img>` elements (omit images from output).
96
96
  pub skip_images: bool,
97
+ /// Link rendering style (inline or reference).
98
+ pub link_style: LinkStyle,
97
99
  /// Target output format (Markdown, plain text, etc.).
98
100
  pub output_format: OutputFormat,
99
101
  /// Include structured document tree in result.
@@ -142,6 +144,7 @@ impl Default for ConversionOptions {
142
144
  strip_tags: Vec::new(),
143
145
  preserve_tags: Vec::new(),
144
146
  skip_images: false,
147
+ link_style: LinkStyle::default(),
145
148
  output_format: OutputFormat::default(),
146
149
  include_document_structure: false,
147
150
  extract_images: false,
@@ -207,6 +210,7 @@ impl ConversionOptionsBuilder {
207
210
  builder_setter!(newline_style, NewlineStyle);
208
211
  builder_setter!(highlight_style, HighlightStyle);
209
212
  builder_setter_into!(code_language, String);
213
+ builder_setter!(link_style, LinkStyle);
210
214
  builder_setter!(autolinks, bool);
211
215
  builder_setter!(default_title, bool);
212
216
  builder_setter!(br_in_tables, bool);
@@ -356,6 +360,8 @@ pub struct ConversionOptionsUpdate {
356
360
  pub preserve_tags: Option<Vec<String>>,
357
361
  /// Optional override for [`ConversionOptions::skip_images`].
358
362
  pub skip_images: Option<bool>,
363
+ /// Optional override for [`ConversionOptions::link_style`].
364
+ pub link_style: Option<LinkStyle>,
359
365
  /// Optional override for [`ConversionOptions::output_format`].
360
366
  pub output_format: Option<OutputFormat>,
361
367
  /// Optional override for [`ConversionOptions::include_document_structure`].
@@ -410,6 +416,7 @@ impl ConversionOptions {
410
416
  apply!(strip_tags);
411
417
  apply!(preserve_tags);
412
418
  apply!(skip_images);
419
+ apply!(link_style);
413
420
  apply!(output_format);
414
421
  apply!(include_document_structure);
415
422
  apply!(extract_images);
@@ -13,7 +13,7 @@ pub mod validation;
13
13
  pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
14
14
  pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
15
15
  pub use validation::{
16
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
16
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
17
17
  };
18
18
 
19
19
  // Note: InlineImageConfig is re-exported from the inline_images module,
@@ -172,6 +172,33 @@ impl HighlightStyle {
172
172
  }
173
173
  }
174
174
 
175
+ /// Link rendering style in Markdown output.
176
+ ///
177
+ /// Controls whether links and images use inline `[text](url)` syntax or
178
+ /// reference-style `[text][1]` syntax with definitions collected at the end.
179
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
180
+ pub enum LinkStyle {
181
+ /// Inline links: `[text](url)`. Default.
182
+ #[default]
183
+ Inline,
184
+ /// Reference-style links: `[text][1]` with `[1]: url` at end of document.
185
+ Reference,
186
+ }
187
+
188
+ impl LinkStyle {
189
+ /// Parse a link style from a string.
190
+ ///
191
+ /// Accepts "reference" or defaults to Inline.
192
+ /// Input is normalized (lowercased, alphanumeric only).
193
+ #[must_use]
194
+ pub fn parse(value: &str) -> Self {
195
+ match normalize_token(value).as_str() {
196
+ "reference" => Self::Reference,
197
+ _ => Self::Inline,
198
+ }
199
+ }
200
+ }
201
+
175
202
  /// Output format for conversion.
176
203
  ///
177
204
  /// Specifies the target markup language format for the conversion output.
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
215
242
  #[cfg(any(feature = "serde", feature = "metadata"))]
216
243
  mod serde_impls {
217
244
  use super::{
218
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
245
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
246
+ WhitespaceMode,
219
247
  };
220
248
  use serde::{Deserialize, Serialize, Serializer};
221
249
 
@@ -239,6 +267,7 @@ mod serde_impls {
239
267
  impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
240
268
  impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
241
269
  impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
270
+ impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
242
271
  impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
243
272
 
244
273
  // Serialize implementations that convert enum variants to their string representations
@@ -324,6 +353,19 @@ mod serde_impls {
324
353
  }
325
354
  }
326
355
 
356
+ impl Serialize for LinkStyle {
357
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
358
+ where
359
+ S: Serializer,
360
+ {
361
+ let s = match self {
362
+ Self::Inline => "inline",
363
+ Self::Reference => "reference",
364
+ };
365
+ serializer.serialize_str(s)
366
+ }
367
+ }
368
+
327
369
  impl Serialize for OutputFormat {
328
370
  fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
329
371
  where
@@ -226,7 +226,8 @@ fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
226
226
  let mut hasher = DefaultHasher::new();
227
227
  node_type.hash(&mut hasher);
228
228
  // Only hash a prefix of the text to keep cost bounded.
229
- text[..text.len().min(64)].hash(&mut hasher);
229
+ let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
230
+ text[..end].hash(&mut hasher);
230
231
  index.hash(&mut hasher);
231
232
  let digest = hasher.finish();
232
233
  format!("{node_type}-{digest:016x}")
@@ -347,7 +347,8 @@ impl StructureCollector {
347
347
 
348
348
  let mut hasher = DefaultHasher::new();
349
349
  node_type.hash(&mut hasher);
350
- text[..text.len().min(64)].hash(&mut hasher);
350
+ let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
351
+ text[..end].hash(&mut hasher);
351
352
  index.hash(&mut hasher);
352
353
  let digest = hasher.finish();
353
354
  format!("{node_type}-{digest:016x}")
@@ -591,6 +591,30 @@ fn q_element_produces_quotes() {
591
591
  assert!(result.contains(r#""hello""#), "q element should add quotes: {result}");
592
592
  }
593
593
 
594
+ #[test]
595
+ fn test_wikipedia_back_reference_caret_normalized() {
596
+ // Wikipedia back-references use <a href="#cite_ref-N">^</a>
597
+ // The caret should be normalized to ↑ to avoid confusion with markdown footnote syntax
598
+ let html = r##"<p>Some text<sup><a href="#cite_ref-1">^</a></sup> more text</p>"##;
599
+ let result = convert(html, None).unwrap();
600
+ assert!(
601
+ result.contains("[↑](#cite_ref-1)"),
602
+ "Back-reference caret should be normalized to ↑: {result}"
603
+ );
604
+ assert!(
605
+ !result.contains("[^]"),
606
+ "Should not produce [^] which looks like footnote syntax: {result}"
607
+ );
608
+ }
609
+
610
+ #[test]
611
+ fn test_regular_caret_link_not_affected() {
612
+ // Regular links with ^ text but no # href should keep the ^
613
+ let html = r#"<a href="https://example.com">^</a>"#;
614
+ let result = convert(html, None).unwrap();
615
+ assert!(result.contains("[^]"), "Non-anchor caret links should keep ^: {result}");
616
+ }
617
+
594
618
  fn convert(
595
619
  html: &str,
596
620
  opts: Option<html_to_markdown_rs::ConversionOptions>,
@@ -0,0 +1,169 @@
1
+ #![allow(missing_docs)]
2
+
3
+ use html_to_markdown_rs::{ConversionOptions, LinkStyle};
4
+
5
+ fn convert(html: &str, options: Option<ConversionOptions>) -> String {
6
+ html_to_markdown_rs::convert(html, options)
7
+ .unwrap()
8
+ .content
9
+ .unwrap_or_default()
10
+ }
11
+
12
+ fn ref_options() -> ConversionOptions {
13
+ ConversionOptions {
14
+ link_style: LinkStyle::Reference,
15
+ ..Default::default()
16
+ }
17
+ }
18
+
19
+ #[test]
20
+ fn basic_reference_link() {
21
+ let html = r#"<a href="https://example.com">Click here</a>"#;
22
+ let result = convert(html, Some(ref_options()));
23
+ assert!(
24
+ result.contains("[Click here][1]"),
25
+ "Expected reference-style link, got: {result}"
26
+ );
27
+ assert!(
28
+ result.contains("[1]: https://example.com"),
29
+ "Expected reference definition, got: {result}"
30
+ );
31
+ }
32
+
33
+ #[test]
34
+ fn reference_link_with_title() {
35
+ let html = r#"<a href="https://example.com" title="Example">Click</a>"#;
36
+ let result = convert(html, Some(ref_options()));
37
+ assert!(
38
+ result.contains("[Click][1]"),
39
+ "Expected reference-style link, got: {result}"
40
+ );
41
+ assert!(
42
+ result.contains(r#"[1]: https://example.com "Example""#),
43
+ "Expected reference definition with title, got: {result}"
44
+ );
45
+ }
46
+
47
+ #[test]
48
+ fn url_deduplication() {
49
+ let html = r#"<a href="https://example.com">First</a> <a href="https://example.com">Second</a>"#;
50
+ let result = convert(html, Some(ref_options()));
51
+ assert!(
52
+ result.contains("[First][1]"),
53
+ "Expected first link with ref 1, got: {result}"
54
+ );
55
+ assert!(
56
+ result.contains("[Second][1]"),
57
+ "Expected second link reusing ref 1, got: {result}"
58
+ );
59
+ // Should only have one definition
60
+ let count = result.matches("[1]: https://example.com").count();
61
+ assert_eq!(count, 1, "Expected exactly one definition, got: {result}");
62
+ }
63
+
64
+ #[test]
65
+ fn different_titles_different_refs() {
66
+ let html =
67
+ r#"<a href="https://example.com" title="A">First</a> <a href="https://example.com" title="B">Second</a>"#;
68
+ let result = convert(html, Some(ref_options()));
69
+ assert!(
70
+ result.contains("[First][1]"),
71
+ "Expected first link ref 1, got: {result}"
72
+ );
73
+ assert!(
74
+ result.contains("[Second][2]"),
75
+ "Expected second link ref 2 (different title), got: {result}"
76
+ );
77
+ }
78
+
79
+ #[test]
80
+ fn image_reference_style() {
81
+ let html = r#"<img src="https://example.com/img.png" alt="A photo">"#;
82
+ let result = convert(html, Some(ref_options()));
83
+ assert!(
84
+ result.contains("![A photo][1]"),
85
+ "Expected reference-style image, got: {result}"
86
+ );
87
+ assert!(
88
+ result.contains("[1]: https://example.com/img.png"),
89
+ "Expected image reference definition, got: {result}"
90
+ );
91
+ }
92
+
93
+ #[test]
94
+ fn mixed_links_and_images_share_numbering() {
95
+ let html = r#"<a href="https://a.com">Link</a><img src="https://b.com/img.png" alt="Img">"#;
96
+ let result = convert(html, Some(ref_options()));
97
+ assert!(result.contains("[Link][1]"), "Expected link as ref 1, got: {result}");
98
+ assert!(result.contains("![Img][2]"), "Expected image as ref 2, got: {result}");
99
+ }
100
+
101
+ #[test]
102
+ fn autolinks_unaffected() {
103
+ let html = r#"<a href="https://example.com">https://example.com</a>"#;
104
+ let options = ConversionOptions {
105
+ link_style: LinkStyle::Reference,
106
+ autolinks: true,
107
+ ..Default::default()
108
+ };
109
+ let result = convert(html, Some(options));
110
+ // Autolinks should still render as <url>
111
+ assert!(
112
+ result.contains("<https://example.com>"),
113
+ "Autolinks should not be affected by reference style, got: {result}"
114
+ );
115
+ }
116
+
117
+ #[test]
118
+ fn default_inline_unchanged() {
119
+ let html = r#"<a href="https://example.com">Click</a>"#;
120
+ let result = convert(html, None);
121
+ assert!(
122
+ result.contains("[Click](https://example.com)"),
123
+ "Default should use inline style, got: {result}"
124
+ );
125
+ }
126
+
127
+ #[test]
128
+ fn multiple_paragraphs_references_at_end() {
129
+ let html = r#"<p><a href="https://a.com">A</a></p><p><a href="https://b.com">B</a></p>"#;
130
+ let result = convert(html, Some(ref_options()));
131
+ // References should be at the very end
132
+ let ref_section_start = result.find("[1]:").expect("Should have ref section");
133
+ let content_end = result.find("[A][1]").expect("Should have inline ref");
134
+ assert!(
135
+ ref_section_start > content_end,
136
+ "Reference section should be after content"
137
+ );
138
+ }
139
+
140
+ #[test]
141
+ fn empty_href_no_reference() {
142
+ let html = r#"<a href="">Empty</a>"#;
143
+ let result = convert(html, Some(ref_options()));
144
+ // Empty href should not create a reference
145
+ assert!(
146
+ !result.contains("[1]:"),
147
+ "Empty href should not create reference, got: {result}"
148
+ );
149
+ }
150
+
151
+ #[test]
152
+ fn title_with_quotes_escaped() {
153
+ let html = r#"<a href="https://example.com" title='Say "hello"'>Link</a>"#;
154
+ let result = convert(html, Some(ref_options()));
155
+ assert!(
156
+ result.contains(r#"[1]: https://example.com "Say \"hello\"""#),
157
+ "Quotes in title should be escaped, got: {result}"
158
+ );
159
+ }
160
+
161
+ #[test]
162
+ fn media_elements_reference_style() {
163
+ let html = r#"<video src="https://example.com/video.mp4"></video>"#;
164
+ let result = convert(html, Some(ref_options()));
165
+ assert!(
166
+ result.contains("[1]: https://example.com/video.mp4"),
167
+ "Video should use reference style, got: {result}"
168
+ );
169
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-03-31 00:00:00.000000000 Z
11
+ date: 2026-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -142,6 +142,7 @@ files:
142
142
  - vendor/html-to-markdown-rs/src/converter/mod.rs
143
143
  - vendor/html-to-markdown-rs/src/converter/plain_text.rs
144
144
  - vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
145
+ - vendor/html-to-markdown-rs/src/converter/reference_collector.rs
145
146
  - vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
146
147
  - vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
147
148
  - vendor/html-to-markdown-rs/src/converter/semantic/figure.rs
@@ -224,6 +225,7 @@ files:
224
225
  - vendor/html-to-markdown-rs/tests/lists_test.rs
225
226
  - vendor/html-to-markdown-rs/tests/plain_output_test.rs
226
227
  - vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
228
+ - vendor/html-to-markdown-rs/tests/reference_links_test.rs
227
229
  - vendor/html-to-markdown-rs/tests/skip_images_test.rs
228
230
  - vendor/html-to-markdown-rs/tests/tables_test.rs
229
231
  - vendor/html-to-markdown-rs/tests/test_custom_elements.rs