html-to-markdown 3.0.2 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e2761dc167e2c7f7e0e27da4367660d7dc4d18f853b5b5add976019434a37de0
4
- data.tar.gz: 8d6822eb08fc782524c4ec35446c48ff017bd2aa640f2515e79c05e4d89508b7
3
+ metadata.gz: c23b51454716c4f5224bc9a0b6cfcfcf3f9935709379395662d9d89cab96f223
4
+ data.tar.gz: '0878f8bad06ca970013d87f6064150bed2db8b5e12d087474acaa4dd17a00559'
5
5
  SHA512:
6
- metadata.gz: 93b26fafdae4c4beca9fc6134a3ba2948369210900c8145e1ad72b1714db86e453523361e39308b8d78d5b94d8dc83e33259c2ae2eca727c1b20f3c1629b81c7
7
- data.tar.gz: 69d16a9ff3a3d67a3cd267d9c0ff7ed48b9f3736e01fa87f5ec7ae2dd1b79263dea40d9e54392ab50546d12ccfd0c9486b3a728e45344009769a1f1fd2a65bd2
6
+ metadata.gz: e21bd6d2ec9cbd40df454f2b441cb2da333b1c73a062686f830c4bd3368dad2dacec3bb0953f5c8902ad2ab411453c690597d64f0c86103d65b71438c647a7f1
7
+ data.tar.gz: 38cf61f5035e6becae227f4117f10208eb9b0ca2d99b805b6e8feefdc8bf2611e44605c967218ea69892df3af8a417026fb70e20cb3ab9a9e28771c3ecc723c9
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html-to-markdown (3.0.2)
4
+ html-to-markdown (3.1.0)
5
5
  rb_sys (>= 0.9, < 1.0)
6
6
 
7
7
  GEM
@@ -153,7 +153,7 @@ CHECKSUMS
153
153
  ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
154
154
  ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
155
155
  fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
156
- html-to-markdown (3.0.2)
156
+ html-to-markdown (3.1.0)
157
157
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
158
158
  json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
159
159
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
data/README.md CHANGED
@@ -18,7 +18,7 @@
18
18
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
19
19
  </a>
20
20
  <a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
21
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0.2" alt="Go">
21
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.1.0" alt="Go">
22
22
  </a>
23
23
  <a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
24
24
  <img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rb"
3
- version = "3.0.2"
3
+ version = "3.1.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.0.2'
4
+ VERSION = '3.1.0'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.0.2"
6
+ version = "3.1.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.0.2"
3
+ version = "3.1.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -12,6 +12,7 @@ use std::rc::Rc;
12
12
  #[cfg(feature = "inline-images")]
13
13
  use crate::inline_images::InlineImageCollector;
14
14
 
15
+ use crate::converter::reference_collector::ReferenceCollectorHandle;
15
16
  use crate::types::structure_collector::StructureCollectorHandle;
16
17
 
17
18
  /// Handle type for inline image collector when feature is enabled.
@@ -105,6 +106,8 @@ pub struct Context {
105
106
  ///
106
107
  /// Populated when `options.include_document_structure == true`.
107
108
  pub(crate) structure_collector: Option<StructureCollectorHandle>,
109
+ /// Optional reference collector for reference-style links.
110
+ pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
108
111
  }
109
112
 
110
113
  impl Context {
@@ -122,6 +125,7 @@ impl Context {
122
125
  #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
123
126
  #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
124
127
  structure_collector: Option<StructureCollectorHandle>,
128
+ reference_collector: Option<ReferenceCollectorHandle>,
125
129
  ) -> Self {
126
130
  #[cfg(feature = "metadata")]
127
131
  let (
@@ -186,6 +190,7 @@ impl Context {
186
190
  #[cfg(feature = "visitor")]
187
191
  visitor_error: Rc::new(RefCell::new(None)),
188
192
  structure_collector,
193
+ reference_collector,
189
194
  }
190
195
  }
191
196
  }
@@ -128,6 +128,8 @@ pub fn handle_graphic(
128
128
  &alt,
129
129
  title.as_deref(),
130
130
  should_use_alt_text,
131
+ options.link_style,
132
+ ctx.reference_collector.as_ref(),
131
133
  )),
132
134
  VisitResult::Custom(custom) => Some(custom),
133
135
  VisitResult::Skip => None,
@@ -145,6 +147,8 @@ pub fn handle_graphic(
145
147
  &alt,
146
148
  title.as_deref(),
147
149
  should_use_alt_text,
150
+ options.link_style,
151
+ ctx.reference_collector.as_ref(),
148
152
  ))
149
153
  };
150
154
 
@@ -154,6 +158,8 @@ pub fn handle_graphic(
154
158
  &alt,
155
159
  title.as_deref(),
156
160
  should_use_alt_text,
161
+ options.link_style,
162
+ ctx.reference_collector.as_ref(),
157
163
  ));
158
164
 
159
165
  if !options.skip_images {
@@ -189,21 +195,39 @@ pub fn handle_graphic(
189
195
  ///
190
196
  /// If `use_alt_only` is true, returns just the alt text.
191
197
  /// Otherwise returns the full `![alt](src "title")` syntax.
192
- fn format_graphic_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
198
+ fn format_graphic_markdown(
199
+ src: &str,
200
+ alt: &str,
201
+ title: Option<&str>,
202
+ use_alt_only: bool,
203
+ link_style: crate::options::validation::LinkStyle,
204
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
205
+ ) -> String {
193
206
  if use_alt_only {
194
- alt.to_string()
195
- } else {
196
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
197
- buf.push_str("![");
198
- buf.push_str(alt);
199
- buf.push_str("](");
200
- buf.push_str(src);
201
- if let Some(title_text) = title {
202
- buf.push_str(" \"");
203
- buf.push_str(title_text);
204
- buf.push('"');
207
+ return alt.to_string();
208
+ }
209
+ if link_style == crate::options::validation::LinkStyle::Reference {
210
+ if let Some(collector) = reference_collector {
211
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
212
+ let mut buf = String::with_capacity(alt.len() + 10);
213
+ buf.push_str("![");
214
+ buf.push_str(alt);
215
+ buf.push_str("][");
216
+ buf.push_str(&ref_num.to_string());
217
+ buf.push(']');
218
+ return buf;
205
219
  }
206
- buf.push(')');
207
- buf
208
220
  }
221
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
222
+ buf.push_str("![");
223
+ buf.push_str(alt);
224
+ buf.push_str("](");
225
+ buf.push_str(src);
226
+ if let Some(title_text) = title {
227
+ buf.push_str(" \"");
228
+ buf.push_str(title_text);
229
+ buf.push('"');
230
+ }
231
+ buf.push(')');
232
+ buf
209
233
  }
@@ -146,7 +146,14 @@ pub fn handle_img(
146
146
  visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
147
147
  };
148
148
  match visit_result {
149
- VisitResult::Continue => Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text)),
149
+ VisitResult::Continue => Some(format_image_markdown(
150
+ &src,
151
+ &alt,
152
+ title.as_deref(),
153
+ should_use_alt_text,
154
+ options.link_style,
155
+ ctx.reference_collector.as_ref(),
156
+ )),
150
157
  VisitResult::Custom(custom) => Some(custom),
151
158
  VisitResult::Skip => None,
152
159
  VisitResult::Error(err) => {
@@ -158,11 +165,25 @@ pub fn handle_img(
158
165
  VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
159
166
  }
160
167
  } else {
161
- Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text))
168
+ Some(format_image_markdown(
169
+ &src,
170
+ &alt,
171
+ title.as_deref(),
172
+ should_use_alt_text,
173
+ options.link_style,
174
+ ctx.reference_collector.as_ref(),
175
+ ))
162
176
  };
163
177
 
164
178
  #[cfg(not(feature = "visitor"))]
165
- let image_output = Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text));
179
+ let image_output = Some(format_image_markdown(
180
+ &src,
181
+ &alt,
182
+ title.as_deref(),
183
+ should_use_alt_text,
184
+ options.link_style,
185
+ ctx.reference_collector.as_ref(),
186
+ ));
166
187
 
167
188
  // Only output image if skip_images is not enabled
168
189
  if !options.skip_images {
@@ -204,21 +225,39 @@ pub fn handle_img(
204
225
  ///
205
226
  /// If `use_alt_only` is true, returns just the alt text.
206
227
  /// Otherwise returns the full `![alt](src "title")` syntax.
207
- fn format_image_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
228
+ fn format_image_markdown(
229
+ src: &str,
230
+ alt: &str,
231
+ title: Option<&str>,
232
+ use_alt_only: bool,
233
+ link_style: crate::options::validation::LinkStyle,
234
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
235
+ ) -> String {
208
236
  if use_alt_only {
209
- alt.to_string()
210
- } else {
211
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
212
- buf.push_str("![");
213
- buf.push_str(alt);
214
- buf.push_str("](");
215
- buf.push_str(src);
216
- if let Some(title_text) = title {
217
- buf.push_str(" \"");
218
- buf.push_str(title_text);
219
- buf.push('"');
237
+ return alt.to_string();
238
+ }
239
+ if link_style == crate::options::validation::LinkStyle::Reference {
240
+ if let Some(collector) = reference_collector {
241
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
242
+ let mut buf = String::with_capacity(alt.len() + 10);
243
+ buf.push_str("![");
244
+ buf.push_str(alt);
245
+ buf.push_str("][");
246
+ buf.push_str(&ref_num.to_string());
247
+ buf.push(']');
248
+ return buf;
220
249
  }
221
- buf.push(')');
222
- buf
223
250
  }
251
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
252
+ buf.push_str("![");
253
+ buf.push_str(alt);
254
+ buf.push_str("](");
255
+ buf.push_str(src);
256
+ if let Some(title_text) = title {
257
+ buf.push_str(" \"");
258
+ buf.push_str(title_text);
259
+ buf.push('"');
260
+ }
261
+ buf.push(')');
262
+ buf
224
263
  }
@@ -115,6 +115,7 @@ pub fn handle_link(
115
115
  title.as_deref(),
116
116
  raw_text.as_str(),
117
117
  options,
118
+ ctx.reference_collector.as_ref(),
118
119
  );
119
120
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
120
121
  return;
@@ -190,6 +191,13 @@ pub fn handle_link(
190
191
  label = href.clone();
191
192
  }
192
193
 
194
+ // Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
195
+ // These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
196
+ // Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
197
+ if label == "^" && href.starts_with('#') {
198
+ label = "↑".to_string();
199
+ }
200
+
193
201
  let escaped_label = escape_link_label(&label);
194
202
 
195
203
  #[cfg(feature = "visitor")]
@@ -226,6 +234,7 @@ pub fn handle_link(
226
234
  title.as_deref(),
227
235
  label.as_str(),
228
236
  options,
237
+ ctx.reference_collector.as_ref(),
229
238
  );
230
239
  Some(buf)
231
240
  }
@@ -248,6 +257,7 @@ pub fn handle_link(
248
257
  title.as_deref(),
249
258
  label.as_str(),
250
259
  options,
260
+ ctx.reference_collector.as_ref(),
251
261
  );
252
262
  Some(buf)
253
263
  };
@@ -262,6 +272,7 @@ pub fn handle_link(
262
272
  title.as_deref(),
263
273
  label.as_str(),
264
274
  options,
275
+ ctx.reference_collector.as_ref(),
265
276
  );
266
277
  Some(buf)
267
278
  };
@@ -145,6 +145,7 @@ pub(crate) fn handle(
145
145
  title.as_deref(),
146
146
  raw_text.as_str(),
147
147
  options,
148
+ ctx.reference_collector.as_ref(),
148
149
  );
149
150
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
150
151
  return;
@@ -262,6 +263,7 @@ pub(crate) fn handle(
262
263
  title.as_deref(),
263
264
  label.as_str(),
264
265
  options,
266
+ ctx.reference_collector.as_ref(),
265
267
  );
266
268
  Some(buf)
267
269
  }
@@ -284,6 +286,7 @@ pub(crate) fn handle(
284
286
  title.as_deref(),
285
287
  label.as_str(),
286
288
  options,
289
+ ctx.reference_collector.as_ref(),
287
290
  );
288
291
  Some(buf)
289
292
  };
@@ -298,6 +301,7 @@ pub(crate) fn handle(
298
301
  title.as_deref(),
299
302
  label.as_str(),
300
303
  options,
304
+ ctx.reference_collector.as_ref(),
301
305
  );
302
306
  Some(buf)
303
307
  };
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
363
367
  title: Option<&str>,
364
368
  raw_text: &str,
365
369
  options: &ConversionOptions,
370
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
366
371
  ) {
372
+ if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
373
+ if let Some(collector) = reference_collector {
374
+ let ref_num = collector.borrow_mut().get_or_insert(href, title);
375
+ output.push('[');
376
+ output.push_str(label);
377
+ output.push_str("][");
378
+ output.push_str(&ref_num.to_string());
379
+ output.push(']');
380
+ return;
381
+ }
382
+ }
383
+
367
384
  output.push('[');
368
385
  output.push_str(label);
369
386
  output.push_str("](");
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
196
196
  }
197
197
  }
198
198
 
199
+ let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
200
+ Some(std::rc::Rc::new(std::cell::RefCell::new(
201
+ crate::converter::reference_collector::ReferenceCollector::new(),
202
+ )))
203
+ } else {
204
+ None
205
+ };
206
+
199
207
  #[cfg(all(feature = "metadata", feature = "visitor"))]
200
208
  let ctx = Context::new(
201
209
  options,
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
203
211
  metadata_collector,
204
212
  visitor,
205
213
  structure_collector.as_ref().map(std::rc::Rc::clone),
214
+ reference_collector.as_ref().map(std::rc::Rc::clone),
206
215
  );
207
216
  #[cfg(all(feature = "metadata", not(feature = "visitor")))]
208
217
  let ctx = Context::new(
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
211
220
  metadata_collector,
212
221
  _visitor,
213
222
  structure_collector.as_ref().map(std::rc::Rc::clone),
223
+ reference_collector.as_ref().map(std::rc::Rc::clone),
214
224
  );
215
225
  #[cfg(all(not(feature = "metadata"), feature = "visitor"))]
216
226
  let ctx = Context::new(
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
219
229
  _metadata_collector,
220
230
  visitor,
221
231
  structure_collector.as_ref().map(std::rc::Rc::clone),
232
+ reference_collector.as_ref().map(std::rc::Rc::clone),
222
233
  );
223
234
  #[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
224
235
  let ctx = Context::new(
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
227
238
  _metadata_collector,
228
239
  _visitor,
229
240
  structure_collector.as_ref().map(std::rc::Rc::clone),
241
+ reference_collector.as_ref().map(std::rc::Rc::clone),
230
242
  );
231
243
 
232
244
  for child_handle in dom.children() {
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
242
254
  // reference to the same collector, and Rc::try_unwrap requires exactly one reference.
243
255
  drop(ctx);
244
256
 
257
+ // Append reference-style link definitions if any were collected
258
+ if let Some(rc) = reference_collector {
259
+ if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
260
+ let ref_section = collector.into_inner().finish();
261
+ if !ref_section.is_empty() {
262
+ let trimmed_len = output.trim_end_matches('\n').len();
263
+ output.truncate(trimmed_len);
264
+ output.push_str("\n\n");
265
+ output.push_str(&ref_section);
266
+ }
267
+ }
268
+ }
269
+
245
270
  // If plain text was requested, discard the markdown output and return plain text.
246
271
  // The full pipeline was still run above so that metadata + visitor callbacks fire.
247
272
  if is_plain_text {
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
78
78
  };
79
79
 
80
80
  if should_output_media_link(&src) {
81
- output.push('[');
82
- output.push_str(&src);
83
- output.push_str("](");
84
- output.push_str(&src);
85
- output.push(')');
81
+ if let Some(ref collector) = ctx.reference_collector {
82
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
83
+ output.push('[');
84
+ output.push_str(&src);
85
+ output.push_str("][");
86
+ output.push_str(&ref_num.to_string());
87
+ output.push(']');
88
+ } else {
89
+ output.push('[');
90
+ output.push_str(&src);
91
+ output.push_str("](");
92
+ output.push_str(&src);
93
+ output.push(')');
94
+ }
86
95
  if !ctx.in_paragraph && !ctx.convert_as_inline {
87
96
  output.push_str("\n\n");
88
97
  }
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
132
141
  };
133
142
 
134
143
  if should_output_media_link(&src) {
135
- output.push('[');
136
- output.push_str(&src);
137
- output.push_str("](");
138
- output.push_str(&src);
139
- output.push(')');
144
+ if let Some(ref collector) = ctx.reference_collector {
145
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
146
+ output.push('[');
147
+ output.push_str(&src);
148
+ output.push_str("][");
149
+ output.push_str(&ref_num.to_string());
150
+ output.push(']');
151
+ } else {
152
+ output.push('[');
153
+ output.push_str(&src);
154
+ output.push_str("](");
155
+ output.push_str(&src);
156
+ output.push(')');
157
+ }
140
158
  if !ctx.in_paragraph && !ctx.convert_as_inline {
141
159
  output.push_str("\n\n");
142
160
  }
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
199
217
  .map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
200
218
 
201
219
  if !src.is_empty() {
202
- output.push('[');
203
- output.push_str(&src);
204
- output.push_str("](");
205
- output.push_str(&src);
206
- output.push(')');
220
+ if let Some(ref collector) = ctx.reference_collector {
221
+ let ref_num = collector.borrow_mut().get_or_insert(&src, None);
222
+ output.push('[');
223
+ output.push_str(&src);
224
+ output.push_str("][");
225
+ output.push_str(&ref_num.to_string());
226
+ output.push(']');
227
+ } else {
228
+ output.push('[');
229
+ output.push_str(&src);
230
+ output.push_str("](");
231
+ output.push_str(&src);
232
+ output.push(')');
233
+ }
207
234
  if !ctx.in_paragraph && !ctx.convert_as_inline {
208
235
  output.push_str("\n\n");
209
236
  }
@@ -103,6 +103,7 @@ pub mod media;
103
103
  mod metadata;
104
104
  pub mod plain_text;
105
105
  pub mod preprocessing_helpers;
106
+ pub mod reference_collector;
106
107
  pub mod semantic;
107
108
  pub mod text;
108
109
  mod text_node;
@@ -0,0 +1,69 @@
1
+ //! Collector for reference-style link definitions.
2
+
3
+ use std::cell::RefCell;
4
+ use std::collections::HashMap;
5
+ use std::rc::Rc;
6
+
7
+ /// Shared handle for passing the collector through the conversion context.
8
+ pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
9
+
10
+ #[derive(Debug, Clone, Hash, Eq, PartialEq)]
11
+ struct ReferenceKey {
12
+ url: String,
13
+ title: Option<String>,
14
+ }
15
+
16
+ /// Collects link/image references during conversion and produces a reference
17
+ /// definitions section at the end of the document.
18
+ #[derive(Debug, Default)]
19
+ pub struct ReferenceCollector {
20
+ map: HashMap<ReferenceKey, usize>,
21
+ entries: Vec<(usize, String, Option<String>)>,
22
+ }
23
+
24
+ impl ReferenceCollector {
25
+ /// Create a new, empty reference collector.
26
+ pub fn new() -> Self {
27
+ Self::default()
28
+ }
29
+
30
+ /// Register a URL (and optional title) and return its 1-based reference number.
31
+ ///
32
+ /// If the same URL+title pair was already registered, the existing number is returned.
33
+ pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
34
+ let key = ReferenceKey {
35
+ url: url.to_string(),
36
+ title: title.map(String::from),
37
+ };
38
+ if let Some(&num) = self.map.get(&key) {
39
+ return num;
40
+ }
41
+ let num = self.entries.len() + 1;
42
+ self.map.insert(key, num);
43
+ self.entries.push((num, url.to_string(), title.map(String::from)));
44
+ num
45
+ }
46
+
47
+ /// Produce the reference definitions section.
48
+ ///
49
+ /// Returns an empty string when no references were collected.
50
+ pub fn finish(&self) -> String {
51
+ if self.entries.is_empty() {
52
+ return String::new();
53
+ }
54
+ let mut out = String::new();
55
+ for (num, url, title) in &self.entries {
56
+ out.push('[');
57
+ out.push_str(&num.to_string());
58
+ out.push_str("]: ");
59
+ out.push_str(url);
60
+ if let Some(t) = title {
61
+ out.push_str(" \"");
62
+ out.push_str(&t.replace('"', "\\\""));
63
+ out.push('"');
64
+ }
65
+ out.push('\n');
66
+ }
67
+ out
68
+ }
69
+ }
@@ -18,6 +18,7 @@ pub use crate::metadata::{
18
18
  };
19
19
 
20
20
  pub use crate::options::{
21
- CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
22
- NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
21
+ CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, LinkStyle,
22
+ ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
23
+ WhitespaceMode,
23
24
  };
@@ -4,7 +4,7 @@
4
4
 
5
5
  use crate::options::preprocessing::PreprocessingOptions;
6
6
  use crate::options::validation::{
7
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
7
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
8
8
  };
9
9
 
10
10
  /// Main conversion options for HTML to Markdown conversion.
@@ -94,6 +94,8 @@ pub struct ConversionOptions {
94
94
  pub preserve_tags: Vec<String>,
95
95
  /// Skip conversion of `<img>` elements (omit images from output).
96
96
  pub skip_images: bool,
97
+ /// Link rendering style (inline or reference).
98
+ pub link_style: LinkStyle,
97
99
  /// Target output format (Markdown, plain text, etc.).
98
100
  pub output_format: OutputFormat,
99
101
  /// Include structured document tree in result.
@@ -142,6 +144,7 @@ impl Default for ConversionOptions {
142
144
  strip_tags: Vec::new(),
143
145
  preserve_tags: Vec::new(),
144
146
  skip_images: false,
147
+ link_style: LinkStyle::default(),
145
148
  output_format: OutputFormat::default(),
146
149
  include_document_structure: false,
147
150
  extract_images: false,
@@ -207,6 +210,7 @@ impl ConversionOptionsBuilder {
207
210
  builder_setter!(newline_style, NewlineStyle);
208
211
  builder_setter!(highlight_style, HighlightStyle);
209
212
  builder_setter_into!(code_language, String);
213
+ builder_setter!(link_style, LinkStyle);
210
214
  builder_setter!(autolinks, bool);
211
215
  builder_setter!(default_title, bool);
212
216
  builder_setter!(br_in_tables, bool);
@@ -356,6 +360,8 @@ pub struct ConversionOptionsUpdate {
356
360
  pub preserve_tags: Option<Vec<String>>,
357
361
  /// Optional override for [`ConversionOptions::skip_images`].
358
362
  pub skip_images: Option<bool>,
363
+ /// Optional override for [`ConversionOptions::link_style`].
364
+ pub link_style: Option<LinkStyle>,
359
365
  /// Optional override for [`ConversionOptions::output_format`].
360
366
  pub output_format: Option<OutputFormat>,
361
367
  /// Optional override for [`ConversionOptions::include_document_structure`].
@@ -410,6 +416,7 @@ impl ConversionOptions {
410
416
  apply!(strip_tags);
411
417
  apply!(preserve_tags);
412
418
  apply!(skip_images);
419
+ apply!(link_style);
413
420
  apply!(output_format);
414
421
  apply!(include_document_structure);
415
422
  apply!(extract_images);
@@ -13,7 +13,7 @@ pub mod validation;
13
13
  pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
14
14
  pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
15
15
  pub use validation::{
16
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
16
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
17
17
  };
18
18
 
19
19
  // Note: InlineImageConfig is re-exported from the inline_images module,
@@ -172,6 +172,33 @@ impl HighlightStyle {
172
172
  }
173
173
  }
174
174
 
175
+ /// Link rendering style in Markdown output.
176
+ ///
177
+ /// Controls whether links and images use inline `[text](url)` syntax or
178
+ /// reference-style `[text][1]` syntax with definitions collected at the end.
179
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
180
+ pub enum LinkStyle {
181
+ /// Inline links: `[text](url)`. Default.
182
+ #[default]
183
+ Inline,
184
+ /// Reference-style links: `[text][1]` with `[1]: url` at end of document.
185
+ Reference,
186
+ }
187
+
188
+ impl LinkStyle {
189
+ /// Parse a link style from a string.
190
+ ///
191
+ /// Accepts "reference" or defaults to Inline.
192
+ /// Input is normalized (lowercased, alphanumeric only).
193
+ #[must_use]
194
+ pub fn parse(value: &str) -> Self {
195
+ match normalize_token(value).as_str() {
196
+ "reference" => Self::Reference,
197
+ _ => Self::Inline,
198
+ }
199
+ }
200
+ }
201
+
175
202
  /// Output format for conversion.
176
203
  ///
177
204
  /// Specifies the target markup language format for the conversion output.
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
215
242
  #[cfg(any(feature = "serde", feature = "metadata"))]
216
243
  mod serde_impls {
217
244
  use super::{
218
- CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
245
+ CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
246
+ WhitespaceMode,
219
247
  };
220
248
  use serde::{Deserialize, Serialize, Serializer};
221
249
 
@@ -239,6 +267,7 @@ mod serde_impls {
239
267
  impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
240
268
  impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
241
269
  impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
270
+ impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
242
271
  impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
243
272
 
244
273
  // Serialize implementations that convert enum variants to their string representations
@@ -324,6 +353,19 @@ mod serde_impls {
324
353
  }
325
354
  }
326
355
 
356
+ impl Serialize for LinkStyle {
357
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
358
+ where
359
+ S: Serializer,
360
+ {
361
+ let s = match self {
362
+ Self::Inline => "inline",
363
+ Self::Reference => "reference",
364
+ };
365
+ serializer.serialize_str(s)
366
+ }
367
+ }
368
+
327
369
  impl Serialize for OutputFormat {
328
370
  fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
329
371
  where
@@ -591,6 +591,30 @@ fn q_element_produces_quotes() {
591
591
  assert!(result.contains(r#""hello""#), "q element should add quotes: {result}");
592
592
  }
593
593
 
594
+ #[test]
595
+ fn test_wikipedia_back_reference_caret_normalized() {
596
+ // Wikipedia back-references use <a href="#cite_ref-N">^</a>
597
+ // The caret should be normalized to ↑ to avoid confusion with markdown footnote syntax
598
+ let html = r##"<p>Some text<sup><a href="#cite_ref-1">^</a></sup> more text</p>"##;
599
+ let result = convert(html, None).unwrap();
600
+ assert!(
601
+ result.contains("[↑](#cite_ref-1)"),
602
+ "Back-reference caret should be normalized to ↑: {result}"
603
+ );
604
+ assert!(
605
+ !result.contains("[^]"),
606
+ "Should not produce [^] which looks like footnote syntax: {result}"
607
+ );
608
+ }
609
+
610
+ #[test]
611
+ fn test_regular_caret_link_not_affected() {
612
+ // Regular links with ^ text but no # href should keep the ^
613
+ let html = r#"<a href="https://example.com">^</a>"#;
614
+ let result = convert(html, None).unwrap();
615
+ assert!(result.contains("[^]"), "Non-anchor caret links should keep ^: {result}");
616
+ }
617
+
594
618
  fn convert(
595
619
  html: &str,
596
620
  opts: Option<html_to_markdown_rs::ConversionOptions>,
@@ -0,0 +1,169 @@
1
+ #![allow(missing_docs)]
2
+
3
+ use html_to_markdown_rs::{ConversionOptions, LinkStyle};
4
+
5
+ fn convert(html: &str, options: Option<ConversionOptions>) -> String {
6
+ html_to_markdown_rs::convert(html, options)
7
+ .unwrap()
8
+ .content
9
+ .unwrap_or_default()
10
+ }
11
+
12
+ fn ref_options() -> ConversionOptions {
13
+ ConversionOptions {
14
+ link_style: LinkStyle::Reference,
15
+ ..Default::default()
16
+ }
17
+ }
18
+
19
+ #[test]
20
+ fn basic_reference_link() {
21
+ let html = r#"<a href="https://example.com">Click here</a>"#;
22
+ let result = convert(html, Some(ref_options()));
23
+ assert!(
24
+ result.contains("[Click here][1]"),
25
+ "Expected reference-style link, got: {result}"
26
+ );
27
+ assert!(
28
+ result.contains("[1]: https://example.com"),
29
+ "Expected reference definition, got: {result}"
30
+ );
31
+ }
32
+
33
+ #[test]
34
+ fn reference_link_with_title() {
35
+ let html = r#"<a href="https://example.com" title="Example">Click</a>"#;
36
+ let result = convert(html, Some(ref_options()));
37
+ assert!(
38
+ result.contains("[Click][1]"),
39
+ "Expected reference-style link, got: {result}"
40
+ );
41
+ assert!(
42
+ result.contains(r#"[1]: https://example.com "Example""#),
43
+ "Expected reference definition with title, got: {result}"
44
+ );
45
+ }
46
+
47
+ #[test]
48
+ fn url_deduplication() {
49
+ let html = r#"<a href="https://example.com">First</a> <a href="https://example.com">Second</a>"#;
50
+ let result = convert(html, Some(ref_options()));
51
+ assert!(
52
+ result.contains("[First][1]"),
53
+ "Expected first link with ref 1, got: {result}"
54
+ );
55
+ assert!(
56
+ result.contains("[Second][1]"),
57
+ "Expected second link reusing ref 1, got: {result}"
58
+ );
59
+ // Should only have one definition
60
+ let count = result.matches("[1]: https://example.com").count();
61
+ assert_eq!(count, 1, "Expected exactly one definition, got: {result}");
62
+ }
63
+
64
+ #[test]
65
+ fn different_titles_different_refs() {
66
+ let html =
67
+ r#"<a href="https://example.com" title="A">First</a> <a href="https://example.com" title="B">Second</a>"#;
68
+ let result = convert(html, Some(ref_options()));
69
+ assert!(
70
+ result.contains("[First][1]"),
71
+ "Expected first link ref 1, got: {result}"
72
+ );
73
+ assert!(
74
+ result.contains("[Second][2]"),
75
+ "Expected second link ref 2 (different title), got: {result}"
76
+ );
77
+ }
78
+
79
+ #[test]
80
+ fn image_reference_style() {
81
+ let html = r#"<img src="https://example.com/img.png" alt="A photo">"#;
82
+ let result = convert(html, Some(ref_options()));
83
+ assert!(
84
+ result.contains("![A photo][1]"),
85
+ "Expected reference-style image, got: {result}"
86
+ );
87
+ assert!(
88
+ result.contains("[1]: https://example.com/img.png"),
89
+ "Expected image reference definition, got: {result}"
90
+ );
91
+ }
92
+
93
+ #[test]
94
+ fn mixed_links_and_images_share_numbering() {
95
+ let html = r#"<a href="https://a.com">Link</a><img src="https://b.com/img.png" alt="Img">"#;
96
+ let result = convert(html, Some(ref_options()));
97
+ assert!(result.contains("[Link][1]"), "Expected link as ref 1, got: {result}");
98
+ assert!(result.contains("![Img][2]"), "Expected image as ref 2, got: {result}");
99
+ }
100
+
101
+ #[test]
102
+ fn autolinks_unaffected() {
103
+ let html = r#"<a href="https://example.com">https://example.com</a>"#;
104
+ let options = ConversionOptions {
105
+ link_style: LinkStyle::Reference,
106
+ autolinks: true,
107
+ ..Default::default()
108
+ };
109
+ let result = convert(html, Some(options));
110
+ // Autolinks should still render as <url>
111
+ assert!(
112
+ result.contains("<https://example.com>"),
113
+ "Autolinks should not be affected by reference style, got: {result}"
114
+ );
115
+ }
116
+
117
+ #[test]
118
+ fn default_inline_unchanged() {
119
+ let html = r#"<a href="https://example.com">Click</a>"#;
120
+ let result = convert(html, None);
121
+ assert!(
122
+ result.contains("[Click](https://example.com)"),
123
+ "Default should use inline style, got: {result}"
124
+ );
125
+ }
126
+
127
+ #[test]
128
+ fn multiple_paragraphs_references_at_end() {
129
+ let html = r#"<p><a href="https://a.com">A</a></p><p><a href="https://b.com">B</a></p>"#;
130
+ let result = convert(html, Some(ref_options()));
131
+ // References should be at the very end
132
+ let ref_section_start = result.find("[1]:").expect("Should have ref section");
133
+ let content_end = result.find("[A][1]").expect("Should have inline ref");
134
+ assert!(
135
+ ref_section_start > content_end,
136
+ "Reference section should be after content"
137
+ );
138
+ }
139
+
140
+ #[test]
141
+ fn empty_href_no_reference() {
142
+ let html = r#"<a href="">Empty</a>"#;
143
+ let result = convert(html, Some(ref_options()));
144
+ // Empty href should not create a reference
145
+ assert!(
146
+ !result.contains("[1]:"),
147
+ "Empty href should not create reference, got: {result}"
148
+ );
149
+ }
150
+
151
+ #[test]
152
+ fn title_with_quotes_escaped() {
153
+ let html = r#"<a href="https://example.com" title='Say "hello"'>Link</a>"#;
154
+ let result = convert(html, Some(ref_options()));
155
+ assert!(
156
+ result.contains(r#"[1]: https://example.com "Say \"hello\"""#),
157
+ "Quotes in title should be escaped, got: {result}"
158
+ );
159
+ }
160
+
161
+ #[test]
162
+ fn media_elements_reference_style() {
163
+ let html = r#"<video src="https://example.com/video.mp4"></video>"#;
164
+ let result = convert(html, Some(ref_options()));
165
+ assert!(
166
+ result.contains("[1]: https://example.com/video.mp4"),
167
+ "Video should use reference style, got: {result}"
168
+ );
169
+ }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-to-markdown
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.2
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
@@ -142,6 +142,7 @@ files:
142
142
  - vendor/html-to-markdown-rs/src/converter/mod.rs
143
143
  - vendor/html-to-markdown-rs/src/converter/plain_text.rs
144
144
  - vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
145
+ - vendor/html-to-markdown-rs/src/converter/reference_collector.rs
145
146
  - vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
146
147
  - vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
147
148
  - vendor/html-to-markdown-rs/src/converter/semantic/figure.rs
@@ -224,6 +225,7 @@ files:
224
225
  - vendor/html-to-markdown-rs/tests/lists_test.rs
225
226
  - vendor/html-to-markdown-rs/tests/plain_output_test.rs
226
227
  - vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
228
+ - vendor/html-to-markdown-rs/tests/reference_links_test.rs
227
229
  - vendor/html-to-markdown-rs/tests/skip_images_test.rs
228
230
  - vendor/html-to-markdown-rs/tests/tables_test.rs
229
231
  - vendor/html-to-markdown-rs/tests/test_custom_elements.rs