html-to-markdown 3.0.2 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +42 -12
  3. data/Gemfile +1 -0
  4. data/Gemfile.lock +27 -55
  5. data/README.md +9 -10
  6. data/Rakefile +4 -10
  7. data/ext/html-to-markdown_rb/Cargo.toml +14 -0
  8. data/ext/html_to_markdown_rb/Cargo.toml +16 -0
  9. data/ext/html_to_markdown_rb/extconf.rb +10 -0
  10. data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
  11. data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
  12. data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
  13. data/html-to-markdown-rb.gemspec +1 -1
  14. data/lib/html_to_markdown/version.rb +1 -1
  15. data/lib/html_to_markdown.rb +31 -21
  16. data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
  17. data/sig/html_to_markdown.rbs +17 -5
  18. data/vendor/Cargo.toml +4 -4
  19. data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
  20. data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
  21. data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
  22. data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
  23. data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
  24. data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
  25. data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
  26. data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +56 -17
  27. data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
  28. data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
  29. data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
  30. data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
  31. data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
  32. data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
  33. data/vendor/html-to-markdown-rs/src/converter/mod.rs +3 -2
  34. data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
  35. data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
  36. data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
  37. data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
  38. data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
  39. data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
  40. data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
  41. data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
  42. data/vendor/html-to-markdown-rs/src/options/conversion.rs +14 -13
  43. data/vendor/html-to-markdown-rs/src/options/mod.rs +2 -2
  44. data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
  45. data/vendor/html-to-markdown-rs/src/options/validation.rs +46 -4
  46. data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
  47. data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
  48. data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
  49. data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
  50. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
  51. data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
  52. data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
  53. data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
  54. data/vendor/html-to-markdown-rs/tests/integration_test.rs +27 -3
  55. data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
  56. data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
  57. data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
  58. metadata +13 -18
  59. data/ext/html-to-markdown-rb/extconf.rb +0 -41
  60. data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
  61. data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
  62. data/ext/html-to-markdown-rb/native/README.md +0 -215
  63. data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
  64. data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
  65. data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
  66. data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
  67. data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
  68. data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
  69. data/lib/html_to_markdown/cli.rb +0 -21
  70. data/lib/html_to_markdown/cli_proxy.rb +0 -74
  71. data/spec/cli_proxy_spec.rb +0 -42
  72. data/spec/spec_helper.rb +0 -10
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
87
87
  spec.files = files
88
88
  spec.extra_rdoc_files = ['README.md']
89
89
 
90
- spec.extensions = ['ext/html-to-markdown-rb/extconf.rb']
90
+ spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
91
91
 
92
92
  spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
93
93
  spec.metadata['rubygems_mfa_required'] = 'true'
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module HtmlToMarkdown
4
- VERSION = '3.0.2'
4
+ VERSION = '3.2.0'
5
5
  end
@@ -3,28 +3,38 @@
3
3
  require_relative 'html_to_markdown/version'
4
4
  require 'html_to_markdown_rb'
5
5
 
6
+ # High-performance HTML to Markdown conversion.
7
+ #
8
+ # @example Simple conversion
9
+ # HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
10
+ #
11
+ # @example With options
12
+ # HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
6
13
  module HtmlToMarkdown
7
- autoload :CLI, 'html_to_markdown/cli'
8
- autoload :CLIProxy, 'html_to_markdown/cli_proxy'
9
-
10
- class << self
11
- alias native_convert convert
12
- end
13
-
14
- module_function
15
-
16
- # Convert HTML to Markdown, returning a Hash with:
17
- # - :content [String, nil] the converted Markdown output
18
- # - :document [nil] document structure (not yet exposed)
19
- # - :metadata [Hash, nil] extracted HTML metadata
20
- # - :tables [Array<Hash>] extracted tables with :grid and :markdown
21
- # - :images [Array<Hash>] extracted inline images
22
- # - :warnings [Array<Hash>] processing warnings
14
+ # Convert HTML to Markdown.
23
15
  #
24
- # @param html [String] HTML string to convert
25
- # @param options [Hash, nil] optional conversion options
26
- # @return [Hash] conversion result
27
- def convert(html, options = nil)
28
- native_convert(html.to_s, options)
16
+ # @param html [String] The HTML content to convert.
17
+ # @param options [Hash] Optional conversion options.
18
+ # Supported keys (all optional):
19
+ # - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
20
+ # - :code_block_style - 'backticks', 'tildes', 'indented'
21
+ # - :escape_asterisks - Boolean
22
+ # - :escape_underscores - Boolean
23
+ # - :escape_misc - Boolean
24
+ # - :escape_ascii - Boolean
25
+ # - :strip_newlines - Boolean
26
+ # - :keep_inline_images_in - Array of tag names
27
+ # - :strip_tags - Array of tag names to strip
28
+ # - :preserve_tags - Array of tag names to preserve verbatim
29
+ # (and more, matching ConversionOptions fields)
30
+ # @return [String] The converted Markdown content.
31
+ def self.convert(html, options = {})
32
+ opts = if options.nil? || options.empty?
33
+ nil
34
+ else
35
+ HtmlToMarkdownRs::ConversionOptions.new(options)
36
+ end
37
+ result = HtmlToMarkdownRs.convert(html, opts)
38
+ result.content || ''
29
39
  end
30
40
  end
@@ -1,3 +1,3 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative '../extconf'
3
+ require 'html_to_markdown_rb'
@@ -1,3 +1,16 @@
1
+ # Native extension module (Magnus/rb-sys)
2
+ module HtmlToMarkdownRs
3
+ class ConversionOptions
4
+ def initialize: (Hash[Symbol, untyped]) -> void
5
+ end
6
+
7
+ class ConversionResult
8
+ def content: () -> String?
9
+ end
10
+
11
+ def self.convert: (String html, ConversionOptions? options) -> ConversionResult
12
+ end
13
+
1
14
  # Type definitions for HtmlToMarkdown Ruby gem
2
15
  module HtmlToMarkdown
3
16
  VERSION: String
@@ -8,6 +21,7 @@ module HtmlToMarkdown
8
21
  type whitespace_mode = :normalized | :strict
9
22
  type newline_style = :spaces | :backslash
10
23
  type code_block_style = :indented | :backticks | :tildes
24
+ type link_style = :inline | :reference
11
25
  type output_format = :markdown | :djot
12
26
  type preprocessing_preset = :minimal | :standard | :aggressive
13
27
 
@@ -49,6 +63,7 @@ module HtmlToMarkdown
49
63
  debug?: bool,
50
64
  strip_tags?: Array[String],
51
65
  preserve_tags?: Array[String],
66
+ link_style?: link_style,
52
67
  output_format?: output_format,
53
68
  skip_images?: bool,
54
69
  include_document_structure?: bool,
@@ -126,12 +141,9 @@ module HtmlToMarkdown
126
141
 
127
142
  public
128
143
 
129
- # Convert HTML to Markdown, returning a Hash with content, metadata, tables, images, and warnings.
144
+ # Convert HTML to Markdown, returning the markdown content string.
130
145
  #
131
146
  # Example:
132
147
  # result = HtmlToMarkdown.convert(html)
133
- def self.convert: (String html, ?conversion_options options) -> Hash[String, untyped]
134
-
135
- # Instance method version (created by module_function)
136
- def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
148
+ def self.convert: (String html, ?conversion_options options) -> String
137
149
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "3.0.2"
6
+ version = "3.2.0"
7
7
  edition = "2024"
8
8
  rust-version = "1.85"
9
9
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -18,11 +18,11 @@ clap = { version = "4.6", features = ["derive"] }
18
18
  clap_complete = "4.6"
19
19
  clap_mangen = "0.3"
20
20
  encoding_rs = "0.8"
21
- ext-php-rs = "0.15.8"
21
+ ext-php-rs = "0.15.10"
22
22
  html5ever = "0.39.0"
23
23
  once_cell = "1.21"
24
- pyo3 = { version = "0.28.2", features = ["abi3-py310"] }
25
- rayon = "1.11"
24
+ pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
25
+ rayon = "1.12"
26
26
  regex = "1.12"
27
27
  serde = { version = "1.0", features = ["derive"] }
28
28
  serde_json = "1.0"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "html-to-markdown-rs"
3
- version = "3.0.2"
3
+ version = "3.2.0"
4
4
  edition = "2024"
5
5
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
6
6
  license = "MIT"
@@ -39,7 +39,7 @@ image = { version = "0.25", default-features = false, features = [
39
39
  "bmp",
40
40
  "webp",
41
41
  ], optional = true }
42
- lru = "0.16"
42
+ lru = "0.17"
43
43
  memchr = "2"
44
44
  once_cell = "1.21"
45
45
  regex = "1.12"
@@ -0,0 +1,12 @@
1
+ #![allow(missing_docs)]
2
+ use html_to_markdown_rs::ConversionOptions;
3
+
4
+ fn main() {
5
+ let json = r#"{"headingStyle":"","listIndentType":"","listIndentWidth":2,"bullets":"-*+","strongEmSymbol":"*","escapeAsterisks":false,"escapeUnderscores":false,"escapeMisc":false,"escapeAscii":false,"codeLanguage":"","autolinks":true,"defaultTitle":false,"brInTables":false,"highlightStyle":"","extractMetadata":true,"whitespaceMode":"","stripNewlines":false,"wrap":false,"wrapWidth":80,"convertAsInline":false,"subSymbol":"","supSymbol":"","newlineStyle":"spaces","codeBlockStyle":"tildes","keepInlineImagesIn":null,"preprocessing":{"enabled":false,"preset":"","removeNavigation":false,"removeForms":false},"encoding":"utf-8","debug":false,"stripTags":null,"preserveTags":null,"skipImages":false,"linkStyle":"","outputFormat":"","includeDocumentStructure":false,"extractImages":false,"maxImageSize":5242880,"captureSvg":false,"inferDimensions":true}"#;
6
+
7
+ let opts: ConversionOptions = serde_json::from_str(json).unwrap();
8
+ println!("code_block_style: {:?}", opts.code_block_style);
9
+
10
+ let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
11
+ println!("result: {:?}", result.content);
12
+ }
@@ -40,7 +40,7 @@ pub use super::{Context, DomContext};
40
40
  /// element was handled, `false` otherwise.
41
41
  ///
42
42
  /// # Usage in converter.rs
43
- /// ```ignore
43
+ /// ```text
44
44
  /// if crate::converter::block::dispatch_block_handler(
45
45
  /// &tag_name,
46
46
  /// node_handle,
@@ -29,7 +29,7 @@ pub(crate) use caption::handle_caption;
29
29
  /// Dispatches table element handling to the main convert_table function.
30
30
  ///
31
31
  /// # Usage in converter.rs
32
- /// ```ignore
32
+ /// ```text
33
33
  /// if "table" == tag_name {
34
34
  /// crate::converter::block::table::handle_table(
35
35
  /// node_handle,
@@ -12,6 +12,7 @@ use std::rc::Rc;
12
12
  #[cfg(feature = "inline-images")]
13
13
  use crate::inline_images::InlineImageCollector;
14
14
 
15
+ use crate::converter::reference_collector::ReferenceCollectorHandle;
15
16
  use crate::types::structure_collector::StructureCollectorHandle;
16
17
 
17
18
  /// Handle type for inline image collector when feature is enabled.
@@ -105,6 +106,8 @@ pub struct Context {
105
106
  ///
106
107
  /// Populated when `options.include_document_structure == true`.
107
108
  pub(crate) structure_collector: Option<StructureCollectorHandle>,
109
+ /// Optional reference collector for reference-style links.
110
+ pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
108
111
  }
109
112
 
110
113
  impl Context {
@@ -122,6 +125,7 @@ impl Context {
122
125
  #[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
123
126
  #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
124
127
  structure_collector: Option<StructureCollectorHandle>,
128
+ reference_collector: Option<ReferenceCollectorHandle>,
125
129
  ) -> Self {
126
130
  #[cfg(feature = "metadata")]
127
131
  let (
@@ -186,6 +190,7 @@ impl Context {
186
190
  #[cfg(feature = "visitor")]
187
191
  visitor_error: Rc::new(RefCell::new(None)),
188
192
  structure_collector,
193
+ reference_collector,
189
194
  }
190
195
  }
191
196
  }
@@ -53,7 +53,7 @@ pub use elements::handle as handle_form_elements;
53
53
  ///
54
54
  /// # Example
55
55
  ///
56
- /// ```ignore
56
+ /// ```text
57
57
  /// if dispatch_form_handler(tag_name, &node_handle, &parser, output, options, ctx, depth, dom_ctx) {
58
58
  /// // Tag was handled
59
59
  /// } else {
@@ -128,6 +128,8 @@ pub fn handle_graphic(
128
128
  &alt,
129
129
  title.as_deref(),
130
130
  should_use_alt_text,
131
+ options.link_style,
132
+ ctx.reference_collector.as_ref(),
131
133
  )),
132
134
  VisitResult::Custom(custom) => Some(custom),
133
135
  VisitResult::Skip => None,
@@ -145,6 +147,8 @@ pub fn handle_graphic(
145
147
  &alt,
146
148
  title.as_deref(),
147
149
  should_use_alt_text,
150
+ options.link_style,
151
+ ctx.reference_collector.as_ref(),
148
152
  ))
149
153
  };
150
154
 
@@ -154,6 +158,8 @@ pub fn handle_graphic(
154
158
  &alt,
155
159
  title.as_deref(),
156
160
  should_use_alt_text,
161
+ options.link_style,
162
+ ctx.reference_collector.as_ref(),
157
163
  ));
158
164
 
159
165
  if !options.skip_images {
@@ -189,21 +195,39 @@ pub fn handle_graphic(
189
195
  ///
190
196
  /// If `use_alt_only` is true, returns just the alt text.
191
197
  /// Otherwise returns the full `![alt](src "title")` syntax.
192
- fn format_graphic_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
198
+ fn format_graphic_markdown(
199
+ src: &str,
200
+ alt: &str,
201
+ title: Option<&str>,
202
+ use_alt_only: bool,
203
+ link_style: crate::options::validation::LinkStyle,
204
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
205
+ ) -> String {
193
206
  if use_alt_only {
194
- alt.to_string()
195
- } else {
196
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
197
- buf.push_str("![");
198
- buf.push_str(alt);
199
- buf.push_str("](");
200
- buf.push_str(src);
201
- if let Some(title_text) = title {
202
- buf.push_str(" \"");
203
- buf.push_str(title_text);
204
- buf.push('"');
207
+ return alt.to_string();
208
+ }
209
+ if link_style == crate::options::validation::LinkStyle::Reference {
210
+ if let Some(collector) = reference_collector {
211
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
212
+ let mut buf = String::with_capacity(alt.len() + 10);
213
+ buf.push_str("![");
214
+ buf.push_str(alt);
215
+ buf.push_str("][");
216
+ buf.push_str(&ref_num.to_string());
217
+ buf.push(']');
218
+ return buf;
205
219
  }
206
- buf.push(')');
207
- buf
208
220
  }
221
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
222
+ buf.push_str("![");
223
+ buf.push_str(alt);
224
+ buf.push_str("](");
225
+ buf.push_str(src);
226
+ if let Some(title_text) = title {
227
+ buf.push_str(" \"");
228
+ buf.push_str(title_text);
229
+ buf.push('"');
230
+ }
231
+ buf.push(')');
232
+ buf
209
233
  }
@@ -146,7 +146,14 @@ pub fn handle_img(
146
146
  visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
147
147
  };
148
148
  match visit_result {
149
- VisitResult::Continue => Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text)),
149
+ VisitResult::Continue => Some(format_image_markdown(
150
+ &src,
151
+ &alt,
152
+ title.as_deref(),
153
+ should_use_alt_text,
154
+ options.link_style,
155
+ ctx.reference_collector.as_ref(),
156
+ )),
150
157
  VisitResult::Custom(custom) => Some(custom),
151
158
  VisitResult::Skip => None,
152
159
  VisitResult::Error(err) => {
@@ -158,11 +165,25 @@ pub fn handle_img(
158
165
  VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
159
166
  }
160
167
  } else {
161
- Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text))
168
+ Some(format_image_markdown(
169
+ &src,
170
+ &alt,
171
+ title.as_deref(),
172
+ should_use_alt_text,
173
+ options.link_style,
174
+ ctx.reference_collector.as_ref(),
175
+ ))
162
176
  };
163
177
 
164
178
  #[cfg(not(feature = "visitor"))]
165
- let image_output = Some(format_image_markdown(&src, &alt, title.as_deref(), should_use_alt_text));
179
+ let image_output = Some(format_image_markdown(
180
+ &src,
181
+ &alt,
182
+ title.as_deref(),
183
+ should_use_alt_text,
184
+ options.link_style,
185
+ ctx.reference_collector.as_ref(),
186
+ ));
166
187
 
167
188
  // Only output image if skip_images is not enabled
168
189
  if !options.skip_images {
@@ -204,21 +225,39 @@ pub fn handle_img(
204
225
  ///
205
226
  /// If `use_alt_only` is true, returns just the alt text.
206
227
  /// Otherwise returns the full `![alt](src "title")` syntax.
207
- fn format_image_markdown(src: &str, alt: &str, title: Option<&str>, use_alt_only: bool) -> String {
228
+ fn format_image_markdown(
229
+ src: &str,
230
+ alt: &str,
231
+ title: Option<&str>,
232
+ use_alt_only: bool,
233
+ link_style: crate::options::validation::LinkStyle,
234
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
235
+ ) -> String {
208
236
  if use_alt_only {
209
- alt.to_string()
210
- } else {
211
- let mut buf = String::with_capacity(src.len() + alt.len() + 10);
212
- buf.push_str("![");
213
- buf.push_str(alt);
214
- buf.push_str("](");
215
- buf.push_str(src);
216
- if let Some(title_text) = title {
217
- buf.push_str(" \"");
218
- buf.push_str(title_text);
219
- buf.push('"');
237
+ return alt.to_string();
238
+ }
239
+ if link_style == crate::options::validation::LinkStyle::Reference {
240
+ if let Some(collector) = reference_collector {
241
+ let ref_num = collector.borrow_mut().get_or_insert(src, title);
242
+ let mut buf = String::with_capacity(alt.len() + 10);
243
+ buf.push_str("![");
244
+ buf.push_str(alt);
245
+ buf.push_str("][");
246
+ buf.push_str(&ref_num.to_string());
247
+ buf.push(']');
248
+ return buf;
220
249
  }
221
- buf.push(')');
222
- buf
223
250
  }
251
+ let mut buf = String::with_capacity(src.len() + alt.len() + 10);
252
+ buf.push_str("![");
253
+ buf.push_str(alt);
254
+ buf.push_str("](");
255
+ buf.push_str(src);
256
+ if let Some(title_text) = title {
257
+ buf.push_str(" \"");
258
+ buf.push_str(title_text);
259
+ buf.push('"');
260
+ }
261
+ buf.push(')');
262
+ buf
224
263
  }
@@ -115,6 +115,7 @@ pub fn handle_link(
115
115
  title.as_deref(),
116
116
  raw_text.as_str(),
117
117
  options,
118
+ ctx.reference_collector.as_ref(),
118
119
  );
119
120
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
120
121
  return;
@@ -190,6 +191,13 @@ pub fn handle_link(
190
191
  label = href.clone();
191
192
  }
192
193
 
194
+ // Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
195
+ // These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
196
+ // Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
197
+ if label == "^" && href.starts_with('#') {
198
+ label = "↑".to_string();
199
+ }
200
+
193
201
  let escaped_label = escape_link_label(&label);
194
202
 
195
203
  #[cfg(feature = "visitor")]
@@ -226,6 +234,7 @@ pub fn handle_link(
226
234
  title.as_deref(),
227
235
  label.as_str(),
228
236
  options,
237
+ ctx.reference_collector.as_ref(),
229
238
  );
230
239
  Some(buf)
231
240
  }
@@ -248,6 +257,7 @@ pub fn handle_link(
248
257
  title.as_deref(),
249
258
  label.as_str(),
250
259
  options,
260
+ ctx.reference_collector.as_ref(),
251
261
  );
252
262
  Some(buf)
253
263
  };
@@ -262,6 +272,7 @@ pub fn handle_link(
262
272
  title.as_deref(),
263
273
  label.as_str(),
264
274
  options,
275
+ ctx.reference_collector.as_ref(),
265
276
  );
266
277
  Some(buf)
267
278
  };
@@ -145,6 +145,7 @@ pub(crate) fn handle(
145
145
  title.as_deref(),
146
146
  raw_text.as_str(),
147
147
  options,
148
+ ctx.reference_collector.as_ref(),
148
149
  );
149
150
  push_heading(output, ctx, options, heading_level, link_buffer.as_str());
150
151
  return;
@@ -262,6 +263,7 @@ pub(crate) fn handle(
262
263
  title.as_deref(),
263
264
  label.as_str(),
264
265
  options,
266
+ ctx.reference_collector.as_ref(),
265
267
  );
266
268
  Some(buf)
267
269
  }
@@ -284,6 +286,7 @@ pub(crate) fn handle(
284
286
  title.as_deref(),
285
287
  label.as_str(),
286
288
  options,
289
+ ctx.reference_collector.as_ref(),
287
290
  );
288
291
  Some(buf)
289
292
  };
@@ -298,6 +301,7 @@ pub(crate) fn handle(
298
301
  title.as_deref(),
299
302
  label.as_str(),
300
303
  options,
304
+ ctx.reference_collector.as_ref(),
301
305
  );
302
306
  Some(buf)
303
307
  };
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
363
367
  title: Option<&str>,
364
368
  raw_text: &str,
365
369
  options: &ConversionOptions,
370
+ reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
366
371
  ) {
372
+ if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
373
+ if let Some(collector) = reference_collector {
374
+ let ref_num = collector.borrow_mut().get_or_insert(href, title);
375
+ output.push('[');
376
+ output.push_str(label);
377
+ output.push_str("][");
378
+ output.push_str(&ref_num.to_string());
379
+ output.push(']');
380
+ return;
381
+ }
382
+ }
383
+
367
384
  output.push('[');
368
385
  output.push_str(label);
369
386
  output.push_str("](");
@@ -64,7 +64,7 @@ pub use super::{Context, DomContext};
64
64
  ///
65
65
  /// # Usage in converter.rs
66
66
  ///
67
- /// ```ignore
67
+ /// ```text
68
68
  /// if crate::converter::inline::dispatch_inline_handler(
69
69
  /// &tag_name,
70
70
  /// &node_handle,
@@ -198,7 +198,11 @@ pub(crate) fn handle_li(
198
198
  } else {
199
199
  let bullets: Vec<char> = options.bullets.chars().collect();
200
200
  let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
201
- let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
201
+ let bullet = if bullets.is_empty() {
202
+ '*'
203
+ } else {
204
+ bullets[bullet_index % bullets.len()]
205
+ };
202
206
  output.push(bullet);
203
207
  output.push(' ');
204
208
  }
@@ -265,7 +269,11 @@ pub(crate) fn handle_li(
265
269
  } else {
266
270
  let bullets: Vec<char> = options.bullets.chars().collect();
267
271
  let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
268
- let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
272
+ let bullet = if bullets.is_empty() {
273
+ '*'
274
+ } else {
275
+ bullets[bullet_index % bullets.len()]
276
+ };
269
277
  let bullet_str = bullet.to_string();
270
278
  let text_start = last_line.find(bullet).map_or(0, |pos| pos + 1);
271
279
  (bullet_str, last_line[text_start..].trim().to_string())
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
196
196
  }
197
197
  }
198
198
 
199
+ let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
200
+ Some(std::rc::Rc::new(std::cell::RefCell::new(
201
+ crate::converter::reference_collector::ReferenceCollector::new(),
202
+ )))
203
+ } else {
204
+ None
205
+ };
206
+
199
207
  #[cfg(all(feature = "metadata", feature = "visitor"))]
200
208
  let ctx = Context::new(
201
209
  options,
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
203
211
  metadata_collector,
204
212
  visitor,
205
213
  structure_collector.as_ref().map(std::rc::Rc::clone),
214
+ reference_collector.as_ref().map(std::rc::Rc::clone),
206
215
  );
207
216
  #[cfg(all(feature = "metadata", not(feature = "visitor")))]
208
217
  let ctx = Context::new(
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
211
220
  metadata_collector,
212
221
  _visitor,
213
222
  structure_collector.as_ref().map(std::rc::Rc::clone),
223
+ reference_collector.as_ref().map(std::rc::Rc::clone),
214
224
  );
215
225
  #[cfg(all(not(feature = "metadata"), feature = "visitor"))]
216
226
  let ctx = Context::new(
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
219
229
  _metadata_collector,
220
230
  visitor,
221
231
  structure_collector.as_ref().map(std::rc::Rc::clone),
232
+ reference_collector.as_ref().map(std::rc::Rc::clone),
222
233
  );
223
234
  #[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
224
235
  let ctx = Context::new(
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
227
238
  _metadata_collector,
228
239
  _visitor,
229
240
  structure_collector.as_ref().map(std::rc::Rc::clone),
241
+ reference_collector.as_ref().map(std::rc::Rc::clone),
230
242
  );
231
243
 
232
244
  for child_handle in dom.children() {
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
242
254
  // reference to the same collector, and Rc::try_unwrap requires exactly one reference.
243
255
  drop(ctx);
244
256
 
257
+ // Append reference-style link definitions if any were collected
258
+ if let Some(rc) = reference_collector {
259
+ if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
260
+ let ref_section = collector.into_inner().finish();
261
+ if !ref_section.is_empty() {
262
+ let trimmed_len = output.trim_end_matches('\n').len();
263
+ output.truncate(trimmed_len);
264
+ output.push_str("\n\n");
265
+ output.push_str(&ref_section);
266
+ }
267
+ }
268
+ }
269
+
245
270
  // If plain text was requested, discard the markdown output and return plain text.
246
271
  // The full pipeline was still run above so that metadata + visitor callbacks fire.
247
272
  if is_plain_text {