html-to-markdown 3.0.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +42 -12
- data/Gemfile +1 -0
- data/Gemfile.lock +27 -55
- data/README.md +9 -10
- data/Rakefile +4 -10
- data/ext/html-to-markdown_rb/Cargo.toml +14 -0
- data/ext/html_to_markdown_rb/Cargo.toml +16 -0
- data/ext/html_to_markdown_rb/extconf.rb +10 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs/version.rb +6 -0
- data/ext/html_to_markdown_rb/src/html_to_markdown_rs.rb +9 -0
- data/ext/html_to_markdown_rb/src/lib.rs +3941 -0
- data/html-to-markdown-rb.gemspec +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/lib/html_to_markdown.rb +31 -21
- data/{ext/html-to-markdown-rb/native/extconf.rb → lib/html_to_markdown_rs.rb} +1 -1
- data/sig/html_to_markdown.rbs +17 -5
- data/vendor/Cargo.toml +4 -4
- data/vendor/html-to-markdown-rs/Cargo.toml +2 -2
- data/vendor/html-to-markdown-rs/examples/test_deser.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/block/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/table/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
- data/vendor/html-to-markdown-rs/src/converter/form/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +56 -17
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +10 -2
- data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +3 -2
- data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
- data/vendor/html-to-markdown-rs/src/converter/semantic/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/converter/utility/content.rs +1 -1
- data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
- data/vendor/html-to-markdown-rs/src/inline_images.rs +1 -1
- data/vendor/html-to-markdown-rs/src/lib.rs +1 -2
- data/vendor/html-to-markdown-rs/src/metadata/config.rs +1 -1
- data/vendor/html-to-markdown-rs/src/metadata/mod.rs +5 -5
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +14 -13
- data/vendor/html-to-markdown-rs/src/options/mod.rs +2 -2
- data/vendor/html-to-markdown-rs/src/options/preprocessing.rs +3 -9
- data/vendor/html-to-markdown-rs/src/options/validation.rs +46 -4
- data/vendor/html-to-markdown-rs/src/types/document.rs +11 -0
- data/vendor/html-to-markdown-rs/src/types/result.rs +5 -2
- data/vendor/html-to-markdown-rs/src/types/tables.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/state.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers/helpers/traversal.rs +1 -1
- data/vendor/html-to-markdown-rs/src/visitor_helpers.rs +8 -8
- data/vendor/html-to-markdown-rs/tests/commonmark_compliance_test.rs +6 -0
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +27 -3
- data/vendor/html-to-markdown-rs/tests/issue_140_regressions.rs +8 -2
- data/vendor/html-to-markdown-rs/tests/lists_test.rs +4 -4
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
- metadata +13 -18
- data/ext/html-to-markdown-rb/extconf.rb +0 -41
- data/ext/html-to-markdown-rb/native/Cargo.lock +0 -934
- data/ext/html-to-markdown-rb/native/Cargo.toml +0 -48
- data/ext/html-to-markdown-rb/native/README.md +0 -215
- data/ext/html-to-markdown-rb/native/src/conversion/inline_images.rs +0 -54
- data/ext/html-to-markdown-rb/native/src/conversion/metadata.rs +0 -158
- data/ext/html-to-markdown-rb/native/src/conversion/mod.rs +0 -11
- data/ext/html-to-markdown-rb/native/src/lib.rs +0 -128
- data/ext/html-to-markdown-rb/native/src/options.rs +0 -238
- data/ext/html-to-markdown-rb/native/src/types.rs +0 -24
- data/lib/html_to_markdown/cli.rb +0 -21
- data/lib/html_to_markdown/cli_proxy.rb +0 -74
- data/spec/cli_proxy_spec.rb +0 -42
- data/spec/spec_helper.rb +0 -10
data/html-to-markdown-rb.gemspec
CHANGED
|
@@ -87,7 +87,7 @@ Gem::Specification.new do |spec|
|
|
|
87
87
|
spec.files = files
|
|
88
88
|
spec.extra_rdoc_files = ['README.md']
|
|
89
89
|
|
|
90
|
-
spec.extensions = ['ext/
|
|
90
|
+
spec.extensions = ['ext/html_to_markdown_rb/extconf.rb']
|
|
91
91
|
|
|
92
92
|
spec.add_dependency 'rb_sys', '>= 0.9', '< 1.0'
|
|
93
93
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
data/lib/html_to_markdown.rb
CHANGED
|
@@ -3,28 +3,38 @@
|
|
|
3
3
|
require_relative 'html_to_markdown/version'
|
|
4
4
|
require 'html_to_markdown_rb'
|
|
5
5
|
|
|
6
|
+
# High-performance HTML to Markdown conversion.
|
|
7
|
+
#
|
|
8
|
+
# @example Simple conversion
|
|
9
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>') # => "# Hello\n\n"
|
|
10
|
+
#
|
|
11
|
+
# @example With options
|
|
12
|
+
# HtmlToMarkdown.convert('<h1>Hello</h1>', heading_style: 'atx')
|
|
6
13
|
module HtmlToMarkdown
|
|
7
|
-
|
|
8
|
-
autoload :CLIProxy, 'html_to_markdown/cli_proxy'
|
|
9
|
-
|
|
10
|
-
class << self
|
|
11
|
-
alias native_convert convert
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
module_function
|
|
15
|
-
|
|
16
|
-
# Convert HTML to Markdown, returning a Hash with:
|
|
17
|
-
# - :content [String, nil] the converted Markdown output
|
|
18
|
-
# - :document [nil] document structure (not yet exposed)
|
|
19
|
-
# - :metadata [Hash, nil] extracted HTML metadata
|
|
20
|
-
# - :tables [Array<Hash>] extracted tables with :grid and :markdown
|
|
21
|
-
# - :images [Array<Hash>] extracted inline images
|
|
22
|
-
# - :warnings [Array<Hash>] processing warnings
|
|
14
|
+
# Convert HTML to Markdown.
|
|
23
15
|
#
|
|
24
|
-
# @param html [String] HTML
|
|
25
|
-
# @param options [Hash
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
16
|
+
# @param html [String] The HTML content to convert.
|
|
17
|
+
# @param options [Hash] Optional conversion options.
|
|
18
|
+
# Supported keys (all optional):
|
|
19
|
+
# - :heading_style - 'atx', 'atx_closed', 'setext', 'underlined'
|
|
20
|
+
# - :code_block_style - 'backticks', 'tildes', 'indented'
|
|
21
|
+
# - :escape_asterisks - Boolean
|
|
22
|
+
# - :escape_underscores - Boolean
|
|
23
|
+
# - :escape_misc - Boolean
|
|
24
|
+
# - :escape_ascii - Boolean
|
|
25
|
+
# - :strip_newlines - Boolean
|
|
26
|
+
# - :keep_inline_images_in - Array of tag names
|
|
27
|
+
# - :strip_tags - Array of tag names to strip
|
|
28
|
+
# - :preserve_tags - Array of tag names to preserve verbatim
|
|
29
|
+
# (and more, matching ConversionOptions fields)
|
|
30
|
+
# @return [String] The converted Markdown content.
|
|
31
|
+
def self.convert(html, options = {})
|
|
32
|
+
opts = if options.nil? || options.empty?
|
|
33
|
+
nil
|
|
34
|
+
else
|
|
35
|
+
HtmlToMarkdownRs::ConversionOptions.new(options)
|
|
36
|
+
end
|
|
37
|
+
result = HtmlToMarkdownRs.convert(html, opts)
|
|
38
|
+
result.content || ''
|
|
29
39
|
end
|
|
30
40
|
end
|
data/sig/html_to_markdown.rbs
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# Native extension module (Magnus/rb-sys)
|
|
2
|
+
module HtmlToMarkdownRs
|
|
3
|
+
class ConversionOptions
|
|
4
|
+
def initialize: (Hash[Symbol, untyped]) -> void
|
|
5
|
+
end
|
|
6
|
+
|
|
7
|
+
class ConversionResult
|
|
8
|
+
def content: () -> String?
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.convert: (String html, ConversionOptions? options) -> ConversionResult
|
|
12
|
+
end
|
|
13
|
+
|
|
1
14
|
# Type definitions for HtmlToMarkdown Ruby gem
|
|
2
15
|
module HtmlToMarkdown
|
|
3
16
|
VERSION: String
|
|
@@ -8,6 +21,7 @@ module HtmlToMarkdown
|
|
|
8
21
|
type whitespace_mode = :normalized | :strict
|
|
9
22
|
type newline_style = :spaces | :backslash
|
|
10
23
|
type code_block_style = :indented | :backticks | :tildes
|
|
24
|
+
type link_style = :inline | :reference
|
|
11
25
|
type output_format = :markdown | :djot
|
|
12
26
|
type preprocessing_preset = :minimal | :standard | :aggressive
|
|
13
27
|
|
|
@@ -49,6 +63,7 @@ module HtmlToMarkdown
|
|
|
49
63
|
debug?: bool,
|
|
50
64
|
strip_tags?: Array[String],
|
|
51
65
|
preserve_tags?: Array[String],
|
|
66
|
+
link_style?: link_style,
|
|
52
67
|
output_format?: output_format,
|
|
53
68
|
skip_images?: bool,
|
|
54
69
|
include_document_structure?: bool,
|
|
@@ -126,12 +141,9 @@ module HtmlToMarkdown
|
|
|
126
141
|
|
|
127
142
|
public
|
|
128
143
|
|
|
129
|
-
# Convert HTML to Markdown, returning
|
|
144
|
+
# Convert HTML to Markdown, returning the markdown content string.
|
|
130
145
|
#
|
|
131
146
|
# Example:
|
|
132
147
|
# result = HtmlToMarkdown.convert(html)
|
|
133
|
-
def self.convert: (String html, ?conversion_options options) ->
|
|
134
|
-
|
|
135
|
-
# Instance method version (created by module_function)
|
|
136
|
-
def convert: (String html, ?conversion_options options) -> Hash[String, untyped]
|
|
148
|
+
def self.convert: (String html, ?conversion_options options) -> String
|
|
137
149
|
end
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["html-to-markdown-rs"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "3.0
|
|
6
|
+
version = "3.2.0"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.85"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -18,11 +18,11 @@ clap = { version = "4.6", features = ["derive"] }
|
|
|
18
18
|
clap_complete = "4.6"
|
|
19
19
|
clap_mangen = "0.3"
|
|
20
20
|
encoding_rs = "0.8"
|
|
21
|
-
ext-php-rs = "0.15.
|
|
21
|
+
ext-php-rs = "0.15.10"
|
|
22
22
|
html5ever = "0.39.0"
|
|
23
23
|
once_cell = "1.21"
|
|
24
|
-
pyo3 = { version = "0.28.
|
|
25
|
-
rayon = "1.
|
|
24
|
+
pyo3 = { version = "0.28.3", features = ["abi3-py310"] }
|
|
25
|
+
rayon = "1.12"
|
|
26
26
|
regex = "1.12"
|
|
27
27
|
serde = { version = "1.0", features = ["derive"] }
|
|
28
28
|
serde_json = "1.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "html-to-markdown-rs"
|
|
3
|
-
version = "3.0
|
|
3
|
+
version = "3.2.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -39,7 +39,7 @@ image = { version = "0.25", default-features = false, features = [
|
|
|
39
39
|
"bmp",
|
|
40
40
|
"webp",
|
|
41
41
|
], optional = true }
|
|
42
|
-
lru = "0.
|
|
42
|
+
lru = "0.17"
|
|
43
43
|
memchr = "2"
|
|
44
44
|
once_cell = "1.21"
|
|
45
45
|
regex = "1.12"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
use html_to_markdown_rs::ConversionOptions;
|
|
3
|
+
|
|
4
|
+
fn main() {
|
|
5
|
+
let json = r#"{"headingStyle":"","listIndentType":"","listIndentWidth":2,"bullets":"-*+","strongEmSymbol":"*","escapeAsterisks":false,"escapeUnderscores":false,"escapeMisc":false,"escapeAscii":false,"codeLanguage":"","autolinks":true,"defaultTitle":false,"brInTables":false,"highlightStyle":"","extractMetadata":true,"whitespaceMode":"","stripNewlines":false,"wrap":false,"wrapWidth":80,"convertAsInline":false,"subSymbol":"","supSymbol":"","newlineStyle":"spaces","codeBlockStyle":"tildes","keepInlineImagesIn":null,"preprocessing":{"enabled":false,"preset":"","removeNavigation":false,"removeForms":false},"encoding":"utf-8","debug":false,"stripTags":null,"preserveTags":null,"skipImages":false,"linkStyle":"","outputFormat":"","includeDocumentStructure":false,"extractImages":false,"maxImageSize":5242880,"captureSvg":false,"inferDimensions":true}"#;
|
|
6
|
+
|
|
7
|
+
let opts: ConversionOptions = serde_json::from_str(json).unwrap();
|
|
8
|
+
println!("code_block_style: {:?}", opts.code_block_style);
|
|
9
|
+
|
|
10
|
+
let result = html_to_markdown_rs::convert("<pre><code>some code</code></pre>", Some(opts)).unwrap();
|
|
11
|
+
println!("result: {:?}", result.content);
|
|
12
|
+
}
|
|
@@ -29,7 +29,7 @@ pub(crate) use caption::handle_caption;
|
|
|
29
29
|
/// Dispatches table element handling to the main convert_table function.
|
|
30
30
|
///
|
|
31
31
|
/// # Usage in converter.rs
|
|
32
|
-
/// ```
|
|
32
|
+
/// ```text
|
|
33
33
|
/// if "table" == tag_name {
|
|
34
34
|
/// crate::converter::block::table::handle_table(
|
|
35
35
|
/// node_handle,
|
|
@@ -12,6 +12,7 @@ use std::rc::Rc;
|
|
|
12
12
|
#[cfg(feature = "inline-images")]
|
|
13
13
|
use crate::inline_images::InlineImageCollector;
|
|
14
14
|
|
|
15
|
+
use crate::converter::reference_collector::ReferenceCollectorHandle;
|
|
15
16
|
use crate::types::structure_collector::StructureCollectorHandle;
|
|
16
17
|
|
|
17
18
|
/// Handle type for inline image collector when feature is enabled.
|
|
@@ -105,6 +106,8 @@ pub struct Context {
|
|
|
105
106
|
///
|
|
106
107
|
/// Populated when `options.include_document_structure == true`.
|
|
107
108
|
pub(crate) structure_collector: Option<StructureCollectorHandle>,
|
|
109
|
+
/// Optional reference collector for reference-style links.
|
|
110
|
+
pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
|
|
108
111
|
}
|
|
109
112
|
|
|
110
113
|
impl Context {
|
|
@@ -122,6 +125,7 @@ impl Context {
|
|
|
122
125
|
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
|
|
123
126
|
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
124
127
|
structure_collector: Option<StructureCollectorHandle>,
|
|
128
|
+
reference_collector: Option<ReferenceCollectorHandle>,
|
|
125
129
|
) -> Self {
|
|
126
130
|
#[cfg(feature = "metadata")]
|
|
127
131
|
let (
|
|
@@ -186,6 +190,7 @@ impl Context {
|
|
|
186
190
|
#[cfg(feature = "visitor")]
|
|
187
191
|
visitor_error: Rc::new(RefCell::new(None)),
|
|
188
192
|
structure_collector,
|
|
193
|
+
reference_collector,
|
|
189
194
|
}
|
|
190
195
|
}
|
|
191
196
|
}
|
|
@@ -128,6 +128,8 @@ pub fn handle_graphic(
|
|
|
128
128
|
&alt,
|
|
129
129
|
title.as_deref(),
|
|
130
130
|
should_use_alt_text,
|
|
131
|
+
options.link_style,
|
|
132
|
+
ctx.reference_collector.as_ref(),
|
|
131
133
|
)),
|
|
132
134
|
VisitResult::Custom(custom) => Some(custom),
|
|
133
135
|
VisitResult::Skip => None,
|
|
@@ -145,6 +147,8 @@ pub fn handle_graphic(
|
|
|
145
147
|
&alt,
|
|
146
148
|
title.as_deref(),
|
|
147
149
|
should_use_alt_text,
|
|
150
|
+
options.link_style,
|
|
151
|
+
ctx.reference_collector.as_ref(),
|
|
148
152
|
))
|
|
149
153
|
};
|
|
150
154
|
|
|
@@ -154,6 +158,8 @@ pub fn handle_graphic(
|
|
|
154
158
|
&alt,
|
|
155
159
|
title.as_deref(),
|
|
156
160
|
should_use_alt_text,
|
|
161
|
+
options.link_style,
|
|
162
|
+
ctx.reference_collector.as_ref(),
|
|
157
163
|
));
|
|
158
164
|
|
|
159
165
|
if !options.skip_images {
|
|
@@ -189,21 +195,39 @@ pub fn handle_graphic(
|
|
|
189
195
|
///
|
|
190
196
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
191
197
|
/// Otherwise returns the full `` syntax.
|
|
192
|
-
fn format_graphic_markdown(
|
|
198
|
+
fn format_graphic_markdown(
|
|
199
|
+
src: &str,
|
|
200
|
+
alt: &str,
|
|
201
|
+
title: Option<&str>,
|
|
202
|
+
use_alt_only: bool,
|
|
203
|
+
link_style: crate::options::validation::LinkStyle,
|
|
204
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
205
|
+
) -> String {
|
|
193
206
|
if use_alt_only {
|
|
194
|
-
alt.to_string()
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
buf.push_str("
|
|
203
|
-
buf.push_str(
|
|
204
|
-
buf.push('
|
|
207
|
+
return alt.to_string();
|
|
208
|
+
}
|
|
209
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
210
|
+
if let Some(collector) = reference_collector {
|
|
211
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
212
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
213
|
+
buf.push_str("![");
|
|
214
|
+
buf.push_str(alt);
|
|
215
|
+
buf.push_str("][");
|
|
216
|
+
buf.push_str(&ref_num.to_string());
|
|
217
|
+
buf.push(']');
|
|
218
|
+
return buf;
|
|
205
219
|
}
|
|
206
|
-
buf.push(')');
|
|
207
|
-
buf
|
|
208
220
|
}
|
|
221
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
222
|
+
buf.push_str(";
|
|
225
|
+
buf.push_str(src);
|
|
226
|
+
if let Some(title_text) = title {
|
|
227
|
+
buf.push_str(" \"");
|
|
228
|
+
buf.push_str(title_text);
|
|
229
|
+
buf.push('"');
|
|
230
|
+
}
|
|
231
|
+
buf.push(')');
|
|
232
|
+
buf
|
|
209
233
|
}
|
|
@@ -146,7 +146,14 @@ pub fn handle_img(
|
|
|
146
146
|
visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
|
|
147
147
|
};
|
|
148
148
|
match visit_result {
|
|
149
|
-
VisitResult::Continue => Some(format_image_markdown(
|
|
149
|
+
VisitResult::Continue => Some(format_image_markdown(
|
|
150
|
+
&src,
|
|
151
|
+
&alt,
|
|
152
|
+
title.as_deref(),
|
|
153
|
+
should_use_alt_text,
|
|
154
|
+
options.link_style,
|
|
155
|
+
ctx.reference_collector.as_ref(),
|
|
156
|
+
)),
|
|
150
157
|
VisitResult::Custom(custom) => Some(custom),
|
|
151
158
|
VisitResult::Skip => None,
|
|
152
159
|
VisitResult::Error(err) => {
|
|
@@ -158,11 +165,25 @@ pub fn handle_img(
|
|
|
158
165
|
VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
|
|
159
166
|
}
|
|
160
167
|
} else {
|
|
161
|
-
Some(format_image_markdown(
|
|
168
|
+
Some(format_image_markdown(
|
|
169
|
+
&src,
|
|
170
|
+
&alt,
|
|
171
|
+
title.as_deref(),
|
|
172
|
+
should_use_alt_text,
|
|
173
|
+
options.link_style,
|
|
174
|
+
ctx.reference_collector.as_ref(),
|
|
175
|
+
))
|
|
162
176
|
};
|
|
163
177
|
|
|
164
178
|
#[cfg(not(feature = "visitor"))]
|
|
165
|
-
let image_output = Some(format_image_markdown(
|
|
179
|
+
let image_output = Some(format_image_markdown(
|
|
180
|
+
&src,
|
|
181
|
+
&alt,
|
|
182
|
+
title.as_deref(),
|
|
183
|
+
should_use_alt_text,
|
|
184
|
+
options.link_style,
|
|
185
|
+
ctx.reference_collector.as_ref(),
|
|
186
|
+
));
|
|
166
187
|
|
|
167
188
|
// Only output image if skip_images is not enabled
|
|
168
189
|
if !options.skip_images {
|
|
@@ -204,21 +225,39 @@ pub fn handle_img(
|
|
|
204
225
|
///
|
|
205
226
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
206
227
|
/// Otherwise returns the full `` syntax.
|
|
207
|
-
fn format_image_markdown(
|
|
228
|
+
fn format_image_markdown(
|
|
229
|
+
src: &str,
|
|
230
|
+
alt: &str,
|
|
231
|
+
title: Option<&str>,
|
|
232
|
+
use_alt_only: bool,
|
|
233
|
+
link_style: crate::options::validation::LinkStyle,
|
|
234
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
235
|
+
) -> String {
|
|
208
236
|
if use_alt_only {
|
|
209
|
-
alt.to_string()
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
buf.push_str("
|
|
218
|
-
buf.push_str(
|
|
219
|
-
buf.push('
|
|
237
|
+
return alt.to_string();
|
|
238
|
+
}
|
|
239
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
240
|
+
if let Some(collector) = reference_collector {
|
|
241
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
242
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
243
|
+
buf.push_str("![");
|
|
244
|
+
buf.push_str(alt);
|
|
245
|
+
buf.push_str("][");
|
|
246
|
+
buf.push_str(&ref_num.to_string());
|
|
247
|
+
buf.push(']');
|
|
248
|
+
return buf;
|
|
220
249
|
}
|
|
221
|
-
buf.push(')');
|
|
222
|
-
buf
|
|
223
250
|
}
|
|
251
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
252
|
+
buf.push_str(";
|
|
255
|
+
buf.push_str(src);
|
|
256
|
+
if let Some(title_text) = title {
|
|
257
|
+
buf.push_str(" \"");
|
|
258
|
+
buf.push_str(title_text);
|
|
259
|
+
buf.push('"');
|
|
260
|
+
}
|
|
261
|
+
buf.push(')');
|
|
262
|
+
buf
|
|
224
263
|
}
|
|
@@ -115,6 +115,7 @@ pub fn handle_link(
|
|
|
115
115
|
title.as_deref(),
|
|
116
116
|
raw_text.as_str(),
|
|
117
117
|
options,
|
|
118
|
+
ctx.reference_collector.as_ref(),
|
|
118
119
|
);
|
|
119
120
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
120
121
|
return;
|
|
@@ -190,6 +191,13 @@ pub fn handle_link(
|
|
|
190
191
|
label = href.clone();
|
|
191
192
|
}
|
|
192
193
|
|
|
194
|
+
// Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
|
|
195
|
+
// These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
|
|
196
|
+
// Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
|
|
197
|
+
if label == "^" && href.starts_with('#') {
|
|
198
|
+
label = "↑".to_string();
|
|
199
|
+
}
|
|
200
|
+
|
|
193
201
|
let escaped_label = escape_link_label(&label);
|
|
194
202
|
|
|
195
203
|
#[cfg(feature = "visitor")]
|
|
@@ -226,6 +234,7 @@ pub fn handle_link(
|
|
|
226
234
|
title.as_deref(),
|
|
227
235
|
label.as_str(),
|
|
228
236
|
options,
|
|
237
|
+
ctx.reference_collector.as_ref(),
|
|
229
238
|
);
|
|
230
239
|
Some(buf)
|
|
231
240
|
}
|
|
@@ -248,6 +257,7 @@ pub fn handle_link(
|
|
|
248
257
|
title.as_deref(),
|
|
249
258
|
label.as_str(),
|
|
250
259
|
options,
|
|
260
|
+
ctx.reference_collector.as_ref(),
|
|
251
261
|
);
|
|
252
262
|
Some(buf)
|
|
253
263
|
};
|
|
@@ -262,6 +272,7 @@ pub fn handle_link(
|
|
|
262
272
|
title.as_deref(),
|
|
263
273
|
label.as_str(),
|
|
264
274
|
options,
|
|
275
|
+
ctx.reference_collector.as_ref(),
|
|
265
276
|
);
|
|
266
277
|
Some(buf)
|
|
267
278
|
};
|
|
@@ -145,6 +145,7 @@ pub(crate) fn handle(
|
|
|
145
145
|
title.as_deref(),
|
|
146
146
|
raw_text.as_str(),
|
|
147
147
|
options,
|
|
148
|
+
ctx.reference_collector.as_ref(),
|
|
148
149
|
);
|
|
149
150
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
150
151
|
return;
|
|
@@ -262,6 +263,7 @@ pub(crate) fn handle(
|
|
|
262
263
|
title.as_deref(),
|
|
263
264
|
label.as_str(),
|
|
264
265
|
options,
|
|
266
|
+
ctx.reference_collector.as_ref(),
|
|
265
267
|
);
|
|
266
268
|
Some(buf)
|
|
267
269
|
}
|
|
@@ -284,6 +286,7 @@ pub(crate) fn handle(
|
|
|
284
286
|
title.as_deref(),
|
|
285
287
|
label.as_str(),
|
|
286
288
|
options,
|
|
289
|
+
ctx.reference_collector.as_ref(),
|
|
287
290
|
);
|
|
288
291
|
Some(buf)
|
|
289
292
|
};
|
|
@@ -298,6 +301,7 @@ pub(crate) fn handle(
|
|
|
298
301
|
title.as_deref(),
|
|
299
302
|
label.as_str(),
|
|
300
303
|
options,
|
|
304
|
+
ctx.reference_collector.as_ref(),
|
|
301
305
|
);
|
|
302
306
|
Some(buf)
|
|
303
307
|
};
|
|
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
|
|
|
363
367
|
title: Option<&str>,
|
|
364
368
|
raw_text: &str,
|
|
365
369
|
options: &ConversionOptions,
|
|
370
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
366
371
|
) {
|
|
372
|
+
if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
|
|
373
|
+
if let Some(collector) = reference_collector {
|
|
374
|
+
let ref_num = collector.borrow_mut().get_or_insert(href, title);
|
|
375
|
+
output.push('[');
|
|
376
|
+
output.push_str(label);
|
|
377
|
+
output.push_str("][");
|
|
378
|
+
output.push_str(&ref_num.to_string());
|
|
379
|
+
output.push(']');
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
367
384
|
output.push('[');
|
|
368
385
|
output.push_str(label);
|
|
369
386
|
output.push_str("](");
|
|
@@ -198,7 +198,11 @@ pub(crate) fn handle_li(
|
|
|
198
198
|
} else {
|
|
199
199
|
let bullets: Vec<char> = options.bullets.chars().collect();
|
|
200
200
|
let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
|
|
201
|
-
let bullet =
|
|
201
|
+
let bullet = if bullets.is_empty() {
|
|
202
|
+
'*'
|
|
203
|
+
} else {
|
|
204
|
+
bullets[bullet_index % bullets.len()]
|
|
205
|
+
};
|
|
202
206
|
output.push(bullet);
|
|
203
207
|
output.push(' ');
|
|
204
208
|
}
|
|
@@ -265,7 +269,11 @@ pub(crate) fn handle_li(
|
|
|
265
269
|
} else {
|
|
266
270
|
let bullets: Vec<char> = options.bullets.chars().collect();
|
|
267
271
|
let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
|
|
268
|
-
let bullet =
|
|
272
|
+
let bullet = if bullets.is_empty() {
|
|
273
|
+
'*'
|
|
274
|
+
} else {
|
|
275
|
+
bullets[bullet_index % bullets.len()]
|
|
276
|
+
};
|
|
269
277
|
let bullet_str = bullet.to_string();
|
|
270
278
|
let text_start = last_line.find(bullet).map_or(0, |pos| pos + 1);
|
|
271
279
|
(bullet_str, last_line[text_start..].trim().to_string())
|
|
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
|
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
198
|
|
|
199
|
+
let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
|
|
200
|
+
Some(std::rc::Rc::new(std::cell::RefCell::new(
|
|
201
|
+
crate::converter::reference_collector::ReferenceCollector::new(),
|
|
202
|
+
)))
|
|
203
|
+
} else {
|
|
204
|
+
None
|
|
205
|
+
};
|
|
206
|
+
|
|
199
207
|
#[cfg(all(feature = "metadata", feature = "visitor"))]
|
|
200
208
|
let ctx = Context::new(
|
|
201
209
|
options,
|
|
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
|
|
|
203
211
|
metadata_collector,
|
|
204
212
|
visitor,
|
|
205
213
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
206
215
|
);
|
|
207
216
|
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
|
|
208
217
|
let ctx = Context::new(
|
|
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
|
|
|
211
220
|
metadata_collector,
|
|
212
221
|
_visitor,
|
|
213
222
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
223
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
224
|
);
|
|
215
225
|
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
|
|
216
226
|
let ctx = Context::new(
|
|
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
|
|
|
219
229
|
_metadata_collector,
|
|
220
230
|
visitor,
|
|
221
231
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
232
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
222
233
|
);
|
|
223
234
|
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
|
|
224
235
|
let ctx = Context::new(
|
|
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
|
|
|
227
238
|
_metadata_collector,
|
|
228
239
|
_visitor,
|
|
229
240
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
241
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
230
242
|
);
|
|
231
243
|
|
|
232
244
|
for child_handle in dom.children() {
|
|
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
|
|
|
242
254
|
// reference to the same collector, and Rc::try_unwrap requires exactly one reference.
|
|
243
255
|
drop(ctx);
|
|
244
256
|
|
|
257
|
+
// Append reference-style link definitions if any were collected
|
|
258
|
+
if let Some(rc) = reference_collector {
|
|
259
|
+
if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
|
|
260
|
+
let ref_section = collector.into_inner().finish();
|
|
261
|
+
if !ref_section.is_empty() {
|
|
262
|
+
let trimmed_len = output.trim_end_matches('\n').len();
|
|
263
|
+
output.truncate(trimmed_len);
|
|
264
|
+
output.push_str("\n\n");
|
|
265
|
+
output.push_str(&ref_section);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
245
270
|
// If plain text was requested, discard the markdown output and return plain text.
|
|
246
271
|
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
247
272
|
if is_plain_text {
|