html-to-markdown 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +10 -6
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.lock +12 -20
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/converter/block/heading.rs +14 -10
- data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/code_block.rs +4 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +62 -17
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
- data/vendor/html-to-markdown-rs/src/converter/list/item.rs +14 -0
- data/vendor/html-to-markdown-rs/src/converter/list/ordered.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/list/unordered.rs +12 -0
- data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
- data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +8 -1
- data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/options/validation.rs +43 -1
- data/vendor/html-to-markdown-rs/src/types/structure_builder.rs +2 -1
- data/vendor/html-to-markdown-rs/src/types/structure_collector.rs +2 -1
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +24 -0
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c23b51454716c4f5224bc9a0b6cfcfcf3f9935709379395662d9d89cab96f223
|
|
4
|
+
data.tar.gz: '0878f8bad06ca970013d87f6064150bed2db8b5e12d087474acaa4dd17a00559'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e21bd6d2ec9cbd40df454f2b441cb2da333b1c73a062686f830c4bd3368dad2dacec3bb0953f5c8902ad2ab411453c690597d64f0c86103d65b71438c647a7f1
|
|
7
|
+
data.tar.gz: 38cf61f5035e6becae227f4117f10208eb9b0ca2d99b805b6e8feefdc8bf2611e44605c967218ea69892df3af8a417026fb70e20cb3ab9a9e28771c3ecc723c9
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (3.0
|
|
4
|
+
html-to-markdown (3.1.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -29,6 +29,7 @@ GEM
|
|
|
29
29
|
diff-lcs (1.6.2)
|
|
30
30
|
drb (2.2.3)
|
|
31
31
|
ffi (1.17.4-arm64-darwin)
|
|
32
|
+
ffi (1.17.4-x86_64-linux-gnu)
|
|
32
33
|
fileutils (1.8.0)
|
|
33
34
|
i18n (1.14.8)
|
|
34
35
|
concurrent-ruby (~> 1.0)
|
|
@@ -40,7 +41,7 @@ GEM
|
|
|
40
41
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
41
42
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
42
43
|
logger (1.7.0)
|
|
43
|
-
minitest (6.0.
|
|
44
|
+
minitest (6.0.3)
|
|
44
45
|
drb (~> 2.0)
|
|
45
46
|
prism (~> 1.5)
|
|
46
47
|
mutex_m (0.3.0)
|
|
@@ -58,7 +59,8 @@ GEM
|
|
|
58
59
|
rb-fsevent (0.11.2)
|
|
59
60
|
rb-inotify (0.11.1)
|
|
60
61
|
ffi (~> 1.0)
|
|
61
|
-
rb_sys (0.9.
|
|
62
|
+
rb_sys (0.9.125)
|
|
63
|
+
json (>= 2)
|
|
62
64
|
rake-compiler-dock (= 1.11.0)
|
|
63
65
|
rbs (3.10.4)
|
|
64
66
|
logger
|
|
@@ -126,6 +128,7 @@ GEM
|
|
|
126
128
|
|
|
127
129
|
PLATFORMS
|
|
128
130
|
arm64-darwin
|
|
131
|
+
x86_64-linux
|
|
129
132
|
|
|
130
133
|
DEPENDENCIES
|
|
131
134
|
html-to-markdown!
|
|
@@ -148,15 +151,16 @@ CHECKSUMS
|
|
|
148
151
|
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
149
152
|
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
|
|
150
153
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
154
|
+
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
151
155
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
152
|
-
html-to-markdown (3.0
|
|
156
|
+
html-to-markdown (3.1.0)
|
|
153
157
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
154
158
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
155
159
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
156
160
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
157
161
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
158
162
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
159
|
-
minitest (6.0.
|
|
163
|
+
minitest (6.0.3) sha256=88ac8a1de36c00692420e7cb3cc11a0773bbcb126aee1c249f320160a7d11411
|
|
160
164
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
161
165
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
162
166
|
parser (3.3.11.1) sha256=d17ace7aabe3e72c3cc94043714be27cc6f852f104d81aa284c2281aecc65d54
|
|
@@ -168,7 +172,7 @@ CHECKSUMS
|
|
|
168
172
|
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
169
173
|
rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
|
|
170
174
|
rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
|
|
171
|
-
rb_sys (0.9.
|
|
175
|
+
rb_sys (0.9.125) sha256=14efd4e07eaf7c07edb1bab548d2a4767869a47a8821fc5ea52d9bf982ef00a8
|
|
172
176
|
rbs (3.10.4) sha256=b17d7c4be4bb31a11a3b529830f0aa206a807ca42f2e7921a3027dfc6b7e5ce8
|
|
173
177
|
regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
|
|
174
178
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.1.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
|
@@ -59,16 +59,14 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
|
|
59
59
|
|
|
60
60
|
[[package]]
|
|
61
61
|
name = "bindgen"
|
|
62
|
-
version = "0.
|
|
62
|
+
version = "0.72.1"
|
|
63
63
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
64
|
-
checksum = "
|
|
64
|
+
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
|
|
65
65
|
dependencies = [
|
|
66
66
|
"bitflags",
|
|
67
67
|
"cexpr",
|
|
68
68
|
"clang-sys",
|
|
69
69
|
"itertools",
|
|
70
|
-
"lazy_static",
|
|
71
|
-
"lazycell",
|
|
72
70
|
"proc-macro2",
|
|
73
71
|
"quote",
|
|
74
72
|
"regex",
|
|
@@ -260,7 +258,7 @@ dependencies = [
|
|
|
260
258
|
|
|
261
259
|
[[package]]
|
|
262
260
|
name = "html-to-markdown-rb"
|
|
263
|
-
version = "3.0.
|
|
261
|
+
version = "3.0.1"
|
|
264
262
|
dependencies = [
|
|
265
263
|
"html-to-markdown-rs",
|
|
266
264
|
"magnus",
|
|
@@ -269,7 +267,7 @@ dependencies = [
|
|
|
269
267
|
|
|
270
268
|
[[package]]
|
|
271
269
|
name = "html-to-markdown-rs"
|
|
272
|
-
version = "3.0.
|
|
270
|
+
version = "3.0.1"
|
|
273
271
|
dependencies = [
|
|
274
272
|
"ahash",
|
|
275
273
|
"astral-tl",
|
|
@@ -326,9 +324,9 @@ dependencies = [
|
|
|
326
324
|
|
|
327
325
|
[[package]]
|
|
328
326
|
name = "itertools"
|
|
329
|
-
version = "0.
|
|
327
|
+
version = "0.13.0"
|
|
330
328
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
331
|
-
checksum = "
|
|
329
|
+
checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
|
|
332
330
|
dependencies = [
|
|
333
331
|
"either",
|
|
334
332
|
]
|
|
@@ -345,12 +343,6 @@ version = "1.5.0"
|
|
|
345
343
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
346
344
|
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
347
345
|
|
|
348
|
-
[[package]]
|
|
349
|
-
name = "lazycell"
|
|
350
|
-
version = "1.3.0"
|
|
351
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
352
|
-
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
353
|
-
|
|
354
346
|
[[package]]
|
|
355
347
|
name = "libc"
|
|
356
348
|
version = "0.2.183"
|
|
@@ -609,18 +601,18 @@ dependencies = [
|
|
|
609
601
|
|
|
610
602
|
[[package]]
|
|
611
603
|
name = "rb-sys"
|
|
612
|
-
version = "0.9.
|
|
604
|
+
version = "0.9.125"
|
|
613
605
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
614
|
-
checksum = "
|
|
606
|
+
checksum = "85b37650fabd8ba515910a0dc089dcb6348eb3c35fbf91698cb226435be2babc"
|
|
615
607
|
dependencies = [
|
|
616
608
|
"rb-sys-build",
|
|
617
609
|
]
|
|
618
610
|
|
|
619
611
|
[[package]]
|
|
620
612
|
name = "rb-sys-build"
|
|
621
|
-
version = "0.9.
|
|
613
|
+
version = "0.9.125"
|
|
622
614
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
623
|
-
checksum = "
|
|
615
|
+
checksum = "c73b806faa66006e491458b48a78725621c1ac5a2a6efe2614c90711a7780b80"
|
|
624
616
|
dependencies = [
|
|
625
617
|
"bindgen",
|
|
626
618
|
"lazy_static",
|
|
@@ -677,9 +669,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
|
|
677
669
|
|
|
678
670
|
[[package]]
|
|
679
671
|
name = "rustc-hash"
|
|
680
|
-
version = "
|
|
672
|
+
version = "2.1.2"
|
|
681
673
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
682
|
-
checksum = "
|
|
674
|
+
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
|
683
675
|
|
|
684
676
|
[[package]]
|
|
685
677
|
name = "scopeguard"
|
data/vendor/Cargo.toml
CHANGED
|
@@ -128,16 +128,20 @@ pub(crate) fn handle(
|
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
// Notify the structure collector if present.
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
131
|
+
// Skip headings inside table cells — they are part of the table content,
|
|
132
|
+
// not standalone structural headings.
|
|
133
|
+
if !ctx.in_table_cell {
|
|
134
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
135
|
+
if let Some(node) = node_handle.get(parser) {
|
|
136
|
+
if let tl::Node::Tag(tag) = node {
|
|
137
|
+
let id = tag
|
|
138
|
+
.attributes()
|
|
139
|
+
.get("id")
|
|
140
|
+
.flatten()
|
|
141
|
+
.map(|v| v.as_utf8_str().to_string());
|
|
142
|
+
sc.borrow_mut()
|
|
143
|
+
.push_heading(level as u8, normalized.as_ref(), id.as_deref());
|
|
144
|
+
}
|
|
141
145
|
}
|
|
142
146
|
}
|
|
143
147
|
}
|
|
@@ -12,6 +12,7 @@ use std::rc::Rc;
|
|
|
12
12
|
#[cfg(feature = "inline-images")]
|
|
13
13
|
use crate::inline_images::InlineImageCollector;
|
|
14
14
|
|
|
15
|
+
use crate::converter::reference_collector::ReferenceCollectorHandle;
|
|
15
16
|
use crate::types::structure_collector::StructureCollectorHandle;
|
|
16
17
|
|
|
17
18
|
/// Handle type for inline image collector when feature is enabled.
|
|
@@ -105,6 +106,8 @@ pub struct Context {
|
|
|
105
106
|
///
|
|
106
107
|
/// Populated when `options.include_document_structure == true`.
|
|
107
108
|
pub(crate) structure_collector: Option<StructureCollectorHandle>,
|
|
109
|
+
/// Optional reference collector for reference-style links.
|
|
110
|
+
pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
|
|
108
111
|
}
|
|
109
112
|
|
|
110
113
|
impl Context {
|
|
@@ -122,6 +125,7 @@ impl Context {
|
|
|
122
125
|
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
|
|
123
126
|
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
124
127
|
structure_collector: Option<StructureCollectorHandle>,
|
|
128
|
+
reference_collector: Option<ReferenceCollectorHandle>,
|
|
125
129
|
) -> Self {
|
|
126
130
|
#[cfg(feature = "metadata")]
|
|
127
131
|
let (
|
|
@@ -186,6 +190,7 @@ impl Context {
|
|
|
186
190
|
#[cfg(feature = "visitor")]
|
|
187
191
|
visitor_error: Rc::new(RefCell::new(None)),
|
|
188
192
|
structure_collector,
|
|
193
|
+
reference_collector,
|
|
189
194
|
}
|
|
190
195
|
}
|
|
191
196
|
}
|
|
@@ -301,6 +301,10 @@ pub fn handle_pre(
|
|
|
301
301
|
{
|
|
302
302
|
format_code_block(&processed_content, language.as_deref(), output, options, ctx);
|
|
303
303
|
}
|
|
304
|
+
|
|
305
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
306
|
+
sc.borrow_mut().push_code(&processed_content, language.as_deref());
|
|
307
|
+
}
|
|
304
308
|
}
|
|
305
309
|
}
|
|
306
310
|
|
|
@@ -128,6 +128,8 @@ pub fn handle_graphic(
|
|
|
128
128
|
&alt,
|
|
129
129
|
title.as_deref(),
|
|
130
130
|
should_use_alt_text,
|
|
131
|
+
options.link_style,
|
|
132
|
+
ctx.reference_collector.as_ref(),
|
|
131
133
|
)),
|
|
132
134
|
VisitResult::Custom(custom) => Some(custom),
|
|
133
135
|
VisitResult::Skip => None,
|
|
@@ -145,6 +147,8 @@ pub fn handle_graphic(
|
|
|
145
147
|
&alt,
|
|
146
148
|
title.as_deref(),
|
|
147
149
|
should_use_alt_text,
|
|
150
|
+
options.link_style,
|
|
151
|
+
ctx.reference_collector.as_ref(),
|
|
148
152
|
))
|
|
149
153
|
};
|
|
150
154
|
|
|
@@ -154,6 +158,8 @@ pub fn handle_graphic(
|
|
|
154
158
|
&alt,
|
|
155
159
|
title.as_deref(),
|
|
156
160
|
should_use_alt_text,
|
|
161
|
+
options.link_style,
|
|
162
|
+
ctx.reference_collector.as_ref(),
|
|
157
163
|
));
|
|
158
164
|
|
|
159
165
|
if !options.skip_images {
|
|
@@ -189,21 +195,39 @@ pub fn handle_graphic(
|
|
|
189
195
|
///
|
|
190
196
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
191
197
|
/// Otherwise returns the full `` syntax.
|
|
192
|
-
fn format_graphic_markdown(
|
|
198
|
+
fn format_graphic_markdown(
|
|
199
|
+
src: &str,
|
|
200
|
+
alt: &str,
|
|
201
|
+
title: Option<&str>,
|
|
202
|
+
use_alt_only: bool,
|
|
203
|
+
link_style: crate::options::validation::LinkStyle,
|
|
204
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
205
|
+
) -> String {
|
|
193
206
|
if use_alt_only {
|
|
194
|
-
alt.to_string()
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
buf.push_str("
|
|
203
|
-
buf.push_str(
|
|
204
|
-
buf.push('
|
|
207
|
+
return alt.to_string();
|
|
208
|
+
}
|
|
209
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
210
|
+
if let Some(collector) = reference_collector {
|
|
211
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
212
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
213
|
+
buf.push_str("![");
|
|
214
|
+
buf.push_str(alt);
|
|
215
|
+
buf.push_str("][");
|
|
216
|
+
buf.push_str(&ref_num.to_string());
|
|
217
|
+
buf.push(']');
|
|
218
|
+
return buf;
|
|
205
219
|
}
|
|
206
|
-
buf.push(')');
|
|
207
|
-
buf
|
|
208
220
|
}
|
|
221
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
222
|
+
buf.push_str(";
|
|
225
|
+
buf.push_str(src);
|
|
226
|
+
if let Some(title_text) = title {
|
|
227
|
+
buf.push_str(" \"");
|
|
228
|
+
buf.push_str(title_text);
|
|
229
|
+
buf.push('"');
|
|
230
|
+
}
|
|
231
|
+
buf.push(')');
|
|
232
|
+
buf
|
|
209
233
|
}
|
|
@@ -146,7 +146,14 @@ pub fn handle_img(
|
|
|
146
146
|
visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
|
|
147
147
|
};
|
|
148
148
|
match visit_result {
|
|
149
|
-
VisitResult::Continue => Some(format_image_markdown(
|
|
149
|
+
VisitResult::Continue => Some(format_image_markdown(
|
|
150
|
+
&src,
|
|
151
|
+
&alt,
|
|
152
|
+
title.as_deref(),
|
|
153
|
+
should_use_alt_text,
|
|
154
|
+
options.link_style,
|
|
155
|
+
ctx.reference_collector.as_ref(),
|
|
156
|
+
)),
|
|
150
157
|
VisitResult::Custom(custom) => Some(custom),
|
|
151
158
|
VisitResult::Skip => None,
|
|
152
159
|
VisitResult::Error(err) => {
|
|
@@ -158,11 +165,25 @@ pub fn handle_img(
|
|
|
158
165
|
VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
|
|
159
166
|
}
|
|
160
167
|
} else {
|
|
161
|
-
Some(format_image_markdown(
|
|
168
|
+
Some(format_image_markdown(
|
|
169
|
+
&src,
|
|
170
|
+
&alt,
|
|
171
|
+
title.as_deref(),
|
|
172
|
+
should_use_alt_text,
|
|
173
|
+
options.link_style,
|
|
174
|
+
ctx.reference_collector.as_ref(),
|
|
175
|
+
))
|
|
162
176
|
};
|
|
163
177
|
|
|
164
178
|
#[cfg(not(feature = "visitor"))]
|
|
165
|
-
let image_output = Some(format_image_markdown(
|
|
179
|
+
let image_output = Some(format_image_markdown(
|
|
180
|
+
&src,
|
|
181
|
+
&alt,
|
|
182
|
+
title.as_deref(),
|
|
183
|
+
should_use_alt_text,
|
|
184
|
+
options.link_style,
|
|
185
|
+
ctx.reference_collector.as_ref(),
|
|
186
|
+
));
|
|
166
187
|
|
|
167
188
|
// Only output image if skip_images is not enabled
|
|
168
189
|
if !options.skip_images {
|
|
@@ -192,27 +213,51 @@ pub fn handle_img(
|
|
|
192
213
|
}
|
|
193
214
|
}
|
|
194
215
|
}
|
|
216
|
+
|
|
217
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
218
|
+
let src_opt = if src.is_empty() { None } else { Some(src.as_ref()) };
|
|
219
|
+
let alt_opt = if alt.is_empty() { None } else { Some(alt.as_ref()) };
|
|
220
|
+
sc.borrow_mut().push_image(src_opt, alt_opt);
|
|
221
|
+
}
|
|
195
222
|
}
|
|
196
223
|
|
|
197
224
|
/// Format an image as Markdown syntax.
|
|
198
225
|
///
|
|
199
226
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
200
227
|
/// Otherwise returns the full `` syntax.
|
|
201
|
-
fn format_image_markdown(
|
|
228
|
+
fn format_image_markdown(
|
|
229
|
+
src: &str,
|
|
230
|
+
alt: &str,
|
|
231
|
+
title: Option<&str>,
|
|
232
|
+
use_alt_only: bool,
|
|
233
|
+
link_style: crate::options::validation::LinkStyle,
|
|
234
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
235
|
+
) -> String {
|
|
202
236
|
if use_alt_only {
|
|
203
|
-
alt.to_string()
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
buf.push_str("
|
|
212
|
-
buf.push_str(
|
|
213
|
-
buf.push('
|
|
237
|
+
return alt.to_string();
|
|
238
|
+
}
|
|
239
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
240
|
+
if let Some(collector) = reference_collector {
|
|
241
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
242
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
243
|
+
buf.push_str("![");
|
|
244
|
+
buf.push_str(alt);
|
|
245
|
+
buf.push_str("][");
|
|
246
|
+
buf.push_str(&ref_num.to_string());
|
|
247
|
+
buf.push(']');
|
|
248
|
+
return buf;
|
|
214
249
|
}
|
|
215
|
-
buf.push(')');
|
|
216
|
-
buf
|
|
217
250
|
}
|
|
251
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
252
|
+
buf.push_str(";
|
|
255
|
+
buf.push_str(src);
|
|
256
|
+
if let Some(title_text) = title {
|
|
257
|
+
buf.push_str(" \"");
|
|
258
|
+
buf.push_str(title_text);
|
|
259
|
+
buf.push('"');
|
|
260
|
+
}
|
|
261
|
+
buf.push(')');
|
|
262
|
+
buf
|
|
218
263
|
}
|
|
@@ -115,6 +115,7 @@ pub fn handle_link(
|
|
|
115
115
|
title.as_deref(),
|
|
116
116
|
raw_text.as_str(),
|
|
117
117
|
options,
|
|
118
|
+
ctx.reference_collector.as_ref(),
|
|
118
119
|
);
|
|
119
120
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
120
121
|
return;
|
|
@@ -190,6 +191,13 @@ pub fn handle_link(
|
|
|
190
191
|
label = href.clone();
|
|
191
192
|
}
|
|
192
193
|
|
|
194
|
+
// Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
|
|
195
|
+
// These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
|
|
196
|
+
// Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
|
|
197
|
+
if label == "^" && href.starts_with('#') {
|
|
198
|
+
label = "↑".to_string();
|
|
199
|
+
}
|
|
200
|
+
|
|
193
201
|
let escaped_label = escape_link_label(&label);
|
|
194
202
|
|
|
195
203
|
#[cfg(feature = "visitor")]
|
|
@@ -226,6 +234,7 @@ pub fn handle_link(
|
|
|
226
234
|
title.as_deref(),
|
|
227
235
|
label.as_str(),
|
|
228
236
|
options,
|
|
237
|
+
ctx.reference_collector.as_ref(),
|
|
229
238
|
);
|
|
230
239
|
Some(buf)
|
|
231
240
|
}
|
|
@@ -248,6 +257,7 @@ pub fn handle_link(
|
|
|
248
257
|
title.as_deref(),
|
|
249
258
|
label.as_str(),
|
|
250
259
|
options,
|
|
260
|
+
ctx.reference_collector.as_ref(),
|
|
251
261
|
);
|
|
252
262
|
Some(buf)
|
|
253
263
|
};
|
|
@@ -262,6 +272,7 @@ pub fn handle_link(
|
|
|
262
272
|
title.as_deref(),
|
|
263
273
|
label.as_str(),
|
|
264
274
|
options,
|
|
275
|
+
ctx.reference_collector.as_ref(),
|
|
265
276
|
);
|
|
266
277
|
Some(buf)
|
|
267
278
|
};
|
|
@@ -145,6 +145,7 @@ pub(crate) fn handle(
|
|
|
145
145
|
title.as_deref(),
|
|
146
146
|
raw_text.as_str(),
|
|
147
147
|
options,
|
|
148
|
+
ctx.reference_collector.as_ref(),
|
|
148
149
|
);
|
|
149
150
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
150
151
|
return;
|
|
@@ -262,6 +263,7 @@ pub(crate) fn handle(
|
|
|
262
263
|
title.as_deref(),
|
|
263
264
|
label.as_str(),
|
|
264
265
|
options,
|
|
266
|
+
ctx.reference_collector.as_ref(),
|
|
265
267
|
);
|
|
266
268
|
Some(buf)
|
|
267
269
|
}
|
|
@@ -284,6 +286,7 @@ pub(crate) fn handle(
|
|
|
284
286
|
title.as_deref(),
|
|
285
287
|
label.as_str(),
|
|
286
288
|
options,
|
|
289
|
+
ctx.reference_collector.as_ref(),
|
|
287
290
|
);
|
|
288
291
|
Some(buf)
|
|
289
292
|
};
|
|
@@ -298,6 +301,7 @@ pub(crate) fn handle(
|
|
|
298
301
|
title.as_deref(),
|
|
299
302
|
label.as_str(),
|
|
300
303
|
options,
|
|
304
|
+
ctx.reference_collector.as_ref(),
|
|
301
305
|
);
|
|
302
306
|
Some(buf)
|
|
303
307
|
};
|
|
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
|
|
|
363
367
|
title: Option<&str>,
|
|
364
368
|
raw_text: &str,
|
|
365
369
|
options: &ConversionOptions,
|
|
370
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
366
371
|
) {
|
|
372
|
+
if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
|
|
373
|
+
if let Some(collector) = reference_collector {
|
|
374
|
+
let ref_num = collector.borrow_mut().get_or_insert(href, title);
|
|
375
|
+
output.push('[');
|
|
376
|
+
output.push_str(label);
|
|
377
|
+
output.push_str("][");
|
|
378
|
+
output.push_str(&ref_num.to_string());
|
|
379
|
+
output.push(']');
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
367
384
|
output.push('[');
|
|
368
385
|
output.push_str(label);
|
|
369
386
|
output.push_str("](");
|
|
@@ -204,6 +204,8 @@ pub(crate) fn handle_li(
|
|
|
204
204
|
}
|
|
205
205
|
}
|
|
206
206
|
|
|
207
|
+
let item_start_pos = output.len();
|
|
208
|
+
|
|
207
209
|
let children = tag.children();
|
|
208
210
|
{
|
|
209
211
|
for child_handle in children.top().iter() {
|
|
@@ -213,6 +215,18 @@ pub(crate) fn handle_li(
|
|
|
213
215
|
|
|
214
216
|
trim_trailing_whitespace(output);
|
|
215
217
|
|
|
218
|
+
if !ctx.in_table_cell {
|
|
219
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
220
|
+
if item_start_pos <= output.len() && output.is_char_boundary(item_start_pos) {
|
|
221
|
+
let rendered = &output[item_start_pos..];
|
|
222
|
+
let content = rendered.trim();
|
|
223
|
+
if !content.is_empty() {
|
|
224
|
+
sc.borrow_mut().push_list_item(content);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
216
230
|
#[cfg(feature = "visitor")]
|
|
217
231
|
if let Some(ref visitor_handle) = ctx.visitor {
|
|
218
232
|
use crate::visitor::{NodeContext, NodeType, VisitResult};
|
|
@@ -107,6 +107,12 @@ pub(crate) fn handle_ol(
|
|
|
107
107
|
}
|
|
108
108
|
}
|
|
109
109
|
|
|
110
|
+
if !ctx.in_table_cell {
|
|
111
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
112
|
+
sc.borrow_mut().push_list_start(true);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
110
116
|
process_list_children(
|
|
111
117
|
*node_handle,
|
|
112
118
|
parser,
|
|
@@ -121,6 +127,12 @@ pub(crate) fn handle_ol(
|
|
|
121
127
|
dom_ctx,
|
|
122
128
|
);
|
|
123
129
|
|
|
130
|
+
if !ctx.in_table_cell {
|
|
131
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
132
|
+
sc.borrow_mut().push_list_end();
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
124
136
|
add_nested_list_trailing_separator(output, ctx);
|
|
125
137
|
|
|
126
138
|
#[cfg(feature = "visitor")]
|
|
@@ -101,6 +101,12 @@ pub(crate) fn handle_ul(
|
|
|
101
101
|
}
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
+
if !ctx.in_table_cell {
|
|
105
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
106
|
+
sc.borrow_mut().push_list_start(false);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
104
110
|
process_list_children(
|
|
105
111
|
*node_handle,
|
|
106
112
|
parser,
|
|
@@ -115,6 +121,12 @@ pub(crate) fn handle_ul(
|
|
|
115
121
|
dom_ctx,
|
|
116
122
|
);
|
|
117
123
|
|
|
124
|
+
if !ctx.in_table_cell {
|
|
125
|
+
if let Some(ref sc) = ctx.structure_collector {
|
|
126
|
+
sc.borrow_mut().push_list_end();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
118
130
|
add_nested_list_trailing_separator(output, ctx);
|
|
119
131
|
|
|
120
132
|
#[cfg(feature = "visitor")]
|
|
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
|
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
198
|
|
|
199
|
+
let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
|
|
200
|
+
Some(std::rc::Rc::new(std::cell::RefCell::new(
|
|
201
|
+
crate::converter::reference_collector::ReferenceCollector::new(),
|
|
202
|
+
)))
|
|
203
|
+
} else {
|
|
204
|
+
None
|
|
205
|
+
};
|
|
206
|
+
|
|
199
207
|
#[cfg(all(feature = "metadata", feature = "visitor"))]
|
|
200
208
|
let ctx = Context::new(
|
|
201
209
|
options,
|
|
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
|
|
|
203
211
|
metadata_collector,
|
|
204
212
|
visitor,
|
|
205
213
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
206
215
|
);
|
|
207
216
|
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
|
|
208
217
|
let ctx = Context::new(
|
|
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
|
|
|
211
220
|
metadata_collector,
|
|
212
221
|
_visitor,
|
|
213
222
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
223
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
224
|
);
|
|
215
225
|
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
|
|
216
226
|
let ctx = Context::new(
|
|
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
|
|
|
219
229
|
_metadata_collector,
|
|
220
230
|
visitor,
|
|
221
231
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
232
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
222
233
|
);
|
|
223
234
|
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
|
|
224
235
|
let ctx = Context::new(
|
|
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
|
|
|
227
238
|
_metadata_collector,
|
|
228
239
|
_visitor,
|
|
229
240
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
241
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
230
242
|
);
|
|
231
243
|
|
|
232
244
|
for child_handle in dom.children() {
|
|
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
|
|
|
242
254
|
// reference to the same collector, and Rc::try_unwrap requires exactly one reference.
|
|
243
255
|
drop(ctx);
|
|
244
256
|
|
|
257
|
+
// Append reference-style link definitions if any were collected
|
|
258
|
+
if let Some(rc) = reference_collector {
|
|
259
|
+
if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
|
|
260
|
+
let ref_section = collector.into_inner().finish();
|
|
261
|
+
if !ref_section.is_empty() {
|
|
262
|
+
let trimmed_len = output.trim_end_matches('\n').len();
|
|
263
|
+
output.truncate(trimmed_len);
|
|
264
|
+
output.push_str("\n\n");
|
|
265
|
+
output.push_str(&ref_section);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
245
270
|
// If plain text was requested, discard the markdown output and return plain text.
|
|
246
271
|
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
247
272
|
if is_plain_text {
|
|
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
|
|
|
78
78
|
};
|
|
79
79
|
|
|
80
80
|
if should_output_media_link(&src) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
82
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
83
|
+
output.push('[');
|
|
84
|
+
output.push_str(&src);
|
|
85
|
+
output.push_str("][");
|
|
86
|
+
output.push_str(&ref_num.to_string());
|
|
87
|
+
output.push(']');
|
|
88
|
+
} else {
|
|
89
|
+
output.push('[');
|
|
90
|
+
output.push_str(&src);
|
|
91
|
+
output.push_str("](");
|
|
92
|
+
output.push_str(&src);
|
|
93
|
+
output.push(')');
|
|
94
|
+
}
|
|
86
95
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
87
96
|
output.push_str("\n\n");
|
|
88
97
|
}
|
|
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
|
|
|
132
141
|
};
|
|
133
142
|
|
|
134
143
|
if should_output_media_link(&src) {
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
144
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
145
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
146
|
+
output.push('[');
|
|
147
|
+
output.push_str(&src);
|
|
148
|
+
output.push_str("][");
|
|
149
|
+
output.push_str(&ref_num.to_string());
|
|
150
|
+
output.push(']');
|
|
151
|
+
} else {
|
|
152
|
+
output.push('[');
|
|
153
|
+
output.push_str(&src);
|
|
154
|
+
output.push_str("](");
|
|
155
|
+
output.push_str(&src);
|
|
156
|
+
output.push(')');
|
|
157
|
+
}
|
|
140
158
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
141
159
|
output.push_str("\n\n");
|
|
142
160
|
}
|
|
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
|
|
|
199
217
|
.map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
|
|
200
218
|
|
|
201
219
|
if !src.is_empty() {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
220
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
221
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
222
|
+
output.push('[');
|
|
223
|
+
output.push_str(&src);
|
|
224
|
+
output.push_str("][");
|
|
225
|
+
output.push_str(&ref_num.to_string());
|
|
226
|
+
output.push(']');
|
|
227
|
+
} else {
|
|
228
|
+
output.push('[');
|
|
229
|
+
output.push_str(&src);
|
|
230
|
+
output.push_str("](");
|
|
231
|
+
output.push_str(&src);
|
|
232
|
+
output.push(')');
|
|
233
|
+
}
|
|
207
234
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
208
235
|
output.push_str("\n\n");
|
|
209
236
|
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
//! Collector for reference-style link definitions.
|
|
2
|
+
|
|
3
|
+
use std::cell::RefCell;
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
use std::rc::Rc;
|
|
6
|
+
|
|
7
|
+
/// Shared handle for passing the collector through the conversion context.
|
|
8
|
+
pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
|
|
9
|
+
|
|
10
|
+
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
11
|
+
struct ReferenceKey {
|
|
12
|
+
url: String,
|
|
13
|
+
title: Option<String>,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// Collects link/image references during conversion and produces a reference
|
|
17
|
+
/// definitions section at the end of the document.
|
|
18
|
+
#[derive(Debug, Default)]
|
|
19
|
+
pub struct ReferenceCollector {
|
|
20
|
+
map: HashMap<ReferenceKey, usize>,
|
|
21
|
+
entries: Vec<(usize, String, Option<String>)>,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
impl ReferenceCollector {
|
|
25
|
+
/// Create a new, empty reference collector.
|
|
26
|
+
pub fn new() -> Self {
|
|
27
|
+
Self::default()
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Register a URL (and optional title) and return its 1-based reference number.
|
|
31
|
+
///
|
|
32
|
+
/// If the same URL+title pair was already registered, the existing number is returned.
|
|
33
|
+
pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
|
|
34
|
+
let key = ReferenceKey {
|
|
35
|
+
url: url.to_string(),
|
|
36
|
+
title: title.map(String::from),
|
|
37
|
+
};
|
|
38
|
+
if let Some(&num) = self.map.get(&key) {
|
|
39
|
+
return num;
|
|
40
|
+
}
|
|
41
|
+
let num = self.entries.len() + 1;
|
|
42
|
+
self.map.insert(key, num);
|
|
43
|
+
self.entries.push((num, url.to_string(), title.map(String::from)));
|
|
44
|
+
num
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Produce the reference definitions section.
|
|
48
|
+
///
|
|
49
|
+
/// Returns an empty string when no references were collected.
|
|
50
|
+
pub fn finish(&self) -> String {
|
|
51
|
+
if self.entries.is_empty() {
|
|
52
|
+
return String::new();
|
|
53
|
+
}
|
|
54
|
+
let mut out = String::new();
|
|
55
|
+
for (num, url, title) in &self.entries {
|
|
56
|
+
out.push('[');
|
|
57
|
+
out.push_str(&num.to_string());
|
|
58
|
+
out.push_str("]: ");
|
|
59
|
+
out.push_str(url);
|
|
60
|
+
if let Some(t) = title {
|
|
61
|
+
out.push_str(" \"");
|
|
62
|
+
out.push_str(&t.replace('"', "\\\""));
|
|
63
|
+
out.push('"');
|
|
64
|
+
}
|
|
65
|
+
out.push('\n');
|
|
66
|
+
}
|
|
67
|
+
out
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -18,6 +18,7 @@ pub use crate::metadata::{
|
|
|
18
18
|
};
|
|
19
19
|
|
|
20
20
|
pub use crate::options::{
|
|
21
|
-
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
-
NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
|
|
21
|
+
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, LinkStyle,
|
|
22
|
+
ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
|
|
23
|
+
WhitespaceMode,
|
|
23
24
|
};
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::options::preprocessing::PreprocessingOptions;
|
|
6
6
|
use crate::options::validation::{
|
|
7
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
7
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
8
8
|
};
|
|
9
9
|
|
|
10
10
|
/// Main conversion options for HTML to Markdown conversion.
|
|
@@ -94,6 +94,8 @@ pub struct ConversionOptions {
|
|
|
94
94
|
pub preserve_tags: Vec<String>,
|
|
95
95
|
/// Skip conversion of `<img>` elements (omit images from output).
|
|
96
96
|
pub skip_images: bool,
|
|
97
|
+
/// Link rendering style (inline or reference).
|
|
98
|
+
pub link_style: LinkStyle,
|
|
97
99
|
/// Target output format (Markdown, plain text, etc.).
|
|
98
100
|
pub output_format: OutputFormat,
|
|
99
101
|
/// Include structured document tree in result.
|
|
@@ -142,6 +144,7 @@ impl Default for ConversionOptions {
|
|
|
142
144
|
strip_tags: Vec::new(),
|
|
143
145
|
preserve_tags: Vec::new(),
|
|
144
146
|
skip_images: false,
|
|
147
|
+
link_style: LinkStyle::default(),
|
|
145
148
|
output_format: OutputFormat::default(),
|
|
146
149
|
include_document_structure: false,
|
|
147
150
|
extract_images: false,
|
|
@@ -207,6 +210,7 @@ impl ConversionOptionsBuilder {
|
|
|
207
210
|
builder_setter!(newline_style, NewlineStyle);
|
|
208
211
|
builder_setter!(highlight_style, HighlightStyle);
|
|
209
212
|
builder_setter_into!(code_language, String);
|
|
213
|
+
builder_setter!(link_style, LinkStyle);
|
|
210
214
|
builder_setter!(autolinks, bool);
|
|
211
215
|
builder_setter!(default_title, bool);
|
|
212
216
|
builder_setter!(br_in_tables, bool);
|
|
@@ -356,6 +360,8 @@ pub struct ConversionOptionsUpdate {
|
|
|
356
360
|
pub preserve_tags: Option<Vec<String>>,
|
|
357
361
|
/// Optional override for [`ConversionOptions::skip_images`].
|
|
358
362
|
pub skip_images: Option<bool>,
|
|
363
|
+
/// Optional override for [`ConversionOptions::link_style`].
|
|
364
|
+
pub link_style: Option<LinkStyle>,
|
|
359
365
|
/// Optional override for [`ConversionOptions::output_format`].
|
|
360
366
|
pub output_format: Option<OutputFormat>,
|
|
361
367
|
/// Optional override for [`ConversionOptions::include_document_structure`].
|
|
@@ -410,6 +416,7 @@ impl ConversionOptions {
|
|
|
410
416
|
apply!(strip_tags);
|
|
411
417
|
apply!(preserve_tags);
|
|
412
418
|
apply!(skip_images);
|
|
419
|
+
apply!(link_style);
|
|
413
420
|
apply!(output_format);
|
|
414
421
|
apply!(include_document_structure);
|
|
415
422
|
apply!(extract_images);
|
|
@@ -13,7 +13,7 @@ pub mod validation;
|
|
|
13
13
|
pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
|
|
14
14
|
pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
|
|
15
15
|
pub use validation::{
|
|
16
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
16
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
17
17
|
};
|
|
18
18
|
|
|
19
19
|
// Note: InlineImageConfig is re-exported from the inline_images module,
|
|
@@ -172,6 +172,33 @@ impl HighlightStyle {
|
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
|
|
175
|
+
/// Link rendering style in Markdown output.
|
|
176
|
+
///
|
|
177
|
+
/// Controls whether links and images use inline `[text](url)` syntax or
|
|
178
|
+
/// reference-style `[text][1]` syntax with definitions collected at the end.
|
|
179
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
180
|
+
pub enum LinkStyle {
|
|
181
|
+
/// Inline links: `[text](url)`. Default.
|
|
182
|
+
#[default]
|
|
183
|
+
Inline,
|
|
184
|
+
/// Reference-style links: `[text][1]` with `[1]: url` at end of document.
|
|
185
|
+
Reference,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
impl LinkStyle {
|
|
189
|
+
/// Parse a link style from a string.
|
|
190
|
+
///
|
|
191
|
+
/// Accepts "reference" or defaults to Inline.
|
|
192
|
+
/// Input is normalized (lowercased, alphanumeric only).
|
|
193
|
+
#[must_use]
|
|
194
|
+
pub fn parse(value: &str) -> Self {
|
|
195
|
+
match normalize_token(value).as_str() {
|
|
196
|
+
"reference" => Self::Reference,
|
|
197
|
+
_ => Self::Inline,
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
175
202
|
/// Output format for conversion.
|
|
176
203
|
///
|
|
177
204
|
/// Specifies the target markup language format for the conversion output.
|
|
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
|
|
|
215
242
|
#[cfg(any(feature = "serde", feature = "metadata"))]
|
|
216
243
|
mod serde_impls {
|
|
217
244
|
use super::{
|
|
218
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
245
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
246
|
+
WhitespaceMode,
|
|
219
247
|
};
|
|
220
248
|
use serde::{Deserialize, Serialize, Serializer};
|
|
221
249
|
|
|
@@ -239,6 +267,7 @@ mod serde_impls {
|
|
|
239
267
|
impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
|
|
240
268
|
impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
|
|
241
269
|
impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
|
|
270
|
+
impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
|
|
242
271
|
impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
|
|
243
272
|
|
|
244
273
|
// Serialize implementations that convert enum variants to their string representations
|
|
@@ -324,6 +353,19 @@ mod serde_impls {
|
|
|
324
353
|
}
|
|
325
354
|
}
|
|
326
355
|
|
|
356
|
+
impl Serialize for LinkStyle {
|
|
357
|
+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
358
|
+
where
|
|
359
|
+
S: Serializer,
|
|
360
|
+
{
|
|
361
|
+
let s = match self {
|
|
362
|
+
Self::Inline => "inline",
|
|
363
|
+
Self::Reference => "reference",
|
|
364
|
+
};
|
|
365
|
+
serializer.serialize_str(s)
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
327
369
|
impl Serialize for OutputFormat {
|
|
328
370
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
329
371
|
where
|
|
@@ -226,7 +226,8 @@ fn make_node_id(node_type: &str, text: &str, index: usize) -> String {
|
|
|
226
226
|
let mut hasher = DefaultHasher::new();
|
|
227
227
|
node_type.hash(&mut hasher);
|
|
228
228
|
// Only hash a prefix of the text to keep cost bounded.
|
|
229
|
-
text
|
|
229
|
+
let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
|
|
230
|
+
text[..end].hash(&mut hasher);
|
|
230
231
|
index.hash(&mut hasher);
|
|
231
232
|
let digest = hasher.finish();
|
|
232
233
|
format!("{node_type}-{digest:016x}")
|
|
@@ -347,7 +347,8 @@ impl StructureCollector {
|
|
|
347
347
|
|
|
348
348
|
let mut hasher = DefaultHasher::new();
|
|
349
349
|
node_type.hash(&mut hasher);
|
|
350
|
-
text
|
|
350
|
+
let end = crate::converter::utility::content::floor_char_boundary(text, text.len().min(64));
|
|
351
|
+
text[..end].hash(&mut hasher);
|
|
351
352
|
index.hash(&mut hasher);
|
|
352
353
|
let digest = hasher.finish();
|
|
353
354
|
format!("{node_type}-{digest:016x}")
|
|
@@ -591,6 +591,30 @@ fn q_element_produces_quotes() {
|
|
|
591
591
|
assert!(result.contains(r#""hello""#), "q element should add quotes: {result}");
|
|
592
592
|
}
|
|
593
593
|
|
|
594
|
+
#[test]
|
|
595
|
+
fn test_wikipedia_back_reference_caret_normalized() {
|
|
596
|
+
// Wikipedia back-references use <a href="#cite_ref-N">^</a>
|
|
597
|
+
// The caret should be normalized to ↑ to avoid confusion with markdown footnote syntax
|
|
598
|
+
let html = r##"<p>Some text<sup><a href="#cite_ref-1">^</a></sup> more text</p>"##;
|
|
599
|
+
let result = convert(html, None).unwrap();
|
|
600
|
+
assert!(
|
|
601
|
+
result.contains("[↑](#cite_ref-1)"),
|
|
602
|
+
"Back-reference caret should be normalized to ↑: {result}"
|
|
603
|
+
);
|
|
604
|
+
assert!(
|
|
605
|
+
!result.contains("[^]"),
|
|
606
|
+
"Should not produce [^] which looks like footnote syntax: {result}"
|
|
607
|
+
);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
#[test]
|
|
611
|
+
fn test_regular_caret_link_not_affected() {
|
|
612
|
+
// Regular links with ^ text but no # href should keep the ^
|
|
613
|
+
let html = r#"<a href="https://example.com">^</a>"#;
|
|
614
|
+
let result = convert(html, None).unwrap();
|
|
615
|
+
assert!(result.contains("[^]"), "Non-anchor caret links should keep ^: {result}");
|
|
616
|
+
}
|
|
617
|
+
|
|
594
618
|
fn convert(
|
|
595
619
|
html: &str,
|
|
596
620
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::{ConversionOptions, LinkStyle};
|
|
4
|
+
|
|
5
|
+
fn convert(html: &str, options: Option<ConversionOptions>) -> String {
|
|
6
|
+
html_to_markdown_rs::convert(html, options)
|
|
7
|
+
.unwrap()
|
|
8
|
+
.content
|
|
9
|
+
.unwrap_or_default()
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
fn ref_options() -> ConversionOptions {
|
|
13
|
+
ConversionOptions {
|
|
14
|
+
link_style: LinkStyle::Reference,
|
|
15
|
+
..Default::default()
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#[test]
|
|
20
|
+
fn basic_reference_link() {
|
|
21
|
+
let html = r#"<a href="https://example.com">Click here</a>"#;
|
|
22
|
+
let result = convert(html, Some(ref_options()));
|
|
23
|
+
assert!(
|
|
24
|
+
result.contains("[Click here][1]"),
|
|
25
|
+
"Expected reference-style link, got: {result}"
|
|
26
|
+
);
|
|
27
|
+
assert!(
|
|
28
|
+
result.contains("[1]: https://example.com"),
|
|
29
|
+
"Expected reference definition, got: {result}"
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[test]
|
|
34
|
+
fn reference_link_with_title() {
|
|
35
|
+
let html = r#"<a href="https://example.com" title="Example">Click</a>"#;
|
|
36
|
+
let result = convert(html, Some(ref_options()));
|
|
37
|
+
assert!(
|
|
38
|
+
result.contains("[Click][1]"),
|
|
39
|
+
"Expected reference-style link, got: {result}"
|
|
40
|
+
);
|
|
41
|
+
assert!(
|
|
42
|
+
result.contains(r#"[1]: https://example.com "Example""#),
|
|
43
|
+
"Expected reference definition with title, got: {result}"
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn url_deduplication() {
|
|
49
|
+
let html = r#"<a href="https://example.com">First</a> <a href="https://example.com">Second</a>"#;
|
|
50
|
+
let result = convert(html, Some(ref_options()));
|
|
51
|
+
assert!(
|
|
52
|
+
result.contains("[First][1]"),
|
|
53
|
+
"Expected first link with ref 1, got: {result}"
|
|
54
|
+
);
|
|
55
|
+
assert!(
|
|
56
|
+
result.contains("[Second][1]"),
|
|
57
|
+
"Expected second link reusing ref 1, got: {result}"
|
|
58
|
+
);
|
|
59
|
+
// Should only have one definition
|
|
60
|
+
let count = result.matches("[1]: https://example.com").count();
|
|
61
|
+
assert_eq!(count, 1, "Expected exactly one definition, got: {result}");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#[test]
|
|
65
|
+
fn different_titles_different_refs() {
|
|
66
|
+
let html =
|
|
67
|
+
r#"<a href="https://example.com" title="A">First</a> <a href="https://example.com" title="B">Second</a>"#;
|
|
68
|
+
let result = convert(html, Some(ref_options()));
|
|
69
|
+
assert!(
|
|
70
|
+
result.contains("[First][1]"),
|
|
71
|
+
"Expected first link ref 1, got: {result}"
|
|
72
|
+
);
|
|
73
|
+
assert!(
|
|
74
|
+
result.contains("[Second][2]"),
|
|
75
|
+
"Expected second link ref 2 (different title), got: {result}"
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[test]
|
|
80
|
+
fn image_reference_style() {
|
|
81
|
+
let html = r#"<img src="https://example.com/img.png" alt="A photo">"#;
|
|
82
|
+
let result = convert(html, Some(ref_options()));
|
|
83
|
+
assert!(
|
|
84
|
+
result.contains("![A photo][1]"),
|
|
85
|
+
"Expected reference-style image, got: {result}"
|
|
86
|
+
);
|
|
87
|
+
assert!(
|
|
88
|
+
result.contains("[1]: https://example.com/img.png"),
|
|
89
|
+
"Expected image reference definition, got: {result}"
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#[test]
|
|
94
|
+
fn mixed_links_and_images_share_numbering() {
|
|
95
|
+
let html = r#"<a href="https://a.com">Link</a><img src="https://b.com/img.png" alt="Img">"#;
|
|
96
|
+
let result = convert(html, Some(ref_options()));
|
|
97
|
+
assert!(result.contains("[Link][1]"), "Expected link as ref 1, got: {result}");
|
|
98
|
+
assert!(result.contains("![Img][2]"), "Expected image as ref 2, got: {result}");
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[test]
|
|
102
|
+
fn autolinks_unaffected() {
|
|
103
|
+
let html = r#"<a href="https://example.com">https://example.com</a>"#;
|
|
104
|
+
let options = ConversionOptions {
|
|
105
|
+
link_style: LinkStyle::Reference,
|
|
106
|
+
autolinks: true,
|
|
107
|
+
..Default::default()
|
|
108
|
+
};
|
|
109
|
+
let result = convert(html, Some(options));
|
|
110
|
+
// Autolinks should still render as <url>
|
|
111
|
+
assert!(
|
|
112
|
+
result.contains("<https://example.com>"),
|
|
113
|
+
"Autolinks should not be affected by reference style, got: {result}"
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#[test]
|
|
118
|
+
fn default_inline_unchanged() {
|
|
119
|
+
let html = r#"<a href="https://example.com">Click</a>"#;
|
|
120
|
+
let result = convert(html, None);
|
|
121
|
+
assert!(
|
|
122
|
+
result.contains("[Click](https://example.com)"),
|
|
123
|
+
"Default should use inline style, got: {result}"
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[test]
|
|
128
|
+
fn multiple_paragraphs_references_at_end() {
|
|
129
|
+
let html = r#"<p><a href="https://a.com">A</a></p><p><a href="https://b.com">B</a></p>"#;
|
|
130
|
+
let result = convert(html, Some(ref_options()));
|
|
131
|
+
// References should be at the very end
|
|
132
|
+
let ref_section_start = result.find("[1]:").expect("Should have ref section");
|
|
133
|
+
let content_end = result.find("[A][1]").expect("Should have inline ref");
|
|
134
|
+
assert!(
|
|
135
|
+
ref_section_start > content_end,
|
|
136
|
+
"Reference section should be after content"
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[test]
|
|
141
|
+
fn empty_href_no_reference() {
|
|
142
|
+
let html = r#"<a href="">Empty</a>"#;
|
|
143
|
+
let result = convert(html, Some(ref_options()));
|
|
144
|
+
// Empty href should not create a reference
|
|
145
|
+
assert!(
|
|
146
|
+
!result.contains("[1]:"),
|
|
147
|
+
"Empty href should not create reference, got: {result}"
|
|
148
|
+
);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
#[test]
|
|
152
|
+
fn title_with_quotes_escaped() {
|
|
153
|
+
let html = r#"<a href="https://example.com" title='Say "hello"'>Link</a>"#;
|
|
154
|
+
let result = convert(html, Some(ref_options()));
|
|
155
|
+
assert!(
|
|
156
|
+
result.contains(r#"[1]: https://example.com "Say \"hello\"""#),
|
|
157
|
+
"Quotes in title should be escaped, got: {result}"
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[test]
|
|
162
|
+
fn media_elements_reference_style() {
|
|
163
|
+
let html = r#"<video src="https://example.com/video.mp4"></video>"#;
|
|
164
|
+
let result = convert(html, Some(ref_options()));
|
|
165
|
+
assert!(
|
|
166
|
+
result.contains("[1]: https://example.com/video.mp4"),
|
|
167
|
+
"Video should use reference style, got: {result}"
|
|
168
|
+
);
|
|
169
|
+
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.0
|
|
4
|
+
version: 3.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-04-01 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -142,6 +142,7 @@ files:
|
|
|
142
142
|
- vendor/html-to-markdown-rs/src/converter/mod.rs
|
|
143
143
|
- vendor/html-to-markdown-rs/src/converter/plain_text.rs
|
|
144
144
|
- vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
|
|
145
|
+
- vendor/html-to-markdown-rs/src/converter/reference_collector.rs
|
|
145
146
|
- vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
|
|
146
147
|
- vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
|
|
147
148
|
- vendor/html-to-markdown-rs/src/converter/semantic/figure.rs
|
|
@@ -224,6 +225,7 @@ files:
|
|
|
224
225
|
- vendor/html-to-markdown-rs/tests/lists_test.rs
|
|
225
226
|
- vendor/html-to-markdown-rs/tests/plain_output_test.rs
|
|
226
227
|
- vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
|
|
228
|
+
- vendor/html-to-markdown-rs/tests/reference_links_test.rs
|
|
227
229
|
- vendor/html-to-markdown-rs/tests/skip_images_test.rs
|
|
228
230
|
- vendor/html-to-markdown-rs/tests/tables_test.rs
|
|
229
231
|
- vendor/html-to-markdown-rs/tests/test_custom_elements.rs
|