html-to-markdown 3.0.2 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/html-to-markdown-rb/native/Cargo.toml +1 -1
- data/lib/html_to_markdown/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/Cargo.toml +1 -1
- data/vendor/html-to-markdown-rs/src/converter/context.rs +5 -0
- data/vendor/html-to-markdown-rs/src/converter/handlers/graphic.rs +38 -14
- data/vendor/html-to-markdown-rs/src/converter/handlers/image.rs +56 -17
- data/vendor/html-to-markdown-rs/src/converter/handlers/link.rs +11 -0
- data/vendor/html-to-markdown-rs/src/converter/inline/link.rs +17 -0
- data/vendor/html-to-markdown-rs/src/converter/main.rs +25 -0
- data/vendor/html-to-markdown-rs/src/converter/media/embedded.rs +42 -15
- data/vendor/html-to-markdown-rs/src/converter/mod.rs +1 -0
- data/vendor/html-to-markdown-rs/src/converter/reference_collector.rs +69 -0
- data/vendor/html-to-markdown-rs/src/exports.rs +3 -2
- data/vendor/html-to-markdown-rs/src/options/conversion.rs +8 -1
- data/vendor/html-to-markdown-rs/src/options/mod.rs +1 -1
- data/vendor/html-to-markdown-rs/src/options/validation.rs +43 -1
- data/vendor/html-to-markdown-rs/tests/integration_test.rs +24 -0
- data/vendor/html-to-markdown-rs/tests/reference_links_test.rs +169 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c23b51454716c4f5224bc9a0b6cfcfcf3f9935709379395662d9d89cab96f223
|
|
4
|
+
data.tar.gz: '0878f8bad06ca970013d87f6064150bed2db8b5e12d087474acaa4dd17a00559'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e21bd6d2ec9cbd40df454f2b441cb2da333b1c73a062686f830c4bd3368dad2dacec3bb0953f5c8902ad2ab411453c690597d64f0c86103d65b71438c647a7f1
|
|
7
|
+
data.tar.gz: 38cf61f5035e6becae227f4117f10208eb9b0ca2d99b805b6e8feefdc8bf2611e44605c967218ea69892df3af8a417026fb70e20cb3ab9a9e28771c3ecc723c9
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html-to-markdown (3.0
|
|
4
|
+
html-to-markdown (3.1.0)
|
|
5
5
|
rb_sys (>= 0.9, < 1.0)
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -153,7 +153,7 @@ CHECKSUMS
|
|
|
153
153
|
ffi (1.17.4-arm64-darwin) sha256=19071aaf1419251b0a46852abf960e77330a3b334d13a4ab51d58b31a937001b
|
|
154
154
|
ffi (1.17.4-x86_64-linux-gnu) sha256=9d3db14c2eae074b382fa9c083fe95aec6e0a1451da249eab096c34002bc752d
|
|
155
155
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
156
|
-
html-to-markdown (3.0
|
|
156
|
+
html-to-markdown (3.1.0)
|
|
157
157
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
158
158
|
json (2.19.3) sha256=289b0bb53052a1fa8c34ab33cc750b659ba14a5c45f3fcf4b18762dc67c78646
|
|
159
159
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
data/README.md
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/html-to-markdown?label=Java&color=007ec6" alt="Java">
|
|
19
19
|
</a>
|
|
20
20
|
<a href="https://pkg.go.dev/github.com/kreuzberg-dev/html-to-markdown/packages/go/v3/htmltomarkdown">
|
|
21
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.0
|
|
21
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/html-to-markdown?label=Go&color=007ec6&filter=v3.1.0" alt="Go">
|
|
22
22
|
</a>
|
|
23
23
|
<a href="https://www.nuget.org/packages/KreuzbergDev.HtmlToMarkdown/">
|
|
24
24
|
<img src="https://img.shields.io/nuget/v/KreuzbergDev.HtmlToMarkdown?label=C%23&color=007ec6" alt="C#">
|
data/vendor/Cargo.toml
CHANGED
|
@@ -12,6 +12,7 @@ use std::rc::Rc;
|
|
|
12
12
|
#[cfg(feature = "inline-images")]
|
|
13
13
|
use crate::inline_images::InlineImageCollector;
|
|
14
14
|
|
|
15
|
+
use crate::converter::reference_collector::ReferenceCollectorHandle;
|
|
15
16
|
use crate::types::structure_collector::StructureCollectorHandle;
|
|
16
17
|
|
|
17
18
|
/// Handle type for inline image collector when feature is enabled.
|
|
@@ -105,6 +106,8 @@ pub struct Context {
|
|
|
105
106
|
///
|
|
106
107
|
/// Populated when `options.include_document_structure == true`.
|
|
107
108
|
pub(crate) structure_collector: Option<StructureCollectorHandle>,
|
|
109
|
+
/// Optional reference collector for reference-style links.
|
|
110
|
+
pub(crate) reference_collector: Option<ReferenceCollectorHandle>,
|
|
108
111
|
}
|
|
109
112
|
|
|
110
113
|
impl Context {
|
|
@@ -122,6 +125,7 @@ impl Context {
|
|
|
122
125
|
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
|
|
123
126
|
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
|
|
124
127
|
structure_collector: Option<StructureCollectorHandle>,
|
|
128
|
+
reference_collector: Option<ReferenceCollectorHandle>,
|
|
125
129
|
) -> Self {
|
|
126
130
|
#[cfg(feature = "metadata")]
|
|
127
131
|
let (
|
|
@@ -186,6 +190,7 @@ impl Context {
|
|
|
186
190
|
#[cfg(feature = "visitor")]
|
|
187
191
|
visitor_error: Rc::new(RefCell::new(None)),
|
|
188
192
|
structure_collector,
|
|
193
|
+
reference_collector,
|
|
189
194
|
}
|
|
190
195
|
}
|
|
191
196
|
}
|
|
@@ -128,6 +128,8 @@ pub fn handle_graphic(
|
|
|
128
128
|
&alt,
|
|
129
129
|
title.as_deref(),
|
|
130
130
|
should_use_alt_text,
|
|
131
|
+
options.link_style,
|
|
132
|
+
ctx.reference_collector.as_ref(),
|
|
131
133
|
)),
|
|
132
134
|
VisitResult::Custom(custom) => Some(custom),
|
|
133
135
|
VisitResult::Skip => None,
|
|
@@ -145,6 +147,8 @@ pub fn handle_graphic(
|
|
|
145
147
|
&alt,
|
|
146
148
|
title.as_deref(),
|
|
147
149
|
should_use_alt_text,
|
|
150
|
+
options.link_style,
|
|
151
|
+
ctx.reference_collector.as_ref(),
|
|
148
152
|
))
|
|
149
153
|
};
|
|
150
154
|
|
|
@@ -154,6 +158,8 @@ pub fn handle_graphic(
|
|
|
154
158
|
&alt,
|
|
155
159
|
title.as_deref(),
|
|
156
160
|
should_use_alt_text,
|
|
161
|
+
options.link_style,
|
|
162
|
+
ctx.reference_collector.as_ref(),
|
|
157
163
|
));
|
|
158
164
|
|
|
159
165
|
if !options.skip_images {
|
|
@@ -189,21 +195,39 @@ pub fn handle_graphic(
|
|
|
189
195
|
///
|
|
190
196
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
191
197
|
/// Otherwise returns the full `` syntax.
|
|
192
|
-
fn format_graphic_markdown(
|
|
198
|
+
fn format_graphic_markdown(
|
|
199
|
+
src: &str,
|
|
200
|
+
alt: &str,
|
|
201
|
+
title: Option<&str>,
|
|
202
|
+
use_alt_only: bool,
|
|
203
|
+
link_style: crate::options::validation::LinkStyle,
|
|
204
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
205
|
+
) -> String {
|
|
193
206
|
if use_alt_only {
|
|
194
|
-
alt.to_string()
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
buf.push_str("
|
|
203
|
-
buf.push_str(
|
|
204
|
-
buf.push('
|
|
207
|
+
return alt.to_string();
|
|
208
|
+
}
|
|
209
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
210
|
+
if let Some(collector) = reference_collector {
|
|
211
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
212
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
213
|
+
buf.push_str("![");
|
|
214
|
+
buf.push_str(alt);
|
|
215
|
+
buf.push_str("][");
|
|
216
|
+
buf.push_str(&ref_num.to_string());
|
|
217
|
+
buf.push(']');
|
|
218
|
+
return buf;
|
|
205
219
|
}
|
|
206
|
-
buf.push(')');
|
|
207
|
-
buf
|
|
208
220
|
}
|
|
221
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
222
|
+
buf.push_str(";
|
|
225
|
+
buf.push_str(src);
|
|
226
|
+
if let Some(title_text) = title {
|
|
227
|
+
buf.push_str(" \"");
|
|
228
|
+
buf.push_str(title_text);
|
|
229
|
+
buf.push('"');
|
|
230
|
+
}
|
|
231
|
+
buf.push(')');
|
|
232
|
+
buf
|
|
209
233
|
}
|
|
@@ -146,7 +146,14 @@ pub fn handle_img(
|
|
|
146
146
|
visitor.visit_image(&node_ctx, &src, &alt, title.as_deref())
|
|
147
147
|
};
|
|
148
148
|
match visit_result {
|
|
149
|
-
VisitResult::Continue => Some(format_image_markdown(
|
|
149
|
+
VisitResult::Continue => Some(format_image_markdown(
|
|
150
|
+
&src,
|
|
151
|
+
&alt,
|
|
152
|
+
title.as_deref(),
|
|
153
|
+
should_use_alt_text,
|
|
154
|
+
options.link_style,
|
|
155
|
+
ctx.reference_collector.as_ref(),
|
|
156
|
+
)),
|
|
150
157
|
VisitResult::Custom(custom) => Some(custom),
|
|
151
158
|
VisitResult::Skip => None,
|
|
152
159
|
VisitResult::Error(err) => {
|
|
@@ -158,11 +165,25 @@ pub fn handle_img(
|
|
|
158
165
|
VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
|
|
159
166
|
}
|
|
160
167
|
} else {
|
|
161
|
-
Some(format_image_markdown(
|
|
168
|
+
Some(format_image_markdown(
|
|
169
|
+
&src,
|
|
170
|
+
&alt,
|
|
171
|
+
title.as_deref(),
|
|
172
|
+
should_use_alt_text,
|
|
173
|
+
options.link_style,
|
|
174
|
+
ctx.reference_collector.as_ref(),
|
|
175
|
+
))
|
|
162
176
|
};
|
|
163
177
|
|
|
164
178
|
#[cfg(not(feature = "visitor"))]
|
|
165
|
-
let image_output = Some(format_image_markdown(
|
|
179
|
+
let image_output = Some(format_image_markdown(
|
|
180
|
+
&src,
|
|
181
|
+
&alt,
|
|
182
|
+
title.as_deref(),
|
|
183
|
+
should_use_alt_text,
|
|
184
|
+
options.link_style,
|
|
185
|
+
ctx.reference_collector.as_ref(),
|
|
186
|
+
));
|
|
166
187
|
|
|
167
188
|
// Only output image if skip_images is not enabled
|
|
168
189
|
if !options.skip_images {
|
|
@@ -204,21 +225,39 @@ pub fn handle_img(
|
|
|
204
225
|
///
|
|
205
226
|
/// If `use_alt_only` is true, returns just the alt text.
|
|
206
227
|
/// Otherwise returns the full `` syntax.
|
|
207
|
-
fn format_image_markdown(
|
|
228
|
+
fn format_image_markdown(
|
|
229
|
+
src: &str,
|
|
230
|
+
alt: &str,
|
|
231
|
+
title: Option<&str>,
|
|
232
|
+
use_alt_only: bool,
|
|
233
|
+
link_style: crate::options::validation::LinkStyle,
|
|
234
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
235
|
+
) -> String {
|
|
208
236
|
if use_alt_only {
|
|
209
|
-
alt.to_string()
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
buf.push_str("
|
|
218
|
-
buf.push_str(
|
|
219
|
-
buf.push('
|
|
237
|
+
return alt.to_string();
|
|
238
|
+
}
|
|
239
|
+
if link_style == crate::options::validation::LinkStyle::Reference {
|
|
240
|
+
if let Some(collector) = reference_collector {
|
|
241
|
+
let ref_num = collector.borrow_mut().get_or_insert(src, title);
|
|
242
|
+
let mut buf = String::with_capacity(alt.len() + 10);
|
|
243
|
+
buf.push_str("![");
|
|
244
|
+
buf.push_str(alt);
|
|
245
|
+
buf.push_str("][");
|
|
246
|
+
buf.push_str(&ref_num.to_string());
|
|
247
|
+
buf.push(']');
|
|
248
|
+
return buf;
|
|
220
249
|
}
|
|
221
|
-
buf.push(')');
|
|
222
|
-
buf
|
|
223
250
|
}
|
|
251
|
+
let mut buf = String::with_capacity(src.len() + alt.len() + 10);
|
|
252
|
+
buf.push_str(";
|
|
255
|
+
buf.push_str(src);
|
|
256
|
+
if let Some(title_text) = title {
|
|
257
|
+
buf.push_str(" \"");
|
|
258
|
+
buf.push_str(title_text);
|
|
259
|
+
buf.push('"');
|
|
260
|
+
}
|
|
261
|
+
buf.push(')');
|
|
262
|
+
buf
|
|
224
263
|
}
|
|
@@ -115,6 +115,7 @@ pub fn handle_link(
|
|
|
115
115
|
title.as_deref(),
|
|
116
116
|
raw_text.as_str(),
|
|
117
117
|
options,
|
|
118
|
+
ctx.reference_collector.as_ref(),
|
|
118
119
|
);
|
|
119
120
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
120
121
|
return;
|
|
@@ -190,6 +191,13 @@ pub fn handle_link(
|
|
|
190
191
|
label = href.clone();
|
|
191
192
|
}
|
|
192
193
|
|
|
194
|
+
// Normalize Wikipedia-style back-reference links: <a href="#cite_ref-N">^</a>
|
|
195
|
+
// These produce `[^](#cite_ref-N)` which is confusing (looks like a footnote).
|
|
196
|
+
// Convert to `[↑](#cite_ref-N)` to avoid ambiguity with markdown footnote syntax.
|
|
197
|
+
if label == "^" && href.starts_with('#') {
|
|
198
|
+
label = "↑".to_string();
|
|
199
|
+
}
|
|
200
|
+
|
|
193
201
|
let escaped_label = escape_link_label(&label);
|
|
194
202
|
|
|
195
203
|
#[cfg(feature = "visitor")]
|
|
@@ -226,6 +234,7 @@ pub fn handle_link(
|
|
|
226
234
|
title.as_deref(),
|
|
227
235
|
label.as_str(),
|
|
228
236
|
options,
|
|
237
|
+
ctx.reference_collector.as_ref(),
|
|
229
238
|
);
|
|
230
239
|
Some(buf)
|
|
231
240
|
}
|
|
@@ -248,6 +257,7 @@ pub fn handle_link(
|
|
|
248
257
|
title.as_deref(),
|
|
249
258
|
label.as_str(),
|
|
250
259
|
options,
|
|
260
|
+
ctx.reference_collector.as_ref(),
|
|
251
261
|
);
|
|
252
262
|
Some(buf)
|
|
253
263
|
};
|
|
@@ -262,6 +272,7 @@ pub fn handle_link(
|
|
|
262
272
|
title.as_deref(),
|
|
263
273
|
label.as_str(),
|
|
264
274
|
options,
|
|
275
|
+
ctx.reference_collector.as_ref(),
|
|
265
276
|
);
|
|
266
277
|
Some(buf)
|
|
267
278
|
};
|
|
@@ -145,6 +145,7 @@ pub(crate) fn handle(
|
|
|
145
145
|
title.as_deref(),
|
|
146
146
|
raw_text.as_str(),
|
|
147
147
|
options,
|
|
148
|
+
ctx.reference_collector.as_ref(),
|
|
148
149
|
);
|
|
149
150
|
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
|
|
150
151
|
return;
|
|
@@ -262,6 +263,7 @@ pub(crate) fn handle(
|
|
|
262
263
|
title.as_deref(),
|
|
263
264
|
label.as_str(),
|
|
264
265
|
options,
|
|
266
|
+
ctx.reference_collector.as_ref(),
|
|
265
267
|
);
|
|
266
268
|
Some(buf)
|
|
267
269
|
}
|
|
@@ -284,6 +286,7 @@ pub(crate) fn handle(
|
|
|
284
286
|
title.as_deref(),
|
|
285
287
|
label.as_str(),
|
|
286
288
|
options,
|
|
289
|
+
ctx.reference_collector.as_ref(),
|
|
287
290
|
);
|
|
288
291
|
Some(buf)
|
|
289
292
|
};
|
|
@@ -298,6 +301,7 @@ pub(crate) fn handle(
|
|
|
298
301
|
title.as_deref(),
|
|
299
302
|
label.as_str(),
|
|
300
303
|
options,
|
|
304
|
+
ctx.reference_collector.as_ref(),
|
|
301
305
|
);
|
|
302
306
|
Some(buf)
|
|
303
307
|
};
|
|
@@ -363,7 +367,20 @@ pub(crate) fn append_markdown_link(
|
|
|
363
367
|
title: Option<&str>,
|
|
364
368
|
raw_text: &str,
|
|
365
369
|
options: &ConversionOptions,
|
|
370
|
+
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
|
|
366
371
|
) {
|
|
372
|
+
if options.link_style == crate::options::validation::LinkStyle::Reference && !href.is_empty() {
|
|
373
|
+
if let Some(collector) = reference_collector {
|
|
374
|
+
let ref_num = collector.borrow_mut().get_or_insert(href, title);
|
|
375
|
+
output.push('[');
|
|
376
|
+
output.push_str(label);
|
|
377
|
+
output.push_str("][");
|
|
378
|
+
output.push_str(&ref_num.to_string());
|
|
379
|
+
output.push(']');
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
367
384
|
output.push('[');
|
|
368
385
|
output.push_str(label);
|
|
369
386
|
output.push_str("](");
|
|
@@ -196,6 +196,14 @@ pub(crate) fn convert_html_impl(
|
|
|
196
196
|
}
|
|
197
197
|
}
|
|
198
198
|
|
|
199
|
+
let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
|
|
200
|
+
Some(std::rc::Rc::new(std::cell::RefCell::new(
|
|
201
|
+
crate::converter::reference_collector::ReferenceCollector::new(),
|
|
202
|
+
)))
|
|
203
|
+
} else {
|
|
204
|
+
None
|
|
205
|
+
};
|
|
206
|
+
|
|
199
207
|
#[cfg(all(feature = "metadata", feature = "visitor"))]
|
|
200
208
|
let ctx = Context::new(
|
|
201
209
|
options,
|
|
@@ -203,6 +211,7 @@ pub(crate) fn convert_html_impl(
|
|
|
203
211
|
metadata_collector,
|
|
204
212
|
visitor,
|
|
205
213
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
206
215
|
);
|
|
207
216
|
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
|
|
208
217
|
let ctx = Context::new(
|
|
@@ -211,6 +220,7 @@ pub(crate) fn convert_html_impl(
|
|
|
211
220
|
metadata_collector,
|
|
212
221
|
_visitor,
|
|
213
222
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
223
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
214
224
|
);
|
|
215
225
|
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
|
|
216
226
|
let ctx = Context::new(
|
|
@@ -219,6 +229,7 @@ pub(crate) fn convert_html_impl(
|
|
|
219
229
|
_metadata_collector,
|
|
220
230
|
visitor,
|
|
221
231
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
232
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
222
233
|
);
|
|
223
234
|
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
|
|
224
235
|
let ctx = Context::new(
|
|
@@ -227,6 +238,7 @@ pub(crate) fn convert_html_impl(
|
|
|
227
238
|
_metadata_collector,
|
|
228
239
|
_visitor,
|
|
229
240
|
structure_collector.as_ref().map(std::rc::Rc::clone),
|
|
241
|
+
reference_collector.as_ref().map(std::rc::Rc::clone),
|
|
230
242
|
);
|
|
231
243
|
|
|
232
244
|
for child_handle in dom.children() {
|
|
@@ -242,6 +254,19 @@ pub(crate) fn convert_html_impl(
|
|
|
242
254
|
// reference to the same collector, and Rc::try_unwrap requires exactly one reference.
|
|
243
255
|
drop(ctx);
|
|
244
256
|
|
|
257
|
+
// Append reference-style link definitions if any were collected
|
|
258
|
+
if let Some(rc) = reference_collector {
|
|
259
|
+
if let Ok(collector) = std::rc::Rc::try_unwrap(rc) {
|
|
260
|
+
let ref_section = collector.into_inner().finish();
|
|
261
|
+
if !ref_section.is_empty() {
|
|
262
|
+
let trimmed_len = output.trim_end_matches('\n').len();
|
|
263
|
+
output.truncate(trimmed_len);
|
|
264
|
+
output.push_str("\n\n");
|
|
265
|
+
output.push_str(&ref_section);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
245
270
|
// If plain text was requested, discard the markdown output and return plain text.
|
|
246
271
|
// The full pipeline was still run above so that metadata + visitor callbacks fire.
|
|
247
272
|
if is_plain_text {
|
|
@@ -78,11 +78,20 @@ pub(crate) fn handle_audio(
|
|
|
78
78
|
};
|
|
79
79
|
|
|
80
80
|
if should_output_media_link(&src) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
82
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
83
|
+
output.push('[');
|
|
84
|
+
output.push_str(&src);
|
|
85
|
+
output.push_str("][");
|
|
86
|
+
output.push_str(&ref_num.to_string());
|
|
87
|
+
output.push(']');
|
|
88
|
+
} else {
|
|
89
|
+
output.push('[');
|
|
90
|
+
output.push_str(&src);
|
|
91
|
+
output.push_str("](");
|
|
92
|
+
output.push_str(&src);
|
|
93
|
+
output.push(')');
|
|
94
|
+
}
|
|
86
95
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
87
96
|
output.push_str("\n\n");
|
|
88
97
|
}
|
|
@@ -132,11 +141,20 @@ pub(crate) fn handle_video(
|
|
|
132
141
|
};
|
|
133
142
|
|
|
134
143
|
if should_output_media_link(&src) {
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
144
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
145
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
146
|
+
output.push('[');
|
|
147
|
+
output.push_str(&src);
|
|
148
|
+
output.push_str("][");
|
|
149
|
+
output.push_str(&ref_num.to_string());
|
|
150
|
+
output.push(']');
|
|
151
|
+
} else {
|
|
152
|
+
output.push('[');
|
|
153
|
+
output.push_str(&src);
|
|
154
|
+
output.push_str("](");
|
|
155
|
+
output.push_str(&src);
|
|
156
|
+
output.push(')');
|
|
157
|
+
}
|
|
140
158
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
141
159
|
output.push_str("\n\n");
|
|
142
160
|
}
|
|
@@ -199,11 +217,20 @@ pub(crate) fn handle_iframe(tag: &HTMLTag, output: &mut String, ctx: &Context) {
|
|
|
199
217
|
.map_or(Cow::Borrowed(""), |v| v.as_utf8_str());
|
|
200
218
|
|
|
201
219
|
if !src.is_empty() {
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
220
|
+
if let Some(ref collector) = ctx.reference_collector {
|
|
221
|
+
let ref_num = collector.borrow_mut().get_or_insert(&src, None);
|
|
222
|
+
output.push('[');
|
|
223
|
+
output.push_str(&src);
|
|
224
|
+
output.push_str("][");
|
|
225
|
+
output.push_str(&ref_num.to_string());
|
|
226
|
+
output.push(']');
|
|
227
|
+
} else {
|
|
228
|
+
output.push('[');
|
|
229
|
+
output.push_str(&src);
|
|
230
|
+
output.push_str("](");
|
|
231
|
+
output.push_str(&src);
|
|
232
|
+
output.push(')');
|
|
233
|
+
}
|
|
207
234
|
if !ctx.in_paragraph && !ctx.convert_as_inline {
|
|
208
235
|
output.push_str("\n\n");
|
|
209
236
|
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
//! Collector for reference-style link definitions.
|
|
2
|
+
|
|
3
|
+
use std::cell::RefCell;
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
use std::rc::Rc;
|
|
6
|
+
|
|
7
|
+
/// Shared handle for passing the collector through the conversion context.
|
|
8
|
+
pub type ReferenceCollectorHandle = Rc<RefCell<ReferenceCollector>>;
|
|
9
|
+
|
|
10
|
+
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
|
|
11
|
+
struct ReferenceKey {
|
|
12
|
+
url: String,
|
|
13
|
+
title: Option<String>,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// Collects link/image references during conversion and produces a reference
|
|
17
|
+
/// definitions section at the end of the document.
|
|
18
|
+
#[derive(Debug, Default)]
|
|
19
|
+
pub struct ReferenceCollector {
|
|
20
|
+
map: HashMap<ReferenceKey, usize>,
|
|
21
|
+
entries: Vec<(usize, String, Option<String>)>,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
impl ReferenceCollector {
|
|
25
|
+
/// Create a new, empty reference collector.
|
|
26
|
+
pub fn new() -> Self {
|
|
27
|
+
Self::default()
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Register a URL (and optional title) and return its 1-based reference number.
|
|
31
|
+
///
|
|
32
|
+
/// If the same URL+title pair was already registered, the existing number is returned.
|
|
33
|
+
pub fn get_or_insert(&mut self, url: &str, title: Option<&str>) -> usize {
|
|
34
|
+
let key = ReferenceKey {
|
|
35
|
+
url: url.to_string(),
|
|
36
|
+
title: title.map(String::from),
|
|
37
|
+
};
|
|
38
|
+
if let Some(&num) = self.map.get(&key) {
|
|
39
|
+
return num;
|
|
40
|
+
}
|
|
41
|
+
let num = self.entries.len() + 1;
|
|
42
|
+
self.map.insert(key, num);
|
|
43
|
+
self.entries.push((num, url.to_string(), title.map(String::from)));
|
|
44
|
+
num
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Produce the reference definitions section.
|
|
48
|
+
///
|
|
49
|
+
/// Returns an empty string when no references were collected.
|
|
50
|
+
pub fn finish(&self) -> String {
|
|
51
|
+
if self.entries.is_empty() {
|
|
52
|
+
return String::new();
|
|
53
|
+
}
|
|
54
|
+
let mut out = String::new();
|
|
55
|
+
for (num, url, title) in &self.entries {
|
|
56
|
+
out.push('[');
|
|
57
|
+
out.push_str(&num.to_string());
|
|
58
|
+
out.push_str("]: ");
|
|
59
|
+
out.push_str(url);
|
|
60
|
+
if let Some(t) = title {
|
|
61
|
+
out.push_str(" \"");
|
|
62
|
+
out.push_str(&t.replace('"', "\\\""));
|
|
63
|
+
out.push('"');
|
|
64
|
+
}
|
|
65
|
+
out.push('\n');
|
|
66
|
+
}
|
|
67
|
+
out
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -18,6 +18,7 @@ pub use crate::metadata::{
|
|
|
18
18
|
};
|
|
19
19
|
|
|
20
20
|
pub use crate::options::{
|
|
21
|
-
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle,
|
|
22
|
-
NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
|
|
21
|
+
CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, LinkStyle,
|
|
22
|
+
ListIndentType, NewlineStyle, OutputFormat, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset,
|
|
23
|
+
WhitespaceMode,
|
|
23
24
|
};
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::options::preprocessing::PreprocessingOptions;
|
|
6
6
|
use crate::options::validation::{
|
|
7
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
7
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
8
8
|
};
|
|
9
9
|
|
|
10
10
|
/// Main conversion options for HTML to Markdown conversion.
|
|
@@ -94,6 +94,8 @@ pub struct ConversionOptions {
|
|
|
94
94
|
pub preserve_tags: Vec<String>,
|
|
95
95
|
/// Skip conversion of `<img>` elements (omit images from output).
|
|
96
96
|
pub skip_images: bool,
|
|
97
|
+
/// Link rendering style (inline or reference).
|
|
98
|
+
pub link_style: LinkStyle,
|
|
97
99
|
/// Target output format (Markdown, plain text, etc.).
|
|
98
100
|
pub output_format: OutputFormat,
|
|
99
101
|
/// Include structured document tree in result.
|
|
@@ -142,6 +144,7 @@ impl Default for ConversionOptions {
|
|
|
142
144
|
strip_tags: Vec::new(),
|
|
143
145
|
preserve_tags: Vec::new(),
|
|
144
146
|
skip_images: false,
|
|
147
|
+
link_style: LinkStyle::default(),
|
|
145
148
|
output_format: OutputFormat::default(),
|
|
146
149
|
include_document_structure: false,
|
|
147
150
|
extract_images: false,
|
|
@@ -207,6 +210,7 @@ impl ConversionOptionsBuilder {
|
|
|
207
210
|
builder_setter!(newline_style, NewlineStyle);
|
|
208
211
|
builder_setter!(highlight_style, HighlightStyle);
|
|
209
212
|
builder_setter_into!(code_language, String);
|
|
213
|
+
builder_setter!(link_style, LinkStyle);
|
|
210
214
|
builder_setter!(autolinks, bool);
|
|
211
215
|
builder_setter!(default_title, bool);
|
|
212
216
|
builder_setter!(br_in_tables, bool);
|
|
@@ -356,6 +360,8 @@ pub struct ConversionOptionsUpdate {
|
|
|
356
360
|
pub preserve_tags: Option<Vec<String>>,
|
|
357
361
|
/// Optional override for [`ConversionOptions::skip_images`].
|
|
358
362
|
pub skip_images: Option<bool>,
|
|
363
|
+
/// Optional override for [`ConversionOptions::link_style`].
|
|
364
|
+
pub link_style: Option<LinkStyle>,
|
|
359
365
|
/// Optional override for [`ConversionOptions::output_format`].
|
|
360
366
|
pub output_format: Option<OutputFormat>,
|
|
361
367
|
/// Optional override for [`ConversionOptions::include_document_structure`].
|
|
@@ -410,6 +416,7 @@ impl ConversionOptions {
|
|
|
410
416
|
apply!(strip_tags);
|
|
411
417
|
apply!(preserve_tags);
|
|
412
418
|
apply!(skip_images);
|
|
419
|
+
apply!(link_style);
|
|
413
420
|
apply!(output_format);
|
|
414
421
|
apply!(include_document_structure);
|
|
415
422
|
apply!(extract_images);
|
|
@@ -13,7 +13,7 @@ pub mod validation;
|
|
|
13
13
|
pub use conversion::{ConversionOptions, ConversionOptionsUpdate};
|
|
14
14
|
pub use preprocessing::{PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset};
|
|
15
15
|
pub use validation::{
|
|
16
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
16
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat, WhitespaceMode,
|
|
17
17
|
};
|
|
18
18
|
|
|
19
19
|
// Note: InlineImageConfig is re-exported from the inline_images module,
|
|
@@ -172,6 +172,33 @@ impl HighlightStyle {
|
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
|
|
175
|
+
/// Link rendering style in Markdown output.
|
|
176
|
+
///
|
|
177
|
+
/// Controls whether links and images use inline `[text](url)` syntax or
|
|
178
|
+
/// reference-style `[text][1]` syntax with definitions collected at the end.
|
|
179
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
|
180
|
+
pub enum LinkStyle {
|
|
181
|
+
/// Inline links: `[text](url)`. Default.
|
|
182
|
+
#[default]
|
|
183
|
+
Inline,
|
|
184
|
+
/// Reference-style links: `[text][1]` with `[1]: url` at end of document.
|
|
185
|
+
Reference,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
impl LinkStyle {
|
|
189
|
+
/// Parse a link style from a string.
|
|
190
|
+
///
|
|
191
|
+
/// Accepts "reference" or defaults to Inline.
|
|
192
|
+
/// Input is normalized (lowercased, alphanumeric only).
|
|
193
|
+
#[must_use]
|
|
194
|
+
pub fn parse(value: &str) -> Self {
|
|
195
|
+
match normalize_token(value).as_str() {
|
|
196
|
+
"reference" => Self::Reference,
|
|
197
|
+
_ => Self::Inline,
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
175
202
|
/// Output format for conversion.
|
|
176
203
|
///
|
|
177
204
|
/// Specifies the target markup language format for the conversion output.
|
|
@@ -215,7 +242,8 @@ pub(crate) fn normalize_token(value: &str) -> String {
|
|
|
215
242
|
#[cfg(any(feature = "serde", feature = "metadata"))]
|
|
216
243
|
mod serde_impls {
|
|
217
244
|
use super::{
|
|
218
|
-
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
245
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, LinkStyle, ListIndentType, NewlineStyle, OutputFormat,
|
|
246
|
+
WhitespaceMode,
|
|
219
247
|
};
|
|
220
248
|
use serde::{Deserialize, Serialize, Serializer};
|
|
221
249
|
|
|
@@ -239,6 +267,7 @@ mod serde_impls {
|
|
|
239
267
|
impl_deserialize_from_parse!(NewlineStyle, NewlineStyle::parse);
|
|
240
268
|
impl_deserialize_from_parse!(CodeBlockStyle, CodeBlockStyle::parse);
|
|
241
269
|
impl_deserialize_from_parse!(HighlightStyle, HighlightStyle::parse);
|
|
270
|
+
impl_deserialize_from_parse!(LinkStyle, LinkStyle::parse);
|
|
242
271
|
impl_deserialize_from_parse!(OutputFormat, OutputFormat::parse);
|
|
243
272
|
|
|
244
273
|
// Serialize implementations that convert enum variants to their string representations
|
|
@@ -324,6 +353,19 @@ mod serde_impls {
|
|
|
324
353
|
}
|
|
325
354
|
}
|
|
326
355
|
|
|
356
|
+
impl Serialize for LinkStyle {
|
|
357
|
+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
358
|
+
where
|
|
359
|
+
S: Serializer,
|
|
360
|
+
{
|
|
361
|
+
let s = match self {
|
|
362
|
+
Self::Inline => "inline",
|
|
363
|
+
Self::Reference => "reference",
|
|
364
|
+
};
|
|
365
|
+
serializer.serialize_str(s)
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
327
369
|
impl Serialize for OutputFormat {
|
|
328
370
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
|
329
371
|
where
|
|
@@ -591,6 +591,30 @@ fn q_element_produces_quotes() {
|
|
|
591
591
|
assert!(result.contains(r#""hello""#), "q element should add quotes: {result}");
|
|
592
592
|
}
|
|
593
593
|
|
|
594
|
+
#[test]
|
|
595
|
+
fn test_wikipedia_back_reference_caret_normalized() {
|
|
596
|
+
// Wikipedia back-references use <a href="#cite_ref-N">^</a>
|
|
597
|
+
// The caret should be normalized to ↑ to avoid confusion with markdown footnote syntax
|
|
598
|
+
let html = r##"<p>Some text<sup><a href="#cite_ref-1">^</a></sup> more text</p>"##;
|
|
599
|
+
let result = convert(html, None).unwrap();
|
|
600
|
+
assert!(
|
|
601
|
+
result.contains("[↑](#cite_ref-1)"),
|
|
602
|
+
"Back-reference caret should be normalized to ↑: {result}"
|
|
603
|
+
);
|
|
604
|
+
assert!(
|
|
605
|
+
!result.contains("[^]"),
|
|
606
|
+
"Should not produce [^] which looks like footnote syntax: {result}"
|
|
607
|
+
);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
#[test]
|
|
611
|
+
fn test_regular_caret_link_not_affected() {
|
|
612
|
+
// Regular links with ^ text but no # href should keep the ^
|
|
613
|
+
let html = r#"<a href="https://example.com">^</a>"#;
|
|
614
|
+
let result = convert(html, None).unwrap();
|
|
615
|
+
assert!(result.contains("[^]"), "Non-anchor caret links should keep ^: {result}");
|
|
616
|
+
}
|
|
617
|
+
|
|
594
618
|
fn convert(
|
|
595
619
|
html: &str,
|
|
596
620
|
opts: Option<html_to_markdown_rs::ConversionOptions>,
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#![allow(missing_docs)]
|
|
2
|
+
|
|
3
|
+
use html_to_markdown_rs::{ConversionOptions, LinkStyle};
|
|
4
|
+
|
|
5
|
+
fn convert(html: &str, options: Option<ConversionOptions>) -> String {
|
|
6
|
+
html_to_markdown_rs::convert(html, options)
|
|
7
|
+
.unwrap()
|
|
8
|
+
.content
|
|
9
|
+
.unwrap_or_default()
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
fn ref_options() -> ConversionOptions {
|
|
13
|
+
ConversionOptions {
|
|
14
|
+
link_style: LinkStyle::Reference,
|
|
15
|
+
..Default::default()
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
#[test]
|
|
20
|
+
fn basic_reference_link() {
|
|
21
|
+
let html = r#"<a href="https://example.com">Click here</a>"#;
|
|
22
|
+
let result = convert(html, Some(ref_options()));
|
|
23
|
+
assert!(
|
|
24
|
+
result.contains("[Click here][1]"),
|
|
25
|
+
"Expected reference-style link, got: {result}"
|
|
26
|
+
);
|
|
27
|
+
assert!(
|
|
28
|
+
result.contains("[1]: https://example.com"),
|
|
29
|
+
"Expected reference definition, got: {result}"
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[test]
|
|
34
|
+
fn reference_link_with_title() {
|
|
35
|
+
let html = r#"<a href="https://example.com" title="Example">Click</a>"#;
|
|
36
|
+
let result = convert(html, Some(ref_options()));
|
|
37
|
+
assert!(
|
|
38
|
+
result.contains("[Click][1]"),
|
|
39
|
+
"Expected reference-style link, got: {result}"
|
|
40
|
+
);
|
|
41
|
+
assert!(
|
|
42
|
+
result.contains(r#"[1]: https://example.com "Example""#),
|
|
43
|
+
"Expected reference definition with title, got: {result}"
|
|
44
|
+
);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn url_deduplication() {
|
|
49
|
+
let html = r#"<a href="https://example.com">First</a> <a href="https://example.com">Second</a>"#;
|
|
50
|
+
let result = convert(html, Some(ref_options()));
|
|
51
|
+
assert!(
|
|
52
|
+
result.contains("[First][1]"),
|
|
53
|
+
"Expected first link with ref 1, got: {result}"
|
|
54
|
+
);
|
|
55
|
+
assert!(
|
|
56
|
+
result.contains("[Second][1]"),
|
|
57
|
+
"Expected second link reusing ref 1, got: {result}"
|
|
58
|
+
);
|
|
59
|
+
// Should only have one definition
|
|
60
|
+
let count = result.matches("[1]: https://example.com").count();
|
|
61
|
+
assert_eq!(count, 1, "Expected exactly one definition, got: {result}");
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#[test]
|
|
65
|
+
fn different_titles_different_refs() {
|
|
66
|
+
let html =
|
|
67
|
+
r#"<a href="https://example.com" title="A">First</a> <a href="https://example.com" title="B">Second</a>"#;
|
|
68
|
+
let result = convert(html, Some(ref_options()));
|
|
69
|
+
assert!(
|
|
70
|
+
result.contains("[First][1]"),
|
|
71
|
+
"Expected first link ref 1, got: {result}"
|
|
72
|
+
);
|
|
73
|
+
assert!(
|
|
74
|
+
result.contains("[Second][2]"),
|
|
75
|
+
"Expected second link ref 2 (different title), got: {result}"
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
#[test]
|
|
80
|
+
fn image_reference_style() {
|
|
81
|
+
let html = r#"<img src="https://example.com/img.png" alt="A photo">"#;
|
|
82
|
+
let result = convert(html, Some(ref_options()));
|
|
83
|
+
assert!(
|
|
84
|
+
result.contains("![A photo][1]"),
|
|
85
|
+
"Expected reference-style image, got: {result}"
|
|
86
|
+
);
|
|
87
|
+
assert!(
|
|
88
|
+
result.contains("[1]: https://example.com/img.png"),
|
|
89
|
+
"Expected image reference definition, got: {result}"
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#[test]
|
|
94
|
+
fn mixed_links_and_images_share_numbering() {
|
|
95
|
+
let html = r#"<a href="https://a.com">Link</a><img src="https://b.com/img.png" alt="Img">"#;
|
|
96
|
+
let result = convert(html, Some(ref_options()));
|
|
97
|
+
assert!(result.contains("[Link][1]"), "Expected link as ref 1, got: {result}");
|
|
98
|
+
assert!(result.contains("![Img][2]"), "Expected image as ref 2, got: {result}");
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[test]
|
|
102
|
+
fn autolinks_unaffected() {
|
|
103
|
+
let html = r#"<a href="https://example.com">https://example.com</a>"#;
|
|
104
|
+
let options = ConversionOptions {
|
|
105
|
+
link_style: LinkStyle::Reference,
|
|
106
|
+
autolinks: true,
|
|
107
|
+
..Default::default()
|
|
108
|
+
};
|
|
109
|
+
let result = convert(html, Some(options));
|
|
110
|
+
// Autolinks should still render as <url>
|
|
111
|
+
assert!(
|
|
112
|
+
result.contains("<https://example.com>"),
|
|
113
|
+
"Autolinks should not be affected by reference style, got: {result}"
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#[test]
|
|
118
|
+
fn default_inline_unchanged() {
|
|
119
|
+
let html = r#"<a href="https://example.com">Click</a>"#;
|
|
120
|
+
let result = convert(html, None);
|
|
121
|
+
assert!(
|
|
122
|
+
result.contains("[Click](https://example.com)"),
|
|
123
|
+
"Default should use inline style, got: {result}"
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[test]
|
|
128
|
+
fn multiple_paragraphs_references_at_end() {
|
|
129
|
+
let html = r#"<p><a href="https://a.com">A</a></p><p><a href="https://b.com">B</a></p>"#;
|
|
130
|
+
let result = convert(html, Some(ref_options()));
|
|
131
|
+
// References should be at the very end
|
|
132
|
+
let ref_section_start = result.find("[1]:").expect("Should have ref section");
|
|
133
|
+
let content_end = result.find("[A][1]").expect("Should have inline ref");
|
|
134
|
+
assert!(
|
|
135
|
+
ref_section_start > content_end,
|
|
136
|
+
"Reference section should be after content"
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[test]
|
|
141
|
+
fn empty_href_no_reference() {
|
|
142
|
+
let html = r#"<a href="">Empty</a>"#;
|
|
143
|
+
let result = convert(html, Some(ref_options()));
|
|
144
|
+
// Empty href should not create a reference
|
|
145
|
+
assert!(
|
|
146
|
+
!result.contains("[1]:"),
|
|
147
|
+
"Empty href should not create reference, got: {result}"
|
|
148
|
+
);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
#[test]
|
|
152
|
+
fn title_with_quotes_escaped() {
|
|
153
|
+
let html = r#"<a href="https://example.com" title='Say "hello"'>Link</a>"#;
|
|
154
|
+
let result = convert(html, Some(ref_options()));
|
|
155
|
+
assert!(
|
|
156
|
+
result.contains(r#"[1]: https://example.com "Say \"hello\"""#),
|
|
157
|
+
"Quotes in title should be escaped, got: {result}"
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[test]
|
|
162
|
+
fn media_elements_reference_style() {
|
|
163
|
+
let html = r#"<video src="https://example.com/video.mp4"></video>"#;
|
|
164
|
+
let result = convert(html, Some(ref_options()));
|
|
165
|
+
assert!(
|
|
166
|
+
result.contains("[1]: https://example.com/video.mp4"),
|
|
167
|
+
"Video should use reference style, got: {result}"
|
|
168
|
+
);
|
|
169
|
+
}
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html-to-markdown
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.0
|
|
4
|
+
version: 3.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
@@ -142,6 +142,7 @@ files:
|
|
|
142
142
|
- vendor/html-to-markdown-rs/src/converter/mod.rs
|
|
143
143
|
- vendor/html-to-markdown-rs/src/converter/plain_text.rs
|
|
144
144
|
- vendor/html-to-markdown-rs/src/converter/preprocessing_helpers.rs
|
|
145
|
+
- vendor/html-to-markdown-rs/src/converter/reference_collector.rs
|
|
145
146
|
- vendor/html-to-markdown-rs/src/converter/semantic/attributes.rs
|
|
146
147
|
- vendor/html-to-markdown-rs/src/converter/semantic/definition_list.rs
|
|
147
148
|
- vendor/html-to-markdown-rs/src/converter/semantic/figure.rs
|
|
@@ -224,6 +225,7 @@ files:
|
|
|
224
225
|
- vendor/html-to-markdown-rs/tests/lists_test.rs
|
|
225
226
|
- vendor/html-to-markdown-rs/tests/plain_output_test.rs
|
|
226
227
|
- vendor/html-to-markdown-rs/tests/preprocessing_tests.rs
|
|
228
|
+
- vendor/html-to-markdown-rs/tests/reference_links_test.rs
|
|
227
229
|
- vendor/html-to-markdown-rs/tests/skip_images_test.rs
|
|
228
230
|
- vendor/html-to-markdown-rs/tests/tables_test.rs
|
|
229
231
|
- vendor/html-to-markdown-rs/tests/test_custom_elements.rs
|