inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
//! Image attribute injection filter and URL matcher.
|
|
2
|
+
//!
|
|
3
|
+
//! When enabled, replaces pulldown-cmark's default image event sequence
|
|
4
|
+
//! (`Start(Tag::Image) ... End(TagEnd::Image)`) with a single `Event::Html`
|
|
5
|
+
//! carrying a hand-built `<img>` tag that includes the "modern" loading and
|
|
6
|
+
//! decoding hints:
|
|
7
|
+
//!
|
|
8
|
+
//! ```html
|
|
9
|
+
//! <img src="..." alt="..." loading="lazy" decoding="async" />
|
|
10
|
+
//! ```
|
|
11
|
+
//!
|
|
12
|
+
//! Pulldown-cmark's `Tag::Image` struct doesn't expose an "extra attributes"
|
|
13
|
+
//! field, so rewriting the Tag in place isn't enough—we have to bypass
|
|
14
|
+
//! the built-in image writer entirely and emit the HTML ourselves. Alt,
|
|
15
|
+
//! title, and URL are escaped through the same `pulldown-cmark-escape`
|
|
16
|
+
//! functions the upstream html writer uses, so the output stays byte-
|
|
17
|
+
//! compatible with what pulldown-cmark would have produced plus the two
|
|
18
|
+
//! extra attributes.
|
|
19
|
+
|
|
20
|
+
use globset::GlobSet;
|
|
21
|
+
use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
|
|
22
|
+
use pulldown_cmark_escape::{escape_href, escape_html};
|
|
23
|
+
|
|
24
|
+
use crate::url_match::is_host_allowed;
|
|
25
|
+
|
|
26
|
+
/// Rewrite every image in the event stream as a self-contained `Event::Html`
|
|
27
|
+
/// carrying `<img ... loading="lazy" decoding="async">`.
|
|
28
|
+
///
|
|
29
|
+
/// We consume the input Vec to own each event, then rebuild with
|
|
30
|
+
/// `Vec::with_capacity(events.len())` so passthrough events move by value
|
|
31
|
+
/// and image events are replaced with a single Html event.
|
|
32
|
+
pub fn add_lazy_loading(events: Vec<Event<'_>>) -> Vec<Event<'_>> {
|
|
33
|
+
let mut out: Vec<Event<'_>> = Vec::with_capacity(events.len());
|
|
34
|
+
let mut iter = events.into_iter();
|
|
35
|
+
|
|
36
|
+
while let Some(event) = iter.next() {
|
|
37
|
+
match event {
|
|
38
|
+
Event::Start(Tag::Image {
|
|
39
|
+
dest_url, title, ..
|
|
40
|
+
}) => {
|
|
41
|
+
// Consume events up to the matching End(Image), accumulating
|
|
42
|
+
// alt text from Text and Code payloads. Images can contain
|
|
43
|
+
// inline formatting (e.g. ``), which
|
|
44
|
+
// produces Start(Strong)/Text/End(Strong) events; the bare
|
|
45
|
+
// text content is what we want for the alt attribute.
|
|
46
|
+
let mut alt = String::new();
|
|
47
|
+
for inner in iter.by_ref() {
|
|
48
|
+
match inner {
|
|
49
|
+
Event::End(TagEnd::Image) => break,
|
|
50
|
+
Event::Text(t) | Event::Code(t) => alt.push_str(&t),
|
|
51
|
+
_ => {}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
let html = build_img_tag(&dest_url, &alt, &title);
|
|
56
|
+
out.push(Event::Html(CowStr::Boxed(html.into_boxed_str())));
|
|
57
|
+
}
|
|
58
|
+
other => out.push(other),
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
out
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Drop images whose `src` host isn't in the allowlist. The whole
|
|
66
|
+
/// `Start(Image) ... End(Image)` sequence is replaced with a single
|
|
67
|
+
/// `Event::Text` carrying the image's alt text, or removed entirely
|
|
68
|
+
/// when alt is empty. Non-web URLs pass through: [`is_host_allowed`]
|
|
69
|
+
/// returns true for any URL with no parseable host.
|
|
70
|
+
///
|
|
71
|
+
/// Alt accumulation matches `add_lazy_loading`: images can contain
|
|
72
|
+
/// markdown like ``, producing
|
|
73
|
+
/// Start(Strong)/Text/End(Strong) events—we pull the raw text payloads
|
|
74
|
+
/// out and discard formatting.
|
|
75
|
+
pub fn filter_by_hosts<'a>(events: Vec<Event<'a>>, set: &GlobSet) -> Vec<Event<'a>> {
|
|
76
|
+
let mut out: Vec<Event<'a>> = Vec::with_capacity(events.len());
|
|
77
|
+
let mut iter = events.into_iter();
|
|
78
|
+
|
|
79
|
+
while let Some(event) = iter.next() {
|
|
80
|
+
match event {
|
|
81
|
+
Event::Start(Tag::Image { ref dest_url, .. }) if !is_host_allowed(dest_url, set) => {
|
|
82
|
+
let mut alt = String::new();
|
|
83
|
+
for inner in iter.by_ref() {
|
|
84
|
+
match inner {
|
|
85
|
+
Event::End(TagEnd::Image) => break,
|
|
86
|
+
Event::Text(t) | Event::Code(t) => alt.push_str(&t),
|
|
87
|
+
_ => {}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if !alt.is_empty() {
|
|
91
|
+
out.push(Event::Text(CowStr::Boxed(alt.into_boxed_str())));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
other => out.push(other),
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
out
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/// Construct the `<img>` HTML string with `loading="lazy"` and
|
|
102
|
+
/// `decoding="async"` attributes. `src` is escaped as a URL (percent-
|
|
103
|
+
/// encoded where necessary); `alt` and `title` are HTML-attribute
|
|
104
|
+
/// escaped. The output matches pulldown-cmark's built-in image writer
|
|
105
|
+
/// plus the two extra hint attributes.
|
|
106
|
+
#[inline]
|
|
107
|
+
fn build_img_tag(src: &str, alt: &str, title: &str) -> String {
|
|
108
|
+
// Rough capacity estimate: base tag (~60) + src + alt + title length.
|
|
109
|
+
let mut out = String::with_capacity(60 + src.len() + alt.len() + title.len());
|
|
110
|
+
out.push_str("<img src=\"");
|
|
111
|
+
|
|
112
|
+
// escape_href percent-encodes problematic bytes and also handles HTML
|
|
113
|
+
// specials (&, <, etc.). Matches pulldown-cmark's upstream behavior.
|
|
114
|
+
let _ = escape_href(&mut out, src);
|
|
115
|
+
out.push_str("\" alt=\"");
|
|
116
|
+
let _ = escape_html(&mut out, alt);
|
|
117
|
+
out.push('"');
|
|
118
|
+
if !title.is_empty() {
|
|
119
|
+
out.push_str(" title=\"");
|
|
120
|
+
let _ = escape_html(&mut out, title);
|
|
121
|
+
out.push('"');
|
|
122
|
+
}
|
|
123
|
+
out.push_str(" loading=\"lazy\" decoding=\"async\" />");
|
|
124
|
+
out
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[cfg(test)]
|
|
128
|
+
mod tests {
|
|
129
|
+
use super::{add_lazy_loading, build_img_tag, filter_by_hosts};
|
|
130
|
+
use globset::{Glob, GlobSetBuilder};
|
|
131
|
+
use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
|
|
132
|
+
|
|
133
|
+
fn host_set(patterns: &[&str]) -> globset::GlobSet {
|
|
134
|
+
let mut b = GlobSetBuilder::new();
|
|
135
|
+
for p in patterns {
|
|
136
|
+
b.add(Glob::new(p).unwrap());
|
|
137
|
+
}
|
|
138
|
+
b.build().unwrap()
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
#[test]
|
|
142
|
+
fn basic_tag() {
|
|
143
|
+
let html = build_img_tag("img.png", "a picture", "");
|
|
144
|
+
assert_eq!(
|
|
145
|
+
html,
|
|
146
|
+
r#"<img src="img.png" alt="a picture" loading="lazy" decoding="async" />"#
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[test]
|
|
151
|
+
fn with_title() {
|
|
152
|
+
let html = build_img_tag("img.png", "alt", "the title");
|
|
153
|
+
assert_eq!(
|
|
154
|
+
html,
|
|
155
|
+
r#"<img src="img.png" alt="alt" title="the title" loading="lazy" decoding="async" />"#
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn escapes_alt_html_specials() {
|
|
161
|
+
// Attempted HTML injection in alt—must come out escaped.
|
|
162
|
+
let html = build_img_tag("img.png", "a\"b<c>d&e", "");
|
|
163
|
+
assert!(html.contains("alt=\"a"b<c>d&e\""));
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
#[test]
|
|
167
|
+
fn escapes_url_ampersand() {
|
|
168
|
+
let html = build_img_tag("img.png?a=1&b=2", "alt", "");
|
|
169
|
+
// pulldown-cmark-escape writes `&` as `&` in hrefs.
|
|
170
|
+
assert!(html.contains("src=\"img.png?a=1&b=2\""));
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn empty_alt_still_valid() {
|
|
175
|
+
let html = build_img_tag("img.png", "", "");
|
|
176
|
+
assert_eq!(
|
|
177
|
+
html,
|
|
178
|
+
r#"<img src="img.png" alt="" loading="lazy" decoding="async" />"#
|
|
179
|
+
);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
#[test]
|
|
183
|
+
fn title_skipped_when_empty() {
|
|
184
|
+
let html = build_img_tag("img.png", "alt", "");
|
|
185
|
+
assert!(!html.contains("title="));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn add_lazy_loading_collapses_image_events_into_html() {
|
|
190
|
+
// Start(Image) + Text("alt") + End(Image) → single Html event with loading=
|
|
191
|
+
let events = vec![
|
|
192
|
+
Event::Start(Tag::Image {
|
|
193
|
+
link_type: LinkType::Inline,
|
|
194
|
+
dest_url: CowStr::Borrowed("photo.jpg"),
|
|
195
|
+
title: CowStr::Borrowed(""),
|
|
196
|
+
id: CowStr::Borrowed(""),
|
|
197
|
+
}),
|
|
198
|
+
Event::Text(CowStr::Borrowed("alt text")),
|
|
199
|
+
Event::End(TagEnd::Image),
|
|
200
|
+
];
|
|
201
|
+
let out = add_lazy_loading(events);
|
|
202
|
+
assert_eq!(out.len(), 1, "should collapse to one event");
|
|
203
|
+
match &out[0] {
|
|
204
|
+
Event::Html(html) => {
|
|
205
|
+
assert!(
|
|
206
|
+
html.contains("loading="),
|
|
207
|
+
"missing loading attribute: {html}"
|
|
208
|
+
);
|
|
209
|
+
assert!(html.contains("alt=\"alt text\""), "missing alt: {html}");
|
|
210
|
+
assert!(html.contains("src=\"photo.jpg\""), "missing src: {html}");
|
|
211
|
+
}
|
|
212
|
+
other => panic!("expected Html event, got {other:?}"),
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
#[test]
|
|
217
|
+
fn filter_by_hosts_drops_disallowed_image_to_alt_text() {
|
|
218
|
+
let events = vec![
|
|
219
|
+
Event::Start(Tag::Image {
|
|
220
|
+
link_type: LinkType::Inline,
|
|
221
|
+
dest_url: CowStr::Borrowed("https://evil.com/bad.png"),
|
|
222
|
+
title: CowStr::Borrowed(""),
|
|
223
|
+
id: CowStr::Borrowed(""),
|
|
224
|
+
}),
|
|
225
|
+
Event::Text(CowStr::Borrowed("fallback alt")),
|
|
226
|
+
Event::End(TagEnd::Image),
|
|
227
|
+
];
|
|
228
|
+
let out = filter_by_hosts(events, &host_set(&["example.net"]));
|
|
229
|
+
assert_eq!(out.len(), 1);
|
|
230
|
+
match &out[0] {
|
|
231
|
+
Event::Text(t) => assert_eq!(t.as_ref(), "fallback alt"),
|
|
232
|
+
other => panic!("expected Text event, got {other:?}"),
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
#[test]
|
|
237
|
+
fn filter_by_hosts_drops_disallowed_image_with_empty_alt_entirely() {
|
|
238
|
+
let events = vec![
|
|
239
|
+
Event::Start(Tag::Image {
|
|
240
|
+
link_type: LinkType::Inline,
|
|
241
|
+
dest_url: CowStr::Borrowed("https://evil.com/x.png"),
|
|
242
|
+
title: CowStr::Borrowed(""),
|
|
243
|
+
id: CowStr::Borrowed(""),
|
|
244
|
+
}),
|
|
245
|
+
Event::End(TagEnd::Image),
|
|
246
|
+
];
|
|
247
|
+
let out = filter_by_hosts(events, &host_set(&["example.net"]));
|
|
248
|
+
assert!(out.is_empty(), "expected zero events, got {out:?}");
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
#[test]
|
|
252
|
+
fn filter_by_hosts_keeps_allowed_images_untouched() {
|
|
253
|
+
let events = vec![
|
|
254
|
+
Event::Start(Tag::Image {
|
|
255
|
+
link_type: LinkType::Inline,
|
|
256
|
+
dest_url: CowStr::Borrowed("https://cdn.example.net/ok.png"),
|
|
257
|
+
title: CowStr::Borrowed(""),
|
|
258
|
+
id: CowStr::Borrowed(""),
|
|
259
|
+
}),
|
|
260
|
+
Event::Text(CowStr::Borrowed("alt")),
|
|
261
|
+
Event::End(TagEnd::Image),
|
|
262
|
+
];
|
|
263
|
+
let out = filter_by_hosts(events, &host_set(&["*.example.net"]));
|
|
264
|
+
assert_eq!(out.len(), 3);
|
|
265
|
+
assert!(matches!(out[0], Event::Start(Tag::Image { .. })));
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
#[test]
|
|
269
|
+
fn filter_by_hosts_leaves_relative_images_alone() {
|
|
270
|
+
let events = vec![
|
|
271
|
+
Event::Start(Tag::Image {
|
|
272
|
+
link_type: LinkType::Inline,
|
|
273
|
+
dest_url: CowStr::Borrowed("/local/pic.png"),
|
|
274
|
+
title: CowStr::Borrowed(""),
|
|
275
|
+
id: CowStr::Borrowed(""),
|
|
276
|
+
}),
|
|
277
|
+
Event::Text(CowStr::Borrowed("alt")),
|
|
278
|
+
Event::End(TagEnd::Image),
|
|
279
|
+
];
|
|
280
|
+
let out = filter_by_hosts(events, &host_set(&[]));
|
|
281
|
+
assert_eq!(out.len(), 3);
|
|
282
|
+
assert!(matches!(out[0], Event::Start(Tag::Image { .. })));
|
|
283
|
+
}
|
|
284
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#![forbid(unsafe_code)]
|
|
2
|
+
|
|
3
|
+
use magnus::{function, prelude::*, Error, Ruby};
|
|
4
|
+
|
|
5
|
+
mod autolink;
|
|
6
|
+
mod chunks_by_heading;
|
|
7
|
+
mod chunks_by_size;
|
|
8
|
+
mod document;
|
|
9
|
+
mod emoji;
|
|
10
|
+
mod handler;
|
|
11
|
+
mod heading;
|
|
12
|
+
mod highlight;
|
|
13
|
+
mod image;
|
|
14
|
+
mod link;
|
|
15
|
+
mod options;
|
|
16
|
+
mod plain_text;
|
|
17
|
+
mod scheme_filter;
|
|
18
|
+
mod stats;
|
|
19
|
+
mod tag_filter;
|
|
20
|
+
mod toc;
|
|
21
|
+
mod truncate;
|
|
22
|
+
mod url_match;
|
|
23
|
+
|
|
24
|
+
#[magnus::init]
|
|
25
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
26
|
+
let inkmark = ruby.define_class("Inkmark", ruby.class_object())?;
|
|
27
|
+
inkmark.define_singleton_method("_native_to_html", function!(document::native_to_html, 2))?;
|
|
28
|
+
inkmark.define_singleton_method("_native_to_markdown", function!(document::native_to_markdown, 2))?;
|
|
29
|
+
inkmark.define_singleton_method("_native_to_plain_text", function!(document::native_to_plain_text, 2))?;
|
|
30
|
+
inkmark.define_singleton_method(
|
|
31
|
+
"_native_chunks_by_heading",
|
|
32
|
+
function!(chunks_by_heading::native_chunks_by_heading, 2),
|
|
33
|
+
)?;
|
|
34
|
+
inkmark.define_singleton_method(
|
|
35
|
+
"_native_chunks_by_size",
|
|
36
|
+
function!(chunks_by_size::native_chunks_by_size, 2),
|
|
37
|
+
)?;
|
|
38
|
+
inkmark.define_singleton_method(
|
|
39
|
+
"_native_truncate_markdown",
|
|
40
|
+
function!(truncate::native_truncate_markdown, 3),
|
|
41
|
+
)?;
|
|
42
|
+
inkmark.define_singleton_method(
|
|
43
|
+
"_native_render_full",
|
|
44
|
+
function!(document::native_render_full, 2),
|
|
45
|
+
)?;
|
|
46
|
+
inkmark.define_singleton_method("_syntax_css", function!(highlight::syntax_css, 1))?;
|
|
47
|
+
inkmark.define_singleton_method("_syntax_themes", function!(highlight::syntax_themes, 0))?;
|
|
48
|
+
inkmark.define_singleton_method("_native_walk", function!(handler::native_walk, 3))?;
|
|
49
|
+
inkmark.define_singleton_method(
|
|
50
|
+
"_native_render_with_handlers",
|
|
51
|
+
function!(handler::native_render_with_handlers, 3),
|
|
52
|
+
)?;
|
|
53
|
+
Ok(())
|
|
54
|
+
}
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
//! External link `rel` attribute injection filter and URL matcher.
|
|
2
|
+
//!
|
|
3
|
+
//! When enabled, replaces the `Start(Tag::Link)` and matching
|
|
4
|
+
//! `End(TagEnd::Link)` events for every external link with hand-built
|
|
5
|
+
//! `<a href="..." rel="nofollow noopener">` / `</a>` HTML events. Inner
|
|
6
|
+
//! events (text, emphasis, inline code, images) pass through unchanged,
|
|
7
|
+
//! so pulldown-cmark's built-in writers still render the link content:
|
|
8
|
+
//! we only replace the opening and closing tags.
|
|
9
|
+
//!
|
|
10
|
+
//! "External" here means the URL starts with `http://` or `https://`
|
|
11
|
+
//! (case-insensitive). Relative paths, anchor fragments, and non-web
|
|
12
|
+
//! schemes (`mailto:`, `tel:`, `javascript:`) are not touched:
|
|
13
|
+
|
|
14
|
+
use globset::GlobSet;
|
|
15
|
+
use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
|
|
16
|
+
use pulldown_cmark_escape::{escape_href, escape_html};
|
|
17
|
+
|
|
18
|
+
use crate::url_match::is_host_allowed;
|
|
19
|
+
|
|
20
|
+
/// Add `rel="nofollow noopener"` to every external `<a>` tag by replacing
|
|
21
|
+
/// its `Start(Link)` event with a synthesized `Event::Html` opening tag
|
|
22
|
+
/// and its matching `End(Link)` event with a `</a>` close tag.
|
|
23
|
+
pub fn add_nofollow(events: Vec<Event<'_>>) -> Vec<Event<'_>> {
|
|
24
|
+
let mut out: Vec<Event<'_>> = Vec::with_capacity(events.len());
|
|
25
|
+
let mut iter = events.into_iter();
|
|
26
|
+
|
|
27
|
+
while let Some(event) = iter.next() {
|
|
28
|
+
match event {
|
|
29
|
+
Event::Start(Tag::Link {
|
|
30
|
+
link_type: _,
|
|
31
|
+
ref dest_url,
|
|
32
|
+
ref title,
|
|
33
|
+
id: _,
|
|
34
|
+
}) if is_external(dest_url) => {
|
|
35
|
+
let open = build_link_open(dest_url, title);
|
|
36
|
+
out.push(Event::Html(CowStr::Boxed(open.into_boxed_str())));
|
|
37
|
+
|
|
38
|
+
// Consume inner events through the matching End(Link),
|
|
39
|
+
// depth-counting so a nested link doesn't break the
|
|
40
|
+
// close-tag pairing. CommonMark disallows nested links
|
|
41
|
+
// in valid markdown, so depth should always reach zero on
|
|
42
|
+
// the first End we see.
|
|
43
|
+
let mut depth: usize = 1;
|
|
44
|
+
for inner in iter.by_ref() {
|
|
45
|
+
let is_link_start = matches!(&inner, Event::Start(Tag::Link { .. }));
|
|
46
|
+
let is_link_end = matches!(&inner, Event::End(TagEnd::Link));
|
|
47
|
+
|
|
48
|
+
if is_link_start {
|
|
49
|
+
depth += 1;
|
|
50
|
+
out.push(inner);
|
|
51
|
+
} else if is_link_end {
|
|
52
|
+
depth -= 1;
|
|
53
|
+
if depth == 0 {
|
|
54
|
+
out.push(Event::Html(CowStr::Borrowed("</a>")));
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
out.push(inner);
|
|
58
|
+
} else {
|
|
59
|
+
out.push(inner);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
other => out.push(other),
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
out
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/// Drop `<a>` tags whose destination URL's host isn't in the allowlist,
|
|
71
|
+
/// leaving the inner content (text, emphasis, images) in place as a
|
|
72
|
+
/// bare phrase. Non-web URLs (relative paths, `mailto:`, etc.) pass
|
|
73
|
+
/// through.
|
|
74
|
+
pub fn filter_by_hosts<'a>(events: Vec<Event<'a>>, set: &GlobSet) -> Vec<Event<'a>> {
|
|
75
|
+
let mut out: Vec<Event<'a>> = Vec::with_capacity(events.len());
|
|
76
|
+
let mut iter = events.into_iter();
|
|
77
|
+
|
|
78
|
+
while let Some(event) = iter.next() {
|
|
79
|
+
match event {
|
|
80
|
+
Event::Start(Tag::Link { ref dest_url, .. }) if !is_host_allowed(dest_url, set) => {
|
|
81
|
+
let mut depth: usize = 1;
|
|
82
|
+
for inner in iter.by_ref() {
|
|
83
|
+
match &inner {
|
|
84
|
+
Event::Start(Tag::Link { .. }) => {
|
|
85
|
+
depth += 1;
|
|
86
|
+
out.push(inner);
|
|
87
|
+
}
|
|
88
|
+
Event::End(TagEnd::Link) => {
|
|
89
|
+
depth -= 1;
|
|
90
|
+
if depth == 0 {
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
out.push(inner);
|
|
94
|
+
}
|
|
95
|
+
_ => out.push(inner),
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
other => out.push(other),
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
out
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/// Return true when the URL starts with `http://` or `https://` (case
|
|
107
|
+
/// insensitive). Relative paths, anchor fragments, and `mailto:` /
|
|
108
|
+
/// `tel:` / `javascript:` URLs return false.
|
|
109
|
+
#[inline]
|
|
110
|
+
fn is_external(url: &str) -> bool {
|
|
111
|
+
url.split_once("://").is_some_and(|(scheme, _)| {
|
|
112
|
+
scheme.eq_ignore_ascii_case("http") || scheme.eq_ignore_ascii_case("https")
|
|
113
|
+
})
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/// Construct the `<a href="..." title="..." rel="nofollow noopener">`
|
|
117
|
+
/// opening tag. The URL goes through `escape_href` (percent-encoding +
|
|
118
|
+
/// HTML-special escaping, matching pulldown-cmark's upstream behavior),
|
|
119
|
+
/// and the title through `escape_html` for attribute context.
|
|
120
|
+
#[inline]
|
|
121
|
+
fn build_link_open(href: &str, title: &str) -> String {
|
|
122
|
+
let mut out = String::with_capacity(40 + href.len() + title.len());
|
|
123
|
+
out.push_str("<a href=\"");
|
|
124
|
+
let _ = escape_href(&mut out, href);
|
|
125
|
+
out.push('"');
|
|
126
|
+
if !title.is_empty() {
|
|
127
|
+
out.push_str(" title=\"");
|
|
128
|
+
let _ = escape_html(&mut out, title);
|
|
129
|
+
out.push('"');
|
|
130
|
+
}
|
|
131
|
+
out.push_str(" rel=\"nofollow noopener\">");
|
|
132
|
+
out
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#[cfg(test)]
|
|
136
|
+
mod tests {
|
|
137
|
+
use super::{add_nofollow, build_link_open, filter_by_hosts, is_external};
|
|
138
|
+
use globset::{Glob, GlobSetBuilder};
|
|
139
|
+
use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
|
|
140
|
+
|
|
141
|
+
fn host_set(patterns: &[&str]) -> globset::GlobSet {
|
|
142
|
+
let mut b = GlobSetBuilder::new();
|
|
143
|
+
for p in patterns {
|
|
144
|
+
b.add(Glob::new(p).unwrap());
|
|
145
|
+
}
|
|
146
|
+
b.build().unwrap()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[test]
|
|
150
|
+
fn external_detection() {
|
|
151
|
+
assert!(is_external("http://example.net"));
|
|
152
|
+
assert!(is_external("https://example.net"));
|
|
153
|
+
assert!(is_external("HTTPS://EXAMPLE.NET"));
|
|
154
|
+
assert!(is_external("Http://mixed.case"));
|
|
155
|
+
|
|
156
|
+
assert!(!is_external("/local/path"));
|
|
157
|
+
assert!(!is_external("relative.html"));
|
|
158
|
+
assert!(!is_external("#anchor"));
|
|
159
|
+
assert!(!is_external("mailto:user@example.net"));
|
|
160
|
+
assert!(!is_external("tel:+1234567890"));
|
|
161
|
+
assert!(!is_external("javascript:alert(1)"));
|
|
162
|
+
assert!(!is_external("//protocol-relative.com"));
|
|
163
|
+
assert!(!is_external(""));
|
|
164
|
+
assert!(!is_external("h"));
|
|
165
|
+
assert!(!is_external("http"));
|
|
166
|
+
assert!(!is_external("https"));
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
#[test]
|
|
170
|
+
fn open_tag_basic() {
|
|
171
|
+
assert_eq!(
|
|
172
|
+
build_link_open("https://example.net", ""),
|
|
173
|
+
r#"<a href="https://example.net" rel="nofollow noopener">"#
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
#[test]
|
|
178
|
+
fn open_tag_with_title() {
|
|
179
|
+
assert_eq!(
|
|
180
|
+
build_link_open("https://example.net", "the title"),
|
|
181
|
+
r#"<a href="https://example.net" title="the title" rel="nofollow noopener">"#
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
#[test]
|
|
186
|
+
fn open_tag_escapes_url_ampersand() {
|
|
187
|
+
let tag = build_link_open("https://example.net/?a=1&b=2", "");
|
|
188
|
+
assert!(tag.contains(r#"href="https://example.net/?a=1&b=2""#));
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
#[test]
|
|
192
|
+
fn open_tag_escapes_title_specials() {
|
|
193
|
+
let tag = build_link_open("https://example.net", r#"a "quoted" <title>"#);
|
|
194
|
+
assert!(tag.contains(""quoted""));
|
|
195
|
+
assert!(tag.contains("<title>"));
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
#[test]
|
|
199
|
+
fn add_nofollow_adds_rel_to_external_link() {
|
|
200
|
+
// Start(Link) + Text("click") + End(Link) → Html open + Text + Html close
|
|
201
|
+
let events = vec![
|
|
202
|
+
Event::Start(Tag::Link {
|
|
203
|
+
link_type: LinkType::Inline,
|
|
204
|
+
dest_url: CowStr::Borrowed("https://example.net"),
|
|
205
|
+
title: CowStr::Borrowed(""),
|
|
206
|
+
id: CowStr::Borrowed(""),
|
|
207
|
+
}),
|
|
208
|
+
Event::Text(CowStr::Borrowed("click")),
|
|
209
|
+
Event::End(TagEnd::Link),
|
|
210
|
+
];
|
|
211
|
+
let out = add_nofollow(events);
|
|
212
|
+
// Should produce: Html(open), Text("click"), Html("</a>")
|
|
213
|
+
assert_eq!(out.len(), 3, "expected 3 events, got {}", out.len());
|
|
214
|
+
match &out[0] {
|
|
215
|
+
Event::Html(html) => {
|
|
216
|
+
assert!(
|
|
217
|
+
html.contains("nofollow"),
|
|
218
|
+
"opening tag must contain nofollow: {html}"
|
|
219
|
+
);
|
|
220
|
+
assert!(
|
|
221
|
+
html.contains("https://example.net"),
|
|
222
|
+
"opening tag must contain href: {html}"
|
|
223
|
+
);
|
|
224
|
+
}
|
|
225
|
+
other => panic!("expected Html open event, got {other:?}"),
|
|
226
|
+
}
|
|
227
|
+
match &out[2] {
|
|
228
|
+
Event::Html(html) => assert_eq!(html.as_ref(), "</a>"),
|
|
229
|
+
other => panic!("expected Html close event, got {other:?}"),
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
#[test]
|
|
234
|
+
fn filter_by_hosts_drops_disallowed_link_tags_keeping_text() {
|
|
235
|
+
// Start(Link to evil) + Text("click") + End(Link) →
|
|
236
|
+
// just Text("click"), with the link wrapper gone.
|
|
237
|
+
let events = vec![
|
|
238
|
+
Event::Start(Tag::Link {
|
|
239
|
+
link_type: LinkType::Inline,
|
|
240
|
+
dest_url: CowStr::Borrowed("https://evil.com"),
|
|
241
|
+
title: CowStr::Borrowed(""),
|
|
242
|
+
id: CowStr::Borrowed(""),
|
|
243
|
+
}),
|
|
244
|
+
Event::Text(CowStr::Borrowed("click me")),
|
|
245
|
+
Event::End(TagEnd::Link),
|
|
246
|
+
];
|
|
247
|
+
let out = filter_by_hosts(events, &host_set(&["example.net"]));
|
|
248
|
+
assert_eq!(out.len(), 1);
|
|
249
|
+
match &out[0] {
|
|
250
|
+
Event::Text(t) => assert_eq!(t.as_ref(), "click me"),
|
|
251
|
+
other => panic!("expected Text, got {other:?}"),
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
#[test]
|
|
256
|
+
fn filter_by_hosts_keeps_allowed_links_untouched() {
|
|
257
|
+
let events = vec![
|
|
258
|
+
Event::Start(Tag::Link {
|
|
259
|
+
link_type: LinkType::Inline,
|
|
260
|
+
dest_url: CowStr::Borrowed("https://cdn.example.net/doc"),
|
|
261
|
+
title: CowStr::Borrowed(""),
|
|
262
|
+
id: CowStr::Borrowed(""),
|
|
263
|
+
}),
|
|
264
|
+
Event::Text(CowStr::Borrowed("ok")),
|
|
265
|
+
Event::End(TagEnd::Link),
|
|
266
|
+
];
|
|
267
|
+
let out = filter_by_hosts(events, &host_set(&["*.example.net"]));
|
|
268
|
+
assert_eq!(out.len(), 3);
|
|
269
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
270
|
+
assert!(matches!(out[2], Event::End(TagEnd::Link)));
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
#[test]
|
|
274
|
+
fn filter_by_hosts_leaves_relative_and_mailto_alone() {
|
|
275
|
+
// Even with an empty allowlist that blocks everything external,
|
|
276
|
+
// relative/mailto links pass through unchanged.
|
|
277
|
+
let events = vec![
|
|
278
|
+
Event::Start(Tag::Link {
|
|
279
|
+
link_type: LinkType::Inline,
|
|
280
|
+
dest_url: CowStr::Borrowed("/local"),
|
|
281
|
+
title: CowStr::Borrowed(""),
|
|
282
|
+
id: CowStr::Borrowed(""),
|
|
283
|
+
}),
|
|
284
|
+
Event::Text(CowStr::Borrowed("home")),
|
|
285
|
+
Event::End(TagEnd::Link),
|
|
286
|
+
];
|
|
287
|
+
let out = filter_by_hosts(events, &host_set(&[]));
|
|
288
|
+
assert_eq!(out.len(), 3);
|
|
289
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
290
|
+
}
|
|
291
|
+
}
|