inkmark 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Cargo.lock +940 -0
- data/Cargo.toml +27 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/ext/inkmark/Cargo.toml +31 -0
- data/ext/inkmark/build.rs +5 -0
- data/ext/inkmark/extconf.rb +6 -0
- data/ext/inkmark/src/autolink.rs +167 -0
- data/ext/inkmark/src/chunks_by_heading.rs +325 -0
- data/ext/inkmark/src/chunks_by_size.rs +302 -0
- data/ext/inkmark/src/document.rs +411 -0
- data/ext/inkmark/src/emoji.rs +197 -0
- data/ext/inkmark/src/handler.rs +758 -0
- data/ext/inkmark/src/heading.rs +262 -0
- data/ext/inkmark/src/highlight.rs +202 -0
- data/ext/inkmark/src/image.rs +284 -0
- data/ext/inkmark/src/lib.rs +54 -0
- data/ext/inkmark/src/link.rs +291 -0
- data/ext/inkmark/src/options.rs +231 -0
- data/ext/inkmark/src/plain_text.rs +445 -0
- data/ext/inkmark/src/scheme_filter.rs +319 -0
- data/ext/inkmark/src/stats.rs +453 -0
- data/ext/inkmark/src/tag_filter.rs +226 -0
- data/ext/inkmark/src/toc.rs +221 -0
- data/ext/inkmark/src/truncate.rs +267 -0
- data/ext/inkmark/src/url_match.rs +178 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +208 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
//! Streaming scheme-allowlist filter over pulldown-cmark events.
|
|
2
|
+
//!
|
|
3
|
+
//! Wraps any `Iterator<Item = Event<'a>>` and filters out markdown-emitted
|
|
4
|
+
//! Link/Image events whose URL scheme is not in the respective allowlist.
|
|
5
|
+
//!
|
|
6
|
+
//! This replaces:
|
|
7
|
+
//!
|
|
8
|
+
//! - Blocked link: `Start(Link)` and matching `End(Link)` both dropped;
|
|
9
|
+
//! inner events (text, emphasis, nested markup) pass through as bare
|
|
10
|
+
//! content. Defensive depth counting handles malformed nested links.
|
|
11
|
+
//! - Blocked image: the entire `Start(Image) ... End(Image)` sequence is
|
|
12
|
+
//! replaced with a single `Event::Text` carrying the accumulated alt
|
|
13
|
+
//! text, or nothing if alt was empty.
|
|
14
|
+
//! - `Option<&[String]>` is `None` for "don't filter this kind" (caller
|
|
15
|
+
//! opted out). Empty slice `Some(&[])` blocks every absolute URL.
|
|
16
|
+
|
|
17
|
+
use pulldown_cmark::{CowStr, Event, Tag, TagEnd};
|
|
18
|
+
|
|
19
|
+
use crate::url_match::is_scheme_allowed;
|
|
20
|
+
|
|
21
|
+
/// Streaming adapter that drops Link/Image events with disallowed schemes.
|
|
22
|
+
///
|
|
23
|
+
/// `'e` is the lifetime of the `Event` borrows from the underlying parser,
|
|
24
|
+
/// `'s` is the lifetime of the scheme-allowlist slices.
|
|
25
|
+
/// Separate lifetimes matter because in the fast path the slices come
|
|
26
|
+
/// from `&'static` `OnceLock` storage (outlives everything), while events
|
|
27
|
+
/// are tied to the source string—tying them into one lifetime would
|
|
28
|
+
/// constrain callers unnecessarily.
|
|
29
|
+
pub struct SchemeFilter<'e, 's, I: Iterator<Item = Event<'e>>> {
|
|
30
|
+
inner: I,
|
|
31
|
+
link_allowed: Option<&'s [String]>,
|
|
32
|
+
image_allowed: Option<&'s [String]>,
|
|
33
|
+
// State for in-progress link drop. 0 = not skipping; N>0 = inside a
|
|
34
|
+
// blocked link with N nested Link starts yet to close. CommonMark
|
|
35
|
+
// disallows nested links so this is usually 1, but the counter keeps
|
|
36
|
+
// us correct on malformed / extension-emitted streams.
|
|
37
|
+
skipping_link_depth: usize,
|
|
38
|
+
// State for in-progress image drop.
|
|
39
|
+
skipping_image: bool,
|
|
40
|
+
image_alt: String,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
impl<'e, 's, I: Iterator<Item = Event<'e>>> SchemeFilter<'e, 's, I> {
|
|
44
|
+
pub fn new(
|
|
45
|
+
inner: I,
|
|
46
|
+
link_allowed: Option<&'s [String]>,
|
|
47
|
+
image_allowed: Option<&'s [String]>,
|
|
48
|
+
) -> Self {
|
|
49
|
+
Self {
|
|
50
|
+
inner,
|
|
51
|
+
link_allowed,
|
|
52
|
+
image_allowed,
|
|
53
|
+
skipping_link_depth: 0,
|
|
54
|
+
skipping_image: false,
|
|
55
|
+
image_alt: String::new(),
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
impl<'e, 's, I: Iterator<Item = Event<'e>>> Iterator for SchemeFilter<'e, 's, I> {
|
|
61
|
+
type Item = Event<'e>;
|
|
62
|
+
|
|
63
|
+
fn next(&mut self) -> Option<Event<'e>> {
|
|
64
|
+
loop {
|
|
65
|
+
// Image skipping mode
|
|
66
|
+
if self.skipping_image {
|
|
67
|
+
match self.inner.next()? {
|
|
68
|
+
Event::End(TagEnd::Image) => {
|
|
69
|
+
self.skipping_image = false;
|
|
70
|
+
if !self.image_alt.is_empty() {
|
|
71
|
+
let alt = std::mem::take(&mut self.image_alt);
|
|
72
|
+
return Some(Event::Text(CowStr::Boxed(alt.into_boxed_str())));
|
|
73
|
+
}
|
|
74
|
+
continue;
|
|
75
|
+
}
|
|
76
|
+
Event::Text(t) | Event::Code(t) => {
|
|
77
|
+
self.image_alt.push_str(&t);
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
_ => continue,
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Link skipping mode
|
|
85
|
+
if self.skipping_link_depth > 0 {
|
|
86
|
+
let ev = self.inner.next()?;
|
|
87
|
+
match &ev {
|
|
88
|
+
Event::Start(Tag::Link { .. }) => {
|
|
89
|
+
self.skipping_link_depth += 1;
|
|
90
|
+
return Some(ev);
|
|
91
|
+
}
|
|
92
|
+
Event::End(TagEnd::Link) => {
|
|
93
|
+
self.skipping_link_depth -= 1;
|
|
94
|
+
if self.skipping_link_depth == 0 {
|
|
95
|
+
continue;
|
|
96
|
+
} else {
|
|
97
|
+
return Some(ev);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
_ => return Some(ev),
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Check each event for a drop trigger
|
|
105
|
+
let ev = self.inner.next()?;
|
|
106
|
+
match &ev {
|
|
107
|
+
Event::Start(Tag::Link { dest_url, .. })
|
|
108
|
+
if self
|
|
109
|
+
.link_allowed
|
|
110
|
+
.is_some_and(|s| !is_scheme_allowed(dest_url, s)) =>
|
|
111
|
+
{
|
|
112
|
+
self.skipping_link_depth = 1;
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
115
|
+
Event::Start(Tag::Image { dest_url, .. })
|
|
116
|
+
if self
|
|
117
|
+
.image_allowed
|
|
118
|
+
.is_some_and(|s| !is_scheme_allowed(dest_url, s)) =>
|
|
119
|
+
{
|
|
120
|
+
self.skipping_image = true;
|
|
121
|
+
self.image_alt.clear();
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
_ => return Some(ev),
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
#[cfg(test)]
|
|
131
|
+
mod tests {
|
|
132
|
+
use super::SchemeFilter;
|
|
133
|
+
use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
|
|
134
|
+
|
|
135
|
+
fn schemes(s: &[&str]) -> Vec<String> {
|
|
136
|
+
s.iter().map(|x| x.to_ascii_lowercase()).collect()
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
fn run<'a>(
|
|
140
|
+
events: Vec<Event<'a>>,
|
|
141
|
+
link: Option<&'a [String]>,
|
|
142
|
+
image: Option<&'a [String]>,
|
|
143
|
+
) -> Vec<Event<'a>> {
|
|
144
|
+
SchemeFilter::new(events.into_iter(), link, image).collect()
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn passes_allowed_link_through() {
|
|
149
|
+
let events = vec![
|
|
150
|
+
Event::Start(Tag::Link {
|
|
151
|
+
link_type: LinkType::Inline,
|
|
152
|
+
dest_url: CowStr::Borrowed("https://example.net"),
|
|
153
|
+
title: CowStr::Borrowed(""),
|
|
154
|
+
id: CowStr::Borrowed(""),
|
|
155
|
+
}),
|
|
156
|
+
Event::Text(CowStr::Borrowed("t")),
|
|
157
|
+
Event::End(TagEnd::Link),
|
|
158
|
+
];
|
|
159
|
+
let allowed = schemes(&["https"]);
|
|
160
|
+
let out = run(events, Some(&allowed), None);
|
|
161
|
+
assert_eq!(out.len(), 3);
|
|
162
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
#[test]
|
|
166
|
+
fn drops_blocked_link_keeps_text() {
|
|
167
|
+
let events = vec![
|
|
168
|
+
Event::Start(Tag::Link {
|
|
169
|
+
link_type: LinkType::Inline,
|
|
170
|
+
dest_url: CowStr::Borrowed("javascript:alert(1)"),
|
|
171
|
+
title: CowStr::Borrowed(""),
|
|
172
|
+
id: CowStr::Borrowed(""),
|
|
173
|
+
}),
|
|
174
|
+
Event::Text(CowStr::Borrowed("click")),
|
|
175
|
+
Event::End(TagEnd::Link),
|
|
176
|
+
];
|
|
177
|
+
let allowed = schemes(&["http", "https"]);
|
|
178
|
+
let out = run(events, Some(&allowed), None);
|
|
179
|
+
assert_eq!(out.len(), 1);
|
|
180
|
+
match &out[0] {
|
|
181
|
+
Event::Text(t) => assert_eq!(t.as_ref(), "click"),
|
|
182
|
+
other => panic!("expected Text, got {other:?}"),
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[test]
|
|
187
|
+
fn drops_blocked_image_to_alt() {
|
|
188
|
+
let events = vec![
|
|
189
|
+
Event::Start(Tag::Image {
|
|
190
|
+
link_type: LinkType::Inline,
|
|
191
|
+
dest_url: CowStr::Borrowed("data:image/svg+xml,<svg/>"),
|
|
192
|
+
title: CowStr::Borrowed(""),
|
|
193
|
+
id: CowStr::Borrowed(""),
|
|
194
|
+
}),
|
|
195
|
+
Event::Text(CowStr::Borrowed("fallback")),
|
|
196
|
+
Event::End(TagEnd::Image),
|
|
197
|
+
];
|
|
198
|
+
let allowed = schemes(&["http", "https"]);
|
|
199
|
+
let out = run(events, None, Some(&allowed));
|
|
200
|
+
assert_eq!(out.len(), 1);
|
|
201
|
+
match &out[0] {
|
|
202
|
+
Event::Text(t) => assert_eq!(t.as_ref(), "fallback"),
|
|
203
|
+
other => panic!("expected Text, got {other:?}"),
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#[test]
|
|
208
|
+
fn drops_blocked_image_empty_alt_entirely() {
|
|
209
|
+
let events = vec![
|
|
210
|
+
Event::Start(Tag::Image {
|
|
211
|
+
link_type: LinkType::Inline,
|
|
212
|
+
dest_url: CowStr::Borrowed("data:image/svg+xml,<svg/>"),
|
|
213
|
+
title: CowStr::Borrowed(""),
|
|
214
|
+
id: CowStr::Borrowed(""),
|
|
215
|
+
}),
|
|
216
|
+
Event::End(TagEnd::Image),
|
|
217
|
+
];
|
|
218
|
+
let allowed = schemes(&["https"]);
|
|
219
|
+
let out = run(events, None, Some(&allowed));
|
|
220
|
+
assert!(out.is_empty());
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
#[test]
|
|
224
|
+
fn fuses_link_and_image_filters_in_one_pass() {
|
|
225
|
+
let events = vec![
|
|
226
|
+
Event::Start(Tag::Link {
|
|
227
|
+
link_type: LinkType::Inline,
|
|
228
|
+
dest_url: CowStr::Borrowed("javascript:alert(1)"),
|
|
229
|
+
title: CowStr::Borrowed(""),
|
|
230
|
+
id: CowStr::Borrowed(""),
|
|
231
|
+
}),
|
|
232
|
+
Event::Text(CowStr::Borrowed("bad link")),
|
|
233
|
+
Event::End(TagEnd::Link),
|
|
234
|
+
Event::Text(CowStr::Borrowed(" and ")),
|
|
235
|
+
Event::Start(Tag::Image {
|
|
236
|
+
link_type: LinkType::Inline,
|
|
237
|
+
dest_url: CowStr::Borrowed("data:x"),
|
|
238
|
+
title: CowStr::Borrowed(""),
|
|
239
|
+
id: CowStr::Borrowed(""),
|
|
240
|
+
}),
|
|
241
|
+
Event::Text(CowStr::Borrowed("bad pic")),
|
|
242
|
+
Event::End(TagEnd::Image),
|
|
243
|
+
];
|
|
244
|
+
let link = schemes(&["https"]);
|
|
245
|
+
let image = schemes(&["https"]);
|
|
246
|
+
let out = run(events, Some(&link), Some(&image));
|
|
247
|
+
// Expect: Text("bad link"), Text(" and "), Text("bad pic")
|
|
248
|
+
assert_eq!(out.len(), 3);
|
|
249
|
+
assert!(matches!(out[0], Event::Text(_)));
|
|
250
|
+
assert!(matches!(out[1], Event::Text(_)));
|
|
251
|
+
assert!(matches!(out[2], Event::Text(_)));
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
#[test]
|
|
255
|
+
fn relative_urls_pass_through() {
|
|
256
|
+
let events = vec![
|
|
257
|
+
Event::Start(Tag::Link {
|
|
258
|
+
link_type: LinkType::Inline,
|
|
259
|
+
dest_url: CowStr::Borrowed("/local"),
|
|
260
|
+
title: CowStr::Borrowed(""),
|
|
261
|
+
id: CowStr::Borrowed(""),
|
|
262
|
+
}),
|
|
263
|
+
Event::Text(CowStr::Borrowed("home")),
|
|
264
|
+
Event::End(TagEnd::Link),
|
|
265
|
+
];
|
|
266
|
+
// Even with an empty allowlist that blocks all absolute URLs,
|
|
267
|
+
// relative URLs pass through.
|
|
268
|
+
let allowed = schemes(&[]);
|
|
269
|
+
let out = run(events, Some(&allowed), None);
|
|
270
|
+
assert_eq!(out.len(), 3);
|
|
271
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
#[test]
|
|
275
|
+
fn none_allowlist_means_no_filter() {
|
|
276
|
+
let events = vec![
|
|
277
|
+
Event::Start(Tag::Link {
|
|
278
|
+
link_type: LinkType::Inline,
|
|
279
|
+
dest_url: CowStr::Borrowed("javascript:alert(1)"),
|
|
280
|
+
title: CowStr::Borrowed(""),
|
|
281
|
+
id: CowStr::Borrowed(""),
|
|
282
|
+
}),
|
|
283
|
+
Event::Text(CowStr::Borrowed("click")),
|
|
284
|
+
Event::End(TagEnd::Link),
|
|
285
|
+
];
|
|
286
|
+
let out = run(events, None, None);
|
|
287
|
+
assert_eq!(out.len(), 3);
|
|
288
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
#[test]
|
|
292
|
+
fn independent_link_and_image_control() {
|
|
293
|
+
// Link filter off, image filter on—only image should drop.
|
|
294
|
+
let events = vec![
|
|
295
|
+
Event::Start(Tag::Link {
|
|
296
|
+
link_type: LinkType::Inline,
|
|
297
|
+
dest_url: CowStr::Borrowed("javascript:x"),
|
|
298
|
+
title: CowStr::Borrowed(""),
|
|
299
|
+
id: CowStr::Borrowed(""),
|
|
300
|
+
}),
|
|
301
|
+
Event::Text(CowStr::Borrowed("stays")),
|
|
302
|
+
Event::End(TagEnd::Link),
|
|
303
|
+
Event::Start(Tag::Image {
|
|
304
|
+
link_type: LinkType::Inline,
|
|
305
|
+
dest_url: CowStr::Borrowed("data:x"),
|
|
306
|
+
title: CowStr::Borrowed(""),
|
|
307
|
+
id: CowStr::Borrowed(""),
|
|
308
|
+
}),
|
|
309
|
+
Event::Text(CowStr::Borrowed("dropped")),
|
|
310
|
+
Event::End(TagEnd::Image),
|
|
311
|
+
];
|
|
312
|
+
let image = schemes(&["https"]);
|
|
313
|
+
let out = run(events, None, Some(&image));
|
|
314
|
+
// Link passes (3 events), image drops to Text (1 event).
|
|
315
|
+
assert_eq!(out.len(), 4);
|
|
316
|
+
assert!(matches!(out[0], Event::Start(Tag::Link { .. })));
|
|
317
|
+
assert!(matches!(out[3], Event::Text(_)));
|
|
318
|
+
}
|
|
319
|
+
}
|